├── cfgs ├── voc.json ├── __init__.py └── config.py ├── lib ├── __init__.py ├── bbox │ ├── __init__.py │ ├── box_transform.py │ └── bbox.py ├── det_ops │ ├── __init__.py │ ├── anchors.py │ ├── anchor_target.py │ └── loss.py └── nms │ ├── .gitignore │ ├── __init__.py │ ├── Makefile │ ├── gpu_nms.hpp │ ├── nms_wrapper.py │ ├── gpu_nms.pyx │ ├── cpu_nms.pyx │ ├── setup.py │ ├── cpu_soft_nms.pyx │ └── nms_kernel.cu ├── models ├── __init__.py ├── fpn.py ├── retina.py └── resnet.py ├── utils ├── __init__.py ├── logger.py └── visualization.py ├── datasets ├── __init__.py ├── utils.py ├── icdar15.py ├── synthtext.py └── minibatch.py ├── evaluation ├── __init__.py └── evaluation.py ├── .DS_Store ├── images ├── .DS_Store ├── icdar.png └── synth.png ├── .idea ├── vcs.xml ├── misc.xml ├── modules.xml ├── RetinaNet-Text-Detection.iml └── workspace.xml ├── README.md ├── LICENSE ├── train.py └── test.py /cfgs/voc.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cfgs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/bbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/det_ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.cpp 2 | -------------------------------------------------------------------------------- /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/.DS_Store -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- 1 | from .nms_wrapper import nms, soft_nms 2 | 3 | __all__ = ['nms', 'soft_nms'] 4 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/.DS_Store -------------------------------------------------------------------------------- /images/icdar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/icdar.png -------------------------------------------------------------------------------- /images/synth.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wondervictor/RetinaNet-Text-Detection/HEAD/images/synth.png -------------------------------------------------------------------------------- /lib/nms/Makefile: -------------------------------------------------------------------------------- 1 | 2 | all: 3 | echo "Compiling nms kernels..." 4 | python setup.py build_ext --inplace 5 | 6 | clean: 7 | rm -f *.so 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id, size_t base); 3 | size_t nms_Malloc(); 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## RetinaNet-Text-Detection 2 | 3 | **Work in progress (Fork of [wondervictor/RetinaNet](https://github.com/wondervictor/RetinaNet))** 4 | 5 | RetinaNet for `Text Detection` implemented with Pure PyTorch 6 | 7 | 8 | ### Results 9 | 10 | * ICDAR 11 | 12 | ![](images/icdar.png) 13 | 14 | * SynthText 15 | 16 | ![](images/synth.png) 17 | 18 | 19 | ### Licence 20 | 21 | This project is under the **MIT Licence** 22 | -------------------------------------------------------------------------------- /.idea/RetinaNet-Text-Detection.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Training Logger 4 | 5 | """ 6 | import torch 7 | 8 | 9 | class Logger: 10 | 11 | def __init__(self): 12 | pass 13 | 14 | 15 | def save_checkpoints(model, optimizer, epoch, iteration, path): 16 | 17 | state_dict = { 18 | "model": model.state_dict(), 19 | "optimizer": optimizer.state_dict(), 20 | "epoch": epoch, 21 | "iteration": iteration 22 | } 23 | 24 | torch.save(state_dict, path) 25 | 26 | 27 | def load_checkpoints(path): 28 | state_dict = torch.load(path) 29 | 30 | return state_dict['model'], state_dict['optimizer'], state_dict['epoch'], state_dict['iteration'] 31 | 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Vic Chan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /utils/visualization.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | 5 | import cv2 6 | import json 7 | import random 8 | import argparse 9 | 10 | 11 | def show_img(im_path, boxes): 12 | 13 | img = cv2.imread(im_path) 14 | for bb in boxes: 15 | if bb[4] < 0.3: 16 | continue 17 | img = cv2.rectangle(img, (int(bb[0]), int(bb[1])), (int(bb[2]), int(bb[3])), (0, 255, 0), 1) 18 | img = cv2.putText(img, '{}:{:.2f}'.format(bb[5], bb[4]), (int(bb[0]), int(bb[1])+10), cv2.FONT_HERSHEY_SIMPLEX, 19 | 0.5, (0, 255, 0), 1) 20 | cv2.imshow('img', img) 21 | 22 | cv2.waitKey(0) 23 | cv2.destroyAllWindows() 24 | 25 | 26 | def main(): 27 | parser = argparse.ArgumentParser() 28 | parser.add_argument('-d', '--dt', default='', type=str) 29 | args = parser.parse_args() 30 | with open(args.dt, 'r') as f: 31 | lines = f.readlines() 32 | lines = [json.loads(x.rstrip('\n')) for x in lines] 33 | random.shuffle(lines) 34 | dt = dict() 35 | for l in lines: 36 | name = l['image_id'] 37 | res = l['result'] 38 | _boxes = [] 39 | for bb in res: 40 | _boxes.append(bb['bbox']+[bb['prob'], bb['class']]) 41 | dt[name] = _boxes 42 | 43 | for k in dt.keys(): 44 | show_img('/public_datasets/SynthText/'+k, dt[k]) 45 | 46 | 47 | if __name__ == '__main__': 48 | 49 | main() 50 | -------------------------------------------------------------------------------- /lib/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from .gpu_nms import gpu_nms 5 | from .cpu_nms import cpu_nms 6 | from .cpu_soft_nms import cpu_soft_nms 7 | 8 | 9 | def nms(dets, thresh, device_id=None): 10 | """Dispatch to either CPU or GPU NMS implementations.""" 11 | 12 | if isinstance(dets, torch.Tensor): 13 | if dets.is_cuda: 14 | device_id = dets.get_device() 15 | dets = dets.detach().cpu().numpy() 16 | assert isinstance(dets, np.ndarray) 17 | 18 | if dets.shape[0] == 0: 19 | inds = [] 20 | else: 21 | inds = (gpu_nms(dets, thresh, device_id=device_id) 22 | if device_id is not None else cpu_nms(dets, thresh)) 23 | 24 | if isinstance(dets, torch.Tensor): 25 | return dets.new_tensor(inds, dtype=torch.long) 26 | else: 27 | return np.array(inds, dtype=np.int) 28 | 29 | 30 | def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0): 31 | if isinstance(dets, torch.Tensor): 32 | _dets = dets.detach().cpu().numpy() 33 | else: 34 | _dets = dets.copy() 35 | assert isinstance(_dets, np.ndarray) 36 | 37 | new_dets, inds = cpu_soft_nms( 38 | _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score) 39 | 40 | if isinstance(dets, torch.Tensor): 41 | return dets.new_tensor( 42 | inds, dtype=torch.long), dets.new_tensor(new_dets) 43 | else: 44 | return np.array( 45 | inds, dtype=np.int), np.array( 46 | new_dets, dtype=np.float32) 47 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil 15 | size_t nms_Malloc() nogil 16 | 17 | memory_pool = {} 18 | 19 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 20 | np.int32_t device_id=0): 21 | cdef int boxes_num = dets.shape[0] 22 | cdef int boxes_dim = dets.shape[1] 23 | cdef int num_out 24 | cdef size_t base 25 | cdef np.ndarray[np.int32_t, ndim=1] \ 26 | keep = np.zeros(boxes_num, dtype=np.int32) 27 | cdef np.ndarray[np.float32_t, ndim=1] \ 28 | scores = dets[:, 4] 29 | cdef np.ndarray[np.int_t, ndim=1] \ 30 | order = scores.argsort()[::-1] 31 | cdef np.ndarray[np.float32_t, ndim=2] \ 32 | sorted_dets = dets[order, :] 33 | cdef float cthresh = thresh 34 | if device_id not in memory_pool: 35 | with nogil: 36 | base = nms_Malloc() 37 | memory_pool[device_id] = base 38 | # print "malloc", base 39 | base = memory_pool[device_id] 40 | with nogil: 41 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base) 42 | keep = keep[:num_out] 43 | return list(order[keep]) 44 | -------------------------------------------------------------------------------- /datasets/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Dataset utils 4 | 5 | """ 6 | import cv2 7 | import numpy as np 8 | from PIL import Image 9 | 10 | 11 | def flip_pil_img_and_boxes(img, boxes=None): 12 | """ Flip PIL Images and Boxes 13 | Args: 14 | img: PIL Image 15 | boxes: [N, 4] 16 | """ 17 | assert isinstance(img, Image.Image), "img should be PIL.Image" 18 | w, h = img.size 19 | flip_img = img.transpose(Image.FLIP_LEFT_RIGHT) 20 | if boxes is not None: 21 | flip_boxes = boxes.copy() 22 | flip_boxes[:, 0] = w - boxes[:, 2] - 1 23 | flip_boxes[:, 2] = w - boxes[:, 0] - 1 24 | return flip_img, flip_boxes 25 | else: 26 | return flip_img 27 | 28 | 29 | def flip_img_boxes(img, boxes=None): 30 | 31 | h, w, c = img.shape 32 | flip_img = cv2.flip(img, 1) 33 | if boxes is not None: 34 | flip_boxes = boxes.copy() 35 | for i in range(flip_boxes.shape[0]): 36 | flip_boxes[i, 0] = w - boxes[i, 2] - 1 37 | flip_boxes[i, 2] = w - boxes[i, 0] - 1 38 | return flip_img, flip_boxes 39 | else: 40 | return flip_img 41 | 42 | 43 | def normalize_image(img): 44 | img = img / 255.0 45 | mean = np.array([.485, .456, .406]) 46 | std = np.array([.229, .224, .225]) 47 | img = (img - mean) / std 48 | return img 49 | 50 | 51 | def get_im_scale(h, w, target_size, max_size): 52 | img_min_size = min(h, w) 53 | img_max_size = max(h, w) 54 | scale = target_size / img_min_size 55 | if scale * img_max_size > max_size: 56 | scale = max_size / img_max_size 57 | 58 | return int(round(h*scale)), int(round(w*scale)), scale 59 | -------------------------------------------------------------------------------- /lib/det_ops/anchors.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate Anchors 3 | """ 4 | 5 | import math 6 | import torch 7 | 8 | 9 | def mesh_grid(x, y): 10 | """ mesh grid 11 | 12 | """ 13 | a = torch.arange(0, x) 14 | b = torch.arange(0, y) 15 | xx = a.repeat(y).view(-1, 1) 16 | yy = b.view(-1, 1).repeat(1, x).view(-1, 1) 17 | 18 | return torch.cat([xx, yy], dim=1).float() 19 | 20 | 21 | def compute_anchor_whs(num_layers, areas, aspect_ratios, sizes): 22 | anchors = [] 23 | for i in range(len(areas)): 24 | area = areas[i] 25 | for ar in aspect_ratios: 26 | h = math.sqrt(area / ar) 27 | w = h * ar 28 | for s in sizes: 29 | anchor_h = h * s 30 | anchor_w = w * s 31 | anchors.append([anchor_w, anchor_h]) 32 | # M * K * 2 33 | # Faster R-CNN: 1*K*2 (1x9x2) 34 | # FPN: 5*K*2 (5x3x2) 35 | # RetinaNet: 5*K*2 (5*9*2) 36 | return torch.Tensor(anchors).view(num_layers, -1, 2) 37 | 38 | 39 | def generate_anchors(anchor_whs, input_size, strides): 40 | """ generate anchors 41 | """ 42 | boxes = [] 43 | num_strides = len(strides) 44 | num_anchors = anchor_whs.shape[1] 45 | 46 | for i in range(num_strides): 47 | stride = strides[i] 48 | feature_size = input_size / stride 49 | fmw, fmh = int(math.ceil(feature_size[0])), int(math.ceil(feature_size[1])) 50 | xy = mesh_grid(fmh, fmw) + 0.5 # shift to center 51 | xy = (xy * stride).view(fmh, fmw, 1, 2).expand(fmh, fmw, num_anchors, 2) 52 | wh = anchor_whs[i].view(1, 1, num_anchors, 2).expand(fmh, fmw, num_anchors, 2) 53 | box = torch.cat([xy, wh], dim=3) 54 | boxes.append(box.view(-1, 4)) 55 | boxes = torch.cat(boxes, 0) 56 | # box: H * W * self._num_anchors * 2 57 | return boxes 58 | -------------------------------------------------------------------------------- /models/fpn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Feature Pyramid Network for Object Detection 3 | 4 | """ 5 | 6 | import torch 7 | import torch.nn.functional as F 8 | from torchvision.models import resnet 9 | import torch.nn as nn 10 | from .resnet import ResNet50Stages 11 | 12 | 13 | class FPN50(nn.Module): 14 | 15 | def __init__(self, pretrained_path): 16 | super(FPN50, self).__init__() 17 | self.backbone = ResNet50Stages(pretrained_path) 18 | 19 | self.lateral_layer1 = nn.Conv2d(2048, 256, 1) 20 | self.lateral_layer2 = nn.Conv2d(1024, 256, 1) 21 | self.lateral_layer3 = nn.Conv2d(512, 256, 1) 22 | 23 | self.conv6 = nn.Conv2d(2048, 256, 3, padding=1, stride=2) 24 | self.conv7 = nn.Conv2d(256, 256, 3, padding=1, stride=2) 25 | self.relu = nn.ReLU(inplace=True) 26 | 27 | self._weight_initialize() 28 | 29 | def _weight_initialize(self): 30 | 31 | self.lateral_layer1.weight.data.normal_(std=0.01) 32 | self.lateral_layer1.bias.data.fill_(0.0) 33 | 34 | self.lateral_layer2.weight.data.normal_(std=0.01) 35 | self.lateral_layer2.bias.data.fill_(0.0) 36 | 37 | self.lateral_layer3.weight.data.normal_(std=0.01) 38 | self.lateral_layer3.bias.data.fill_(0.0) 39 | 40 | self.conv6.weight.data.normal_(std=0.01) 41 | self.conv6.bias.data.fill_(0.0) 42 | 43 | self.conv7.weight.data.normal_(std=0.01) 44 | self.conv7.bias.data.fill_(0.0) 45 | 46 | def upsample_add(self, x, y): 47 | _, _, h, w = y.size() 48 | return F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True) 49 | 50 | def forward(self, x): 51 | c3, c4, c5 = self.backbone(x) 52 | # c2: 64*4=256 c3: 128*4=512 c4: 256*4=1024 c5: 512*4=2048 53 | 54 | p5 = self.lateral_layer1(c5) 55 | 56 | p4 = self.lateral_layer2(c4) 57 | p4 = self.upsample_add(p5, p4) 58 | 59 | p3 = self.lateral_layer3(c3) 60 | p3 = self.upsample_add(p4, p3) 61 | 62 | p6 = self.conv6(c5) 63 | p7 = self.conv7(self.relu(p6)) 64 | 65 | return p3, p4, p5, p6, p7 66 | -------------------------------------------------------------------------------- /lib/bbox/box_transform.py: -------------------------------------------------------------------------------- 1 | """ 2 | BBox transform 3 | """ 4 | 5 | import torch 6 | 7 | 8 | def bbox_transform(boxes, gtboxes): 9 | """ Bounding Box Transform 10 | from groundtruth boxes and proposal boxes to deltas 11 | 12 | Args: 13 | boxes: [N, 4] torch.Tensor (xyxy) 14 | gtboxes: [N, 4] torch.Tensor (xywh) 15 | Return: 16 | delta: [N, 4] torch.Tensor 17 | """ 18 | gt_w = gtboxes[:, 2] - gtboxes[:, 0] + 1 19 | gt_h = gtboxes[:, 3] - gtboxes[:, 1] + 1 20 | # center 21 | gt_x = gtboxes[:, 0] + 0.5 * gt_w 22 | gt_y = gtboxes[:, 1] + 0.5 * gt_h 23 | 24 | # Anchors [x,y,w,h] 25 | anchor_x = boxes[:, 0] 26 | anchor_y = boxes[:, 1] 27 | anchor_w = boxes[:, 2] 28 | anchor_h = boxes[:, 3] 29 | # anchor_w = boxes[:, 2] - boxes[:, 0] + 1 30 | # anchor_h = boxes[:, 3] - boxes[:, 1] + 1 31 | # # center 32 | # anchor_x = boxes[:, 0] + 0.5 * anchor_w 33 | # anchor_y = boxes[:, 1] + 0.5 * anchor_h 34 | 35 | delta_x = (gt_x - anchor_x) / anchor_w 36 | delta_y = (gt_y - anchor_y) / anchor_h 37 | delta_w = torch.log(gt_w / anchor_w) 38 | delta_h = torch.log(gt_h / anchor_h) 39 | 40 | # [N, 4] 41 | return torch.stack([delta_x, delta_y, delta_w, delta_h]).transpose(0, 1) 42 | 43 | 44 | def bbox_transform_inv(boxes, delta): 45 | """ Inverse Bounding Box Transform 46 | from deltas and proposal boxes to predicted boxes 47 | Args: 48 | boxes: [N, 4] torch.Tensor (xywh) 49 | delta: [N, 4] torch.Tensor (xywh) 50 | Return: 51 | pred: [N, 4] torch.Tensor (xyxy) 52 | """ 53 | pred_boxes = torch.zeros_like(boxes) 54 | pred_x = boxes[:, 0] + boxes[:, 2] * delta[:, 0] 55 | pred_y = boxes[:, 1] + boxes[:, 3] * delta[:, 1] 56 | pred_w = boxes[:, 2] * torch.exp(delta[:, 2]) 57 | pred_h = boxes[:, 3] * torch.exp(delta[:, 3]) 58 | 59 | pred_boxes[:, 0] = pred_x - 0.5 * pred_w 60 | pred_boxes[:, 1] = pred_y - 0.5 * pred_h 61 | pred_boxes[:, 2] = pred_x + 0.5 * pred_w 62 | pred_boxes[:, 3] = pred_y + 0.5 * pred_h 63 | 64 | return pred_boxes 65 | 66 | 67 | if __name__ == '__main__': 68 | 69 | pass 70 | -------------------------------------------------------------------------------- /cfgs/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Config File 3 | """ 4 | 5 | 6 | config = { 7 | 8 | "synth_baseline": { 9 | # lr and general config 10 | 'base_lr': 1e-2, 11 | "lr_decay": [60000, 80000], 12 | "workers": 8, 13 | "num_classes": 21, 14 | "weight_decay": 1e-4, 15 | "epochs": 200, 16 | 17 | "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth', 18 | "data_dir": "/public_datasets/SynthText", 19 | 20 | # anchor config 21 | "positive_anchor_threshold": 0.5, 22 | "negative_anchor_threshold": 0.4, 23 | "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)], 24 | "aspect_ratios": [1, 3, 5], 25 | "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2], 26 | "strides": [8, 16, 32, 64, 128], 27 | "base_size": 8, 28 | 29 | # dataset 30 | "image_scales": [600], 31 | "max_image_size": 1000, 32 | 33 | # test config 34 | "pre_nms_boxes": 1000, 35 | "test_nms": 0.5, 36 | "test_max_boxes": 300, 37 | "cls_thresh": 0.05, 38 | 39 | # log 40 | "logdir": "log", 41 | "tb_dump_dir": "", 42 | "model_dump_dir": "", 43 | }, 44 | 45 | "icdar_baseline": { 46 | # lr and general config 47 | 'base_lr': 1e-2, 48 | "lr_decay": [60000, 80000], 49 | "workers": 8, 50 | "num_classes": 21, 51 | "weight_decay": 1e-4, 52 | "epochs": 200, 53 | 54 | "basemodel_path": '/home/tianhengcheng/.torch/models/resnet50-19c8e357.pth', 55 | "data_dir": "/public_datasets/Text/icdar2015/", 56 | 57 | # anchor config 58 | "positive_anchor_threshold": 0.5, 59 | "negative_anchor_threshold": 0.4, 60 | "anchor_sizes": [2 ** 0, 2 ** (1 / 3), 2 ** (2 / 3)], 61 | "aspect_ratios": [1, 3, 5], 62 | "anchor_areas": [32 ** 2, 64 ** 2, 128 ** 2, 256 ** 2, 512 ** 2], 63 | "strides": [8, 16, 32, 64, 128], 64 | "base_size": 8, 65 | 66 | # dataset 67 | "image_scales": [600], 68 | "max_image_size": 1000, 69 | 70 | # test config 71 | "pre_nms_boxes": 1000, 72 | "test_nms": 0.5, 73 | "test_max_boxes": 300, 74 | "cls_thresh": 0.05, 75 | 76 | # log 77 | "logdir": "log", 78 | "tb_dump_dir": "", 79 | "model_dump_dir": "", 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/det_ops/anchor_target.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Anchor Layer 4 | 5 | 6 | """ 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors 11 | import sys 12 | sys.path.append('../') 13 | from lib.bbox import bbox, box_transform 14 | from IPython import embed 15 | 16 | 17 | class AnchorLayer: 18 | """ Anchor Layer 19 | """ 20 | def __init__(self, strides, areas, aspect_ratios, sizes): 21 | self.aspect_ratios = aspect_ratios 22 | self.areas = areas 23 | self.strides = strides 24 | self.sizes = sizes 25 | 26 | self._anchor_sizes = self._compute_anchor_size() 27 | 28 | # self._num_anchors = len(self.scales) * len(self.aspect_ratios) 29 | 30 | def _compute_anchor_size(self): 31 | return compute_anchor_whs(len(self.strides), self.areas, self.aspect_ratios, self.sizes) 32 | 33 | def _generate_anchors(self, input_size): 34 | boxes = generate_anchors(self._anchor_sizes, input_size, self.strides) 35 | return boxes 36 | 37 | def assign(self, gt_boxes, labels, input_size, neg_thresh=0.4, pos_thresh=0.5): 38 | """ assign groundtruth box to anchor box 39 | 40 | """ 41 | anchor_boxes = self._generate_anchors(input_size) 42 | if labels.shape[0] == 0: 43 | return torch.LongTensor([0]*anchor_boxes.shape[0]), torch.zeros_like(anchor_boxes) 44 | # M * N 45 | xyxy_anchors = bbox.xywh2xyxy(anchor_boxes) 46 | ious = bbox.box_overlaps(xyxy_anchors, gt_boxes) 47 | max_ious, max_inds = ious.max(1) 48 | # M * 4 49 | matched_boxes = gt_boxes[max_inds] 50 | box_targets = box_transform.bbox_transform(anchor_boxes, matched_boxes) 51 | 52 | cls_targets = labels[max_inds] 53 | # negative 54 | cls_targets[max_ious < neg_thresh] = 0 55 | # ignore 56 | cls_targets[(max_ious > neg_thresh) & (max_ious < pos_thresh)] = -1 57 | return cls_targets, box_targets 58 | 59 | 60 | if __name__ == '__main__': 61 | # RetinaNet settings 62 | strides = [8, 16, 32, 64, 128] 63 | aspect_ratios = [0.5, 1, 2] 64 | sizes = [1, 2**(1/3), 2**(2/3)] 65 | areas = [32**2, 64**2, 128**2, 256**2, 512**2] 66 | anchor_layer = AnchorLayer(strides=strides, areas=areas, aspect_ratios=aspect_ratios, sizes=sizes) 67 | 68 | boxes = torch.Tensor([[10, 20, 44, 56], [50, 34, 260, 340], 69 | [70, 80, 190, 410], [360, 270, 500, 600]]) 70 | labels = torch.LongTensor([3, 1, 1, 4]) 71 | cls_target, box_target = anchor_layer.assign(boxes, labels, torch.FloatTensor([600, 600]), 0.4, 0.5) 72 | 73 | embed() 74 | -------------------------------------------------------------------------------- /lib/nms/setup.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | from distutils.core import setup, Extension 3 | 4 | import numpy as np 5 | from Cython.Build import cythonize 6 | from Cython.Distutils import build_ext 7 | 8 | # extensions 9 | ext_args = dict( 10 | include_dirs=[np.get_include()], 11 | language='c++', 12 | extra_compile_args={ 13 | 'cc': ['-Wno-unused-function', '-Wno-write-strings'], 14 | 'nvcc': ['-c', '--compiler-options', '-fPIC'], 15 | }, 16 | ) 17 | 18 | extensions = [ 19 | Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args), 20 | Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args), 21 | Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args), 22 | ] 23 | 24 | 25 | def customize_compiler_for_nvcc(self): 26 | """inject deep into distutils to customize how the dispatch 27 | to cc/nvcc works. 28 | If you subclass UnixCCompiler, it's not trivial to get your subclass 29 | injected in, and still have the right customizations (i.e. 30 | distutils.sysconfig.customize_compiler) run on it. So instead of going 31 | the OO route, I have this. Note, it's kindof like a wierd functional 32 | subclassing going on.""" 33 | 34 | # tell the compiler it can processes .cu 35 | self.src_extensions.append('.cu') 36 | 37 | # save references to the default compiler_so and _comple methods 38 | default_compiler_so = self.compiler_so 39 | super = self._compile 40 | 41 | # now redefine the _compile method. This gets executed for each 42 | # object but distutils doesn't have the ability to change compilers 43 | # based on source extension: we add it. 44 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 45 | if osp.splitext(src)[1] == '.cu': 46 | # use the cuda for .cu files 47 | self.set_executable('compiler_so', 'nvcc') 48 | # use only a subset of the extra_postargs, which are 1-1 translated 49 | # from the extra_compile_args in the Extension class 50 | postargs = extra_postargs['nvcc'] 51 | else: 52 | postargs = extra_postargs['cc'] 53 | 54 | super(obj, src, ext, cc_args, postargs, pp_opts) 55 | # reset the default compiler_so, which we might have changed for cuda 56 | self.compiler_so = default_compiler_so 57 | 58 | # inject our redefined _compile method into the class 59 | self._compile = _compile 60 | 61 | 62 | # run the customize_compiler 63 | class custom_build_ext(build_ext): 64 | 65 | def build_extensions(self): 66 | customize_compiler_for_nvcc(self.compiler) 67 | build_ext.build_extensions(self) 68 | 69 | 70 | setup( 71 | name='nms', 72 | cmdclass={'build_ext': custom_build_ext}, 73 | ext_modules=cythonize(extensions), 74 | ) 75 | -------------------------------------------------------------------------------- /datasets/icdar15.py: -------------------------------------------------------------------------------- 1 | """ 2 | ICDAR2015 for Text Detection 3 | """ 4 | 5 | import os 6 | import cv2 7 | import json 8 | import torch 9 | import random 10 | import numpy as np 11 | from PIL import Image 12 | from scipy.io import loadmat 13 | from torch.utils.data import Dataset 14 | from torchvision.transforms import transforms 15 | from datasets.utils import normalize_image, get_im_scale 16 | 17 | 18 | CLASSES = ('text',) 19 | NUM_CLASSES = 2 20 | 21 | 22 | class ICDAR15(Dataset): 23 | 24 | def __init__(self, dataroot, config, imageset='train'): 25 | assert imageset == 'train' or imageset == 'val' or imageset == 'all' 26 | self._imageset = imageset 27 | self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset)) 28 | self._base_dir = os.path.join(dataroot, '{}_images'.format(imageset)) 29 | self.name = 'ICDAR15' 30 | self.config = config 31 | self.annotations = self._read_annotations() 32 | 33 | def _read_annotations(self): 34 | # im path -> annotations 35 | with open(self._annotation_file, 'r') as f: 36 | lines = f.readlines() 37 | lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines)) 38 | return lines 39 | 40 | def __len__(self): 41 | return len(self.annotations) 42 | 43 | def __getitem__(self, idx): 44 | annotation = self.annotations[idx] 45 | im_name = annotation['im_name'] 46 | gt_boxes = annotation['gtboxes'] 47 | try: 48 | img = Image.open(os.path.join(self._base_dir, im_name)) 49 | except OSError as e: 50 | idx = random.randint(0, len(self)) 51 | annotation = self.annotations[idx] 52 | im_name = annotation['im_name'] 53 | gt_boxes = annotation['gtboxes'] 54 | img = Image.open(os.path.join(self._base_dir, im_name)) 55 | 56 | if self._imageset == 'val': 57 | # testing or validation mode, original scale 58 | img = np.array(img).astype('float32') 59 | h, w = img.shape[:2] 60 | resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0], 61 | max_size=self.config['test_max_image_size']) 62 | img = cv2.resize(img, (resize_w, resize_h)) 63 | img = normalize_image(img) 64 | img = img.transpose(2, 0, 1) 65 | img = torch.Tensor(img) 66 | return img, im_name, scale, (h, w) 67 | 68 | img = np.array(img).astype('float32') 69 | labels = np.ones(len(gt_boxes), dtype=np.int32) 70 | labels = torch.LongTensor(labels) 71 | boxes = np.array(gt_boxes, dtype=np.float32) 72 | # C, H, W 73 | 74 | return img, labels, boxes 75 | 76 | 77 | -------------------------------------------------------------------------------- /datasets/synthtext.py: -------------------------------------------------------------------------------- 1 | """ 2 | Synth800K for Text Detection 3 | """ 4 | 5 | import os 6 | import cv2 7 | import json 8 | import torch 9 | import random 10 | import numpy as np 11 | from PIL import Image 12 | from scipy.io import loadmat 13 | from torch.utils.data import Dataset 14 | from torchvision.transforms import transforms 15 | from datasets.utils import normalize_image, get_im_scale 16 | 17 | 18 | CLASSES = ('text',) 19 | NUM_CLASSES = 2 20 | SYNTHTEXT_ROOT = '/public_datasets/SynthText' 21 | 22 | 23 | class SynthText(Dataset): 24 | 25 | def __init__(self, dataroot,config, imageset='train'): 26 | assert imageset == 'train' or imageset == 'val' or imageset == 'all' 27 | self._imageset = imageset 28 | self._annotation_file = os.path.join(dataroot, '{}.odgt'.format(imageset)) 29 | self._base_dir = dataroot 30 | self.config = config 31 | self.name = 'SynthText80K' 32 | self.annotations = self._read_annotations() 33 | 34 | def _read_annotations(self): 35 | # im path -> annotations 36 | with open(self._annotation_file, 'r') as f: 37 | lines = f.readlines() 38 | lines = list(map(lambda x: json.loads(x.rstrip('\n')), lines)) 39 | return lines 40 | 41 | def __len__(self): 42 | return len(self.annotations) 43 | 44 | def __getitem__(self, idx): 45 | annotation = self.annotations[idx] 46 | im_name = annotation['im_name'] 47 | gt_boxes = annotation['gtboxes'] 48 | try: 49 | img = Image.open(os.path.join(self._base_dir, im_name)) 50 | except OSError as e: 51 | idx = random.randint(0, len(self)) 52 | annotation = self.annotations[idx] 53 | im_name = annotation['im_name'] 54 | gt_boxes = annotation['gtboxes'] 55 | img = Image.open(os.path.join(self._base_dir, im_name)) 56 | 57 | if self._imageset == 'val': 58 | # testing or validation mode, original scale 59 | img = np.array(img).astype('float32') 60 | h, w = img.shape[:2] 61 | resize_h, resize_w, scale = get_im_scale(h, w, target_size=self.config['test_image_size'][0], 62 | max_size=self.config['test_max_image_size']) 63 | img = cv2.resize(img, (resize_w, resize_h)) 64 | img = normalize_image(img) 65 | img = img.transpose(2, 0, 1) 66 | img = torch.Tensor(img) 67 | return img, im_name, scale, (h, w) 68 | 69 | img = np.array(img).astype('float32') 70 | labels = np.ones(len(gt_boxes), dtype=np.int32) 71 | labels = torch.LongTensor(labels) 72 | boxes = np.array(gt_boxes, dtype=np.float32) 73 | # C, H, W 74 | 75 | return img, labels, boxes 76 | 77 | -------------------------------------------------------------------------------- /lib/det_ops/loss.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Loss functions for Detection 4 | 5 | """ 6 | import math 7 | import numpy as np 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | from IPython import embed 12 | __all__ = ['SmoothL1Loss', 'SoftmaxCrossEntropy', 'SigmoidCrossEntropy', 'SigmoidFocalLoss'] 13 | 14 | 15 | SoftmaxCrossEntropy = nn.CrossEntropyLoss 16 | SigmoidCrossEntropy = nn.BCEWithLogitsLoss 17 | 18 | 19 | class SigmoidFocalLoss(nn.Module): 20 | """ Focal Loss 21 | 22 | Args: 23 | 24 | Input: 25 | pred: [] 26 | target: [] 27 | Output: 28 | 29 | """ 30 | def __init__(self, background=0, gamma=2, alpha=0.25): 31 | super(SigmoidFocalLoss, self).__init__() 32 | self.gamma = gamma 33 | self.alpha = alpha 34 | self.background = background 35 | 36 | def forward(self, pred, target): 37 | # pred.shape = [B, K, C] 38 | # target.shape = [B, N] 39 | B, N, C = pred.size() 40 | pred_sigmoid = pred.sigmoid() 41 | # # ignore: [B, N] 42 | # keep_mask = target > -1 43 | # # [B, N], ignore and background shared 0 index 44 | # keep_mask = keep_mask.long() 45 | mask = (target > -1) 46 | new_target = target * mask.long() 47 | new_target = new_target.reshape((new_target.size()[0], new_target.size()[1], 1)) 48 | onehot_target = torch.zeros((B, N, C+1)).cuda() 49 | onehot_target.scatter_(2, new_target, 1.0) 50 | onehot = onehot_target[:, :, 1:].float() 51 | mask = mask.unsqueeze(2).float() 52 | 53 | # pred_sigmoid = pred_sigmoid.clamp(min=1e-6, max=1-1e-6) 54 | # bce = -(onehot * torch.log(pred_sigmoid) + (1-onehot)*torch.log(1-pred_sigmoid)) 55 | 56 | weight = self.alpha*((1-pred_sigmoid).pow(self.gamma))*onehot \ 57 | + (1-self.alpha)*(pred_sigmoid.pow(self.gamma))*(1-onehot) 58 | weight = weight * mask 59 | avg_factor = torch.sum(target > 0, dim=1).float() 60 | # embed() 61 | loss = F.binary_cross_entropy_with_logits(pred, onehot, weight, reduction='none').sum(dim=1).sum(dim=1) 62 | loss = loss.div_(avg_factor.clamp(min=1.0)).mean() 63 | 64 | return loss 65 | 66 | 67 | class SmoothL1Loss(nn.Module): 68 | 69 | def __init__(self): 70 | super(SmoothL1Loss, self).__init__() 71 | self.smooth_l1 = nn.SmoothL1Loss(reduction='none') 72 | 73 | def forward(self, offset, target, cls_target): 74 | # ignore background and ignore label 75 | # offset B*N*4 76 | # target B*N*4 77 | # cls: B*N*C 78 | # bg_mask = cls_target == 0 79 | # ig_mask = cls_target == -1 80 | 81 | mask = cls_target > 0 # ig_mask * bg_mask 82 | mask = mask.float() 83 | loss_raw = (self.smooth_l1(offset, target).sum(2)) * mask 84 | loss = loss_raw.sum(dim=1).div_(mask.sum(dim=1).clamp(min=1.0)).mean() 85 | return loss -------------------------------------------------------------------------------- /models/retina.py: -------------------------------------------------------------------------------- 1 | """ 2 | RetinaNet Model 3 | backbone: resnet50 + FPN 4 | """ 5 | 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | from models import fpn 10 | 11 | 12 | class RetinaNetHead(nn.Module): 13 | 14 | def __init__(self, num_classes, num_anchors): 15 | super(RetinaNetHead, self).__init__() 16 | self.num_classes = num_classes 17 | 18 | self.cls_branch = nn.Sequential( 19 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 20 | nn.ReLU(inplace=True), 21 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 22 | nn.ReLU(inplace=True), 23 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 24 | nn.ReLU(inplace=True), 25 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 26 | nn.ReLU(inplace=True) 27 | ) 28 | self.cls_score = nn.Conv2d(256, out_channels=num_classes*num_anchors, kernel_size=3, stride=1, padding=1) 29 | 30 | self.bbox_branch = nn.Sequential( 31 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 32 | nn.ReLU(inplace=True), 33 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 34 | nn.ReLU(inplace=True), 35 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 36 | nn.ReLU(inplace=True), 37 | nn.Conv2d(256, out_channels=256, kernel_size=3, stride=1, padding=1), 38 | nn.ReLU(inplace=True), 39 | nn.Conv2d(256, out_channels=num_anchors*4, kernel_size=3, stride=1, padding=1) 40 | ) 41 | self._initialize_weights() 42 | 43 | def _initialize_weights(self): 44 | for m in self.cls_branch.modules(): 45 | if isinstance(m, nn.Conv2d): 46 | m.weight.data.normal_(0, 0.01) 47 | m.bias.data.fill_(0) 48 | 49 | for m in self.bbox_branch.modules(): 50 | if isinstance(m, nn.Conv2d): 51 | m.weight.data.normal_(0, 0.01) 52 | m.bias.data.fill_(0) 53 | 54 | self.cls_score.weight.data.normal_(0, 0.01) 55 | pi = 0.01 56 | self.cls_score.bias.data.fill_(-np.log((1 - pi) / pi)) 57 | 58 | def forward(self, x): 59 | bbox_output = self.bbox_branch(x) 60 | bbox_output = bbox_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, 4) 61 | cls_output = self.cls_score(self.cls_branch(x)) 62 | cls_output = cls_output.permute(0, 2, 3, 1).contiguous().view(x.size()[0], -1, self.num_classes) 63 | return cls_output, bbox_output 64 | 65 | 66 | class RetinaNet(nn.Module): 67 | 68 | def __init__(self, num_classes, num_anchors, pretrained_path): 69 | super(RetinaNet, self).__init__() 70 | self.fpn = fpn.FPN50(pretrained_path) 71 | self.head = RetinaNetHead(num_classes, num_anchors) 72 | 73 | def forward(self, x): 74 | # [P3, P4, P5, P6, P7] 75 | # stride: [8, 16, 32, 64, 128] 76 | feature_pyramids = self.fpn(x) 77 | cls_outputs = [] 78 | bbox_outputs = [] 79 | for fp in feature_pyramids: 80 | cls_output, bbox_output = self.head(fp) 81 | cls_outputs.append(cls_output) 82 | bbox_outputs.append(bbox_output) 83 | 84 | cls_outputs = torch.cat(cls_outputs, dim=1) 85 | bbox_outputs = torch.cat(bbox_outputs, dim=1) 86 | 87 | return cls_outputs, bbox_outputs 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /datasets/minibatch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Create Mini Batch 3 | """ 4 | import cv2 5 | import torch 6 | import random 7 | import numpy as np 8 | from datasets.utils import flip_img_boxes 9 | from lib.det_ops import anchor_target 10 | from datasets.utils import normalize_image, get_im_scale 11 | 12 | 13 | def create_minibatch_func(config): 14 | aspect_ratios = config['aspect_ratios'] 15 | anchor_sizes = config['anchor_sizes'] 16 | anchor_areas = config['anchor_areas'] 17 | strides = config['strides'] 18 | 19 | anchor_layer = anchor_target.AnchorLayer(aspect_ratios=aspect_ratios, 20 | sizes=anchor_sizes, 21 | areas=anchor_areas, 22 | strides=strides) 23 | 24 | def collate_minibatch(batch): 25 | # (img, labels, boxes) 26 | # img: [H, W, C] 27 | # labels: [N] 28 | # boxes: [N, 4] 29 | batch_size = len(batch) 30 | max_size = config['max_image_size'] 31 | # [N, 1] 32 | target_size_inds = np.random.randint( 33 | 0, high=len(config['image_scales']), size=batch_size 34 | ) 35 | 36 | image_shapes = np.zeros((batch_size, 2), dtype=np.int) 37 | image_scales = np.zeros(batch_size, dtype=np.float) 38 | batch_height = 0 39 | batch_width = 0 40 | for i in range(batch_size): 41 | h, w = batch[i][0].shape[:2] 42 | target_size = config['image_scales'][target_size_inds[i]] 43 | h_, w_, s_ = get_im_scale(h, w, target_size, max_size) 44 | image_shapes[i, 0] = h_ 45 | image_shapes[i, 1] = w_ 46 | image_scales[i] = s_ 47 | batch_height = max(h_, batch_height) 48 | batch_width = max(w_, batch_width) 49 | 50 | # pad images to support the last stride 51 | max_stride = strides[-1] 52 | batch_width = int(np.ceil(batch_width/max_stride)*max_stride) 53 | batch_height = int(np.ceil(batch_height/max_stride)*max_stride) 54 | 55 | labels = [] 56 | gtboxes = [] 57 | batch_images = torch.zeros((batch_size, 3, batch_height, batch_width)) 58 | input_size = np.array([batch_height, batch_width]) 59 | for i in range(batch_size): 60 | img, label, boxes = batch[i] 61 | boxes = boxes.astype('float32') 62 | h, w = image_shapes[i] 63 | scale = image_scales[i] 64 | img = cv2.resize(img, (w, h)) 65 | 66 | # OpenCV resize (W, H) 67 | boxes = boxes * scale 68 | if random.random() < 0.5: 69 | img, boxes = flip_img_boxes(img, boxes) 70 | 71 | # transform or data augmentation 72 | img = normalize_image(img) 73 | img = img.transpose(2, 0, 1) 74 | img = torch.Tensor(img) 75 | # assign anchors 76 | boxes = torch.Tensor(boxes) 77 | label, boxes = anchor_layer.assign(boxes, label, input_size=input_size, 78 | neg_thresh=config['negative_anchor_threshold'], 79 | pos_thresh=config['positive_anchor_threshold']) 80 | 81 | labels.append(label.unsqueeze(0)) 82 | gtboxes.append(boxes.unsqueeze(0)) 83 | # print(img.shape, batch_images.shape) 84 | batch_images[i, :, :h, :w] = img 85 | 86 | labels = torch.cat(labels, dim=0) 87 | gtboxes = torch.cat(gtboxes, dim=0) 88 | return batch_images, labels, gtboxes 89 | 90 | return collate_minibatch 91 | -------------------------------------------------------------------------------- /evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | import json 5 | import argparse 6 | import numpy as np 7 | from IPython import embed 8 | 9 | 10 | def calculate_ap(recall, precision): 11 | mrec = np.concatenate(([0.], recall, [1.])) 12 | mpre = np.concatenate(([0.], precision, [0.])) 13 | 14 | # compute the precision envelope 15 | for i in range(mpre.size - 1, 0, -1): 16 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 17 | 18 | # to calculate area under PR curve, look for points 19 | # where X axis (recall) changes value 20 | i = np.where(mrec[1:] != mrec[:-1])[0] 21 | 22 | # and sum (\Delta recall) * prec 23 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 24 | 25 | return ap 26 | 27 | 28 | def eval_ap(predict_path, gt_path, iou_thresh): 29 | 30 | with open(predict_path, 'r') as f: 31 | lines = f.readlines() 32 | predictions = [json.loads(x.rstrip('\n')) for x in lines] 33 | 34 | with open(gt_path, 'r') as f: 35 | lines = f.readlines() 36 | gt = [json.loads(x.rstrip('\n')) for x in lines] 37 | 38 | predict_boxes = [] 39 | for p in predictions: 40 | im_name = p['image_id'] 41 | boxes = p['result'] 42 | for bb in boxes: 43 | bb['im_name'] = im_name 44 | predict_boxes.append(bb) 45 | 46 | gt_boxes = dict() 47 | npos = 0 48 | for g in gt: 49 | gt_boxes[g['im_name']] = {'box': np.array(g['gtboxes']), 50 | 'flag': np.zeros(len(g['gtboxes']), dtype=int)} 51 | npos += len(g['gtboxes']) 52 | 53 | # sort 54 | predict_boxes = sorted(predict_boxes, key=lambda x: x['prob'], reverse=True) 55 | tp = np.zeros(len(predict_boxes)) 56 | fp = np.zeros(len(predict_boxes)) 57 | for i in range(len(predict_boxes)): 58 | box = predict_boxes[i] 59 | im_name = box['im_name'] 60 | _gt_boxes = gt_boxes[im_name]['box'] 61 | bb = box['bbox'] 62 | bb = np.array(bb) 63 | 64 | if len(_gt_boxes) > 0: 65 | 66 | ixmin = np.maximum(_gt_boxes[:, 0], bb[0]) 67 | iymin = np.maximum(_gt_boxes[:, 1], bb[1]) 68 | ixmax = np.minimum(_gt_boxes[:, 2], bb[2]) 69 | iymax = np.minimum(_gt_boxes[:, 3], bb[3]) 70 | iw = np.maximum(ixmax - ixmin + 1., 0.) 71 | ih = np.maximum(iymax - iymin + 1., 0.) 72 | inters = iw * ih 73 | 74 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 75 | (_gt_boxes[:, 2] - _gt_boxes[:, 0] + 1.) * 76 | (_gt_boxes[:, 3] - _gt_boxes[:, 1] + 1.) - inters) 77 | 78 | overlaps = inters / uni 79 | ovmax = np.max(overlaps) 80 | jmax = np.argmax(overlaps) 81 | 82 | if ovmax > iou_thresh: 83 | if gt_boxes[im_name]['flag'][jmax] > 0: 84 | fp[i] = 1 85 | else: 86 | tp[i] = 1 87 | gt_boxes[im_name]['flag'][jmax] = 1 88 | else: 89 | fp[i] = 1 90 | 91 | fp = np.cumsum(fp) 92 | tp = np.cumsum(tp) 93 | recall = tp / float(npos) 94 | # avoid divide by zero in case the first detection matches a difficult 95 | # ground truth 96 | precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 97 | 98 | ap = calculate_ap(recall, precision) 99 | 100 | return ap 101 | 102 | 103 | def main(): 104 | 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument('-p', '--predict', type=str, default='', required=True) 107 | parser.add_argument('-g', '--gt', type=str, default='/public_datasets/SynthText/val.odgt') 108 | parser.add_argument('-t', '--thresh', type=float, default=0.5) 109 | 110 | args = parser.parse_args() 111 | 112 | ap = eval_ap(args.predict, args.gt, args.thresh) 113 | 114 | print("eval finished, ap={:.3f}".format(ap)) 115 | 116 | 117 | if __name__ == '__main__': 118 | 119 | main() 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /lib/bbox/bbox.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Bounding Box 4 | 5 | 6 | """ 7 | 8 | import torch 9 | import numpy as np 10 | from IPython import embed 11 | 12 | def np_xywh2xyxy(boxes): 13 | # [x1,y1,w,h] 14 | boxes = np.hstack( 15 | (boxes[:, 0:2], boxes[:, 0:2] + np.maximum(0, boxes[:, 2:4] - 1)) 16 | ) 17 | 18 | return boxes 19 | 20 | 21 | def clip_boxes(boxes, image_height, image_width): 22 | boxes[:, 0] = torch.clamp(boxes[:, 0], min=0, max=image_width-1) 23 | boxes[:, 1] = torch.clamp(boxes[:, 1], min=0, max=image_width-1) 24 | boxes[:, 2] = torch.clamp(boxes[:, 2], min=0, max=image_height-1) 25 | boxes[:, 3] = torch.clamp(boxes[:, 3], min=0, max=image_height-1) 26 | return boxes 27 | 28 | 29 | def filter_boxes(boxes): 30 | 31 | keep = [] 32 | for i in range(boxes.shape[0]): 33 | if boxes[i, 0] < boxes[i, 2] and boxes[i, 1] < boxes[i, 3]: 34 | keep.append(i) 35 | boxes = boxes[keep] 36 | return boxes 37 | 38 | 39 | def xywh2xyxy(boxes): 40 | """ xywh -> xyxy 41 | (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height) 42 | Args: 43 | boxes: torch.FloatTensor[N,4] 44 | """ 45 | 46 | x1 = boxes[:, 0] - 0.5 * boxes[:, 2] 47 | y1 = boxes[:, 1] - 0.5 * boxes[:, 3] 48 | x2 = boxes[:, 0] + 0.5 * boxes[:, 2] 49 | y2 = boxes[:, 1] + 0.5 * boxes[:, 3] 50 | 51 | return torch.stack([x1, y1, x2, y2]).transpose(0, 1) 52 | 53 | 54 | def xyxy2xywh(boxes): 55 | """ xyxy -> xywh 56 | (xmin,ymin,xmax,ymax) and (xcenter,ycenter,width,height) 57 | Args: 58 | boxes: torch.FloatTensor[N,4] 59 | """ 60 | w = boxes[:, 2] - boxes[:, 0] + 1 61 | h = boxes[:, 3] - boxes[:, 1] + 1 62 | # center 63 | x = boxes[:, 0] + 0.5 * w 64 | y = boxes[:, 1] + 0.5 * h 65 | 66 | return torch.stack([x, y, w, h]).transpose(0, 1) 67 | 68 | 69 | def box_overlaps(box1, box2): 70 | """ Box IoU(Insertion over Union)(xmin,ymin,xmax,ymax) 71 | Args: 72 | box1: torch.FloatTensor[N, 4], 73 | box2: torch.FloatTensor[M, 4] 74 | mode: box representation format 75 | """ 76 | # N = box1.size()[0] 77 | # M = box2.size()[0] 78 | 79 | # NxMx2 80 | lo = torch.max(box1[:, None, :2], box2[:, :2]) 81 | hi = torch.min(box1[:, None, 2:], box2[:, 2:]) 82 | 83 | inner_rect = (hi - lo + 1).clamp(0) 84 | # NxMx1 85 | inner = inner_rect[:, :, 0] * inner_rect[:, :, 1] 86 | 87 | area1 = (box1[:, 2]-box1[:, 0]+1)*(box1[:, 3]-box1[:, 1]+1) 88 | area2 = (box2[:, 2]-box2[:, 0]+1)*(box2[:, 3]-box2[:, 1]+1) 89 | 90 | iou = inner / (area1[:, None] + area2 - inner) 91 | 92 | return iou 93 | 94 | 95 | def box_nms(boxes, scores, threshold): 96 | """Non maximum suppression. 97 | Args: 98 | boxes: (tensor) bounding boxes, sized [N,4]. 99 | scores: (tensor) bbox scores, sized [N,]. 100 | threshold: (float) overlap threshold. 101 | Returns: 102 | keep: (tensor) selected indices. 103 | Reference: 104 | https://github.com/rbgirshick/py-faster-rcnn/blob/master/lib/nms/py_cpu_nms.py 105 | """ 106 | x1 = boxes[:, 0] 107 | y1 = boxes[:, 1] 108 | x2 = boxes[:, 2] 109 | y2 = boxes[:, 3] 110 | 111 | areas = (x2-x1+1) * (y2-y1+1) 112 | _, order = scores.sort(0, descending=True) 113 | keep = [] 114 | while order.numel() > 0: 115 | i = order[0] 116 | keep.append(i) 117 | 118 | if order.numel() == 1: 119 | break 120 | 121 | xx1 = x1[order[1:]].clamp(min=x1[i].item()) 122 | yy1 = y1[order[1:]].clamp(min=y1[i].item()) 123 | xx2 = x2[order[1:]].clamp(max=x2[i].item()) 124 | yy2 = y2[order[1:]].clamp(max=y2[i].item()) 125 | 126 | w = (xx2-xx1+1).clamp(min=0) 127 | h = (yy2-yy1+1).clamp(min=0) 128 | inter = w*h 129 | 130 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 131 | 132 | ids = (ovr <= threshold).nonzero().squeeze() 133 | if ids.numel() == 0: 134 | break 135 | order = order[ids+1] 136 | return torch.LongTensor(keep) 137 | 138 | 139 | if __name__ == '__main__': 140 | # TODO: Test it! 141 | pass -------------------------------------------------------------------------------- /lib/nms/cpu_soft_nms.pyx: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------- 2 | # Soft-NMS: Improving Object Detection With One Line of Code 3 | # Copyright (c) University of Maryland, College Park 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Navaneeth Bodla and Bharat Singh 6 | # ---------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | 12 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 13 | return a if a >= b else b 14 | 15 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 16 | return a if a <= b else b 17 | 18 | def cpu_soft_nms( 19 | np.ndarray[float, ndim=2] boxes_in, 20 | float sigma=0.5, 21 | float Nt=0.3, 22 | float threshold=0.001, 23 | unsigned int method=0 24 | ): 25 | boxes = boxes_in.copy() 26 | cdef unsigned int N = boxes.shape[0] 27 | cdef float iw, ih, box_area 28 | cdef float ua 29 | cdef int pos = 0 30 | cdef float maxscore = 0 31 | cdef int maxpos = 0 32 | cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov 33 | inds = np.arange(N) 34 | 35 | for i in range(N): 36 | maxscore = boxes[i, 4] 37 | maxpos = i 38 | 39 | tx1 = boxes[i,0] 40 | ty1 = boxes[i,1] 41 | tx2 = boxes[i,2] 42 | ty2 = boxes[i,3] 43 | ts = boxes[i,4] 44 | ti = inds[i] 45 | 46 | pos = i + 1 47 | # get max box 48 | while pos < N: 49 | if maxscore < boxes[pos, 4]: 50 | maxscore = boxes[pos, 4] 51 | maxpos = pos 52 | pos = pos + 1 53 | 54 | # add max box as a detection 55 | boxes[i,0] = boxes[maxpos,0] 56 | boxes[i,1] = boxes[maxpos,1] 57 | boxes[i,2] = boxes[maxpos,2] 58 | boxes[i,3] = boxes[maxpos,3] 59 | boxes[i,4] = boxes[maxpos,4] 60 | inds[i] = inds[maxpos] 61 | 62 | # swap ith box with position of max box 63 | boxes[maxpos,0] = tx1 64 | boxes[maxpos,1] = ty1 65 | boxes[maxpos,2] = tx2 66 | boxes[maxpos,3] = ty2 67 | boxes[maxpos,4] = ts 68 | inds[maxpos] = ti 69 | 70 | tx1 = boxes[i,0] 71 | ty1 = boxes[i,1] 72 | tx2 = boxes[i,2] 73 | ty2 = boxes[i,3] 74 | ts = boxes[i,4] 75 | 76 | pos = i + 1 77 | # NMS iterations, note that N changes if detection boxes fall below 78 | # threshold 79 | while pos < N: 80 | x1 = boxes[pos, 0] 81 | y1 = boxes[pos, 1] 82 | x2 = boxes[pos, 2] 83 | y2 = boxes[pos, 3] 84 | s = boxes[pos, 4] 85 | 86 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 87 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 88 | if iw > 0: 89 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 90 | if ih > 0: 91 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 92 | ov = iw * ih / ua #iou between max box and detection box 93 | 94 | if method == 1: # linear 95 | if ov > Nt: 96 | weight = 1 - ov 97 | else: 98 | weight = 1 99 | elif method == 2: # gaussian 100 | weight = np.exp(-(ov * ov)/sigma) 101 | else: # original NMS 102 | if ov > Nt: 103 | weight = 0 104 | else: 105 | weight = 1 106 | 107 | boxes[pos, 4] = weight*boxes[pos, 4] 108 | 109 | # if box score falls below threshold, discard the box by 110 | # swapping with last box update N 111 | if boxes[pos, 4] < threshold: 112 | boxes[pos,0] = boxes[N-1, 0] 113 | boxes[pos,1] = boxes[N-1, 1] 114 | boxes[pos,2] = boxes[N-1, 2] 115 | boxes[pos,3] = boxes[N-1, 3] 116 | boxes[pos,4] = boxes[N-1, 4] 117 | inds[pos] = inds[N-1] 118 | N = N - 1 119 | pos = pos - 1 120 | 121 | pos = pos + 1 122 | 123 | return boxes[:N], inds[:N] -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | Basemodel: ResNet 3 | 4 | """ 5 | 6 | import torch 7 | from torchvision.models import resnet 8 | import torch.nn as nn 9 | 10 | Bottleneck = resnet.Bottleneck 11 | 12 | 13 | class ResNet50Stages(nn.Module): 14 | 15 | def __init__(self, pretrained_path): 16 | super(ResNet50Stages, self).__init__() 17 | self.inplanes = 64 18 | self.stages = [3, 4, 6, 3] 19 | self.mid_outputs = [64, 128, 256, 512] 20 | 21 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 22 | bias=False) 23 | self.bn1 = nn.BatchNorm2d(64) 24 | self.relu = nn.ReLU(inplace=True) 25 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 26 | self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0]) 27 | self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2) 28 | self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2) 29 | self.layer4 = self._make_layer(Bottleneck, self.mid_outputs[3], self.stages[3], stride=2) 30 | 31 | # self.load_state_dict(torch.load(pretrained_path)) 32 | self.load_pretrained(pretrained_path) 33 | 34 | def _make_layer(self, block, planes, blocks, stride=1): 35 | downsample = None 36 | if stride != 1 or self.inplanes != planes * block.expansion: 37 | downsample = nn.Sequential( 38 | nn.Conv2d(self.inplanes, planes * block.expansion, 39 | kernel_size=1, stride=stride, bias=False), 40 | nn.BatchNorm2d(planes * block.expansion), 41 | ) 42 | 43 | layers = [] 44 | layers.append(block(self.inplanes, planes, stride, downsample)) 45 | self.inplanes = planes * block.expansion 46 | for i in range(1, blocks): 47 | layers.append(block(self.inplanes, planes)) 48 | 49 | return nn.Sequential(*layers) 50 | 51 | def freeze_bn(self): 52 | pass 53 | 54 | def load_pretrained(self, mpath): 55 | 56 | pretrained_dict = torch.load(mpath) 57 | model_dict = self.state_dict() 58 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 59 | self.load_state_dict(pretrained_dict) 60 | 61 | def forward(self, x): 62 | x = self.conv1(x) 63 | x = self.bn1(x) 64 | x = self.relu(x) 65 | x = self.maxpool(x) 66 | x1 = self.layer1(x) 67 | x2 = self.layer2(x1) 68 | x3 = self.layer3(x2) 69 | x4 = self.layer4(x3) 70 | return [x2, x3, x4] 71 | 72 | 73 | class ResNet50(nn.Module): 74 | 75 | def __init__(self, pretrained_path): 76 | super(ResNet50, self).__init__() 77 | self.layers = ResNet50Stages(pretrained_path) 78 | 79 | def forward(self, x): 80 | return self.layers(x)[-1] 81 | 82 | 83 | class ResNet50C4(nn.Module): 84 | 85 | def __init__(self, pretrained_path): 86 | super(ResNet50C4, self).__init__() 87 | self.inplanes = 64 88 | self.stages = [3, 4, 6] 89 | self.mid_outputs = [64, 128, 256, 512] 90 | 91 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 92 | bias=False) 93 | self.bn1 = nn.BatchNorm2d(64) 94 | self.relu = nn.ReLU(inplace=True) 95 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 96 | self.layer1 = self._make_layer(Bottleneck, self.mid_outputs[0], self.stages[0]) 97 | self.layer2 = self._make_layer(Bottleneck, self.mid_outputs[1], self.stages[1], stride=2) 98 | self.layer3 = self._make_layer(Bottleneck, self.mid_outputs[2], self.stages[2], stride=2) 99 | 100 | # self.load_state_dict(torch.load(pretrained_path)) 101 | self.load_pretrained(pretrained_path) 102 | 103 | def _make_layer(self, block, planes, blocks, stride=1): 104 | downsample = None 105 | if stride != 1 or self.inplanes != planes * block.expansion: 106 | downsample = nn.Sequential( 107 | nn.Conv2d(self.inplanes, planes * block.expansion, 108 | kernel_size=1, stride=stride, bias=False), 109 | nn.BatchNorm2d(planes * block.expansion), 110 | ) 111 | 112 | layers = [] 113 | layers.append(block(self.inplanes, planes, stride, downsample)) 114 | self.inplanes = planes * block.expansion 115 | for i in range(1, blocks): 116 | layers.append(block(self.inplanes, planes)) 117 | 118 | return nn.Sequential(*layers) 119 | 120 | def freeze_bn(self): 121 | pass 122 | 123 | def load_pretrained(self, mpath): 124 | 125 | pretrained_dict = torch.load(mpath) 126 | model_dict = self.state_dict() 127 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 128 | self.load_state_dict(pretrained_dict) 129 | 130 | def forward(self, x): 131 | x = self.conv1(x) 132 | x = self.bn1(x) 133 | x = self.relu(x) 134 | x = self.maxpool(x) 135 | x1 = self.layer1(x) 136 | x2 = self.layer2(x1) 137 | x3 = self.layer3(x2) 138 | return x3 139 | 140 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Training RetinaNet 4 | 5 | 6 | """ 7 | import os 8 | import tqdm 9 | import argparse 10 | import numpy as np 11 | import torch.nn as nn 12 | import torch.optim as optim 13 | from models import retina 14 | from datasets import synthtext, icdar15, minibatch 15 | from torch.utils.data import DataLoader 16 | from lib.det_ops.loss import SigmoidFocalLoss, SmoothL1Loss 17 | from IPython import embed 18 | import tensorboardX 19 | from utils import logger 20 | from cfgs import config as cfg 21 | 22 | 23 | def initialize(config, args): 24 | 25 | logdir = config['logdir'] 26 | if not os.path.exists(logdir): 27 | os.mkdir(logdir) 28 | if not os.path.exists(os.path.join(logdir, args.experiment)): 29 | os.mkdir(os.path.join(logdir, args.experiment)) 30 | 31 | model_dump_dir = os.path.join(logdir, args.experiment, 'model_dump') 32 | tb_dump = os.path.join(logdir, args.experiment, 'tb_dump') 33 | 34 | if not os.path.exists(model_dump_dir): 35 | os.mkdir(model_dump_dir) 36 | 37 | if not os.path.exists(tb_dump): 38 | os.mkdir(tb_dump) 39 | 40 | config['tb_dump_dir'] = tb_dump 41 | config['model_dump_dir'] = model_dump_dir 42 | 43 | 44 | def learning_rate_decay(optimizer, step, config): 45 | base_lr = config['base_lr'] 46 | lr = base_lr 47 | if step >= config['lr_decay'][0]: 48 | lr = base_lr * 0.1 49 | if step >= config['lr_decay'][0]: 50 | lr = base_lr * 0.01 51 | 52 | for param_group in optimizer.param_groups: 53 | param_group['lr'] = lr 54 | 55 | 56 | def train(args, config): 57 | anchor_scales = config['anchor_sizes'] 58 | anchor_apsect_ratios = config['anchor_aspect_ratios'] 59 | num_anchors = len(anchor_scales) * len(anchor_apsect_ratios) 60 | 61 | model = retina.RetinaNet(config['num_classes'], num_anchors, config['basemodel_path']).cuda() 62 | model = nn.DataParallel(model, device_ids=list(range(args.device))) 63 | 64 | if args.dataset == 'SynthText': 65 | train_dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config) 66 | elif args.dataset == 'ICDAR': 67 | train_dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config) 68 | else: 69 | raise NotImplemented() 70 | 71 | collate_minibatch = minibatch.create_minibatch_func(config) 72 | 73 | train_loader = DataLoader( 74 | dataset=train_dataset, 75 | batch_size=args.batch_size*args.device, 76 | shuffle=True, 77 | num_workers=config['workers'], 78 | collate_fn=collate_minibatch 79 | ) 80 | 81 | writer = tensorboardX.SummaryWriter(config['tb_dump_dir']) 82 | # torch model 83 | 84 | optimizer = optim.SGD(lr=config['base_lr'], params=model.parameters(), 85 | weight_decay=config['weight_decay'], momentum=0.9) 86 | 87 | cls_criterion = SigmoidFocalLoss().cuda() 88 | box_criterion = SmoothL1Loss().cuda() 89 | 90 | start_epoch = 0 91 | global_step = 0 92 | 93 | # Load state dict from saved model 94 | if len(args.continue_path) > 0: 95 | model_state, optimizer_state, epoch, step = logger.load_checkpoints(args.continue_path) 96 | model.module.load_state_dict(model_state) 97 | optimizer.load_state_dict(optimizer_state) 98 | global_step = step+1 99 | start_epoch = epoch + 1 100 | 101 | for epoch in range(start_epoch, config['epochs']): 102 | losses = [] 103 | data_iter = iter(train_loader) 104 | pbar = tqdm.tqdm(range(len(train_loader))) 105 | for i in pbar: 106 | img, labels, boxes = next(data_iter) 107 | img = img.cuda() 108 | labels = labels.long().cuda() 109 | boxes = boxes.cuda() 110 | cls_outputs, bbox_outputs = model(img) 111 | cls_loss = cls_criterion(cls_outputs, labels) 112 | box_loss = box_criterion(bbox_outputs, boxes, labels) 113 | loss = cls_loss + box_loss 114 | 115 | optimizer.zero_grad() 116 | loss.backward() 117 | optimizer.step() 118 | writer.add_scalar('train/box_loss', box_loss.item(), global_step) 119 | writer.add_scalar('train/cls_loss', cls_loss.item(), global_step) 120 | global_step += 1 121 | pbar.set_description('e:{} i:{} loss:{:.3f} cls_loss:{:.3f} box_loss:{:.3f}'.format( 122 | epoch, i + 1, loss.item(), cls_loss.item(), box_loss.item() 123 | )) 124 | losses.append(loss.item()) 125 | 126 | # learning rate decay 127 | learning_rate_decay(optimizer, global_step, config) 128 | 129 | print("e:{} loss: {}".format(epoch, np.mean(losses))) 130 | logger.save_checkpoints(model.module, optimizer, epoch, global_step, 131 | path=os.path.join(config['model_dump_dir'], 132 | 'epoch-{}-iter-{}.pth'.format(epoch, global_step))) 133 | 134 | 135 | if __name__ == '__main__': 136 | 137 | parser = argparse.ArgumentParser() 138 | parser.add_argument('-d', '--device', type=int, default=1, help='training with ? GPUs') 139 | parser.add_argument('-b', '--batch_size', type=int, default=4, help='training batch size per GPU') 140 | parser.add_argument('-c', '--continue_path', type=str, default='', help='continue model parameters') 141 | parser.add_argument('-e', '--experiment', type=str, default='synth_baseline', 142 | help='experiment name, correspond to `config.py`') 143 | parser.add_argument('-ds', '--dataset', type=str, default='SynthText', help='dataset') 144 | 145 | _args = parser.parse_args() 146 | config = cfg.config[_args.experiment] 147 | train(_args, config) 148 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Test scripts 4 | 5 | """ 6 | import argparse 7 | import json 8 | import tqdm 9 | import torch 10 | import numpy as np 11 | from lib.det_ops.anchors import compute_anchor_whs, generate_anchors 12 | from lib.bbox import bbox, box_transform 13 | from lib.nms import nms 14 | from utils.logger import load_checkpoints 15 | from models import retina 16 | from IPython import embed 17 | from datasets import synthtext, icdar15 18 | from cfgs import config as cfg 19 | 20 | 21 | def inference(model, dataset, anchor_wh, strides, result_file, config): 22 | 23 | model.eval() 24 | num_samples = len(dataset) 25 | pbar = tqdm.tqdm(range(num_samples)) 26 | with torch.no_grad(): 27 | for idx in pbar: 28 | img, im_name, scale, im_size = dataset[idx] 29 | h, w = img.shape[1], img.shape[2] 30 | img = img.cuda() 31 | cls_pred, bbox_pred = model(img.unsqueeze(0)) 32 | scores = cls_pred.sigmoid() 33 | # bbox [N, 4] 34 | bbox_pred = bbox_pred[0] 35 | # cls [N, C] 36 | scores = scores[0] 37 | 38 | anchors = generate_anchors(anchor_wh, input_size=np.array([h, w]), 39 | strides=strides) 40 | anchors = anchors.cuda() 41 | 42 | # transform to bboxes 43 | boxes = box_transform.bbox_transform_inv(anchors, bbox_pred) 44 | boxes = boxes/scale 45 | boxes = bbox.clip_boxes(boxes, im_size[0], im_size[1]) 46 | 47 | filter_boxes_inds_x = boxes[:, 0] >= boxes[:, 2] 48 | filter_boxes_inds_y = boxes[:, 1] >= boxes[:, 3] 49 | filter_boxes_inds = (1 - filter_boxes_inds_x) * (1 - filter_boxes_inds_y) 50 | boxes = boxes[filter_boxes_inds] 51 | scores = scores[filter_boxes_inds] 52 | 53 | result_boxes = [] # [] 54 | # every class 55 | # 1. max detection score 56 | # 2. score thresh 57 | # 3. do nms 58 | # 4. top k 59 | max_labels = torch.argmax(scores, dim=1) 60 | 61 | for cls in range(config['num_classes']-1): 62 | 63 | # filter predictions through 'classification threshold' 64 | score = scores[:, cls] 65 | cls_inds = score > config['cls_thresh'] 66 | # current class has the max score over all classes 67 | max_inds = max_labels == cls 68 | cls_inds = max_inds * cls_inds 69 | if cls_inds.sum() < 1: 70 | continue 71 | # score [K] 72 | score = score[cls_inds] 73 | 74 | # _boxes [K, 4] 75 | _boxes = boxes[cls_inds] 76 | 77 | # NMS remove duplicate 78 | keep = nms(torch.cat([_boxes, score.unsqueeze(1)], 1), config['test_nms']) 79 | 80 | score = score[keep] 81 | _boxes = _boxes[keep] 82 | 83 | for i in range(_boxes.shape[0]): 84 | result_boxes.append((cls, score[i].item(), _boxes[i].cpu().data.numpy().tolist())) 85 | 86 | # Keep Max Num Boxes 87 | if len(result_boxes) > config['test_max_boxes']: 88 | result_boxes = sorted(result_boxes, key=lambda x: x[1], reverse=True) 89 | result_boxes = result_boxes[:config['test_max_boxes']] 90 | pbar.set_description('im_det:{}/{}'.format(idx, num_samples)) 91 | 92 | if len(result_boxes) == 0: 93 | continue 94 | 95 | result = dict() 96 | result['image_id'] = im_name 97 | det = [] 98 | for i in range(len(result_boxes)): 99 | cls, s, b, = result_boxes[i] 100 | current_det = dict() 101 | current_det['prob'] = s 102 | current_det['class'] = cls+1 103 | current_det['bbox'] = b 104 | 105 | det.append(current_det) 106 | result['result'] = det 107 | 108 | with open(result_file, 'a+') as f: 109 | s = json.dumps(result) 110 | f.write('{}\n'.format(s)) 111 | 112 | print("Det Finished!") 113 | 114 | 115 | def validate(args, config): 116 | 117 | anchor_scales = config['anchor_sizes'] 118 | anchor_apsect_ratios = config['aspect_ratios'] 119 | num_anchors = len(anchor_scales) * len(anchor_apsect_ratios) 120 | 121 | model = retina.RetinaNet(config['num_classes']-1, num_anchors, config['basemodel_path']).cuda() 122 | 123 | model_path = args.model_path 124 | output_file = args.output 125 | if args.dataset == 'SynthText': 126 | dataset = synthtext.SynthText(dataroot=config['data_dir'], imageset=args.imageset, config=config) 127 | elif args.dataset == 'ICDAR': 128 | dataset = icdar15.ICDAR15(dataroot=config['data_dir'], imageset=args.imageset, config=config) 129 | else: 130 | NotImplemented() 131 | state_dict, _, _, _ = load_checkpoints(model_path) 132 | model.load_state_dict(state_dict) 133 | 134 | anchor_whs = compute_anchor_whs(len(config['strides']), areas=config['anchor_areas'], 135 | aspect_ratios=anchor_apsect_ratios, 136 | sizes=anchor_scales) 137 | 138 | inference(model, dataset, anchor_whs, config['strides'], result_file=output_file, config=config) 139 | 140 | 141 | if __name__ == '__main__': 142 | 143 | parser = argparse.ArgumentParser() 144 | parser.add_argument('-o', '--output', type=str, default='result.det', help='output file path') 145 | parser.add_argument('-m', '--model_path', type=str, help='saved model path') 146 | parser.add_argument('-i', '--imageset', type=str, default='val', help='saved model path') 147 | parser.add_argument('-e', '--experiment', type=str, default='synth_baseline', 148 | help='experiment name, correspond to `config.py`') 149 | parser.add_argument('-ds', '--dataset', type=str, default='VOC', help='dataset') 150 | _args = parser.parse_args() 151 | config = cfg.config[_args.experiment] 152 | _args = parser.parse_args() 153 | validate(_args, config) 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include 9 | #include 10 | #include 11 | #include "gpu_nms.hpp" 12 | 13 | #define CUDA_CHECK(condition) \ 14 | /* Code block avoids redefinition of cudaError_t error */ \ 15 | do { \ 16 | cudaError_t error = condition; \ 17 | if (error != cudaSuccess) { \ 18 | std::cout << cudaGetErrorString(error) << std::endl; \ 19 | } \ 20 | } while (0) 21 | 22 | #define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) 23 | #define MULTIPLIER 16 24 | #define LONGLONG_SIZE 64 25 | 26 | int const threadsPerBlock = 27 | sizeof(unsigned long long) * 8 * 28 | MULTIPLIER; // number of bits for a long long variable 29 | 30 | __device__ inline float devIoU(float const* const a, float const* const b) { 31 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 32 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 33 | float width = max(right - left + 1, 0.f), 34 | height = max(bottom - top + 1, 0.f); 35 | float interS = width * height; 36 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 37 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 38 | return interS / (Sa + Sb - interS); 39 | } 40 | 41 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 42 | const float* dev_boxes, 43 | unsigned long long* dev_mask) { 44 | const int row_start = blockIdx.y; 45 | const int col_start = blockIdx.x; 46 | 47 | // if (row_start > col_start) return; 48 | 49 | const int row_size = 50 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 51 | const int col_size = 52 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 53 | 54 | __shared__ float block_boxes[threadsPerBlock * 5]; 55 | if (threadIdx.x < col_size) { 56 | block_boxes[threadIdx.x * 5 + 0] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 58 | block_boxes[threadIdx.x * 5 + 1] = 59 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 60 | block_boxes[threadIdx.x * 5 + 2] = 61 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 62 | block_boxes[threadIdx.x * 5 + 3] = 63 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 64 | block_boxes[threadIdx.x * 5 + 4] = 65 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 66 | } 67 | __syncthreads(); 68 | 69 | unsigned long long ts[MULTIPLIER]; 70 | 71 | if (threadIdx.x < row_size) { 72 | #pragma unroll 73 | for (int i = 0; i < MULTIPLIER; ++i) { 74 | ts[i] = 0; 75 | } 76 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 77 | const float* cur_box = dev_boxes + cur_box_idx * 5; 78 | int i = 0; 79 | int start = 0; 80 | if (row_start == col_start) { 81 | start = threadIdx.x + 1; 82 | } 83 | for (i = start; i < col_size; i++) { 84 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 85 | ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE); 86 | } 87 | } 88 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 89 | 90 | #pragma unroll 91 | for (int i = 0; i < MULTIPLIER; ++i) { 92 | dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] = 93 | ts[i]; 94 | } 95 | } 96 | } 97 | 98 | void _set_device(int device_id) { 99 | int current_device; 100 | CUDA_CHECK(cudaGetDevice(¤t_device)); 101 | if (current_device == device_id) { 102 | return; 103 | } 104 | // The call to cudaSetDevice must come before any calls to Get, which 105 | // may perform initialization using the GPU. 106 | CUDA_CHECK(cudaSetDevice(device_id)); 107 | } 108 | 109 | const size_t MEMORY_SIZE = 500000000; 110 | size_t nms_Malloc() { 111 | float* boxes_dev = NULL; 112 | CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE)); 113 | return size_t(boxes_dev); 114 | } 115 | 116 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 117 | int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) { 118 | _set_device(device_id); 119 | 120 | float* boxes_dev = NULL; 121 | unsigned long long* mask_dev = NULL; 122 | 123 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 124 | 125 | if (base > 0) { 126 | size_t require_mem = 127 | boxes_num * boxes_dim * sizeof(float) + 128 | boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER; 129 | if (require_mem >= MEMORY_SIZE) { 130 | std::cout << "require_mem: " << require_mem << std::endl; 131 | } 132 | boxes_dev = (float*)(base); 133 | mask_dev = 134 | (unsigned long long*)(base + 135 | 512 * ((unsigned long long)(boxes_num * 136 | boxes_dim * 137 | sizeof(float) / 138 | 512) + 139 | 1)); 140 | } else { 141 | CUDA_CHECK( 142 | cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float))); 143 | CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks * 144 | sizeof(unsigned long long))); 145 | } 146 | CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host, 147 | boxes_num * boxes_dim * sizeof(float), 148 | cudaMemcpyHostToDevice)); 149 | 150 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 151 | DIVUP(boxes_num, threadsPerBlock)); 152 | dim3 threads(threadsPerBlock); 153 | nms_kernel<<>>(boxes_num, nms_overlap_thresh, boxes_dev, 154 | mask_dev); 155 | 156 | std::vector mask_host(boxes_num * col_blocks * 157 | MULTIPLIER); 158 | CUDA_CHECK(cudaMemcpy( 159 | &mask_host[0], mask_dev, 160 | sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER, 161 | cudaMemcpyDeviceToHost)); 162 | 163 | std::vector remv(col_blocks * MULTIPLIER); 164 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER); 165 | 166 | int num_to_keep = 0; 167 | for (int i = 0; i < boxes_num; i++) { 168 | int nblock = i / threadsPerBlock; 169 | int inblock = i % threadsPerBlock; 170 | int offset = inblock / LONGLONG_SIZE; 171 | int bit_pos = inblock % LONGLONG_SIZE; 172 | 173 | if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) { 174 | keep_out[num_to_keep++] = i; 175 | unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER; 176 | for (int j = nblock * MULTIPLIER + offset; 177 | j < col_blocks * MULTIPLIER; j++) { 178 | remv[j] |= p[j]; 179 | } 180 | } 181 | } 182 | *num_out = num_to_keep; 183 | 184 | if (!base) { 185 | CUDA_CHECK(cudaFree(boxes_dev)); 186 | CUDA_CHECK(cudaFree(mask_dev)); 187 | } 188 | } 189 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 151 | 152 | 153 | 155 | 156 | 166 | 167 | 168 | 169 | 170 | true 171 | DEFINITION_ORDER 172 | 173 | 174 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 |