├── data ├── result │ └── .keep ├── ICDAR_submit │ └── .keep ├── pretrained_model │ └── .keep ├── demo │ └── 007.jpg └── cfgs │ ├── mobile.yml │ ├── res50.yml │ ├── squeeze.yml │ ├── vgg16.yml │ └── res101.yml ├── lib ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── gpu_nms.hpp │ ├── py_cpu_nms.py │ ├── gpu_nms.pyx │ ├── cpu_nms.pyx │ └── nms_kernel.cu ├── nets │ ├── __init__.py │ ├── mobilenet │ │ ├── __init__.py │ │ ├── mobilenet_v2.py │ │ └── conv_blocks.py │ ├── vgg16.py │ ├── mobilenet_v2.py │ ├── squeezenet.py │ └── resnet_v1.py ├── layer_utils │ ├── __init__.py │ ├── proposal_top_layer.py │ ├── proposal_layer.py │ ├── generate_anchors.py │ └── anchor_target_layer.py ├── model │ ├── __init__.py │ ├── nms_wrapper.py │ ├── bbox_transform.py │ ├── test.py │ └── config.py ├── utils │ ├── .gitignore │ ├── common.py │ ├── helper.py │ ├── __init__.py │ ├── timer.py │ ├── blob.py │ ├── bbox.pyx │ └── visualization.py ├── text_connector │ ├── __init__.py │ ├── text_connect_cfg.py │ ├── other.py │ ├── detectors.py │ ├── text_proposal_connector.py │ ├── text_proposal_graph_builder.py │ └── text_proposal_connector_oriented.py ├── Makefile ├── datasets │ ├── __init__.py │ ├── factory.py │ ├── ds_utils.py │ ├── voc_eval.py │ ├── imdb.py │ └── pascal_voc.py ├── roi_data_layer │ ├── __init__.py │ ├── roidb.py │ ├── minibatch.py │ └── layer.py ├── setup_cpu_win.py ├── setup_cpu.py └── setup.py ├── tools ├── ICDAR15 │ ├── __init__.py │ └── readme.txt ├── _init_paths.py ├── ICDAR13 │ └── readme.txt ├── ICDAR13_Det │ └── readme.txt ├── anchor_drawer.py ├── convert_utils.py ├── freeze_graph.py ├── icdar13_split_label.py ├── icdar13_to_voc.py ├── trainval_net.py ├── icdar.py └── demo.py ├── requirements.txt ├── .gitignore ├── main.py ├── LICENSE └── README.md /data/result/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/ICDAR_submit/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/pretrained_model/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/layer_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/ICDAR15/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nets/mobilenet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/model/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.h 4 | *.hpp 5 | -------------------------------------------------------------------------------- /data/demo/007.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sanster/tf_ctpn/HEAD/data/demo/007.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | numpy 3 | opencv-python 4 | tensorflow-gpu 5 | easydict 6 | -------------------------------------------------------------------------------- /lib/text_connector/__init__.py: -------------------------------------------------------------------------------- 1 | from .detectors import TextDetector 2 | from .text_connect_cfg import Config 3 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python3 setup.py build_ext --inplace 3 | rm -rf build 4 | clean: 5 | rm -rf */*.pyc 6 | rm -rf */*.so 7 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/utils/common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def check_dir(dir_path): 4 | """ 5 | create dir if dir not exist 6 | """ 7 | if not os.path.exists(dir_path): 8 | os.makedirs(dir_path) 9 | -------------------------------------------------------------------------------- /lib/utils/helper.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | 4 | def read_rgb_img(img_file_path): 5 | bgr = cv2.imread(img_file_path) 6 | rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB) 7 | return rgb 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | *.pyc 3 | output 4 | tensorboard 5 | lib/build 6 | .idea 7 | data/demo/* 8 | data/VOCdevkit2007 9 | data/cache 10 | data/pretrained_model/*.ckpt 11 | data/result/* 12 | !data/result/.keep 13 | submit.zip 14 | *.zip 15 | model/ 16 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | 4 | def add_path(path): 5 | if path not in sys.path: 6 | sys.path.insert(0, path) 7 | 8 | this_dir = osp.dirname(__file__) 9 | 10 | # Add lib to PYTHONPATH 11 | lib_path = osp.join(this_dir, '..', 'lib') 12 | add_path(lib_path) 13 | -------------------------------------------------------------------------------- /data/cfgs/mobile.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: mobile 2 | TRAIN: 3 | IMS_PER_BATCH: 1 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | DOUBLE_BIAS: False 11 | SNAPSHOT_PREFIX: mobile_ctpn 12 | LEARNING_RATE: 0.00001 13 | -------------------------------------------------------------------------------- /data/cfgs/res50.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res50 2 | TRAIN: 3 | IMS_PER_BATCH: 1 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | DOUBLE_BIAS: False 11 | SNAPSHOT_PREFIX: res50_ctpn 12 | LEARNING_RATE: 0.0001 13 | -------------------------------------------------------------------------------- /data/cfgs/squeeze.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: squeeze 2 | TRAIN: 3 | IMS_PER_BATCH: 1 4 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 5 | RPN_POSITIVE_OVERLAP: 0.7 6 | RPN_BATCHSIZE: 256 7 | PROPOSAL_METHOD: gt 8 | BG_THRESH_LO: 0.0 9 | DISPLAY: 20 10 | DOUBLE_BIAS: False 11 | SNAPSHOT_PREFIX: squeeze_ctpn 12 | LEARNING_RATE: 0.001 13 | -------------------------------------------------------------------------------- /data/cfgs/vgg16.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: vgg16 2 | TRAIN: 3 | USE_FLIPPED: False 4 | IMS_PER_BATCH: 1 5 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 6 | RPN_POSITIVE_OVERLAP: 0.7 7 | RPN_BATCHSIZE: 128 8 | PROPOSAL_METHOD: gt 9 | BG_THRESH_LO: 0.0 10 | DISPLAY: 20 11 | DOUBLE_BIAS: False 12 | SNAPSHOT_PREFIX: vgg16_ctpn 13 | LEARNING_RATE: 0.00001 14 | -------------------------------------------------------------------------------- /data/cfgs/res101.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: res101 2 | TRAIN: 3 | USE_FLIPPED: False 4 | LEARNING_RATE: 0.00001 5 | IMS_PER_BATCH: 1 6 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 7 | RPN_POSITIVE_OVERLAP: 0.7 8 | RPN_BATCHSIZE: 128 9 | PROPOSAL_METHOD: gt 10 | BG_THRESH_LO: 0.0 11 | DISPLAY: 20 12 | DOUBLE_BIAS: False 13 | SNAPSHOT_PREFIX: res101_ctpn 14 | -------------------------------------------------------------------------------- /lib/text_connector/text_connect_cfg.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | SCALE = 600 3 | MAX_SCALE = 1200 4 | TEXT_PROPOSALS_WIDTH = 16 5 | MIN_NUM_PROPOSALS = 2 6 | MIN_RATIO = 0.5 7 | LINE_MIN_SCORE = 0.9 8 | MAX_HORIZONTAL_GAP = 50 9 | TEXT_PROPOSALS_MIN_SCORE = 0.7 10 | TEXT_PROPOSALS_NMS_THRESH = 0.2 11 | MIN_V_OVERLAPS = 0.7 12 | MIN_SIZE_SIM = 0.7 13 | -------------------------------------------------------------------------------- /lib/model/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | from model.config import cfg 12 | 13 | 14 | def nms(dets, thresh, force_cpu=False): 15 | """Dispatch to either CPU or GPU NMS implementations.""" 16 | 17 | if dets.shape[0] == 0: 18 | return [] 19 | if cfg.USE_GPU_NMS and not force_cpu: 20 | from nms.gpu_nms import gpu_nms 21 | return gpu_nms(dets, thresh, device_id=0) 22 | else: 23 | from nms.cpu_nms import cpu_nms 24 | return cpu_nms(dets, thresh) 25 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | # checkpoint_dir = "/home/cwq/data/model/slim/mobilenetv2" 4 | checkpoint_dir = "/home/cwq/data/checkpoint/tf_crnn/simple_no_lstm_more_bg" 5 | print("Restoring checkpoint from: " + checkpoint_dir) 6 | 7 | ckpt = tf.train.latest_checkpoint(checkpoint_dir) 8 | if ckpt is None: 9 | print("Checkpoint not found") 10 | exit(-1) 11 | 12 | meta_file = ckpt + '.meta' 13 | 14 | print('Restore variables from {}'.format(ckpt)) 15 | print('Restore meta_file from {}'.format(meta_file)) 16 | 17 | config = tf.ConfigProto(allow_soft_placement=True) 18 | with tf.Session(config=config) as sess: 19 | saver = tf.train.import_meta_graph(meta_file) 20 | saver.restore(sess, ckpt) 21 | 22 | input_graph_def = tf.get_default_graph().as_graph_def() 23 | 24 | # Print all node name in graph 25 | for node in input_graph_def.node: 26 | print(node.name) 27 | -------------------------------------------------------------------------------- /lib/text_connector/other.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def threshold(coords, min_, max_): 5 | return np.maximum(np.minimum(coords, max_), min_) 6 | 7 | def clip_boxes(boxes, im_shape): 8 | """ 9 | Clip boxes to image boundaries. 10 | """ 11 | boxes[:, 0::2]=threshold(boxes[:, 0::2], 0, im_shape[1]-1) 12 | boxes[:, 1::2]=threshold(boxes[:, 1::2], 0, im_shape[0]-1) 13 | return boxes 14 | 15 | 16 | class Graph: 17 | def __init__(self, graph): 18 | self.graph=graph 19 | 20 | def sub_graphs_connected(self): 21 | sub_graphs=[] 22 | for index in range(self.graph.shape[0]): 23 | if not self.graph[:, index].any() and self.graph[index, :].any(): 24 | v=index 25 | sub_graphs.append([v]) 26 | while self.graph[v, :].any(): 27 | v=np.where(self.graph[v, :])[0][0] 28 | sub_graphs[-1].append(v) 29 | return sub_graphs 30 | 31 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Xinlei Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | __sets = {} 14 | from datasets.pascal_voc import pascal_voc 15 | 16 | # Set up voc__ 17 | for year in ['2007', '2012']: 18 | for split in ['train', 'val', 'trainval', 'test']: 19 | name = 'voc_{}_{}'.format(year, split) 20 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 21 | 22 | for year in ['2007', '2012']: 23 | for split in ['train', 'val', 'trainval', 'test']: 24 | name = 'voc_{}_{}_diff'.format(year, split) 25 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year, use_diff=True)) 26 | 27 | 28 | def get_imdb(name): 29 | """Get an imdb (image database) by name.""" 30 | if name not in __sets: 31 | raise KeyError('Unknown dataset: {}'.format(name)) 32 | return __sets[name]() 33 | 34 | 35 | def list_imdbs(): 36 | """List all registered imdbs.""" 37 | return list(__sets.keys()) 38 | -------------------------------------------------------------------------------- /tools/ICDAR13/readme.txt: -------------------------------------------------------------------------------- 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS 2 | Requirements: 3 | - Python version 2.7. 4 | - Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions. 5 | 6 | Procedure: 7 | Download the ZIP file for the requested script and unzip it to a directory. 8 | 9 | Open a terminal in the directory and run the command: 10 | python script.py –g=gt.zip –s=submit.zip 11 | 12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct. 13 | 14 | parameters: 15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task. 16 | -s: Path of your method's results file. 17 | 18 | Optional parameters: 19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results. 20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script. 21 | 22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}' -------------------------------------------------------------------------------- /tools/ICDAR13_Det/readme.txt: -------------------------------------------------------------------------------- 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS 2 | Requirements: 3 | - Python version 2.7. 4 | - Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions. 5 | 6 | Procedure: 7 | Download the ZIP file for the requested script and unzip it to a directory. 8 | 9 | Open a terminal in the directory and run the command: 10 | python script.py –g=gt.zip –s=submit.zip 11 | 12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct. 13 | 14 | parameters: 15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task. 16 | -s: Path of your method's results file. 17 | 18 | Optional parameters: 19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results. 20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script. 21 | 22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}' -------------------------------------------------------------------------------- /tools/ICDAR15/readme.txt: -------------------------------------------------------------------------------- 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS 2 | Requirements: 3 | - Python version 2.7. 4 | - Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions. 5 | 6 | Procedure: 7 | Download the ZIP file for the requested script and unzip it to a directory. 8 | 9 | Open a terminal in the directory and run the command: 10 | python script.py –g=gt.zip –s=submit.zip 11 | 12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct. 13 | 14 | parameters: 15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task. 16 | -s: Path of your method's results file. 17 | 18 | Optional parameters: 19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results. 20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script. 21 | 22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}' -------------------------------------------------------------------------------- /lib/setup_cpu_win.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join as pjoin 3 | import numpy as np 4 | from distutils.core import setup 5 | from distutils.extension import Extension 6 | from Cython.Distutils import build_ext 7 | 8 | def find_in_path(name, path): 9 | "Find a file in a search path" 10 | #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 11 | for dir in path.split(os.pathsep): 12 | binpath = pjoin(dir, name) 13 | if os.path.exists(binpath): 14 | return os.path.abspath(binpath) 15 | return None 16 | 17 | # Obtain the numpy include directory. This logic works across numpy versions. 18 | try: 19 | numpy_include = np.get_include() 20 | except AttributeError: 21 | numpy_include = np.get_numpy_include() 22 | 23 | class custom_build_ext(build_ext): 24 | def build_extensions(self): 25 | build_ext.build_extensions(self) 26 | 27 | ext_modules = [ 28 | Extension( 29 | "utils.cython_bbox", 30 | ["utils/bbox.pyx"], 31 | include_dirs = [numpy_include] 32 | ), 33 | Extension( 34 | "nms.cpu_nms", 35 | ["nms/cpu_nms.pyx"], 36 | include_dirs = [numpy_include] 37 | ) 38 | ] 39 | 40 | setup( 41 | name='tf_faster_rcnn', 42 | ext_modules=ext_modules, 43 | # inject our custom trigger 44 | cmdclass={'build_ext': custom_build_ext}, 45 | ) 46 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | 12 | 13 | def unique_boxes(boxes, scale=1.0): 14 | """Return indices of unique boxes.""" 15 | v = np.array([1, 1e3, 1e6, 1e9]) 16 | hashes = np.round(boxes * scale).dot(v) 17 | _, index = np.unique(hashes, return_index=True) 18 | return np.sort(index) 19 | 20 | 21 | def xywh_to_xyxy(boxes): 22 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 23 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 24 | 25 | 26 | def xyxy_to_xywh(boxes): 27 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 28 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 29 | 30 | 31 | def validate_boxes(boxes, width=0, height=0): 32 | """Check that a set of boxes are valid.""" 33 | x1 = boxes[:, 0] 34 | y1 = boxes[:, 1] 35 | x2 = boxes[:, 2] 36 | y2 = boxes[:, 3] 37 | assert (x1 >= 0).all() 38 | assert (y1 >= 0).all() 39 | assert (x2 >= x1).all() 40 | assert (y2 >= y1).all() 41 | assert (x2 < width).all() 42 | assert (y2 < height).all() 43 | 44 | 45 | def filter_small_boxes(boxes, min_size): 46 | w = boxes[:, 2] - boxes[:, 0] 47 | h = boxes[:, 3] - boxes[:, 1] 48 | keep = np.where((w >= min_size) & (h > min_size))[0] 49 | return keep 50 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import numpy as np 14 | import cv2 15 | 16 | 17 | def im_list_to_blob(ims): 18 | """Convert a list of images into a network input. 19 | 20 | Assumes images are already prepared (means subtracted, BGR order, ...). 21 | """ 22 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 23 | num_images = len(ims) 24 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 25 | dtype=np.float32) 26 | for i in range(num_images): 27 | im = ims[i] 28 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 29 | 30 | return blob 31 | 32 | 33 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 34 | """Mean subtract and scale an image for use in a blob.""" 35 | im = im.astype(np.float32, copy=False) 36 | im -= pixel_means 37 | im_shape = im.shape 38 | im_size_min = np.min(im_shape[0:2]) 39 | im_size_max = np.max(im_shape[0:2]) 40 | im_scale = float(target_size) / float(im_size_min) 41 | # Prevent the biggest axis from being more than MAX_SIZE 42 | if np.round(im_scale * im_size_max) > max_size: 43 | im_scale = float(max_size) / float(im_size_max) 44 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 45 | interpolation=cv2.INTER_LINEAR) 46 | 47 | return im, im_scale 48 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | 57 | -------------------------------------------------------------------------------- /lib/layer_utils/proposal_top_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from model.config import cfg 12 | from model.bbox_transform import bbox_transform_inv, clip_boxes 13 | import numpy.random as npr 14 | 15 | 16 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, anchors, num_anchors): 17 | """A layer that just selects the top region proposals 18 | without using non-maximal suppression, 19 | For details please see the technical report 20 | """ 21 | rpn_top_n = cfg.TEST.RPN_TOP_N 22 | 23 | scores = rpn_cls_prob[:, :, :, num_anchors:] 24 | 25 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) 26 | scores = scores.reshape((-1, 1)) 27 | 28 | length = scores.shape[0] 29 | if length < rpn_top_n: 30 | # Random selection, maybe unnecessary and loses good proposals 31 | # But such case rarely happens 32 | top_inds = npr.choice(length, size=rpn_top_n, replace=True) 33 | else: 34 | top_inds = scores.argsort(0)[::-1] 35 | top_inds = top_inds[:rpn_top_n] 36 | top_inds = top_inds.reshape(rpn_top_n, ) 37 | 38 | # Do the selection here 39 | anchors = anchors[top_inds, :] 40 | rpn_bbox_pred = rpn_bbox_pred[top_inds, :] 41 | scores = scores[top_inds] 42 | 43 | # Convert anchors into proposals via bbox transformations 44 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 45 | 46 | # Clip predicted boxes to image 47 | proposals = clip_boxes(proposals, im_info[:2]) 48 | 49 | # Output rois blob 50 | # Our RPN implementation only supports a single input image, so all 51 | # batch inds are 0 52 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 53 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 54 | return blob, scores 55 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import numpy as np 14 | from model.config import cfg 15 | from model.bbox_transform import bbox_transform 16 | from utils.cython_bbox import bbox_overlaps 17 | import PIL 18 | 19 | 20 | def prepare_roidb(imdb): 21 | """Enrich the imdb's roidb by adding some derived quantities that 22 | are useful for training. This function precomputes the maximum 23 | overlap, taken over ground-truth boxes, between each ROI and 24 | each ground-truth box. The class with maximum overlap is also 25 | recorded. 26 | """ 27 | roidb = imdb.roidb 28 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 29 | for i in range(imdb.num_images)] 30 | 31 | for i in range(len(imdb.image_index)): 32 | roidb[i]['image'] = imdb.image_path_at(i) 33 | roidb[i]['width'] = sizes[i][0] 34 | roidb[i]['height'] = sizes[i][1] 35 | # need gt_overlaps as a dense array for argmax 36 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 37 | # max overlap with gt over classes (columns) 38 | max_overlaps = gt_overlaps.max(axis=1) 39 | # gt class that had the max overlap 40 | max_classes = gt_overlaps.argmax(axis=1) 41 | roidb[i]['max_classes'] = max_classes 42 | roidb[i]['max_overlaps'] = max_overlaps 43 | # sanity checks 44 | # max overlap of 0 => class should be zero (background) 45 | zero_inds = np.where(max_overlaps == 0)[0] 46 | assert all(max_classes[zero_inds] == 0) 47 | # max overlap > 0 => class should not be zero (must be a fg class) 48 | nonzero_inds = np.where(max_overlaps > 0)[0] 49 | assert all(max_classes[nonzero_inds] != 0) 50 | -------------------------------------------------------------------------------- /lib/text_connector/detectors.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import numpy as np 3 | 4 | from model.config import cfg 5 | from model.nms_wrapper import nms 6 | from .text_proposal_connector import TextProposalConnector 7 | from .text_proposal_connector_oriented import TextProposalConnector as TextProposalConnectorOriented 8 | from .text_connect_cfg import Config as TextLineCfg 9 | 10 | 11 | class TextDetector: 12 | def __init__(self, oriented): 13 | if oriented: 14 | print('Use TextProposalConnectorOriented') 15 | self.text_proposal_connector = TextProposalConnectorOriented() 16 | else: 17 | self.text_proposal_connector = TextProposalConnector() 18 | 19 | @staticmethod 20 | def pre_process(text_proposals, scores): 21 | keep_inds = np.where(scores > TextLineCfg.TEXT_PROPOSALS_MIN_SCORE)[0] 22 | text_proposals, scores = text_proposals[keep_inds], scores[keep_inds] 23 | 24 | # 按得分排序 25 | sorted_indices = np.argsort(scores.ravel())[::-1] 26 | text_proposals, scores = text_proposals[sorted_indices], scores[sorted_indices] 27 | 28 | # 对proposal做nms 29 | keep_inds = nms(np.hstack((text_proposals, scores)), TextLineCfg.TEXT_PROPOSALS_NMS_THRESH) 30 | text_proposals, scores = text_proposals[keep_inds], scores[keep_inds] 31 | 32 | return text_proposals, scores 33 | 34 | def detect(self, text_proposals, scores, size): 35 | text_proposals, scores = self.pre_process(text_proposals, scores) 36 | 37 | # 获取检测结果 38 | text_recs = self.text_proposal_connector.get_text_lines(text_proposals, scores, size) 39 | keep_inds = self.filter_boxes(text_recs) 40 | return text_recs[keep_inds] 41 | 42 | def filter_boxes(self, boxes): 43 | heights = np.zeros((len(boxes), 1), np.float) 44 | widths = np.zeros((len(boxes), 1), np.float) 45 | scores = np.zeros((len(boxes), 1), np.float) 46 | index = 0 47 | for box in boxes: 48 | heights[index] = (abs(box[5] - box[1]) + abs(box[7] - box[3])) / 2.0 + 1 49 | widths[index] = (abs(box[2] - box[0]) + abs(box[6] - box[4])) / 2.0 + 1 50 | scores[index] = box[8] 51 | index += 1 52 | 53 | return np.where((widths / heights > TextLineCfg.MIN_RATIO) & (scores > TextLineCfg.LINE_MIN_SCORE) & 54 | (widths > (TextLineCfg.TEXT_PROPOSALS_WIDTH * TextLineCfg.MIN_NUM_PROPOSALS)))[0] 55 | -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /tools/anchor_drawer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Visualization anchors on scaled image 3 | """ 4 | import _init_paths 5 | import os 6 | import argparse 7 | 8 | import numpy as np 9 | import cv2 10 | 11 | from layer_utils.generate_anchors import generate_anchors 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--img', default='') 17 | parser.add_argument('--scale', type=int, default=600) 18 | parser.add_argument('--max_scale', default=1200) 19 | args = parser.parse_args() 20 | 21 | if not os.path.exists(args.img): 22 | parser.error('Image not exist.') 23 | return args 24 | 25 | 26 | def draw_anchors(img, heights, width, start_center): 27 | anchors = [] 28 | center = start_center 29 | for height in heights: 30 | anchors.append(( 31 | center[0] - width // 2, 32 | center[1] - height // 2, 33 | center[0] + width // 2, 34 | center[1] + height // 2 35 | )) 36 | center = (center[0] + width, center[1]) 37 | 38 | for anchor in anchors: 39 | img = cv2.rectangle(img, (anchor[0], anchor[1]), (anchor[2], anchor[3]), color=(255, 0, 0)) 40 | return img 41 | 42 | 43 | if __name__ == '__main__': 44 | args = parse_args() 45 | img = cv2.imread(args.img) 46 | im_size_min = min(img.shape) 47 | im_size_max = max(img.shape) 48 | 49 | im_scale = float(args.scale) / float(im_size_min) 50 | # Prevent the biggest axis from being more than MAX_SIZE 51 | if np.round(im_scale * im_size_max) > args.max_scale: 52 | im_scale = float(args.max_scale) / float(im_size_max) 53 | 54 | img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) 55 | print("Scaled image size") 56 | print(img.shape) 57 | width = img.shape[1] 58 | height = img.shape[0] 59 | 60 | base_anchors = generate_anchors(base_height=11, 61 | num_anchors=10, 62 | anchor_width=16, 63 | h_ratio_step=0.7) 64 | 65 | heights = [x[3] - x[1] for x in base_anchors] 66 | 67 | img = draw_anchors(img, heights, 16, (width // 2, height // 2)) 68 | img = draw_anchors(img, heights, 16, (100, 150)) 69 | img = draw_anchors(img, heights, 16, (width - 300, 150)) 70 | img = draw_anchors(img, heights, 16, (100, height - 150)) 71 | img = draw_anchors(img, heights, 16, (width - 300, height - 150)) 72 | 73 | cv2.namedWindow('test') 74 | cv2.imshow('test', img) 75 | cv2.waitKey() 76 | -------------------------------------------------------------------------------- /lib/layer_utils/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick and Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from model.config import cfg 12 | from model.bbox_transform import bbox_transform_inv, clip_boxes 13 | from model.nms_wrapper import nms 14 | 15 | 16 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, anchors, num_anchors): 17 | """ 18 | A simplified version compared to fast/er RCNN 19 | For details please see the technical report 20 | :param 21 | rpn_cls_prob: (1, H, W, Ax2) softmax result of rpn scores 22 | rpn_bbox_pred: (1, H, W, Ax4) 1x1 conv result for rpn bbox 23 | """ 24 | if type(cfg_key) == bytes: 25 | cfg_key = cfg_key.decode('utf-8') 26 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 27 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 28 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 29 | 30 | # Get the scores and bounding boxes for foreground (text) 31 | # The order in last dim is related to network.py: 32 | # self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob") 33 | # scores = rpn_cls_prob[:, :, :, num_anchors:] # old 34 | 35 | height, width = rpn_cls_prob.shape[1:3] # feature-map的高宽 36 | scores = np.reshape(np.reshape(rpn_cls_prob, [1, height, width, num_anchors, 2])[:, :, :, :, 1], 37 | [1, height, width, num_anchors]) 38 | 39 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) 40 | scores = scores.reshape((-1, 1)) 41 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 42 | proposals = clip_boxes(proposals, im_info[:2]) 43 | 44 | # Pick the top region proposals 45 | order = scores.ravel().argsort()[::-1] 46 | if pre_nms_topN > 0: 47 | order = order[:pre_nms_topN] 48 | proposals = proposals[order, :] 49 | scores = scores[order] 50 | 51 | # Non-maximal suppression 52 | keep = nms(np.hstack((proposals, scores)), nms_thresh, not cfg.USE_GPU_NMS) 53 | 54 | # Pick th top region proposals after NMS 55 | if post_nms_topN > 0: 56 | keep = keep[:post_nms_topN] 57 | proposals = proposals[keep, :] 58 | scores = scores[keep] 59 | 60 | # Only support single image as input 61 | blob = np.hstack((scores.astype(np.float32, copy=False), proposals.astype(np.float32, copy=False))) 62 | return blob, scores 63 | -------------------------------------------------------------------------------- /lib/text_connector/text_proposal_connector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .other import clip_boxes 3 | from .text_proposal_graph_builder import TextProposalGraphBuilder 4 | 5 | 6 | class TextProposalConnector: 7 | def __init__(self): 8 | self.graph_builder = TextProposalGraphBuilder() 9 | 10 | def group_text_proposals(self, text_proposals, scores, im_size): 11 | graph = self.graph_builder.build_graph(text_proposals, scores, im_size) 12 | return graph.sub_graphs_connected() 13 | 14 | def fit_y(self, X, Y, x1, x2): 15 | len(X) != 0 16 | # if X only include one point, the function will get line y=Y[0] 17 | if np.sum(X == X[0]) == len(X): 18 | return Y[0], Y[0] 19 | p = np.poly1d(np.polyfit(X, Y, 1)) 20 | return p(x1), p(x2) 21 | 22 | def get_text_lines(self, text_proposals, scores, im_size): 23 | # tp=text proposal 24 | tp_groups = self.group_text_proposals(text_proposals, scores, im_size) 25 | text_lines = np.zeros((len(tp_groups), 5), np.float32) 26 | 27 | for index, tp_indices in enumerate(tp_groups): 28 | text_line_boxes = text_proposals[list(tp_indices)] 29 | 30 | x0 = np.min(text_line_boxes[:, 0]) 31 | x1 = np.max(text_line_boxes[:, 2]) 32 | 33 | offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5 34 | 35 | lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset) 36 | lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset) 37 | 38 | # the score of a text line is the average score of the scores 39 | # of all text proposals contained in the text line 40 | score = scores[list(tp_indices)].sum() / float(len(tp_indices)) 41 | 42 | text_lines[index, 0] = x0 43 | text_lines[index, 1] = min(lt_y, rt_y) 44 | text_lines[index, 2] = x1 45 | text_lines[index, 3] = max(lb_y, rb_y) 46 | text_lines[index, 4] = score 47 | 48 | text_lines = clip_boxes(text_lines, im_size) 49 | 50 | text_recs = np.zeros((len(text_lines), 9), np.float) 51 | index = 0 52 | for line in text_lines: 53 | xmin, ymin, xmax, ymax = line[0], line[1], line[2], line[3] 54 | text_recs[index, 0] = xmin 55 | text_recs[index, 1] = ymin 56 | text_recs[index, 2] = xmax 57 | text_recs[index, 3] = ymin 58 | text_recs[index, 4] = xmin 59 | text_recs[index, 5] = ymax 60 | text_recs[index, 6] = xmax 61 | text_recs[index, 7] = ymax 62 | text_recs[index, 8] = line[4] 63 | index = index + 1 64 | 65 | return text_recs 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tf_ctpn 2 | 3 | A tensorflow implement of [CTPN: 4 | Detecting Text in Natural Image with Connectionist Text Proposal Network](https://arxiv.org/abs/1609.03605). 5 | 6 | Most of code in this project are adapted from 7 | [CTPN](https://github.com/tianzhi0549/CTPN), [tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) and [text-detection-ctpn](https://github.com/eragonruan/text-detection-ctpn) 8 | 9 | The result of pretrained model on ICDAR13: 10 | 11 | | Net | Dataset | Recall | Precision | Hmean | 12 | |-------|----------|---------|-------------|------------| 13 | | Origin CTPN |ICDAR13 training data + ?|73.72% | 92.77% | 82.15%| 14 | |vgg16| MLT17 latin/chn + ICDAR13 training data | 74.26% | 82.46% | 78.15% | 15 | 16 | If you want an end to end OCR service, check this repo: https://github.com/Sanster/DeepOcrService 17 | 18 | # Setup 19 | Install dependencies: 20 | ``` 21 | pip3 install -r requirements.txt 22 | ``` 23 | 24 | Build Cython part for both demo and training. 25 | ``` 26 | cd lib/ 27 | make clean 28 | make 29 | ``` 30 | 31 | # Quick start 32 | Download pre-trained CTPN model(based on vgg16) from [google drive](https://drive.google.com/open?id=1f8YZCQxmhpXfwGM0KXjoxsoqujg5ruyC), put it in `output/vgg16/voc_2007_trainval/default`. 33 | Run 34 | ``` 35 | python3 tools/demo.py 36 | ``` 37 | 38 | This model is trained on 1080Ti with 80k iterations using this commit `dc533e030e5431212c1d4dbca0bcd7e594a8a368`. 39 | 40 | 41 | # Training 42 | 1. Download training dataset from [google drive](https://drive.google.com/open?id=1S9K9NKkA0RYlBswCfyUI0dv_fI4r5bcX). 43 | This dataset contain 3727 images from MLT17(latin+chinese) and ICDAR13 training set. 44 | Ground truth anchors are generated by `minAreaRect` of text area, see [eragonruan/text-detection-ctpn#issues215](https://github.com/eragonruan/text-detection-ctpn/issues/215) for more details.You can use [tools/mlt17_to_voc.py](https://github.com/Sanster/tf_ctpn/blob/master/tools/mlt17_to_voc.py) to make your training data. 45 | Put downloaded data in `./data/VOCdevkit2007/VOC2007` 46 | 47 | 1. Download pre-trained slim vgg16 model from [here](https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models) 48 | Put the pretrained_models in `./data/pretrained_model` 49 | 50 | 1. Start training 51 | ``` 52 | python3 tools/trainval_net.py 53 | ``` 54 | The output checkpoint file will be saved at `./output/vgg16/voc_2007_trainval/default` 55 | 56 | 1. Start tensorboard 57 | ``` 58 | tensorboard --logdir=./tensorboard 59 | ``` 60 | 61 | # Test on ICDRA13 62 | ``` 63 | python3 tools/icdar.py --img_dir=path/to/ICDAR13/Challenge2_Test_Task12_Images/ -c=ICDAR13 64 | ``` 65 | 66 | After finish, a submit.zip file will generated in `data/ICDAR_submit`, than run: 67 | 68 | ``` 69 | cd tools/ICDAR13 70 | # use python2 71 | python script.py -g=gt.zip -s=submit.zip 72 | ``` 73 | -------------------------------------------------------------------------------- /lib/nets/vgg16.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | from tensorflow.contrib.slim import losses 13 | from tensorflow.contrib.slim import arg_scope 14 | import numpy as np 15 | 16 | from nets.network import Network 17 | from model.config import cfg 18 | 19 | 20 | class vgg16(Network): 21 | def __init__(self): 22 | Network.__init__(self) 23 | self._feat_stride = [16, ] 24 | self._scope = 'vgg_16' 25 | 26 | def _image_to_head(self, is_training, reuse=None): 27 | with tf.variable_scope(self._scope, self._scope, reuse=reuse): 28 | net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], 29 | trainable=True, scope='conv1') 30 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1') 31 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], 32 | trainable=True, scope='conv2') 33 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2') 34 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], 35 | trainable=True, scope='conv3') 36 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') 37 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], 38 | trainable=True, scope='conv4') 39 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4') 40 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], 41 | trainable=True, scope='conv5') 42 | 43 | self._act_summaries.append(net) 44 | self._layers['head'] = net 45 | 46 | return net 47 | 48 | def get_variables_to_restore(self, variables, var_keep_dic): 49 | variables_to_restore = [] 50 | 51 | for v in variables: 52 | # exclude the conv weights that are fc weights in vgg16 53 | if v.name == (self._scope + '/fc6/weights:0') or \ 54 | v.name == (self._scope + '/fc7/weights:0'): 55 | continue 56 | # exclude the first conv layer to swap RGB to BGR 57 | if v.name == (self._scope + '/conv1/conv1_1/weights:0'): 58 | self._variables_to_fix[v.name] = v 59 | continue 60 | if v.name.split(':')[0] in var_keep_dic: 61 | print('Variables restored: %s' % v.name) 62 | variables_to_restore.append(v) 63 | 64 | return variables_to_restore 65 | -------------------------------------------------------------------------------- /lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | import numpy as np 14 | import numpy.random as npr 15 | import cv2 16 | from model.config import cfg 17 | from utils.blob import prep_im_for_blob, im_list_to_blob 18 | 19 | from utils import helper 20 | 21 | 22 | def get_minibatch(roidb, num_classes): 23 | """Given a roidb, construct a minibatch sampled from it.""" 24 | num_images = len(roidb) 25 | # Sample random scales to use for each image in this batch 26 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), size=num_images) 27 | 28 | # Get the input image blob, formatted for caffe 29 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 30 | 31 | blobs = {'data': im_blob} 32 | 33 | assert len(im_scales) == 1, "Single batch only" 34 | assert len(roidb) == 1, "Single batch only" 35 | 36 | # gt boxes: (x1, y1, x2, y2, cls) 37 | if cfg.TRAIN.USE_ALL_GT: 38 | # Include all ground truth boxes 39 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 40 | else: 41 | # For the COCO ground truth boxes, exclude the ones that are ''iscrowd'' 42 | gt_inds = np.where(roidb[0]['gt_classes'] != 0 & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0] 43 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 44 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 45 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 46 | blobs['gt_boxes'] = gt_boxes 47 | blobs['im_info'] = np.array( 48 | [im_blob.shape[1], im_blob.shape[2], im_scales[0]], 49 | dtype=np.float32) 50 | 51 | return blobs 52 | 53 | 54 | def _get_image_blob(roidb, scale_inds): 55 | """Builds an input blob from the images in the roidb at the specified 56 | scales. 57 | """ 58 | num_images = len(roidb) 59 | processed_ims = [] 60 | im_scales = [] 61 | for i in range(num_images): 62 | im = helper.read_rgb_img(roidb[i]['image']) 63 | if roidb[i]['flipped']: 64 | im = im[:, ::-1, :] 65 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 66 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 67 | cfg.TRAIN.MAX_SIZE) 68 | im_scales.append(im_scale) 69 | processed_ims.append(im) 70 | 71 | # Create a blob to hold the input images 72 | blob = im_list_to_blob(processed_ims) 73 | 74 | return blob, im_scales 75 | -------------------------------------------------------------------------------- /lib/model/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | 14 | def bbox_transform(ex_rois, gt_rois): 15 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 16 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 17 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 18 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 19 | 20 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 21 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 22 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 23 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 24 | 25 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 26 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 27 | targets_dw = np.log(gt_widths / ex_widths) 28 | targets_dh = np.log(gt_heights / ex_heights) 29 | 30 | targets = np.vstack( 31 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 32 | return targets 33 | 34 | 35 | def bbox_transform_inv(boxes, deltas): 36 | if boxes.shape[0] == 0: 37 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 38 | 39 | boxes = boxes.astype(deltas.dtype, copy=False) 40 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 41 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 42 | ctr_x = boxes[:, 0] + 0.5 * widths 43 | ctr_y = boxes[:, 1] + 0.5 * heights 44 | 45 | # dx, dw are not used in CTPN 46 | dx = deltas[:, 0::4] 47 | dy = deltas[:, 1::4] 48 | dw = deltas[:, 2::4] 49 | dh = deltas[:, 3::4] 50 | 51 | pred_ctr_x = ctr_x[:, np.newaxis] 52 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 53 | pred_w = widths[:, np.newaxis] 54 | pred_h = np.exp(dh) * heights[:, np.newaxis] 55 | 56 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 57 | # x1 58 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 59 | # y1 60 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 61 | # x2 62 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 63 | # y2 64 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 65 | 66 | return pred_boxes 67 | 68 | 69 | def clip_boxes(boxes, im_shape): 70 | """ 71 | Clip boxes to image boundaries. 72 | """ 73 | 74 | # x1 >= 0 75 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 76 | # y1 >= 0 77 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 78 | # x2 < im_shape[1] 79 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 80 | # y2 < im_shape[0] 81 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 82 | return boxes 83 | -------------------------------------------------------------------------------- /lib/setup_cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from os.path import join as pjoin 3 | import numpy as np 4 | from distutils.core import setup 5 | from distutils.extension import Extension 6 | from Cython.Distutils import build_ext 7 | 8 | def find_in_path(name, path): 9 | "Find a file in a search path" 10 | #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 11 | for dir in path.split(os.pathsep): 12 | binpath = pjoin(dir, name) 13 | if os.path.exists(binpath): 14 | return os.path.abspath(binpath) 15 | return None 16 | 17 | # Obtain the numpy include directory. This logic works across numpy versions. 18 | try: 19 | numpy_include = np.get_include() 20 | except AttributeError: 21 | numpy_include = np.get_numpy_include() 22 | 23 | def customize_compiler_for_nvcc(self): 24 | """inject deep into distutils to customize how the dispatch 25 | to gcc/nvcc works. 26 | 27 | If you subclass UnixCCompiler, it's not trivial to get your subclass 28 | injected in, and still have the right customizations (i.e. 29 | distutils.sysconfig.customize_compiler) run on it. So instead of going 30 | the OO route, I have this. Note, it's kindof like a wierd functional 31 | subclassing going on.""" 32 | 33 | # tell the compiler it can processes .cu 34 | self.src_extensions.append('.cu') 35 | 36 | # save references to the default compiler_so and _comple methods 37 | default_compiler_so = self.compiler_so 38 | super = self._compile 39 | 40 | # now redefine the _compile method. This gets executed for each 41 | # object but distutils doesn't have the ability to change compilers 42 | # based on source extension: we add it. 43 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 44 | print(extra_postargs) 45 | postargs = extra_postargs['gcc'] 46 | 47 | super(obj, src, ext, cc_args, postargs, pp_opts) 48 | # reset the default compiler_so, which we might have changed for cuda 49 | self.compiler_so = default_compiler_so 50 | 51 | # inject our redefined _compile method into the class 52 | self._compile = _compile 53 | 54 | # run the customize_compiler 55 | class custom_build_ext(build_ext): 56 | def build_extensions(self): 57 | customize_compiler_for_nvcc(self.compiler) 58 | build_ext.build_extensions(self) 59 | 60 | ext_modules = [ 61 | Extension( 62 | "utils.cython_bbox", 63 | ["utils/bbox.pyx"], 64 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 65 | include_dirs = [numpy_include] 66 | ), 67 | Extension( 68 | "nms.cpu_nms", 69 | ["nms/cpu_nms.pyx"], 70 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 71 | include_dirs = [numpy_include] 72 | ) 73 | ] 74 | 75 | setup( 76 | name='tf_faster_rcnn', 77 | ext_modules=ext_modules, 78 | # inject our custom trigger 79 | cmdclass={'build_ext': custom_build_ext}, 80 | ) 81 | -------------------------------------------------------------------------------- /lib/nets/mobilenet_v2.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | from nets.mobilenet import conv_blocks as ops 13 | from nets.mobilenet import mobilenet as lib 14 | from nets.mobilenet import mobilenet_v2 as mobilenet_v2 15 | 16 | from nets.network import Network 17 | from model.config import cfg 18 | 19 | from nets.mobilenet.mobilenet_v2 import expand_input 20 | 21 | from nets.mobilenet.mobilenet_v2 import op 22 | 23 | CTPN_DEF = dict( 24 | defaults={ 25 | # Note: these parameters of batch norm affect the architecture 26 | # that's why they are here and not in training_scope. 27 | (slim.batch_norm,): {'center': True, 'scale': True}, 28 | (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 29 | 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 30 | }, 31 | (ops.expanded_conv,): { 32 | 'expansion_size': expand_input(6), 33 | 'split_expansion': 1, 34 | 'normalizer_fn': slim.batch_norm, 35 | 'residual': True 36 | }, 37 | (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'} 38 | }, 39 | spec=[ 40 | op(slim.conv2d, stride=1, num_outputs=32, kernel_size=[3, 3]), 41 | op(ops.expanded_conv, 42 | expansion_size=expand_input(1, divisible_by=1), 43 | num_outputs=16), 44 | op(ops.expanded_conv, stride=2, num_outputs=24), 45 | op(ops.expanded_conv, stride=1, num_outputs=24), 46 | op(ops.expanded_conv, stride=2, num_outputs=32), 47 | op(ops.expanded_conv, stride=1, num_outputs=32), 48 | op(ops.expanded_conv, stride=1, num_outputs=32), 49 | op(ops.expanded_conv, stride=2, num_outputs=64), 50 | op(ops.expanded_conv, stride=1, num_outputs=64), 51 | op(ops.expanded_conv, stride=1, num_outputs=64), 52 | op(ops.expanded_conv, stride=1, num_outputs=64), 53 | op(ops.expanded_conv, stride=1, num_outputs=96), 54 | op(ops.expanded_conv, stride=1, num_outputs=96), 55 | op(ops.expanded_conv, stride=1, num_outputs=96), 56 | op(ops.expanded_conv, stride=2, num_outputs=160), 57 | op(ops.expanded_conv, stride=1, num_outputs=160), 58 | op(ops.expanded_conv, stride=1, num_outputs=160), 59 | op(ops.expanded_conv, stride=1, num_outputs=320), 60 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280) 61 | ], 62 | ) 63 | 64 | 65 | class MobileNetV2(Network): 66 | def __init__(self): 67 | Network.__init__(self) 68 | self._feat_stride = [16, ] 69 | self._scope = 'mobilenet_v2' 70 | 71 | def _image_to_head(self, is_training, reuse=None): 72 | with slim.arg_scope(mobilenet_v2.training_scope(is_training=is_training)): 73 | net, endpoints = mobilenet_v2.mobilenet_base(self._image, conv_defs=CTPN_DEF) 74 | 75 | self.variables_to_restore = slim.get_variables_to_restore() 76 | 77 | self._act_summaries.append(net) 78 | self._layers['head'] = net 79 | 80 | return net 81 | 82 | def get_variables_to_restore(self, variables, var_keep_dic): 83 | pass 84 | -------------------------------------------------------------------------------- /lib/model/test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import cv2 11 | import numpy as np 12 | 13 | try: 14 | import cPickle as pickle 15 | except ImportError: 16 | import pickle 17 | import os 18 | import math 19 | 20 | from utils.timer import Timer 21 | from utils.blob import im_list_to_blob 22 | 23 | from model.config import cfg, get_output_dir 24 | from model.bbox_transform import clip_boxes, bbox_transform_inv 25 | from model.nms_wrapper import nms 26 | 27 | 28 | def _get_image_blob(im): 29 | """Converts an image into a network input. 30 | Arguments: 31 | im (ndarray): a color image in BGR order 32 | Returns: 33 | blob (ndarray): a data blob holding an image pyramid 34 | im_scale_factors (list): list of image scales (relative to im) used 35 | in the image pyramid 36 | """ 37 | im_orig = im.astype(np.float32, copy=True) 38 | im_orig -= cfg.PIXEL_MEANS 39 | 40 | im_shape = im_orig.shape 41 | im_size_min = np.min(im_shape[0:2]) 42 | im_size_max = np.max(im_shape[0:2]) 43 | 44 | processed_ims = [] 45 | im_scale_factors = [] 46 | 47 | for target_size in cfg.TEST.SCALES: 48 | im_scale = float(target_size) / float(im_size_min) 49 | # Prevent the biggest axis from being more than MAX_SIZE 50 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 51 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 52 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 53 | interpolation=cv2.INTER_LINEAR) 54 | im_scale_factors.append(im_scale) 55 | processed_ims.append(im) 56 | 57 | # Create a blob to hold the input images 58 | blob = im_list_to_blob(processed_ims) 59 | 60 | return blob, np.array(im_scale_factors) 61 | 62 | 63 | def _get_blobs(im): 64 | """Convert an image and RoIs within that image into network inputs.""" 65 | blobs = {} 66 | blobs['data'], im_scale_factors = _get_image_blob(im) 67 | 68 | return blobs, im_scale_factors 69 | 70 | 71 | def _clip_boxes(boxes, im_shape): 72 | """Clip boxes to image boundaries.""" 73 | if len(boxes) == 0: 74 | return boxes 75 | # x1 >= 0 76 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) 77 | # y1 >= 0 78 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0) 79 | # x2 < im_shape[1] 80 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1) 81 | # y2 < im_shape[0] 82 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1) 83 | return boxes 84 | 85 | 86 | def im_detect(sess, net, im): 87 | blobs, im_scales = _get_blobs(im) 88 | assert len(im_scales) == 1, "Only single-image batch implemented" 89 | 90 | resized_im_blob = blobs['data'] 91 | blobs['im_info'] = np.array([resized_im_blob.shape[1], resized_im_blob.shape[2], im_scales[0]], dtype=np.float32) 92 | 93 | rois = net.test_image(sess, blobs['data'], blobs['im_info']) 94 | 95 | boxes = rois[:, 1:5] 96 | boxes = _clip_boxes(boxes, resized_im_blob.shape[1:3]) 97 | 98 | scores = rois[:, 0] 99 | 100 | return scores, boxes, resized_im_blob.shape[1:3], im_scales[0] 101 | -------------------------------------------------------------------------------- /lib/roi_data_layer/layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | 8 | """The data layer used during training to train a Fast R-CNN network. 9 | 10 | RoIDataLayer implements a Caffe Python layer. 11 | """ 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | from model.config import cfg 17 | from roi_data_layer.minibatch import get_minibatch 18 | import numpy as np 19 | import time 20 | 21 | 22 | class RoIDataLayer(object): 23 | """Fast R-CNN data layer used for training.""" 24 | 25 | def __init__(self, roidb, num_classes, random=False): 26 | """Set the roidb to be used by this layer during training.""" 27 | self._roidb = roidb 28 | self._num_classes = num_classes 29 | # Also set a random flag 30 | self._random = random 31 | self._shuffle_roidb_inds() 32 | 33 | def _shuffle_roidb_inds(self): 34 | """Randomly permute the training roidb.""" 35 | # If the random flag is set, 36 | # then the database is shuffled according to system time 37 | # Useful for the validation set 38 | if self._random: 39 | st0 = np.random.get_state() 40 | millis = int(round(time.time() * 1000)) % 4294967295 41 | np.random.seed(millis) 42 | 43 | if cfg.TRAIN.ASPECT_GROUPING: 44 | widths = np.array([r['width'] for r in self._roidb]) 45 | heights = np.array([r['height'] for r in self._roidb]) 46 | horz = (widths >= heights) 47 | vert = np.logical_not(horz) 48 | horz_inds = np.where(horz)[0] 49 | vert_inds = np.where(vert)[0] 50 | inds = np.hstack(( 51 | np.random.permutation(horz_inds), 52 | np.random.permutation(vert_inds))) 53 | inds = np.reshape(inds, (-1, 2)) 54 | row_perm = np.random.permutation(np.arange(inds.shape[0])) 55 | inds = np.reshape(inds[row_perm, :], (-1,)) 56 | self._perm = inds 57 | else: 58 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 59 | # Restore the random state 60 | if self._random: 61 | np.random.set_state(st0) 62 | 63 | self._cur = 0 64 | 65 | def _get_next_minibatch_inds(self): 66 | """Return the roidb indices for the next minibatch.""" 67 | 68 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 69 | self._shuffle_roidb_inds() 70 | 71 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 72 | self._cur += cfg.TRAIN.IMS_PER_BATCH 73 | 74 | return db_inds 75 | 76 | def _get_next_minibatch(self): 77 | """Return the blobs to be used for the next minibatch. 78 | 79 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 80 | separate process and made available through self._blob_queue. 81 | """ 82 | db_inds = self._get_next_minibatch_inds() 83 | minibatch_db = [self._roidb[i] for i in db_inds] 84 | return get_minibatch(minibatch_db, self._num_classes) 85 | 86 | def forward(self): 87 | """Get blobs and copy them into this layer's top blob vector.""" 88 | blobs = self._get_next_minibatch() 89 | return blobs 90 | -------------------------------------------------------------------------------- /tools/convert_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from xml.dom.minidom import Document 4 | import numpy as np 5 | 6 | 7 | def build_voc_dirs(outdir): 8 | mkdir = lambda dir: os.makedirs(dir) if not os.path.exists(dir) else None 9 | mkdir(outdir) 10 | mkdir(os.path.join(outdir, 'Annotations')) 11 | mkdir(os.path.join(outdir, 'ImageSets')) 12 | mkdir(os.path.join(outdir, 'ImageSets', 'Layout')) 13 | mkdir(os.path.join(outdir, 'ImageSets', 'Main')) 14 | mkdir(os.path.join(outdir, 'ImageSets', 'Segmentation')) 15 | mkdir(os.path.join(outdir, 'JPEGImages')) 16 | mkdir(os.path.join(outdir, 'SegmentationClass')) 17 | mkdir(os.path.join(outdir, 'SegmentationObject')) 18 | return os.path.join(outdir, 'Annotations'), os.path.join(outdir, 'JPEGImages'), os.path.join(outdir, 'ImageSets', 19 | 'Main') 20 | 21 | 22 | def generate_xml(img_name, positions, img_size, database, cls='text'): 23 | """ 24 | :param positions: [(xmin, ymin, xmax, ymax)] 25 | """ 26 | doc = Document() 27 | 28 | def append_xml_node_attr(child, parent=None, text=None): 29 | ele = doc.createElement(child) 30 | if not text is None: 31 | text_node = doc.createTextNode(text) 32 | ele.appendChild(text_node) 33 | parent = doc if parent is None else parent 34 | parent.appendChild(ele) 35 | return ele 36 | 37 | # create header 38 | annotation = append_xml_node_attr('annotation') 39 | append_xml_node_attr('folder', parent=annotation, text='text') 40 | append_xml_node_attr('filename', parent=annotation, text=img_name) 41 | source = append_xml_node_attr('source', parent=annotation) 42 | append_xml_node_attr('database', parent=source, text=database) 43 | append_xml_node_attr('annotation', parent=source, text='text') 44 | append_xml_node_attr('image', parent=source, text='text') 45 | append_xml_node_attr('flickrid', parent=source, text='000000') 46 | owner = append_xml_node_attr('owner', parent=annotation) 47 | append_xml_node_attr('name', parent=owner, text='ms') 48 | size = append_xml_node_attr('size', annotation) 49 | append_xml_node_attr('width', size, str(img_size[1])) 50 | append_xml_node_attr('height', size, str(img_size[0])) 51 | append_xml_node_attr('depth', size, str(img_size[2])) 52 | append_xml_node_attr('segmented', parent=annotation, text='0') 53 | 54 | # create objects 55 | objs = [] 56 | for pos in positions: 57 | obj = append_xml_node_attr('object', parent=annotation) 58 | occlusion = int(0) 59 | x1, y1, x2, y2 = int(pos[0]), int(pos[1]), int(pos[2]), int(pos[3]) 60 | 61 | truncation = float(0) 62 | difficult = 0 63 | truncted = 0 if truncation < 0.5 else 1 64 | 65 | append_xml_node_attr('name', parent=obj, text=cls) 66 | append_xml_node_attr('pose', parent=obj, text='none') 67 | append_xml_node_attr('truncated', parent=obj, text=str(truncted)) 68 | append_xml_node_attr('difficult', parent=obj, text=str(int(difficult))) 69 | bb = append_xml_node_attr('bndbox', parent=obj) 70 | append_xml_node_attr('xmin', parent=bb, text=str(x1)) 71 | append_xml_node_attr('ymin', parent=bb, text=str(y1)) 72 | append_xml_node_attr('xmax', parent=bb, text=str(x2)) 73 | append_xml_node_attr('ymax', parent=bb, text=str(y2)) 74 | 75 | o = {'class': cls, 'box': np.asarray([x1, y1, x2, y2], dtype=float), 76 | 'truncation': truncation, 'difficult': difficult, 'occlusion': occlusion} 77 | objs.append(o) 78 | 79 | return doc, objs 80 | -------------------------------------------------------------------------------- /lib/text_connector/text_proposal_graph_builder.py: -------------------------------------------------------------------------------- 1 | from .text_connect_cfg import Config as TextLineCfg 2 | from .other import Graph 3 | import numpy as np 4 | 5 | 6 | # noinspection PyAttributeOutsideInit 7 | class TextProposalGraphBuilder: 8 | """ 9 | Build Text proposals into a graph. 10 | """ 11 | 12 | def get_successions(self, index): 13 | box = self.text_proposals[index] 14 | results = [] 15 | for left in range(int(box[0]) + 1, min(int(box[0]) + TextLineCfg.MAX_HORIZONTAL_GAP + 1, self.im_size[1])): 16 | adj_box_indices = self.boxes_table[left] 17 | for adj_box_index in adj_box_indices: 18 | if self.meet_v_iou(adj_box_index, index): 19 | results.append(adj_box_index) 20 | if len(results) != 0: 21 | return results 22 | return results 23 | 24 | def get_precursors(self, index): 25 | box = self.text_proposals[index] 26 | results = [] 27 | for left in range(int(box[0]) - 1, max(int(box[0] - TextLineCfg.MAX_HORIZONTAL_GAP), 0) - 1, -1): 28 | adj_box_indices = self.boxes_table[left] 29 | for adj_box_index in adj_box_indices: 30 | if self.meet_v_iou(adj_box_index, index): 31 | results.append(adj_box_index) 32 | if len(results) != 0: 33 | return results 34 | return results 35 | 36 | def is_succession_node(self, index, succession_index): 37 | precursors = self.get_precursors(succession_index) 38 | if self.scores[index] >= np.max(self.scores[precursors]): 39 | return True 40 | return False 41 | 42 | def meet_v_iou(self, index1, index2): 43 | def overlaps_v(index1, index2): 44 | h1 = self.heights[index1] 45 | h2 = self.heights[index2] 46 | y0 = max(self.text_proposals[index2][1], self.text_proposals[index1][1]) 47 | y1 = min(self.text_proposals[index2][3], self.text_proposals[index1][3]) 48 | return max(0, y1 - y0 + 1) / min(h1, h2) 49 | 50 | def size_similarity(index1, index2): 51 | h1 = self.heights[index1] 52 | h2 = self.heights[index2] 53 | return min(h1, h2) / max(h1, h2) 54 | 55 | v_overlap = overlaps_v(index1, index2) 56 | size_sim = size_similarity(index1, index2) 57 | # print("v_overlap %f" % v_overlap) 58 | # print("size_sim %f" % size_sim) 59 | return v_overlap >= TextLineCfg.MIN_V_OVERLAPS and size_sim >= TextLineCfg.MIN_SIZE_SIM 60 | 61 | def build_graph(self, text_proposals, scores, im_size): 62 | self.text_proposals = text_proposals 63 | self.scores = scores 64 | self.im_size = im_size 65 | self.heights = text_proposals[:, 3] - text_proposals[:, 1] + 1 66 | 67 | boxes_table = [[] for _ in range(self.im_size[1])] 68 | for index, box in enumerate(text_proposals): 69 | boxes_table[int(box[0])].append(index) 70 | self.boxes_table = boxes_table 71 | 72 | graph = np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool) 73 | 74 | for index, box in enumerate(text_proposals): 75 | successions = self.get_successions(index) 76 | if len(successions) == 0: 77 | continue 78 | succession_index = successions[np.argmax(scores[successions])] 79 | if self.is_succession_node(index, succession_index): 80 | # NOTE: a box can have multiple successions(precursors) if multiple successions(precursors) 81 | # have equal scores. 82 | graph[index, succession_index] = True 83 | return Graph(graph) 84 | -------------------------------------------------------------------------------- /lib/nets/squeezenet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.slim as slim 3 | 4 | from nets.network import Network 5 | 6 | 7 | class SqueezeNet(Network): 8 | def __init__(self): 9 | super().__init__() 10 | self._feat_stride = [16, ] 11 | self._scope = 'squeezenet' 12 | 13 | def _arg_scope(self, is_training, reuse=None): 14 | weight_decay = 0.0 15 | keep_probability = 1.0 16 | 17 | batch_norm_params = { 18 | 'is_training': is_training, 19 | # Decay for the moving averages. 20 | 'decay': 0.995, 21 | # epsilon to prevent 0s in variance. 22 | 'epsilon': 0.001 23 | } 24 | 25 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 26 | weights_initializer=slim.xavier_initializer_conv2d(uniform=True), 27 | weights_regularizer=slim.l2_regularizer(weight_decay), 28 | normalizer_fn=slim.batch_norm, 29 | normalizer_params=batch_norm_params): 30 | with tf.variable_scope(self._scope, self._scope, reuse=reuse): 31 | with slim.arg_scope([slim.batch_norm, slim.dropout], 32 | is_training=is_training) as sc: 33 | return sc 34 | 35 | def get_variables_to_restore(self, variables, var_keep_dic): 36 | pass 37 | 38 | def _image_to_head(self, is_training, reuse=None): 39 | with slim.arg_scope(self._arg_scope(is_training, reuse)): 40 | net = slim.conv2d(self._image, 96, [3, 3], stride=1, scope='conv1') 41 | net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool1') 42 | net = self.fire_module(net, 16, 64, scope='fire2') 43 | net = self.fire_module(net, 16, 64, scope='fire3') 44 | net = self.fire_module(net, 32, 128, scope='fire4') 45 | net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool4') 46 | net = self.fire_module(net, 32, 128, scope='fire5') 47 | net = self.fire_module(net, 48, 192, scope='fire6') 48 | net = self.fire_module(net, 48, 192, scope='fire7') 49 | net = self.fire_module(net, 64, 256, scope='fire8') 50 | net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool8', padding='SAME') 51 | net = self.fire_module(net, 64, 256, scope='fire9') 52 | net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool9', padding='SAME') 53 | net = self.fire_module(net, 64, 512, scope='fire10') 54 | 55 | self._act_summaries.append(net) 56 | self._layers['head'] = net 57 | 58 | return net 59 | 60 | def fire_module(self, inputs, 61 | squeeze_depth, 62 | expand_depth, 63 | reuse=None, 64 | scope=None, 65 | outputs_collections=None): 66 | with tf.variable_scope(scope, 'fire', [inputs], reuse=reuse): 67 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], 68 | outputs_collections=None): 69 | net = self.squeeze(inputs, squeeze_depth) 70 | outputs = self.expand(net, expand_depth) 71 | return outputs 72 | 73 | def squeeze(self, inputs, num_outputs): 74 | return slim.conv2d(inputs, num_outputs, [1, 1], stride=1, scope='squeeze') 75 | 76 | def expand(self, inputs, num_outputs): 77 | with tf.variable_scope('expand'): 78 | e1x1 = slim.conv2d(inputs, num_outputs, [1, 1], stride=1, scope='1x1') 79 | e3x3 = slim.conv2d(inputs, num_outputs, [3, 3], scope='3x3') 80 | return tf.concat([e1x1, e3x3], 3) 81 | -------------------------------------------------------------------------------- /tools/freeze_graph.py: -------------------------------------------------------------------------------- 1 | """ 2 | bazel-bin/tensorflow/tools/graph_transforms/transform_graph \ 3 | --in_graph=/home/cwq/code/tf_ctpn/model/ctpn.pb \ 4 | --out_graph=/home/cwq/code/tf_ctpn/model/ctpn_optimized.pb \ 5 | --inputs='input' \ 6 | --outputs='vgg_16_1/rpn_bbox_pred/Conv2D,vgg_16_1/rpn_cls_score_reshape' \ 7 | --transforms=' 8 | strip_unused_nodes(name=inputs, type_for_name=float, shape_for_name="1,-1,-1,3") 9 | remove_nodes(op=Identity, op=CheckNumerics) 10 | fold_constants(ignore_errors=true) 11 | fold_batch_norms 12 | fold_old_batch_norms' 13 | """ 14 | 15 | import tensorflow as tf 16 | import argparse 17 | import os 18 | import sys 19 | 20 | 21 | def main(args): 22 | with tf.Graph().as_default(): 23 | with tf.Session() as sess: 24 | # Load the model metagraph and checkpoint 25 | print('Model directory: %s' % args.ckpt_dir) 26 | meta_file, ckpt_file = get_model_filenames(args.ckpt_dir) 27 | 28 | print('Metagraph file: %s' % meta_file) 29 | print('Checkpoint file: %s' % ckpt_file) 30 | 31 | saver = tf.train.import_meta_graph(meta_file, clear_devices=True) 32 | tf.get_default_session().run(tf.global_variables_initializer()) 33 | tf.get_default_session().run(tf.local_variables_initializer()) 34 | saver.restore(sess, ckpt_file) 35 | 36 | input_graph_def = tf.get_default_graph().as_graph_def() 37 | 38 | for node in input_graph_def.node: 39 | if node.name == "vgg_16_1/rpn_bbox_pred/Conv2D": 40 | node.name = "RPN/rpn_bbox_pred/Conv2D" 41 | 42 | if node.name == "vgg_16_1/rpn_cls_score_reshape": 43 | node.name = "RPN/rpn_cls_score_reshape" 44 | 45 | output_node_names = ['RPN/rpn_bbox_pred/Conv2D', 'RPN/rpn_cls_prob_reshape'] 46 | 47 | # We use a built-in TF helper to export variables to constants 48 | output_graph_def = tf.graph_util.convert_variables_to_constants( 49 | sess, # The session is used to retrieve the weights 50 | input_graph_def, # The graph_def is used to retrieve the nodes 51 | output_node_names # The output node names are used to select the usefull nodes 52 | ) 53 | 54 | # Serialize and dump the output graph to the filesystem 55 | with tf.gfile.GFile(args.output_file, 'wb') as f: 56 | f.write(output_graph_def.SerializeToString()) 57 | pb_file_size = f.size() / 1024. / 1024. 58 | print("%d ops in the final graph: %s, size: %d mb" % 59 | (len(output_graph_def.node), args.output_file, pb_file_size)) 60 | 61 | 62 | def get_model_filenames(model_dir): 63 | ckpt = tf.train.get_checkpoint_state(model_dir) 64 | if ckpt and ckpt.model_checkpoint_path: 65 | ckpt_file_basename = os.path.basename(ckpt.model_checkpoint_path) 66 | meta_file = os.path.join(model_dir, ckpt_file_basename + '.meta') 67 | return meta_file, ckpt.model_checkpoint_path 68 | 69 | 70 | if __name__ == '__main__': 71 | parser = argparse.ArgumentParser() 72 | 73 | parser.add_argument('--ckpt_dir', type=str, default='./checkpoint/crnn', 74 | help='Directory containing the metagraph (.meta) file and the checkpoint (ckpt) file containing model parameters') 75 | 76 | parser.add_argument('--output_file', type=str, default='./model/ctpn.pb', 77 | help='Filename for the exported graphdef protobuf (.pb)') 78 | 79 | args, _ = parser.parse_known_args() 80 | 81 | output_dir = os.path.dirname(args.output_file) 82 | if not os.path.exists(output_dir): 83 | os.makedirs(output_dir) 84 | 85 | main(args) 86 | -------------------------------------------------------------------------------- /lib/utils/visualization.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from six.moves import range 12 | import PIL.Image as Image 13 | import PIL.ImageColor as ImageColor 14 | import PIL.ImageDraw as ImageDraw 15 | import PIL.ImageFont as ImageFont 16 | 17 | STANDARD_COLORS = [ 18 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 19 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 20 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 21 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 22 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 23 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 24 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 25 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 26 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 27 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 28 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 29 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 30 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 31 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 32 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 33 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 34 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 35 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 36 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 37 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 38 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 39 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 40 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 41 | ] 42 | 43 | NUM_COLORS = len(STANDARD_COLORS) 44 | 45 | try: 46 | FONT = ImageFont.truetype('arial.ttf', 24) 47 | except IOError: 48 | FONT = ImageFont.load_default() 49 | 50 | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4): 51 | draw = ImageDraw.Draw(image) 52 | (left, right, top, bottom) = (xmin, xmax, ymin, ymax) 53 | draw.line([(left, top), (left, bottom), (right, bottom), 54 | (right, top), (left, top)], width=thickness, fill=color) 55 | text_bottom = bottom 56 | # Reverse list and print from bottom to top. 57 | text_width, text_height = font.getsize(display_str) 58 | 59 | margin = np.ceil(0.05 * text_height) 60 | draw.rectangle( 61 | [(left, text_bottom - text_height - 2 * margin), (left + text_width, 62 | text_bottom)], 63 | fill=color) 64 | # draw.text( 65 | # (left + margin, text_bottom - text_height - margin), 66 | # display_str, 67 | # fill='black', 68 | # font=font) 69 | 70 | return image 71 | 72 | def draw_bounding_boxes(image, gt_boxes, im_info): 73 | num_boxes = gt_boxes.shape[0] 74 | gt_boxes_new = gt_boxes.copy() 75 | gt_boxes_new[:,:4] = np.round(gt_boxes_new[:,:4].copy() / im_info[2]) 76 | disp_image = Image.fromarray(np.uint8(image[0])) 77 | 78 | for i in range(num_boxes): 79 | this_class = int(gt_boxes_new[i, 4]) 80 | disp_image = _draw_single_box(disp_image, 81 | gt_boxes_new[i, 0], 82 | gt_boxes_new[i, 1], 83 | gt_boxes_new[i, 2], 84 | gt_boxes_new[i, 3], 85 | 'N%02d-C%02d' % (i, this_class), 86 | FONT, 87 | color=STANDARD_COLORS[this_class % NUM_COLORS]) 88 | 89 | image[0, :] = np.array(disp_image) 90 | return image 91 | -------------------------------------------------------------------------------- /lib/text_connector/text_proposal_connector_oriented.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import numpy as np 3 | from .text_proposal_graph_builder import TextProposalGraphBuilder 4 | 5 | 6 | class TextProposalConnector: 7 | """ 8 | Connect text proposals into text lines 9 | """ 10 | 11 | def __init__(self): 12 | self.graph_builder = TextProposalGraphBuilder() 13 | 14 | def group_text_proposals(self, text_proposals, scores, im_size): 15 | graph = self.graph_builder.build_graph(text_proposals, scores, im_size) 16 | return graph.sub_graphs_connected() 17 | 18 | def fit_y(self, X, Y, x1, x2): 19 | len(X) != 0 20 | # if X only include one point, the function will get line y=Y[0] 21 | if np.sum(X == X[0]) == len(X): 22 | return Y[0], Y[0] 23 | p = np.poly1d(np.polyfit(X, Y, 1)) 24 | return p(x1), p(x2) 25 | 26 | def get_text_lines(self, text_proposals, scores, im_size): 27 | """ 28 | text_proposals:boxes 29 | 30 | """ 31 | # tp=text proposal 32 | tp_groups = self.group_text_proposals(text_proposals, scores, im_size) # 首先还是建图,获取到文本行由哪几个小框构成 33 | 34 | text_lines = np.zeros((len(tp_groups), 8), np.float32) 35 | 36 | for index, tp_indices in enumerate(tp_groups): 37 | text_line_boxes = text_proposals[list(tp_indices)] # 每个文本行的全部小框 38 | X = (text_line_boxes[:, 0] + text_line_boxes[:, 2]) / 2 # 求每一个小框的中心x,y坐标 39 | Y = (text_line_boxes[:, 1] + text_line_boxes[:, 3]) / 2 40 | 41 | z1 = np.polyfit(X, Y, 1) # 多项式拟合,根据之前求的中心店拟合一条直线(最小二乘) 42 | 43 | x0 = np.min(text_line_boxes[:, 0]) # 文本行x坐标最小值 44 | x1 = np.max(text_line_boxes[:, 2]) # 文本行x坐标最大值 45 | 46 | offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5 # 小框宽度的一半 47 | 48 | # 以全部小框的左上角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标 49 | lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset) 50 | # 以全部小框的左下角这个点去拟合一条直线,然后计算一下文本行x坐标的极左极右对应的y坐标 51 | lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset) 52 | 53 | score = scores[list(tp_indices)].sum() / float(len(tp_indices)) # 求全部小框得分的均值作为文本行的均值 54 | 55 | text_lines[index, 0] = x0 56 | text_lines[index, 1] = min(lt_y, rt_y) # 文本行上端 线段 的y坐标的小值 57 | text_lines[index, 2] = x1 58 | text_lines[index, 3] = max(lb_y, rb_y) # 文本行下端 线段 的y坐标的大值 59 | text_lines[index, 4] = score # 文本行得分 60 | text_lines[index, 5] = z1[0] # 根据中心点拟合的直线的k,b 61 | text_lines[index, 6] = z1[1] 62 | height = np.mean((text_line_boxes[:, 3] - text_line_boxes[:, 1])) # 小框平均高度 63 | text_lines[index, 7] = height + 2.5 64 | 65 | text_recs = np.zeros((len(text_lines), 9), np.float) 66 | index = 0 67 | for line in text_lines: 68 | b1 = line[6] - line[7] / 2 # 根据高度和文本行中心线,求取文本行上下两条线的b值 69 | b2 = line[6] + line[7] / 2 70 | x1 = line[0] 71 | y1 = line[5] * line[0] + b1 # 左上 72 | x2 = line[2] 73 | y2 = line[5] * line[2] + b1 # 右上 74 | x3 = line[0] 75 | y3 = line[5] * line[0] + b2 # 左下 76 | x4 = line[2] 77 | y4 = line[5] * line[2] + b2 # 右下 78 | disX = x2 - x1 79 | disY = y2 - y1 80 | width = np.sqrt(disX * disX + disY * disY) # 文本行宽度 81 | 82 | fTmp0 = y3 - y1 # 文本行高度 83 | fTmp1 = fTmp0 * disY / width 84 | x = np.fabs(fTmp1 * disX / width) # 做补偿 85 | y = np.fabs(fTmp1 * disY / width) 86 | if line[5] < 0: 87 | x1 -= x 88 | y1 += y 89 | x4 += x 90 | y4 -= y 91 | else: 92 | x2 += x 93 | y2 += y 94 | x3 -= x 95 | y3 -= y 96 | text_recs[index, 0] = x1 97 | text_recs[index, 1] = y1 98 | text_recs[index, 2] = x2 99 | text_recs[index, 3] = y2 100 | text_recs[index, 4] = x3 101 | text_recs[index, 5] = y3 102 | text_recs[index, 6] = x4 103 | text_recs[index, 7] = y4 104 | text_recs[index, 8] = line[4] 105 | index = index + 1 106 | 107 | return text_recs 108 | -------------------------------------------------------------------------------- /lib/layer_utils/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | 14 | def generate_anchors(base_height=11, num_anchors=10, anchor_width=16, h_ratio_step=0.7): 15 | """ 16 | Generate anchor windows template by using different hight start from base_size 17 | According to the ctpn paper, anchor's width is always 16 pixels 18 | 19 | Anchor heights in ctpn sorce code: [11, 16, 23, 33, 48, 68, 97, 139, 198, 283] 20 | """ 21 | base_anchor = np.array([1, 1, anchor_width, anchor_width]) - 1 22 | h_ratios = h_ratio_step ** np.arange(0, num_anchors) 23 | 24 | w, h, x_ctr, y_ctr = _whctrs(base_anchor) 25 | ws = np.array([16 for _ in range(num_anchors)]) 26 | 27 | hs = np.ceil(base_height / h_ratios) 28 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 29 | return anchors 30 | 31 | 32 | def generate_anchors_pre(height, width, feat_stride, num_anchors=10, anchor_width=16, anchor_h_ratio_step=0.7): 33 | """ 34 | A wrapper function to generate anchors given by different height scale 35 | :arg 36 | height/width: height/width of last shared cnn layer feature map 37 | feat_stride: total stride until the last shared cnn layer 38 | 39 | :returns 40 | anchors: anchors on input image 41 | length: The total number of anchors 42 | """ 43 | # print("width: %d, height: %d" %(width,height)) 44 | anchors = generate_anchors(num_anchors=num_anchors, h_ratio_step=anchor_h_ratio_step, anchor_width=anchor_width) 45 | A = anchors.shape[0] 46 | shift_x = np.arange(0, width) * feat_stride 47 | shift_y = np.arange(0, height) * feat_stride 48 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 49 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() 50 | K = shifts.shape[0] 51 | # width changes faster, so here it is H, W, C 52 | anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 53 | anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False) 54 | length = np.int32(anchors.shape[0]) 55 | 56 | return anchors, length 57 | 58 | 59 | def _whctrs(anchor): 60 | """ 61 | Return width, height, x center, and y center for an anchor (window). 62 | """ 63 | 64 | w = anchor[2] - anchor[0] + 1 65 | h = anchor[3] - anchor[1] + 1 66 | x_ctr = anchor[0] + 0.5 * (w - 1) 67 | y_ctr = anchor[1] + 0.5 * (h - 1) 68 | return w, h, x_ctr, y_ctr 69 | 70 | 71 | def _mkanchors(ws, hs, x_ctr, y_ctr): 72 | """ 73 | Given a vector of widths (ws) and heights (hs) around a center 74 | (x_ctr, y_ctr), output a set of anchors (windows). 75 | """ 76 | 77 | ws = ws[:, np.newaxis] 78 | hs = hs[:, np.newaxis] 79 | anchors = np.hstack((x_ctr - ws / 2, 80 | y_ctr - hs / 2, 81 | x_ctr + ws / 2, 82 | y_ctr + hs / 2)).astype(np.int32) 83 | return anchors 84 | 85 | 86 | def _ratio_enum(anchor, ratios): 87 | """ 88 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 89 | """ 90 | 91 | w, h, x_ctr, y_ctr = _whctrs(anchor) 92 | size = w * h 93 | size_ratios = size / ratios 94 | ws = np.round(np.sqrt(size_ratios)) 95 | hs = np.round(ws * ratios) 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | 100 | def _scale_enum(anchor, scales): 101 | """ 102 | Enumerate a set of anchors for each scale wrt an anchor. 103 | """ 104 | 105 | w, h, x_ctr, y_ctr = _whctrs(anchor) 106 | ws = w * scales 107 | hs = h * scales 108 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 109 | return anchors 110 | 111 | 112 | if __name__ == '__main__': 113 | import time 114 | 115 | t = time.time() 116 | anchors = generate_anchors(base_height=11, num_anchors=6, anchor_width=16, h_ratio_step=0.7) 117 | print(anchors) 118 | for anchor in anchors: 119 | print(anchor[3] - anchor[1]) 120 | 121 | # c, length = generate_anchors_pre(47, 37, 16, 6) 122 | # print(c) 123 | # print(length) 124 | -------------------------------------------------------------------------------- /tools/icdar13_split_label.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import math 4 | import cv2 as cv 5 | 6 | # path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/image' 7 | # gt_path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/label' 8 | 9 | path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images' 10 | gt_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT' 11 | 12 | out_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images_splited' 13 | label_out_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT_splited' 14 | if not os.path.exists(out_path): 15 | os.makedirs(out_path) 16 | files = os.listdir(path) 17 | files.sort() 18 | # files=files[:100] 19 | for file in files: 20 | _, basename = os.path.split(file) 21 | if basename.lower().split('.')[-1] not in ['jpg', 'png']: 22 | continue 23 | stem, ext = os.path.splitext(basename) 24 | gt_file = os.path.join(gt_path, 'gt_' + stem + '.txt') 25 | img_path = os.path.join(path, file) 26 | print(img_path) 27 | img = cv.imread(img_path) 28 | img_size = img.shape 29 | im_size_min = np.min(img_size[0:2]) 30 | im_size_max = np.max(img_size[0:2]) 31 | 32 | im_scale = float(600) / float(im_size_min) 33 | if np.round(im_scale * im_size_max) > 1200: 34 | im_scale = float(1200) / float(im_size_max) 35 | re_im = cv.resize(img, None, None, fx=im_scale, fy=im_scale, interpolation=cv.INTER_AREA) 36 | re_size = re_im.shape 37 | cv.imwrite(os.path.join(out_path, stem) + '.jpg', re_im) 38 | 39 | with open(gt_file, 'r') as f: 40 | lines = f.readlines() 41 | for line in lines: 42 | splitted_line = line.strip().lower().split(' ') 43 | splitted_line = [int(n) for n in splitted_line[:-1]] 44 | 45 | xmin = int(splitted_line[0] * im_scale) 46 | ymin = int(splitted_line[1] * im_scale) 47 | xmax = int(splitted_line[2] * im_scale) 48 | ymax = int(splitted_line[3] * im_scale) 49 | 50 | if xmin < 0: 51 | xmin = 0 52 | if xmax > re_size[1] - 1: 53 | xmax = re_size[1] - 1 54 | if ymin < 0: 55 | ymin = 0 56 | if ymax > re_size[0] - 1: 57 | ymax = re_size[0] - 1 58 | 59 | width = xmax - xmin 60 | height = ymax - ymin 61 | 62 | # 将完整的文字区域切分为宽度为 16 的小区域 63 | step = 16.0 64 | xmins = [] 65 | 66 | anchor_count = int(math.ceil(width / step)) 67 | for i in range(anchor_count): 68 | xmins.append(i * int(step) + xmin) 69 | 70 | if not os.path.exists(label_out_path): 71 | os.makedirs(label_out_path) 72 | 73 | with open(os.path.join(label_out_path, "gt_" + stem) + '.txt', 'a') as f: 74 | for i in range(len(xmins)): 75 | f.writelines(str(xmins[i])) 76 | f.writelines(" ") 77 | f.writelines(str(int(ymin))) 78 | f.writelines(" ") 79 | # anchor box 的宽度为 16, 80 | f.writelines(str(int(xmins[i] + step - 1))) 81 | f.writelines(" ") 82 | f.writelines(str(int(ymax))) 83 | f.writelines("\n") 84 | 85 | # reimplement 86 | # step = 16.0 87 | # x_left = [] 88 | # x_right = [] 89 | # x_left.append(xmin) 90 | # x_left_start = int(math.ceil(xmin / 16.0) * 16.0) 91 | # if x_left_start == xmin: 92 | # x_left_start = xmin + 16 93 | # for i in np.arange(x_left_start, xmax, 16): 94 | # x_left.append(i) 95 | # x_left = np.array(x_left) 96 | # 97 | # x_right.append(x_left_start - 1) 98 | # for i in range(1, len(x_left) - 1): 99 | # x_right.append(x_left[i] + 15) 100 | # x_right.append(xmax) 101 | # x_right = np.array(x_right) 102 | # 103 | # idx = np.where(x_left == x_right) 104 | # x_left = np.delete(x_left, idx, axis=0) 105 | # x_right = np.delete(x_right, idx, axis=0) 106 | # 107 | # if not os.path.exists(label_out_path): 108 | # os.makedirs(label_out_path) 109 | # 110 | # with open(os.path.join(label_out_path, "gt_" + stem) + '.txt', 'a') as f: 111 | # for i in range(len(x_left)): 112 | # f.writelines(str(int(x_left[i]))) 113 | # f.writelines(" ") 114 | # f.writelines(str(int(ymin))) 115 | # f.writelines(" ") 116 | # f.writelines(str(int(x_right[i]))) 117 | # f.writelines(" ") 118 | # f.writelines(str(int(ymax))) 119 | # f.writelines("\n") 120 | -------------------------------------------------------------------------------- /tools/icdar13_to_voc.py: -------------------------------------------------------------------------------- 1 | from xml.dom.minidom import Document 2 | import cv2 3 | import os 4 | import glob 5 | import shutil 6 | import numpy as np 7 | 8 | from tools.convert_utils import build_voc_dirs 9 | 10 | 11 | def generate_xml(img_name, lines, img_size, class_sets): 12 | doc = Document() 13 | 14 | def append_xml_node_attr(child, parent=None, text=None): 15 | ele = doc.createElement(child) 16 | if not text is None: 17 | text_node = doc.createTextNode(text) 18 | ele.appendChild(text_node) 19 | parent = doc if parent is None else parent 20 | parent.appendChild(ele) 21 | return ele 22 | 23 | cls = 'text' 24 | 25 | # create header 26 | annotation = append_xml_node_attr('annotation') 27 | append_xml_node_attr('folder', parent=annotation, text='text') 28 | append_xml_node_attr('filename', parent=annotation, text=img_name) 29 | source = append_xml_node_attr('source', parent=annotation) 30 | append_xml_node_attr('database', parent=source, text='coco_text_database') 31 | append_xml_node_attr('annotation', parent=source, text='text') 32 | append_xml_node_attr('image', parent=source, text='text') 33 | append_xml_node_attr('flickrid', parent=source, text='000000') 34 | owner = append_xml_node_attr('owner', parent=annotation) 35 | append_xml_node_attr('name', parent=owner, text='ms') 36 | size = append_xml_node_attr('size', annotation) 37 | append_xml_node_attr('width', size, str(img_size[1])) 38 | append_xml_node_attr('height', size, str(img_size[0])) 39 | append_xml_node_attr('depth', size, str(img_size[2])) 40 | append_xml_node_attr('segmented', parent=annotation, text='0') 41 | 42 | # create objects 43 | objs = [] 44 | for line in lines: 45 | splitted_line = line.strip().lower().split() 46 | 47 | obj = append_xml_node_attr('object', parent=annotation) 48 | occlusion = int(0) 49 | x1, y1, x2, y2 = int(float(splitted_line[0])), int(float(splitted_line[1])), \ 50 | int(float(splitted_line[2])), int(float(splitted_line[3])) 51 | 52 | truncation = float(0) 53 | difficult = 0 54 | truncted = 0 if truncation < 0.5 else 1 55 | 56 | append_xml_node_attr('name', parent=obj, text=cls) 57 | append_xml_node_attr('pose', parent=obj, text='none') 58 | append_xml_node_attr('truncated', parent=obj, text=str(truncted)) 59 | append_xml_node_attr('difficult', parent=obj, text=str(int(difficult))) 60 | bb = append_xml_node_attr('bndbox', parent=obj) 61 | append_xml_node_attr('xmin', parent=bb, text=str(x1)) 62 | append_xml_node_attr('ymin', parent=bb, text=str(y1)) 63 | append_xml_node_attr('xmax', parent=bb, text=str(x2)) 64 | append_xml_node_attr('ymax', parent=bb, text=str(y2)) 65 | 66 | o = {'class': cls, 'box': np.asarray([x1, y1, x2, y2], dtype=float), 67 | 'truncation': truncation, 'difficult': difficult, 'occlusion': occlusion} 68 | objs.append(o) 69 | 70 | return doc, objs 71 | 72 | 73 | def _is_hard(cls, truncation, occlusion, x1, y1, x2, y2): 74 | hard = False 75 | if y2 - y1 < 25 and occlusion >= 2: 76 | hard = True 77 | return hard 78 | if occlusion >= 3: 79 | hard = True 80 | return hard 81 | if truncation > 0.8: 82 | hard = True 83 | return hard 84 | return hard 85 | 86 | 87 | if __name__ == '__main__': 88 | outdir = '/home/cwq/data/ICDAR13/icdar13_voc' 89 | dest_label_dir, dest_img_dir, dest_set_dir = build_voc_dirs(outdir) 90 | 91 | for dset in ['train']: 92 | _labeldir = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT_splited' 93 | _imagedir = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images_splited' 94 | class_sets = ('text', 'dontcare') 95 | class_sets_dict = dict((k, i) for i, k in enumerate(class_sets)) 96 | fs = [open(os.path.join(dest_set_dir, cls + '_' + dset + '.txt'), 'w') for cls in class_sets] 97 | ftrain = open(os.path.join(dest_set_dir, dset + '.txt'), 'w') 98 | 99 | files = glob.glob(os.path.join(_labeldir, '*.txt')) 100 | files.sort() 101 | for file in files: 102 | path, basename = os.path.split(file) 103 | stem, ext = os.path.splitext(basename) 104 | img_id = stem.split('_')[1] 105 | img_name = img_id + '.jpg' 106 | stem = "icdar13_" + img_id 107 | with open(file, 'r') as f: 108 | lines = f.readlines() 109 | img_file = os.path.join(_imagedir, img_name) 110 | 111 | print(img_file) 112 | img = cv2.imread(img_file) 113 | img_size = img.shape 114 | 115 | save_img_name = "icdar13_" + img_name 116 | doc, objs = generate_xml(save_img_name, lines, img_size, class_sets=class_sets) 117 | 118 | cv2.imwrite(os.path.join(dest_img_dir, save_img_name), img) 119 | xmlfile = os.path.join(dest_label_dir, stem + '.xml') 120 | 121 | with open(xmlfile, 'w') as f: 122 | f.write(doc.toprettyxml(indent=' ')) 123 | 124 | ftrain.writelines(stem + '\n') 125 | 126 | (f.close() for f in fs) 127 | ftrain.close() 128 | -------------------------------------------------------------------------------- /tools/trainval_net.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Zheqi He, Xinlei Chen, based on code from Ross Girshick 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import _init_paths 11 | from model.train_val import get_training_roidb, train_net 12 | from model.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir 13 | from datasets.factory import get_imdb 14 | import datasets.imdb 15 | import argparse 16 | import pprint 17 | import numpy as np 18 | import sys 19 | 20 | import tensorflow as tf 21 | from nets.vgg16 import vgg16 22 | from nets.resnet_v1 import Resnetv1 23 | from nets.squeezenet import SqueezeNet 24 | from nets.mobilenet_v2 import MobileNetV2 25 | 26 | 27 | def parse_args(): 28 | """ 29 | Parse input arguments 30 | """ 31 | parser = argparse.ArgumentParser(description='Train a CTPN network') 32 | parser.add_argument('--cfg', dest='cfg_file', 33 | help='optional config file', 34 | default='./data/cfgs/vgg16.yml', type=str) 35 | parser.add_argument('--pretrained_model', 36 | default=None, 37 | help='path to pretrained model, initialize with pretrained model weights', 38 | type=str) 39 | parser.add_argument('--imdb', dest='imdb_name', 40 | help='dataset to train on', 41 | default='voc_2007_trainval', type=str) 42 | parser.add_argument('--imdbval', dest='imdbval_name', 43 | help='dataset to validate on', 44 | default='voc_2007_test', type=str) 45 | parser.add_argument('--iters', dest='max_iters', 46 | help='number of iterations to train', 47 | default=50000, type=int) 48 | parser.add_argument('--tag', dest='tag', 49 | help='tag of the model', 50 | default=None, type=str) 51 | parser.add_argument('--net', dest='net', 52 | help='vgg16, res50, res101, res152, mobile, squeeze', 53 | choices=['vgg16', 'res50', 'res101', 'res152', 'mobile', 'squeeze'], 54 | default='vgg16', type=str) 55 | parser.add_argument('--set', dest='set_cfgs', 56 | help='set config keys', default=None, 57 | nargs=argparse.REMAINDER) 58 | 59 | args = parser.parse_args() 60 | return args 61 | 62 | 63 | def combined_roidb(imdb_names): 64 | """ 65 | Combine multiple roidbs 66 | """ 67 | 68 | def get_roidb(imdb_name): 69 | imdb = get_imdb(imdb_name) 70 | print('Loaded dataset `{:s}` for training'.format(imdb.name)) 71 | imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) 72 | print('Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)) 73 | roidb = get_training_roidb(imdb) 74 | return roidb 75 | 76 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 77 | roidb = roidbs[0] 78 | if len(roidbs) > 1: 79 | for r in roidbs[1:]: 80 | roidb.extend(r) 81 | tmp = get_imdb(imdb_names.split('+')[1]) 82 | imdb = datasets.imdb.imdb(imdb_names, tmp.classes) 83 | else: 84 | imdb = get_imdb(imdb_names) 85 | return imdb, roidb 86 | 87 | 88 | if __name__ == '__main__': 89 | args = parse_args() 90 | 91 | print('Called with args:') 92 | print(args) 93 | 94 | if args.cfg_file is not None: 95 | cfg_from_file(args.cfg_file) 96 | if args.set_cfgs is not None: 97 | cfg_from_list(args.set_cfgs) 98 | 99 | print('Using config:') 100 | pprint.pprint(cfg) 101 | 102 | np.random.seed(cfg.RNG_SEED) 103 | 104 | # train set 105 | # imdb, roidb = combined_roidb(args.imdb_name) 106 | imdb = get_imdb(args.imdb_name) 107 | roidb = get_training_roidb(imdb) 108 | print('{:d} roidb entries'.format(len(roidb))) 109 | 110 | # output directory where the models are saved 111 | output_dir = get_output_dir(imdb, args.tag) 112 | print('Output will be saved to `{:s}`'.format(output_dir)) 113 | 114 | # tensorboard directory where the summaries are saved during training 115 | tb_dir = get_output_tb_dir(imdb, args.tag) 116 | print('TensorFlow summaries will be saved to `{:s}`'.format(tb_dir)) 117 | 118 | # also add the validation set, but with no flipping images 119 | orgflip = cfg.TRAIN.USE_FLIPPED 120 | cfg.TRAIN.USE_FLIPPED = False 121 | _, valroidb = combined_roidb(args.imdbval_name) 122 | print('{:d} validation roidb entries'.format(len(valroidb))) 123 | cfg.TRAIN.USE_FLIPPED = orgflip 124 | 125 | # load network 126 | if args.net == 'vgg16': 127 | net = vgg16() 128 | elif args.net == 'res50': 129 | net = Resnetv1(num_layers=50) 130 | elif args.net == 'res101': 131 | net = Resnetv1(num_layers=101) 132 | elif args.net == 'res152': 133 | net = Resnetv1(num_layers=152) 134 | elif args.net == 'mobile': 135 | net = MobileNetV2() 136 | elif args.net == 'squeeze': 137 | net = SqueezeNet() 138 | else: 139 | raise NotImplementedError 140 | 141 | train_net(net, imdb, roidb, valroidb, output_dir, tb_dir, 142 | pretrained_model=args.pretrained_model, 143 | max_iters=args.max_iters) 144 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | if nvcc is None: 45 | raise EnvironmentError('The nvcc binary could not be ' 46 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | home = os.path.dirname(os.path.dirname(nvcc)) 48 | 49 | cudaconfig = {'home': home, 'nvcc': nvcc, 50 | 'include': pjoin(home, 'include'), 51 | 'lib64': pjoin(home, 'lib64')} 52 | for k, v in cudaconfig.items(): 53 | if not os.path.exists(v): 54 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | 56 | return cudaconfig 57 | 58 | 59 | CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | print(extra_postargs) 90 | if os.path.splitext(src)[1] == '.cu': 91 | # use the cuda for .cu files 92 | self.set_executable('compiler_so', CUDA['nvcc']) 93 | # use only a subset of the extra_postargs, which are 1-1 translated 94 | # from the extra_compile_args in the Extension class 95 | postargs = extra_postargs['nvcc'] 96 | else: 97 | postargs = extra_postargs['gcc'] 98 | 99 | super(obj, src, ext, cc_args, postargs, pp_opts) 100 | # reset the default compiler_so, which we might have changed for cuda 101 | self.compiler_so = default_compiler_so 102 | 103 | # inject our redefined _compile method into the class 104 | self._compile = _compile 105 | 106 | 107 | # run the customize_compiler 108 | class custom_build_ext(build_ext): 109 | def build_extensions(self): 110 | customize_compiler_for_nvcc(self.compiler) 111 | build_ext.build_extensions(self) 112 | 113 | 114 | ext_modules = [ 115 | Extension( 116 | "utils.cython_bbox", 117 | ["utils/bbox.pyx"], 118 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 119 | include_dirs=[numpy_include] 120 | ), 121 | Extension( 122 | "nms.cpu_nms", 123 | ["nms/cpu_nms.pyx"], 124 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 125 | include_dirs=[numpy_include] 126 | ), 127 | Extension('nms.gpu_nms', 128 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 129 | library_dirs=[CUDA['lib64']], 130 | libraries=['cudart'], 131 | language='c++', 132 | runtime_library_dirs=[CUDA['lib64']], 133 | # this syntax is specific to this build system 134 | # we're only going to use certain compiler args with nvcc and not with gcc 135 | # the implementation of this trick is in customize_compiler() below 136 | extra_compile_args={'gcc': ["-Wno-unused-function"], 137 | 'nvcc': ['-arch=sm_52', 138 | '--ptxas-options=-v', 139 | '-c', 140 | '--compiler-options', 141 | "'-fPIC'"]}, 142 | include_dirs=[numpy_include, CUDA['include']] 143 | ) 144 | ] 145 | 146 | setup( 147 | name='tf_faster_rcnn', 148 | ext_modules=ext_modules, 149 | # inject our custom trigger 150 | cmdclass={'build_ext': custom_build_ext}, 151 | ) 152 | -------------------------------------------------------------------------------- /tools/icdar.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import glob 8 | import time 9 | from zipfile import ZipFile 10 | 11 | import _init_paths 12 | from model.config import cfg 13 | from model.test import im_detect 14 | from model.nms_wrapper import nms 15 | from text_connector import TextDetector 16 | 17 | from utils.timer import Timer 18 | import tensorflow as tf 19 | import matplotlib.pyplot as plt 20 | import numpy as np 21 | import os 22 | import cv2 23 | import argparse 24 | 25 | from nets.vgg16 import vgg16 26 | from nets.resnet_v1 import Resnetv1 27 | from nets.mobilenet_v2 import MobileNetV2 28 | 29 | from utils import helper 30 | 31 | from demo import recover_scale 32 | 33 | CLASSES = ('__background__', 'text') 34 | 35 | 36 | def demo(sess, net, im_file, icdar_dir, oriented=False, ltrb=False): 37 | """Detect object classes in an image using pre-computed object proposals.""" 38 | 39 | # Load the demo image 40 | im = helper.read_rgb_img(im_file) 41 | 42 | # Detect all object classes and regress object bounds 43 | timer = Timer() 44 | timer.tic() 45 | scores, boxes, resized_im_shape, im_scale = im_detect(sess, net, im) 46 | timer.toc() 47 | 48 | # Run TextDetector to merge small box 49 | line_detector = TextDetector(oriented) 50 | 51 | # text_lines point order: left-top, right-top, left-bottom, right-bottom 52 | text_lines = line_detector.detect(boxes, scores[:, np.newaxis], resized_im_shape) 53 | print("Image %s, detect %d text lines in %.3fs" % (im_file, len(text_lines), timer.diff)) 54 | 55 | if len(text_lines) != 0: 56 | text_lines = recover_scale(text_lines, im_scale) 57 | 58 | return save_result_txt(text_lines, icdar_dir, im_file, ltrb) 59 | 60 | 61 | def save_result_txt(text_lines, icdar_dir, im_file, ltrb=False): 62 | # ICDAR need box points in clockwise 63 | boxes = [[l[0], l[1], l[2], l[3], l[6], l[7], l[4], l[5]] for l in text_lines] 64 | 65 | im_name = im_file.split('/')[-1].split('.')[0] 66 | res_file = os.path.join(icdar_dir, 'res_%s.txt' % im_name) 67 | if not os.path.exists(icdar_dir): 68 | os.makedirs(icdar_dir) 69 | 70 | with open(res_file, mode='w') as f: 71 | for line in boxes: 72 | if ltrb: 73 | min_x = min([line[0], line[2], line[4], line[6]]) 74 | min_y = min([line[1], line[3], line[5], line[7]]) 75 | max_x = max([line[0], line[2], line[4], line[6]]) 76 | max_y = max([line[1], line[3], line[5], line[7]]) 77 | 78 | f.write('%d,%d,%d,%d\n' % (min_x, min_y, max_x, max_y)) 79 | else: 80 | f.write('%d,%d,%d,%d,%d,%d,%d,%d\n' % (line[0], line[1], line[2], line[3], 81 | line[4], line[5], line[6], line[7])) 82 | return res_file 83 | 84 | 85 | def parse_args(): 86 | """Parse input arguments.""" 87 | parser = argparse.ArgumentParser(description='Test images, and save result as ICDAR challenge format') 88 | parser.add_argument('--net', dest='net', choices=['vgg16', 'res101'], default='vgg16') 89 | parser.add_argument('--img_dir', default='./data/demo') 90 | parser.add_argument('--dataset', dest='dataset', help='model tag', default='voc_2007_trainval') 91 | parser.add_argument('--tag', dest='tag', help='model tag', default='default') 92 | parser.add_argument('-o', '--oriented', action='store_true', default=False, help='output rotated detect box') 93 | parser.add_argument('-c', '--challenge', type=str, help='Which challenge to run', 94 | choices=[ 95 | 'ICDAR13', # ICDAR13 - Focused Scene Text 96 | 'ICDAR15', # ICDAR15 - Challenge 4 - Incidental Scene Text 97 | 'MLT17' # Multi-lingual scene text detection 98 | ]) 99 | args = parser.parse_args() 100 | 101 | if not os.path.exists(args.img_dir): 102 | print("img dir not exists.") 103 | exit(-1) 104 | 105 | args.result_dir = os.path.join('./data/result', args.tag) 106 | if not os.path.exists(args.result_dir): 107 | os.makedirs(args.result_dir) 108 | 109 | return args 110 | 111 | 112 | if __name__ == '__main__': 113 | args = parse_args() 114 | 115 | # model path 116 | netname = args.net 117 | dataset = args.dataset 118 | 119 | ckpt_dir = os.path.join('output', netname, dataset, args.tag) 120 | ckpt = tf.train.get_checkpoint_state(ckpt_dir) 121 | 122 | # set config 123 | tfconfig = tf.ConfigProto(allow_soft_placement=True) 124 | tfconfig.gpu_options.allow_growth = True 125 | 126 | # init session 127 | sess = tf.Session(config=tfconfig) 128 | # load network 129 | if netname == 'vgg16': 130 | net = vgg16() 131 | elif netname == 'res101': 132 | net = Resnetv1(num_layers=101) 133 | elif netname == 'mobile': 134 | net = MobileNetV2() 135 | else: 136 | raise NotImplementedError 137 | 138 | cfg.USE_GPU_NMS = True 139 | net.create_architecture("TEST", 140 | num_classes=len(CLASSES), 141 | tag=args.tag, 142 | anchor_width=cfg.CTPN.ANCHOR_WIDTH, 143 | anchor_h_ratio_step=cfg.CTPN.H_RADIO_STEP, 144 | num_anchors=cfg.CTPN.NUM_ANCHORS) 145 | saver = tf.train.Saver() 146 | saver.restore(sess, ckpt.model_checkpoint_path) 147 | 148 | print('Loaded network {:s}'.format(ckpt.model_checkpoint_path)) 149 | 150 | txt_files = [] 151 | icdar_dir = os.path.join(args.result_dir, args.challenge) 152 | 153 | ltrb = False 154 | if args.challenge in ['ICDAR13', 'ICDAR13_Det']: 155 | ltrb = True 156 | 157 | im_files = glob.glob(args.img_dir + "/*.*") 158 | for im_file in im_files: 159 | txt_file = demo(sess, net, im_file, icdar_dir, oriented=args.oriented, ltrb=ltrb) 160 | txt_files.append(txt_file) 161 | 162 | zip_path = os.path.join('./data/ICDAR_submit', '%s_%s_submit.zip' % (args.challenge, args.tag)) 163 | print(os.path.abspath(zip_path)) 164 | with ZipFile(zip_path, 'w') as f: 165 | for txt in txt_files: 166 | f.write(txt, txt.split('/')[-1]) 167 | -------------------------------------------------------------------------------- /lib/nets/resnet_v1.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Zheqi He and Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | from tensorflow.contrib.slim import losses 13 | from tensorflow.contrib.slim import arg_scope 14 | from tensorflow.contrib.slim.python.slim.nets import resnet_utils 15 | from tensorflow.contrib.slim.python.slim.nets import resnet_v1 16 | from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block 17 | import numpy as np 18 | 19 | from nets.network import Network 20 | from model.config import cfg 21 | 22 | 23 | def resnet_arg_scope(is_training=True, 24 | batch_norm_decay=0.997, 25 | batch_norm_epsilon=1e-5, 26 | batch_norm_scale=True): 27 | batch_norm_params = { 28 | 'is_training': False, 29 | 'decay': batch_norm_decay, 30 | 'epsilon': batch_norm_epsilon, 31 | 'scale': batch_norm_scale, 32 | 'trainable': False, 33 | 'updates_collections': tf.GraphKeys.UPDATE_OPS 34 | } 35 | 36 | with arg_scope( 37 | [slim.conv2d], 38 | weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), 39 | weights_initializer=slim.variance_scaling_initializer(), 40 | trainable=is_training, 41 | activation_fn=tf.nn.relu, 42 | normalizer_fn=slim.batch_norm, 43 | normalizer_params=batch_norm_params): 44 | with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: 45 | return arg_sc 46 | 47 | 48 | class Resnetv1(Network): 49 | def __init__(self, num_layers=50): 50 | Network.__init__(self) 51 | self._feat_stride = [16, ] 52 | self._num_layers = num_layers 53 | self._scope = 'resnet_v1_%d' % num_layers 54 | self._decide_blocks() 55 | 56 | # Do the first few layers manually, because 'SAME' padding can behave inconsistently 57 | # for images of different sizes: sometimes 0, sometimes 1 58 | def _build_base(self): 59 | with tf.variable_scope(self._scope, self._scope): 60 | net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1') 61 | net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) 62 | net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1') 63 | 64 | return net 65 | 66 | def _image_to_head(self, is_training, reuse=None): 67 | assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3) 68 | # Now the base is always fixed during training 69 | with slim.arg_scope(resnet_arg_scope(is_training=False)): 70 | net_conv = self._build_base() 71 | if cfg.RESNET.FIXED_BLOCKS > 0: 72 | with slim.arg_scope(resnet_arg_scope(is_training=False)): 73 | net_conv, _ = resnet_v1.resnet_v1(net_conv, 74 | self._blocks[0:cfg.RESNET.FIXED_BLOCKS], 75 | global_pool=False, 76 | include_root_block=False, 77 | reuse=reuse, 78 | scope=self._scope) 79 | if cfg.RESNET.FIXED_BLOCKS < 3: 80 | with slim.arg_scope(resnet_arg_scope(is_training=is_training)): 81 | net_conv, _ = resnet_v1.resnet_v1(net_conv, 82 | self._blocks[cfg.RESNET.FIXED_BLOCKS:-1], 83 | global_pool=False, 84 | include_root_block=False, 85 | reuse=reuse, 86 | scope=self._scope) 87 | 88 | self._act_summaries.append(net_conv) 89 | self._layers['head'] = net_conv 90 | 91 | return net_conv 92 | 93 | def _decide_blocks(self): 94 | # choose different blocks for different number of layers 95 | if self._num_layers == 50: 96 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 97 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), 98 | # use stride 1 for the last conv4 layer 99 | resnet_v1_block('block3', base_depth=256, num_units=6, stride=1), 100 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 101 | 102 | elif self._num_layers == 101: 103 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 104 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), 105 | # use stride 1 for the last conv4 layer 106 | resnet_v1_block('block3', base_depth=256, num_units=23, stride=1), 107 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 108 | 109 | elif self._num_layers == 152: 110 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 111 | resnet_v1_block('block2', base_depth=128, num_units=8, stride=2), 112 | # use stride 1 for the last conv4 layer 113 | resnet_v1_block('block3', base_depth=256, num_units=36, stride=1), 114 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 115 | 116 | else: 117 | # other numbers are not supported 118 | raise NotImplementedError 119 | 120 | def get_variables_to_restore(self, variables, var_keep_dic): 121 | variables_to_restore = [] 122 | 123 | for v in variables: 124 | # exclude the first conv layer to swap RGB to BGR 125 | if v.name == (self._scope + '/conv1/weights:0'): 126 | self._variables_to_fix[v.name] = v 127 | continue 128 | if v.name.split(':')[0] in var_keep_dic: 129 | print('Variables restored: %s' % v.name) 130 | variables_to_restore.append(v) 131 | 132 | return variables_to_restore 133 | 134 | -------------------------------------------------------------------------------- /lib/nets/mobilenet/mobilenet_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implementation of Mobilenet V2. 16 | Architecture: https://arxiv.org/abs/1801.04381 17 | The base model gives 72.2% accuracy on ImageNet, with 300MMadds, 18 | 3.4 M parameters. 19 | """ 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | 25 | import copy 26 | 27 | import tensorflow as tf 28 | 29 | from nets.mobilenet import conv_blocks as ops 30 | from nets.mobilenet import mobilenet as lib 31 | 32 | slim = tf.contrib.slim 33 | op = lib.op 34 | 35 | expand_input = ops.expand_input_by_factor 36 | 37 | # Architecture: https://arxiv.org/abs/1801.04381 38 | V2_DEF = dict( 39 | defaults={ 40 | # Note: these parameters of batch norm affect the architecture 41 | # that's why they are here and not in training_scope. 42 | (slim.batch_norm,): {'center': True, 'scale': True}, 43 | (slim.conv2d, slim.fully_connected, slim.separable_conv2d): { 44 | 'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6 45 | }, 46 | (ops.expanded_conv,): { 47 | 'expansion_size': expand_input(6), 48 | 'split_expansion': 1, 49 | 'normalizer_fn': slim.batch_norm, 50 | 'residual': True 51 | }, 52 | (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'} 53 | }, 54 | spec=[ 55 | op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]), 56 | op(ops.expanded_conv, 57 | expansion_size=expand_input(1, divisible_by=1), 58 | num_outputs=16), 59 | op(ops.expanded_conv, stride=2, num_outputs=24), 60 | op(ops.expanded_conv, stride=1, num_outputs=24), 61 | op(ops.expanded_conv, stride=2, num_outputs=32), 62 | op(ops.expanded_conv, stride=1, num_outputs=32), 63 | op(ops.expanded_conv, stride=1, num_outputs=32), 64 | op(ops.expanded_conv, stride=2, num_outputs=64), 65 | op(ops.expanded_conv, stride=1, num_outputs=64), 66 | op(ops.expanded_conv, stride=1, num_outputs=64), 67 | op(ops.expanded_conv, stride=1, num_outputs=64), 68 | op(ops.expanded_conv, stride=1, num_outputs=96), 69 | op(ops.expanded_conv, stride=1, num_outputs=96), 70 | op(ops.expanded_conv, stride=1, num_outputs=96), 71 | op(ops.expanded_conv, stride=2, num_outputs=160), 72 | op(ops.expanded_conv, stride=1, num_outputs=160), 73 | op(ops.expanded_conv, stride=1, num_outputs=160), 74 | op(ops.expanded_conv, stride=1, num_outputs=320), 75 | op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280) 76 | ], 77 | ) 78 | 79 | 80 | @slim.add_arg_scope 81 | def mobilenet(input_tensor, 82 | num_classes=1001, 83 | depth_multiplier=1.0, 84 | scope='MobilenetV2', 85 | conv_defs=None, 86 | finegrain_classification_mode=False, 87 | min_depth=None, 88 | divisible_by=None, 89 | **kwargs): 90 | """Creates mobilenet V2 network. 91 | Inference mode is created by default. To create training use training_scope 92 | below. 93 | with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): 94 | logits, endpoints = mobilenet_v2.mobilenet(input_tensor) 95 | Args: 96 | input_tensor: The input tensor 97 | num_classes: number of classes 98 | depth_multiplier: The multiplier applied to scale number of 99 | channels in each layer. Note: this is called depth multiplier in the 100 | paper but the name is kept for consistency with slim's model builder. 101 | scope: Scope of the operator 102 | conv_defs: Allows to override default conv def. 103 | finegrain_classification_mode: When set to True, the model 104 | will keep the last layer large even for small multipliers. Following 105 | https://arxiv.org/abs/1801.04381 106 | suggests that it improves performance for ImageNet-type of problems. 107 | *Note* ignored if final_endpoint makes the builder exit earlier. 108 | min_depth: If provided, will ensure that all layers will have that 109 | many channels after application of depth multiplier. 110 | divisible_by: If provided will ensure that all layers # channels 111 | will be divisible by this number. 112 | **kwargs: passed directly to mobilenet.mobilenet: 113 | prediction_fn- what prediction function to use. 114 | reuse-: whether to reuse variables (if reuse set to true, scope 115 | must be given). 116 | Returns: 117 | logits/endpoints pair 118 | Raises: 119 | ValueError: On invalid arguments 120 | """ 121 | if conv_defs is None: 122 | conv_defs = V2_DEF 123 | if 'multiplier' in kwargs: 124 | raise ValueError('mobilenetv2 doesn\'t support generic ' 125 | 'multiplier parameter use "depth_multiplier" instead.') 126 | if finegrain_classification_mode: 127 | conv_defs = copy.deepcopy(conv_defs) 128 | if depth_multiplier < 1: 129 | conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier 130 | 131 | depth_args = {} 132 | # NB: do not set depth_args unless they are provided to avoid overriding 133 | # whatever default depth_multiplier might have thanks to arg_scope. 134 | if min_depth is not None: 135 | depth_args['min_depth'] = min_depth 136 | if divisible_by is not None: 137 | depth_args['divisible_by'] = divisible_by 138 | 139 | with slim.arg_scope((lib.depth_multiplier,), **depth_args): 140 | return lib.mobilenet( 141 | input_tensor, 142 | num_classes=num_classes, 143 | conv_defs=conv_defs, 144 | scope=scope, 145 | multiplier=depth_multiplier, 146 | **kwargs) 147 | 148 | 149 | @slim.add_arg_scope 150 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs): 151 | """Creates base of the mobilenet (no pooling and no logits) .""" 152 | return mobilenet(input_tensor, 153 | depth_multiplier=depth_multiplier, 154 | base_only=True, **kwargs) 155 | 156 | 157 | def training_scope(**kwargs): 158 | """Defines MobilenetV2 training scope. 159 | Usage: 160 | with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()): 161 | logits, endpoints = mobilenet_v2.mobilenet(input_tensor) 162 | with slim. 163 | Args: 164 | **kwargs: Passed to mobilenet.training_scope. The following parameters 165 | are supported: 166 | weight_decay- The weight decay to use for regularizing the model. 167 | stddev- Standard deviation for initialization, if negative uses xavier. 168 | dropout_keep_prob- dropout keep probability 169 | bn_decay- decay for the batch norm moving averages. 170 | Returns: 171 | An `arg_scope` to use for the mobilenet v2 model. 172 | """ 173 | return lib.training_scope(**kwargs) 174 | 175 | 176 | __all__ = ['training_scope', 'mobilenet_base', 'mobilenet', 'V2_DEF'] 177 | -------------------------------------------------------------------------------- /lib/layer_utils/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | from model.config import cfg 13 | import numpy as np 14 | import numpy.random as npr 15 | from utils.cython_bbox import bbox_overlaps 16 | from model.bbox_transform import bbox_transform 17 | 18 | 19 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, all_anchors, num_anchors): 20 | """ 21 | Same as the anchor target layer in original Fast/er RCNN 22 | :param 23 | rpn_cls_score: (1, H, W, Ax2) bg/fg scores of previous conv layer 24 | gt_boxes: (G, 5) vstack of [x1, y1, x2, y2, class] 25 | im_info: [image_height, image_width] 26 | all_anchors: all anchors pre generated 27 | num_anchors: 28 | :returns 29 | rpn_labels: (1, H, W, A) 30 | rpn_bbox_targets: (1, H, W, Ax4) 31 | """ 32 | A = num_anchors 33 | total_anchors = all_anchors.shape[0] 34 | K = total_anchors / num_anchors 35 | 36 | # allow boxes to sit over the edge by a small amount 37 | _allowed_border = 0 38 | 39 | # map of shape (..., H, W), height/width for feature map 40 | height, width = rpn_cls_score.shape[1:3] 41 | 42 | # print("rpn: gt_boxes.shape %d" % gt_boxes.shape) 43 | # print("rpn: gt_boxes", gt_boxes) 44 | # only keep anchors inside the image 45 | inds_inside = np.where( 46 | (all_anchors[:, 0] >= -_allowed_border) & 47 | (all_anchors[:, 1] >= -_allowed_border) & 48 | (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width 49 | (all_anchors[:, 3] < im_info[0] + _allowed_border) # height 50 | )[0] 51 | # print("total_anchors %d" % total_anchors) 52 | # print("inds_inside %d" % len(inds_inside)) 53 | 54 | # keep only inside anchors 55 | anchors = all_anchors[inds_inside, :] 56 | 57 | # label: 1 is positive, 0 is negative, -1 is dont care 58 | labels = np.empty((len(inds_inside),), dtype=np.float32) 59 | labels.fill(-1) 60 | 61 | # overlaps between the anchors and the gt boxes 62 | # overlaps (ex, gt) 63 | overlaps = bbox_overlaps( 64 | np.ascontiguousarray(anchors, dtype=np.float), 65 | np.ascontiguousarray(gt_boxes, dtype=np.float)) 66 | argmax_overlaps = overlaps.argmax(axis=1) 67 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 68 | gt_argmax_overlaps = overlaps.argmax(axis=0) 69 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 70 | np.arange(overlaps.shape[1])] 71 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 72 | 73 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 74 | # assign bg labels first so that positive labels can clobber them 75 | # first set the negatives 76 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 77 | 78 | # fg label: for each gt, anchor with highest overlap 79 | labels[gt_argmax_overlaps] = 1 80 | 81 | # fg label: above threshold IOU 82 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 83 | 84 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 85 | # assign bg labels last so that negative labels can clobber positives 86 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 87 | 88 | # subsample positive labels if we have too many 89 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 90 | fg_inds = np.where(labels == 1)[0] 91 | # print("fg_inds.shape", fg_inds.shape) 92 | if len(fg_inds) > num_fg: 93 | disable_inds = npr.choice( 94 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 95 | labels[disable_inds] = -1 96 | 97 | # subsample negative labels if we have too many 98 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) 99 | bg_inds = np.where(labels == 0)[0] 100 | # print("bg_inds.shape", bg_inds.shape) 101 | if len(bg_inds) > num_bg: 102 | disable_inds = npr.choice( 103 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 104 | labels[disable_inds] = -1 105 | 106 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 107 | # get rpn_bbox_targets in delta format, the predict result of rpn is (tx, ty, tw, th) 108 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 109 | # print("bbox_targets!===================", bbox_targets) 110 | 111 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 112 | # only the positive ones have regression targets 113 | bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) 114 | 115 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 116 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 117 | # uniform weighting of examples (given non-uniform sampling) 118 | num_examples = np.sum(labels >= 0) 119 | # positive_weights = np.ones((1, 4)) * 1.0 / num_examples 120 | # negative_weights = np.ones((1, 4)) * 1.0 / num_examples 121 | # CTPN: 122 | positive_weights = np.ones((1, 4)) 123 | negative_weights = np.zeros((1, 4)) 124 | else: 125 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 126 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 127 | positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / 128 | np.sum(labels == 1)) 129 | negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / 130 | np.sum(labels == 0)) 131 | bbox_outside_weights[labels == 1, :] = positive_weights 132 | bbox_outside_weights[labels == 0, :] = negative_weights 133 | 134 | # map up to original set of anchors 135 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 136 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) 137 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) 138 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) 139 | 140 | # labels 141 | labels = labels.reshape((1, height, width, A)) 142 | rpn_labels = labels 143 | 144 | # bbox_targets 145 | bbox_targets = bbox_targets.reshape((1, height, width, A * 4)) 146 | rpn_bbox_targets = bbox_targets 147 | 148 | # bbox_inside_weights 149 | bbox_inside_weights = bbox_inside_weights.reshape((1, height, width, A * 4)) 150 | rpn_bbox_inside_weights = bbox_inside_weights 151 | 152 | # bbox_outside_weights 153 | bbox_outside_weights = bbox_outside_weights.reshape((1, height, width, A * 4)) 154 | rpn_bbox_outside_weights = bbox_outside_weights 155 | 156 | return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights 157 | 158 | 159 | def _unmap(data, count, inds, fill=0): 160 | """ Unmap a subset of item (data) back to the original set of items (of 161 | size count) """ 162 | if len(data.shape) == 1: 163 | ret = np.empty((count,), dtype=np.float32) 164 | ret.fill(fill) 165 | ret[inds] = data 166 | else: 167 | ret = np.empty((count,) + data.shape[1:], dtype=np.float32) 168 | ret.fill(fill) 169 | ret[inds, :] = data 170 | return ret 171 | 172 | 173 | def _compute_targets(ex_rois, gt_rois): 174 | """Compute bounding-box regression targets for an image.""" 175 | 176 | assert ex_rois.shape[0] == gt_rois.shape[0] 177 | assert ex_rois.shape[1] == 4 178 | assert gt_rois.shape[1] == 5 179 | 180 | targets = bbox_transform(ex_rois, gt_rois) 181 | 182 | return targets 183 | -------------------------------------------------------------------------------- /lib/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import xml.etree.ElementTree as ET 11 | import os 12 | import pickle 13 | import numpy as np 14 | 15 | 16 | def parse_rec(filename): 17 | """ Parse a PASCAL VOC xml file """ 18 | tree = ET.parse(filename) 19 | objects = [] 20 | for obj in tree.findall('object'): 21 | obj_struct = {} 22 | obj_struct['name'] = obj.find('name').text 23 | obj_struct['pose'] = obj.find('pose').text 24 | obj_struct['truncated'] = int(obj.find('truncated').text) 25 | obj_struct['difficult'] = int(obj.find('difficult').text) 26 | bbox = obj.find('bndbox') 27 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 28 | int(bbox.find('ymin').text), 29 | int(bbox.find('xmax').text), 30 | int(bbox.find('ymax').text)] 31 | objects.append(obj_struct) 32 | 33 | return objects 34 | 35 | 36 | def voc_ap(rec, prec, use_07_metric=False): 37 | """ ap = voc_ap(rec, prec, [use_07_metric]) 38 | Compute VOC AP given precision and recall. 39 | If use_07_metric is true, uses the 40 | VOC 07 11 point method (default:False). 41 | """ 42 | if use_07_metric: 43 | # 11 point metric 44 | ap = 0. 45 | for t in np.arange(0., 1.1, 0.1): 46 | if np.sum(rec >= t) == 0: 47 | p = 0 48 | else: 49 | p = np.max(prec[rec >= t]) 50 | ap = ap + p / 11. 51 | else: 52 | # correct AP calculation 53 | # first append sentinel values at the end 54 | mrec = np.concatenate(([0.], rec, [1.])) 55 | mpre = np.concatenate(([0.], prec, [0.])) 56 | 57 | # compute the precision envelope 58 | for i in range(mpre.size - 1, 0, -1): 59 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 60 | 61 | # to calculate area under PR curve, look for points 62 | # where X axis (recall) changes value 63 | i = np.where(mrec[1:] != mrec[:-1])[0] 64 | 65 | # and sum (\Delta recall) * prec 66 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 67 | return ap 68 | 69 | 70 | def voc_eval(detpath, 71 | annopath, 72 | imagesetfile, 73 | classname, 74 | cachedir, 75 | ovthresh=0.5, 76 | use_07_metric=False, 77 | use_diff=False): 78 | """rec, prec, ap = voc_eval(detpath, 79 | annopath, 80 | imagesetfile, 81 | classname, 82 | [ovthresh], 83 | [use_07_metric]) 84 | 85 | Top level function that does the PASCAL VOC evaluation. 86 | 87 | detpath: Path to detections 88 | detpath.format(classname) should produce the detection results file. 89 | annopath: Path to annotations 90 | annopath.format(imagename) should be the xml annotations file. 91 | imagesetfile: Text file containing the list of images, one image per line. 92 | classname: Category name (duh) 93 | cachedir: Directory for caching the annotations 94 | [ovthresh]: Overlap threshold (default = 0.5) 95 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 96 | (default False) 97 | """ 98 | # assumes detections are in detpath.format(classname) 99 | # assumes annotations are in annopath.format(imagename) 100 | # assumes imagesetfile is a text file with each line an image name 101 | # cachedir caches the annotations in a pickle file 102 | 103 | # first load gt 104 | if not os.path.isdir(cachedir): 105 | os.mkdir(cachedir) 106 | cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile) 107 | # read list of images 108 | with open(imagesetfile, 'r') as f: 109 | lines = f.readlines() 110 | imagenames = [x.strip() for x in lines] 111 | 112 | if not os.path.isfile(cachefile): 113 | # load annotations 114 | recs = {} 115 | for i, imagename in enumerate(imagenames): 116 | recs[imagename] = parse_rec(annopath.format(imagename)) 117 | if i % 100 == 0: 118 | print('Reading annotation for {:d}/{:d}'.format( 119 | i + 1, len(imagenames))) 120 | # save 121 | print('Saving cached annotations to {:s}'.format(cachefile)) 122 | with open(cachefile, 'w') as f: 123 | pickle.dump(recs, f) 124 | else: 125 | # load 126 | with open(cachefile, 'rb') as f: 127 | try: 128 | recs = pickle.load(f) 129 | except: 130 | recs = pickle.load(f, encoding='bytes') 131 | 132 | # extract gt objects for this class 133 | class_recs = {} 134 | npos = 0 135 | for imagename in imagenames: 136 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 137 | bbox = np.array([x['bbox'] for x in R]) 138 | if use_diff: 139 | difficult = np.array([False for x in R]).astype(np.bool) 140 | else: 141 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 142 | det = [False] * len(R) 143 | npos = npos + sum(~difficult) 144 | class_recs[imagename] = {'bbox': bbox, 145 | 'difficult': difficult, 146 | 'det': det} 147 | 148 | # read dets 149 | detfile = detpath.format(classname) 150 | with open(detfile, 'r') as f: 151 | lines = f.readlines() 152 | 153 | splitlines = [x.strip().split(' ') for x in lines] 154 | image_ids = [x[0] for x in splitlines] 155 | confidence = np.array([float(x[1]) for x in splitlines]) 156 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 157 | 158 | nd = len(image_ids) 159 | tp = np.zeros(nd) 160 | fp = np.zeros(nd) 161 | 162 | if BB.shape[0] > 0: 163 | # sort by confidence 164 | sorted_ind = np.argsort(-confidence) 165 | sorted_scores = np.sort(-confidence) 166 | BB = BB[sorted_ind, :] 167 | image_ids = [image_ids[x] for x in sorted_ind] 168 | 169 | # go down dets and mark TPs and FPs 170 | for d in range(nd): 171 | R = class_recs[image_ids[d]] 172 | bb = BB[d, :].astype(float) 173 | ovmax = -np.inf 174 | BBGT = R['bbox'].astype(float) 175 | 176 | if BBGT.size > 0: 177 | # compute overlaps 178 | # intersection 179 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 180 | iymin = np.maximum(BBGT[:, 1], bb[1]) 181 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 182 | iymax = np.minimum(BBGT[:, 3], bb[3]) 183 | iw = np.maximum(ixmax - ixmin + 1., 0.) 184 | ih = np.maximum(iymax - iymin + 1., 0.) 185 | inters = iw * ih 186 | 187 | # union 188 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 189 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 190 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 191 | 192 | overlaps = inters / uni 193 | ovmax = np.max(overlaps) 194 | jmax = np.argmax(overlaps) 195 | 196 | if ovmax > ovthresh: 197 | if not R['difficult'][jmax]: 198 | if not R['det'][jmax]: 199 | tp[d] = 1. 200 | R['det'][jmax] = 1 201 | else: 202 | fp[d] = 1. 203 | else: 204 | fp[d] = 1. 205 | 206 | # compute precision recall 207 | fp = np.cumsum(fp) 208 | tp = np.cumsum(tp) 209 | rec = tp / float(npos) 210 | # avoid divide by zero in case the first detection matches a difficult 211 | # ground truth 212 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 213 | ap = voc_ap(rec, prec, use_07_metric) 214 | 215 | return rec, prec, ap 216 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Tensorflow Faster R-CNN 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Xinlei Chen, based on code from Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | """ 10 | Demo script showing detections in sample images. 11 | 12 | See README.md for installation instructions before running. 13 | """ 14 | from __future__ import absolute_import 15 | from __future__ import division 16 | from __future__ import print_function 17 | 18 | import glob 19 | import time 20 | 21 | import _init_paths 22 | from model.config import cfg 23 | from model.test import im_detect 24 | from model.nms_wrapper import nms 25 | from text_connector import TextDetector 26 | 27 | from utils.timer import Timer 28 | import tensorflow as tf 29 | import matplotlib.pyplot as plt 30 | import numpy as np 31 | import os 32 | import cv2 33 | import argparse 34 | 35 | from nets.vgg16 import vgg16 36 | from nets.resnet_v1 import Resnetv1 37 | from nets.squeezenet import SqueezeNet 38 | from nets.mobilenet_v2 import MobileNetV2 39 | 40 | from utils import helper 41 | 42 | CLASSES = ('__background__', 'text') 43 | 44 | 45 | def vis_detections(im, class_name, dets, thresh=0.5, text=False): 46 | """Draw detected bounding boxes.""" 47 | inds = np.where(dets[:, -1] >= thresh)[0] 48 | if len(inds) == 0: 49 | return 50 | 51 | im = im[:, :, (2, 1, 0)] 52 | fig, ax = plt.subplots(figsize=(12, 12)) 53 | ax.imshow(im, aspect='equal') 54 | for i in inds: 55 | bbox = dets[i, :8] 56 | score = dets[i, -1] 57 | 58 | ax.add_line( 59 | plt.Line2D([bbox[0], bbox[2], bbox[6], bbox[4], bbox[0]], 60 | [bbox[1], bbox[3], bbox[7], bbox[5], bbox[1]], 61 | color='red', linewidth=3) 62 | ) 63 | 64 | if text: 65 | ax.text(bbox[0], bbox[1] - 2, 66 | '{:s} {:.3f}'.format(class_name, score), 67 | bbox=dict(facecolor='blue', alpha=0.5), 68 | fontsize=14, color='white') 69 | 70 | ax.set_title(('{} detections with ' 71 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 72 | thresh), 73 | fontsize=14) 74 | plt.axis('off') 75 | plt.tight_layout() 76 | plt.draw() 77 | plt.show() 78 | 79 | 80 | def save_result(img, img_name, text_lines, result_dir): 81 | dst = img 82 | color = (0, 150, 0) 83 | for bbox in text_lines: 84 | bbox = [int(x) for x in bbox] 85 | p1 = (bbox[0], bbox[1]) 86 | p2 = (bbox[2], bbox[3]) 87 | p3 = (bbox[6], bbox[7]) 88 | p4 = (bbox[4], bbox[5]) 89 | dst = cv2.line(dst, p1, p2, color, 2) 90 | dst = cv2.line(dst, p2, p3, color, 2) 91 | dst = cv2.line(dst, p3, p4, color, 2) 92 | dst = cv2.line(dst, p4, p1, color, 2) 93 | 94 | img_path = os.path.join(result_dir, img_name[0:-4] + '.jpg') 95 | cv2.imwrite(img_path, dst) 96 | 97 | 98 | def recover_scale(boxes, scale): 99 | """ 100 | :param boxes: [(x1, y1, x2, y2)] 101 | :param scale: image scale 102 | :return: 103 | """ 104 | tmp_boxes = [] 105 | for b in boxes: 106 | tmp_boxes.append([int(x / scale) for x in b]) 107 | return np.asarray(tmp_boxes).astype(np.float32) 108 | 109 | 110 | def draw_rpn_boxes(img, img_name, boxes, scores, im_scale, nms, save_dir): 111 | """ 112 | :param boxes: [(x1, y1, x2, y2)] 113 | """ 114 | boxes = recover_scale(boxes, im_scale) 115 | 116 | base_name = img_name.split('/')[-1] 117 | color = (0, 255, 0) 118 | out = img.copy() 119 | 120 | if nms: 121 | boxes, scores = TextDetector.pre_process(boxes, scores) 122 | file_name = "%s_rpn_nms.jpg" % base_name 123 | else: 124 | file_name = "%s_rpn.jpg" % base_name 125 | 126 | for i, box in enumerate(boxes): 127 | cv2.rectangle(out, (box[0], box[1]), (box[2], box[3]), color, 2) 128 | cx = int((box[0] + box[2]) / 2) 129 | cy = int((box[1] + box[3]) / 2) 130 | cv2.putText(out, "%.01f" % scores[i], (cx, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.2, (255, 0, 0)) 131 | 132 | cv2.imwrite(os.path.join(save_dir, file_name), out) 133 | 134 | 135 | def demo(sess, net, im_file, result_dir, viz=False, oriented=False): 136 | """Detect object classes in an image using pre-computed object proposals.""" 137 | 138 | # Load the demo image 139 | im = helper.read_rgb_img(im_file) 140 | 141 | # Detect all object classes and regress object bounds 142 | timer = Timer() 143 | timer.tic() 144 | scores, boxes, resized_im_shape, im_scale = im_detect(sess, net, im) 145 | timer.toc() 146 | 147 | im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) 148 | img_name = im_file.split('/')[-1] 149 | 150 | draw_rpn_boxes(im, img_name, boxes, scores[:, np.newaxis], im_scale, True, result_dir) 151 | draw_rpn_boxes(im, img_name, boxes, scores[:, np.newaxis], im_scale, False, result_dir) 152 | 153 | # Run TextDetector to merge small box 154 | line_detector = TextDetector(oriented) 155 | 156 | # line_detector 的输入必须是在 scale 之后的图片上!!, 157 | # 如果还原了以后再进行行构建,原图可能太大,导致每个 anchor 的 width 很大,导致 MAX_HORIZONTAL_GAP 太小 158 | # text_lines point order: left-top, right-top, left-bottom, right-bottom 159 | text_lines = line_detector.detect(boxes, scores[:, np.newaxis], resized_im_shape) 160 | print("Image %s, detect %d text lines in %.3fs" % (im_file, len(text_lines), timer.diff)) 161 | 162 | if len(text_lines) != 0: 163 | text_lines = recover_scale(text_lines, im_scale) 164 | save_result(im, img_name, text_lines, result_dir) 165 | 166 | # Visualize detections 167 | if viz: 168 | vis_detections(im, CLASSES[1], text_lines) 169 | 170 | 171 | def parse_args(): 172 | """Parse input arguments.""" 173 | parser = argparse.ArgumentParser(description='Tensorflow CTPN demo') 174 | parser.add_argument('--net', dest='net', choices=['vgg16', 'squeeze', 'mobile'], default='vgg16') 175 | parser.add_argument('--img_dir', default='/home/cwq/data/ICDAR13/123') 176 | parser.add_argument('--dataset', dest='dataset', help='model tag', default='voc_2007_trainval') 177 | parser.add_argument('--tag', dest='tag', help='model tag', default='vgg_latin_chn_newdata') 178 | parser.add_argument('--viz', action='store_true', default=False, help='show result') 179 | parser.add_argument('-o', '--oriented', action='store_true', default=False, help='output rotated detect box') 180 | args = parser.parse_args() 181 | 182 | if not os.path.exists(args.img_dir): 183 | print("img dir not exists.") 184 | exit(-1) 185 | 186 | args.result_dir = os.path.join('./data/result', args.tag) 187 | if not os.path.exists(args.result_dir): 188 | os.makedirs(args.result_dir) 189 | 190 | return args 191 | 192 | 193 | if __name__ == '__main__': 194 | args = parse_args() 195 | 196 | # model path 197 | netname = args.net 198 | dataset = args.dataset 199 | 200 | ckpt_dir = os.path.join('output', netname, dataset, args.tag) 201 | ckpt = tf.train.get_checkpoint_state(ckpt_dir) 202 | 203 | # set config 204 | tfconfig = tf.ConfigProto(allow_soft_placement=True) 205 | tfconfig.gpu_options.allow_growth = True 206 | 207 | # init session 208 | sess = tf.Session(config=tfconfig) 209 | # load network 210 | if netname == 'vgg16': 211 | net = vgg16() 212 | elif netname == 'res101': 213 | net = Resnetv1(num_layers=101) 214 | elif netname == 'mobile': 215 | net = MobileNetV2() 216 | elif args.net == 'squeeze': 217 | net = SqueezeNet() 218 | else: 219 | raise NotImplementedError 220 | 221 | net.create_architecture("TEST", 222 | num_classes=len(CLASSES), 223 | tag=args.tag, 224 | anchor_width=cfg.CTPN.ANCHOR_WIDTH, 225 | anchor_h_ratio_step=cfg.CTPN.H_RADIO_STEP, 226 | num_anchors=cfg.CTPN.NUM_ANCHORS) 227 | saver = tf.train.Saver() 228 | saver.restore(sess, ckpt.model_checkpoint_path) 229 | 230 | print('Loaded network {:s}'.format(ckpt.model_checkpoint_path)) 231 | 232 | im_files = glob.glob(args.img_dir + "/*.*") 233 | for im_file in im_files: 234 | demo(sess, net, im_file, args.result_dir, args.viz, args.oriented) 235 | -------------------------------------------------------------------------------- /lib/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | import os.path as osp 13 | import PIL 14 | from utils.cython_bbox import bbox_overlaps 15 | import numpy as np 16 | import scipy.sparse 17 | from model.config import cfg 18 | 19 | 20 | class imdb(object): 21 | """Image database.""" 22 | 23 | def __init__(self, name, classes=None): 24 | self._name = name 25 | self._num_classes = 0 26 | if not classes: 27 | self._classes = [] 28 | else: 29 | self._classes = classes 30 | self._image_index = [] 31 | self._obj_proposer = 'gt' 32 | self._roidb = None 33 | self._roidb_handler = self.default_roidb 34 | # Use this dict for storing dataset specific config options 35 | self.config = {} 36 | 37 | @property 38 | def name(self): 39 | return self._name 40 | 41 | @property 42 | def num_classes(self): 43 | return len(self._classes) 44 | 45 | @property 46 | def classes(self): 47 | return self._classes 48 | 49 | @property 50 | def image_index(self): 51 | return self._image_index 52 | 53 | @property 54 | def roidb_handler(self): 55 | return self._roidb_handler 56 | 57 | @roidb_handler.setter 58 | def roidb_handler(self, val): 59 | self._roidb_handler = val 60 | 61 | def set_proposal_method(self, method): 62 | method = eval('self.' + method + '_roidb') 63 | self.roidb_handler = method 64 | 65 | @property 66 | def roidb(self): 67 | # A roidb is a list of dictionaries, each with the following keys: 68 | # boxes 69 | # gt_overlaps 70 | # gt_classes 71 | # flipped 72 | if self._roidb is not None: 73 | return self._roidb 74 | self._roidb = self.roidb_handler() 75 | return self._roidb 76 | 77 | @property 78 | def cache_path(self): 79 | cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache')) 80 | if not os.path.exists(cache_path): 81 | os.makedirs(cache_path) 82 | return cache_path 83 | 84 | @property 85 | def num_images(self): 86 | return len(self.image_index) 87 | 88 | def image_path_at(self, i): 89 | raise NotImplementedError 90 | 91 | def default_roidb(self): 92 | raise NotImplementedError 93 | 94 | def evaluate_detections(self, all_boxes, output_dir=None): 95 | """ 96 | all_boxes is a list of length number-of-classes. 97 | Each list element is a list of length number-of-images. 98 | Each of those list elements is either an empty list [] 99 | or a numpy array of detection. 100 | 101 | all_boxes[class][image] = [] or np.array of shape #dets x 5 102 | """ 103 | raise NotImplementedError 104 | 105 | def _get_widths(self): 106 | return [PIL.Image.open(self.image_path_at(i)).size[0] 107 | for i in range(self.num_images)] 108 | 109 | def append_flipped_images(self): 110 | num_images = self.num_images 111 | widths = self._get_widths() 112 | for i in range(num_images): 113 | boxes = self.roidb[i]['boxes'].copy() 114 | oldx1 = boxes[:, 0].copy() 115 | oldx2 = boxes[:, 2].copy() 116 | boxes[:, 0] = widths[i] - oldx2 - 1 117 | boxes[:, 2] = widths[i] - oldx1 - 1 118 | for b in range(len(boxes)): 119 | if boxes[b][2] < boxes[b][0]: 120 | boxes[b][0] = 0 121 | assert (boxes[:, 2] >= boxes[:, 0]).all() 122 | entry = {'boxes': boxes, 123 | 'gt_overlaps': self.roidb[i]['gt_overlaps'], 124 | 'gt_classes': self.roidb[i]['gt_classes'], 125 | 'flipped': True} 126 | 127 | if 'gt_ishard' in self.roidb[i] and 'dontcare_areas' in self.roidb[i]: 128 | entry['gt_ishard'] = self.roidb[i]['gt_ishard'].copy() 129 | dontcare_areas = self.roidb[i]['dontcare_areas'].copy() 130 | oldx1 = dontcare_areas[:, 0].copy() 131 | oldx2 = dontcare_areas[:, 2].copy() 132 | dontcare_areas[:, 0] = widths[i] - oldx2 - 1 133 | dontcare_areas[:, 2] = widths[i] - oldx1 - 1 134 | entry['dontcare_areas'] = dontcare_areas 135 | 136 | self.roidb.append(entry) 137 | 138 | self._image_index = self._image_index * 2 139 | 140 | def evaluate_recall(self, candidate_boxes=None, thresholds=None, 141 | area='all', limit=None): 142 | """Evaluate detection proposal recall metrics. 143 | 144 | Returns: 145 | results: dictionary of results with keys 146 | 'ar': average recall 147 | 'recalls': vector recalls at each IoU overlap threshold 148 | 'thresholds': vector of IoU overlap thresholds 149 | 'gt_overlaps': vector of all ground-truth overlaps 150 | """ 151 | # Record max overlap value for each gt box 152 | # Return vector of overlap values 153 | areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3, 154 | '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7} 155 | area_ranges = [[0 ** 2, 1e5 ** 2], # all 156 | [0 ** 2, 32 ** 2], # small 157 | [32 ** 2, 96 ** 2], # medium 158 | [96 ** 2, 1e5 ** 2], # large 159 | [96 ** 2, 128 ** 2], # 96-128 160 | [128 ** 2, 256 ** 2], # 128-256 161 | [256 ** 2, 512 ** 2], # 256-512 162 | [512 ** 2, 1e5 ** 2], # 512-inf 163 | ] 164 | assert area in areas, 'unknown area range: {}'.format(area) 165 | area_range = area_ranges[areas[area]] 166 | gt_overlaps = np.zeros(0) 167 | num_pos = 0 168 | for i in range(self.num_images): 169 | # Checking for max_overlaps == 1 avoids including crowd annotations 170 | # (...pretty hacking :/) 171 | max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1) 172 | gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) & 173 | (max_gt_overlaps == 1))[0] 174 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :] 175 | gt_areas = self.roidb[i]['seg_areas'][gt_inds] 176 | valid_gt_inds = np.where((gt_areas >= area_range[0]) & 177 | (gt_areas <= area_range[1]))[0] 178 | gt_boxes = gt_boxes[valid_gt_inds, :] 179 | num_pos += len(valid_gt_inds) 180 | 181 | if candidate_boxes is None: 182 | # If candidate_boxes is not supplied, the default is to use the 183 | # non-ground-truth boxes from this roidb 184 | non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0] 185 | boxes = self.roidb[i]['boxes'][non_gt_inds, :] 186 | else: 187 | boxes = candidate_boxes[i] 188 | if boxes.shape[0] == 0: 189 | continue 190 | if limit is not None and boxes.shape[0] > limit: 191 | boxes = boxes[:limit, :] 192 | 193 | overlaps = bbox_overlaps(boxes.astype(np.float), 194 | gt_boxes.astype(np.float)) 195 | 196 | _gt_overlaps = np.zeros((gt_boxes.shape[0])) 197 | for j in range(gt_boxes.shape[0]): 198 | # find which proposal box maximally covers each gt box 199 | argmax_overlaps = overlaps.argmax(axis=0) 200 | # and get the iou amount of coverage for each gt box 201 | max_overlaps = overlaps.max(axis=0) 202 | # find which gt box is 'best' covered (i.e. 'best' = most iou) 203 | gt_ind = max_overlaps.argmax() 204 | gt_ovr = max_overlaps.max() 205 | assert (gt_ovr >= 0) 206 | # find the proposal box that covers the best covered gt box 207 | box_ind = argmax_overlaps[gt_ind] 208 | # record the iou coverage of this gt box 209 | _gt_overlaps[j] = overlaps[box_ind, gt_ind] 210 | assert (_gt_overlaps[j] == gt_ovr) 211 | # mark the proposal box and the gt box as used 212 | overlaps[box_ind, :] = -1 213 | overlaps[:, gt_ind] = -1 214 | # append recorded iou coverage level 215 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) 216 | 217 | gt_overlaps = np.sort(gt_overlaps) 218 | if thresholds is None: 219 | step = 0.05 220 | thresholds = np.arange(0.5, 0.95 + 1e-5, step) 221 | recalls = np.zeros_like(thresholds) 222 | # compute recall for each iou threshold 223 | for i, t in enumerate(thresholds): 224 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) 225 | # ar = 2 * np.trapz(recalls, thresholds) 226 | ar = recalls.mean() 227 | return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds, 228 | 'gt_overlaps': gt_overlaps} 229 | 230 | def create_roidb_from_box_list(self, box_list, gt_roidb): 231 | assert len(box_list) == self.num_images, \ 232 | 'Number of boxes must match number of ground-truth images' 233 | roidb = [] 234 | for i in range(self.num_images): 235 | boxes = box_list[i] 236 | num_boxes = boxes.shape[0] 237 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 238 | 239 | if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: 240 | gt_boxes = gt_roidb[i]['boxes'] 241 | gt_classes = gt_roidb[i]['gt_classes'] 242 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 243 | gt_boxes.astype(np.float)) 244 | argmaxes = gt_overlaps.argmax(axis=1) 245 | maxes = gt_overlaps.max(axis=1) 246 | I = np.where(maxes > 0)[0] 247 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 248 | 249 | overlaps = scipy.sparse.csr_matrix(overlaps) 250 | roidb.append({ 251 | 'boxes': boxes, 252 | 'gt_classes': np.zeros((num_boxes,), dtype=np.int32), 253 | 'gt_overlaps': overlaps, 254 | 'flipped': False, 255 | 'seg_areas': np.zeros((num_boxes,), dtype=np.float32), 256 | }) 257 | return roidb 258 | 259 | @staticmethod 260 | def merge_roidbs(a, b): 261 | assert len(a) == len(b) 262 | for i in range(len(a)): 263 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 264 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 265 | b[i]['gt_classes'])) 266 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 267 | b[i]['gt_overlaps']]) 268 | a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'], 269 | b[i]['seg_areas'])) 270 | return a 271 | 272 | def competition_mode(self, on): 273 | """Turn competition mode on or off.""" 274 | pass 275 | -------------------------------------------------------------------------------- /lib/model/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | # `pip install easydict` if you don't have it 9 | from easydict import EasyDict as edict 10 | 11 | __C = edict() 12 | # Consumers can get config by: 13 | # from fast_rcnn_config import cfg 14 | cfg = __C 15 | 16 | # 17 | # Training options 18 | # 19 | __C.TRAIN = edict() 20 | 21 | # Initial learning rate 22 | __C.TRAIN.LEARNING_RATE = 0.001 23 | 24 | # Optimizer Adam, Momentum, RMS 25 | __C.TRAIN.OPTIMIZER = 'Adam' 26 | 27 | # Momentum 28 | __C.TRAIN.MOMENTUM = 0.9 29 | 30 | # Weight decay, for regularization 31 | __C.TRAIN.WEIGHT_DECAY = 0.0005 32 | 33 | # Factor for reducing the learning rate 34 | __C.TRAIN.GAMMA = 0.1 35 | 36 | # Step size for reducing the learning rate, currently only support one step 37 | __C.TRAIN.STEPSIZE = [50000] 38 | 39 | # Iteration intervals for showing the loss during training, on command line interface 40 | __C.TRAIN.DISPLAY = 10 41 | 42 | # Whether to double the learning rate for bias 43 | __C.TRAIN.DOUBLE_BIAS = True 44 | 45 | # Whether to initialize the weights with truncated normal distribution 46 | __C.TRAIN.TRUNCATED = False 47 | 48 | # Whether to have weight decay on bias as well 49 | __C.TRAIN.BIAS_DECAY = False 50 | 51 | # Whether to add ground truth boxes to the pool when sampling regions 52 | __C.TRAIN.USE_GT = False 53 | 54 | # Whether to use aspect-ratio grouping of training images, introduced merely for saving 55 | # GPU memory 56 | __C.TRAIN.ASPECT_GROUPING = False 57 | 58 | # The number of snapshots kept, older ones are deleted to save space 59 | __C.TRAIN.SNAPSHOT_KEPT = 3 60 | 61 | # The time interval for saving tensorflow summaries 62 | __C.TRAIN.SUMMARY_INTERVAL = 30 63 | 64 | # Scale to use during training (can list multiple scales) 65 | # The scale is the pixel size of an image's shortest side 66 | __C.TRAIN.SCALES = (600,) 67 | 68 | # Max pixel size of the longest side of a scaled input image 69 | __C.TRAIN.MAX_SIZE = 1200 70 | 71 | # Images to use per minibatch 72 | __C.TRAIN.IMS_PER_BATCH = 1 73 | 74 | # Fraction of minibatch that is labeled foreground (i.e. class > 0) 75 | __C.TRAIN.FG_FRACTION = 0.3 76 | 77 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) 78 | __C.TRAIN.FG_THRESH = 0.5 79 | 80 | # Overlap threshold for a ROI to be considered background (class = 0 if 81 | # overlap in [LO, HI)) 82 | __C.TRAIN.BG_THRESH_HI = 0.5 83 | __C.TRAIN.BG_THRESH_LO = 0.1 84 | 85 | __C.TRAIN.USE_FLIPPED = True 86 | 87 | # Train bounding-box regressors 88 | __C.TRAIN.BBOX_REG = True 89 | 90 | # Overlap required between a ROI and ground-truth box in order for that ROI to 91 | # be used as a bounding-box regression training example 92 | __C.TRAIN.BBOX_THRESH = 0.5 93 | 94 | # Iterations between snapshots 95 | __C.TRAIN.SNAPSHOT_ITERS = 5000 96 | 97 | # solver.prototxt specifies the snapshot path prefix, this adds an optional 98 | # infix to yield the path: [_]_iters_XYZ.caffemodel 99 | __C.TRAIN.SNAPSHOT_PREFIX = 'res101_faster_rcnn' 100 | 101 | # Normalize the targets (subtract empirical mean, divide by empirical stddev) 102 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True 103 | 104 | # Deprecated (inside weights) useless in CTPN 105 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (0.0, 1.0, 0.0, 1.0) 106 | 107 | # Normalize the targets using "precomputed" (or made up) means and stdevs 108 | # (BBOX_NORMALIZE_TARGETS must also be True) 109 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True 110 | 111 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0) 112 | 113 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2) 114 | 115 | # Train using these proposals 116 | __C.TRAIN.PROPOSAL_METHOD = 'gt' 117 | 118 | # Make minibatches from images that have similar aspect ratios (i.e. both 119 | # tall and thin or both short and wide) in order to avoid wasting computation 120 | # on zero-padding. 121 | 122 | # IOU >= thresh: positive example 123 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 124 | 125 | # IOU < thresh: negative example 126 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.5 127 | 128 | # If an anchor satisfied by positive and negative conditions set to negative 129 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False 130 | 131 | # Max number of foreground examples 132 | __C.TRAIN.RPN_FG_FRACTION = 0.5 133 | 134 | # Total number of examples 135 | __C.TRAIN.RPN_BATCHSIZE = 128 136 | 137 | # NMS threshold used on RPN proposals 138 | __C.TRAIN.RPN_NMS_THRESH = 0.7 139 | 140 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 141 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 142 | 143 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 144 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000 145 | 146 | # The order of weights see lib/model/bbox_transform.py bbox_transform() 147 | # Weights for (x, y, w, h), for CTPN it should be (0.,1.,0.,1.) 148 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (0.0, 1.0, 0.0, 1.0) 149 | 150 | # Give the positive RPN examples weight of p * 1 / {num positives} 151 | # and give negatives a weight of (1 - p) 152 | # Set to -1.0 to use uniform example weighting 153 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 154 | 155 | # Whether to use all ground truth bounding boxes for training, 156 | # For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd'' 157 | __C.TRAIN.USE_ALL_GT = True 158 | 159 | # 160 | # Testing options 161 | # 162 | __C.TEST = edict() 163 | 164 | # Scale to use during testing (can NOT list multiple scales) 165 | # The scale is the pixel size of an image's shortest side 166 | __C.TEST.SCALES = (600,) 167 | 168 | # Max pixel size of the longest side of a scaled input image 169 | __C.TEST.MAX_SIZE = 1200 170 | 171 | # Overlap threshold used for non-maximum suppression (suppress boxes with 172 | # IoU >= this threshold) 173 | __C.TEST.NMS = 0.3 174 | 175 | # Test using bounding-box regressors 176 | __C.TEST.BBOX_REG = True 177 | 178 | # Test using these proposals 179 | __C.TEST.PROPOSAL_METHOD = 'gt' 180 | 181 | ## NMS threshold used on RPN proposals 182 | __C.TEST.RPN_NMS_THRESH = 0.7 183 | 184 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 185 | __C.TEST.RPN_PRE_NMS_TOP_N = 12000 186 | 187 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 188 | __C.TEST.RPN_POST_NMS_TOP_N = 1000 189 | 190 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 191 | # __C.TEST.RPN_MIN_SIZE = 16 192 | 193 | # Testing mode, default to be 'nms', 'top' is slower but better 194 | # See report for details 195 | __C.TEST.MODE = 'nms' 196 | 197 | # Only useful when TEST.MODE is 'top', specifies the number of top proposals to select 198 | __C.TEST.RPN_TOP_N = 5000 199 | 200 | # 201 | # ResNet options 202 | # 203 | 204 | __C.RESNET = edict() 205 | 206 | # Number of fixed blocks during training, by default the first of all 4 blocks is fixed 207 | # Range: 0 (none) to 3 (all) 208 | __C.RESNET.FIXED_BLOCKS = 1 209 | 210 | # 211 | # MobileNet options 212 | # 213 | 214 | __C.MOBILENET = edict() 215 | 216 | # Whether to regularize the depth-wise filters during training 217 | __C.MOBILENET.REGU_DEPTH = False 218 | 219 | # Number of fixed layers during training, by default the bottom 5 of 14 layers is fixed 220 | # Range: 0 (none) to 12 (all) 221 | __C.MOBILENET.FIXED_LAYERS = 5 222 | 223 | # Weight decay for the mobilenet weights 224 | __C.MOBILENET.WEIGHT_DECAY = 0.00004 225 | 226 | # Depth multiplier 227 | __C.MOBILENET.DEPTH_MULTIPLIER = 1. 228 | 229 | # 230 | # MISC 231 | # 232 | 233 | # Pixel mean values (BGR order) as a (1, 1, 3) array 234 | # Means for VGG, from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py 235 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 236 | # __C.PIXEL_MEANS = np.array([[[103.94, 116.78, 123.68]]]) 237 | 238 | # For reproducibility 239 | __C.RNG_SEED = 3 240 | 241 | # Root directory of project 242 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 243 | 244 | # Data directory 245 | __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data')) 246 | 247 | # Name (or path to) the matlab executable 248 | __C.MATLAB = 'matlab' 249 | 250 | # Place outputs under an experiments directory 251 | __C.EXP_DIR = 'default' 252 | 253 | # Use GPU implementation of non-maximum suppression 254 | __C.USE_GPU_NMS = True 255 | 256 | # Anchor scales for RPN 257 | __C.ANCHOR_SCALES = [8, 16, 32] 258 | 259 | # Anchor ratios for RPN 260 | __C.ANCHOR_RATIOS = [0.5, 1, 2] 261 | 262 | # Number of filters for the RPN layer 263 | __C.RPN_CHANNELS = 512 264 | 265 | # 266 | # CTPN options 267 | # 268 | 269 | __C.CTPN = edict() 270 | 271 | __C.CTPN.NUM_ANCHORS = 10 272 | __C.CTPN.ANCHOR_WIDTH = 16 273 | __C.CTPN.H_RADIO_STEP = 0.7 274 | 275 | 276 | def get_output_dir(imdb, weights_filename): 277 | """Return the directory where experimental artifacts are placed. 278 | If the directory does not exist, it is created. 279 | 280 | A canonical path is built using the name from an imdb and a network 281 | (if not None). 282 | """ 283 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 284 | if weights_filename is None: 285 | weights_filename = 'default' 286 | outdir = osp.join(outdir, weights_filename) 287 | if not os.path.exists(outdir): 288 | os.makedirs(outdir) 289 | return outdir 290 | 291 | 292 | def get_output_tb_dir(imdb, weights_filename): 293 | """Return the directory where tensorflow summaries are placed. 294 | If the directory does not exist, it is created. 295 | 296 | A canonical path is built using the name from an imdb and a network 297 | (if not None). 298 | """ 299 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'tensorboard', __C.EXP_DIR, imdb.name)) 300 | if weights_filename is None: 301 | weights_filename = 'default' 302 | outdir = osp.join(outdir, weights_filename) 303 | if not os.path.exists(outdir): 304 | os.makedirs(outdir) 305 | return outdir 306 | 307 | 308 | def _merge_a_into_b(a, b): 309 | """Merge config dictionary a into config dictionary b, clobbering the 310 | options in b whenever they are also specified in a. 311 | """ 312 | if type(a) is not edict: 313 | return 314 | 315 | for k, v in a.items(): 316 | # a must specify keys that are in b 317 | if k not in b: 318 | raise KeyError('{} is not a valid config key'.format(k)) 319 | 320 | # the types must match, too 321 | old_type = type(b[k]) 322 | if old_type is not type(v): 323 | if isinstance(b[k], np.ndarray): 324 | v = np.array(v, dtype=b[k].dtype) 325 | else: 326 | raise ValueError(('Type mismatch ({} vs. {}) ' 327 | 'for config key: {}').format(type(b[k]), 328 | type(v), k)) 329 | 330 | # recursively merge dicts 331 | if type(v) is edict: 332 | try: 333 | _merge_a_into_b(a[k], b[k]) 334 | except: 335 | print(('Error under config key: {}'.format(k))) 336 | raise 337 | else: 338 | b[k] = v 339 | 340 | 341 | def cfg_from_file(filename): 342 | """Load a config file and merge it into the default options.""" 343 | import yaml 344 | with open(filename, 'r') as f: 345 | yaml_cfg = edict(yaml.load(f)) 346 | 347 | _merge_a_into_b(yaml_cfg, __C) 348 | 349 | 350 | def cfg_from_list(cfg_list): 351 | """Set config keys via list (e.g., from command line).""" 352 | from ast import literal_eval 353 | assert len(cfg_list) % 2 == 0 354 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 355 | key_list = k.split('.') 356 | d = __C 357 | for subkey in key_list[:-1]: 358 | assert subkey in d 359 | d = d[subkey] 360 | subkey = key_list[-1] 361 | assert subkey in d 362 | try: 363 | value = literal_eval(v) 364 | except: 365 | # handle the case when v is a string literal 366 | value = v 367 | assert type(value) == type(d[subkey]), \ 368 | 'type {} does not match original type {}'.format( 369 | type(value), type(d[subkey])) 370 | d[subkey] = value 371 | -------------------------------------------------------------------------------- /lib/datasets/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | from datasets.imdb import imdb 13 | import datasets.ds_utils as ds_utils 14 | import xml.etree.ElementTree as ET 15 | import numpy as np 16 | import scipy.sparse 17 | import scipy.io as sio 18 | import utils.cython_bbox 19 | import pickle 20 | import subprocess 21 | import uuid 22 | from .voc_eval import voc_eval 23 | from model.config import cfg 24 | 25 | 26 | class pascal_voc(imdb): 27 | def __init__(self, image_set, year, use_diff=False): 28 | name = 'voc_' + year + '_' + image_set 29 | if use_diff: 30 | name += '_diff' 31 | imdb.__init__(self, name) 32 | self._year = year 33 | self._image_set = image_set 34 | self._devkit_path = self._get_default_path() 35 | self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year) 36 | self._classes = ('__background__', # always index 0 37 | 'text') 38 | self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes))))) 39 | self._image_ext = '.jpg' 40 | self._image_index = self._load_image_set_index() 41 | # Default to roidb handler 42 | self._roidb_handler = self.gt_roidb 43 | self._salt = str(uuid.uuid4()) 44 | self._comp_id = 'comp4' 45 | 46 | # PASCAL specific config options 47 | self.config = {'cleanup': True, 48 | 'use_salt': True, 49 | 'use_diff': use_diff, 50 | 'matlab_eval': False, 51 | 'rpn_file': None} 52 | 53 | assert os.path.exists(self._devkit_path), \ 54 | 'VOCdevkit path does not exist: {}'.format(self._devkit_path) 55 | assert os.path.exists(self._data_path), \ 56 | 'Path does not exist: {}'.format(self._data_path) 57 | 58 | def image_path_at(self, i): 59 | """ 60 | Return the absolute path to image i in the image sequence. 61 | """ 62 | return self.image_path_from_index(self._image_index[i]) 63 | 64 | def image_path_from_index(self, index): 65 | """ 66 | Construct an image path from the image's "index" identifier. 67 | """ 68 | image_path = os.path.join(self._data_path, 'JPEGImages', 69 | index + self._image_ext) 70 | assert os.path.exists(image_path), \ 71 | 'Path does not exist: {}'.format(image_path) 72 | return image_path 73 | 74 | def _load_image_set_index(self): 75 | """ 76 | Load the indexes listed in this dataset's image set file. 77 | """ 78 | # Example path to image set file: 79 | # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt 80 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 81 | self._image_set + '.txt') 82 | assert os.path.exists(image_set_file), \ 83 | 'Path does not exist: {}'.format(image_set_file) 84 | with open(image_set_file) as f: 85 | image_index = [x.strip() for x in f.readlines()] 86 | return image_index 87 | 88 | def _get_default_path(self): 89 | """ 90 | Return the default path where PASCAL VOC is expected to be installed. 91 | """ 92 | return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year) 93 | 94 | def gt_roidb(self): 95 | """ 96 | Return the database of ground-truth regions of interest. 97 | 98 | This function loads/saves from/to a cache file to speed up future calls. 99 | """ 100 | cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') 101 | if os.path.exists(cache_file): 102 | with open(cache_file, 'rb') as fid: 103 | try: 104 | roidb = pickle.load(fid) 105 | except: 106 | roidb = pickle.load(fid, encoding='bytes') 107 | print('{} gt roidb loaded from {}'.format(self.name, cache_file)) 108 | return roidb 109 | 110 | gt_roidb = [self._load_pascal_annotation(index) 111 | for index in self.image_index] 112 | with open(cache_file, 'wb') as fid: 113 | pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) 114 | print('wrote gt roidb to {}'.format(cache_file)) 115 | 116 | return gt_roidb 117 | 118 | def rpn_roidb(self): 119 | if int(self._year) == 2007 or self._image_set != 'test': 120 | gt_roidb = self.gt_roidb() 121 | rpn_roidb = self._load_rpn_roidb(gt_roidb) 122 | roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb) 123 | else: 124 | roidb = self._load_rpn_roidb(None) 125 | 126 | return roidb 127 | 128 | def _load_rpn_roidb(self, gt_roidb): 129 | filename = self.config['rpn_file'] 130 | print('loading {}'.format(filename)) 131 | assert os.path.exists(filename), \ 132 | 'rpn data not found at: {}'.format(filename) 133 | with open(filename, 'rb') as f: 134 | box_list = pickle.load(f) 135 | return self.create_roidb_from_box_list(box_list, gt_roidb) 136 | 137 | def _load_pascal_annotation(self, index): 138 | """ 139 | Load image and bounding boxes info from XML file in the PASCAL VOC 140 | format. 141 | """ 142 | filename = os.path.join(self._data_path, 'Annotations', index + '.xml') 143 | tree = ET.parse(filename) 144 | objs = tree.findall('object') 145 | if not self.config['use_diff']: 146 | # Exclude the samples labeled as difficult 147 | non_diff_objs = [ 148 | obj for obj in objs if int(obj.find('difficult').text) == 0] 149 | # if len(non_diff_objs) != len(objs): 150 | # print 'Removed {} difficult objects'.format( 151 | # len(objs) - len(non_diff_objs)) 152 | objs = non_diff_objs 153 | num_objs = len(objs) 154 | 155 | boxes = np.zeros((num_objs, 4), dtype=np.uint16) 156 | gt_classes = np.zeros((num_objs), dtype=np.int32) 157 | overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) 158 | # "Seg" area for pascal is just the box area 159 | seg_areas = np.zeros((num_objs), dtype=np.float32) 160 | 161 | # Load object bounding boxes into a data frame. 162 | for ix, obj in enumerate(objs): 163 | bbox = obj.find('bndbox') 164 | # Make pixel indexes 0-based 165 | # Origin faster rcnn 166 | # x1 = float(bbox.find('xmin').text) - 1 167 | # y1 = float(bbox.find('ymin').text) - 1 168 | # x2 = float(bbox.find('xmax').text) - 1 169 | # y2 = float(bbox.find('ymax').text) - 1 170 | 171 | x1 = float(bbox.find('xmin').text) 172 | y1 = float(bbox.find('ymin').text) 173 | x2 = float(bbox.find('xmax').text) 174 | y2 = float(bbox.find('ymax').text) 175 | 176 | cls = self._class_to_ind[obj.find('name').text.lower().strip()] 177 | boxes[ix, :] = [x1, y1, x2, y2] 178 | gt_classes[ix] = cls 179 | overlaps[ix, cls] = 1.0 180 | seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1) 181 | 182 | overlaps = scipy.sparse.csr_matrix(overlaps) 183 | 184 | return {'boxes': boxes, 185 | 'gt_classes': gt_classes, 186 | 'gt_overlaps': overlaps, 187 | 'flipped': False, 188 | 'seg_areas': seg_areas} 189 | 190 | def _get_comp_id(self): 191 | comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt'] 192 | else self._comp_id) 193 | return comp_id 194 | 195 | def _get_voc_results_file_template(self): 196 | # VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt 197 | filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt' 198 | path = os.path.join( 199 | self._devkit_path, 200 | 'results', 201 | 'VOC' + self._year, 202 | 'Main', 203 | filename) 204 | return path 205 | 206 | def _write_voc_results_file(self, all_boxes): 207 | for cls_ind, cls in enumerate(self.classes): 208 | if cls == '__background__': 209 | continue 210 | print('Writing {} VOC results file'.format(cls)) 211 | filename = self._get_voc_results_file_template().format(cls) 212 | with open(filename, 'wt') as f: 213 | for im_ind, index in enumerate(self.image_index): 214 | dets = all_boxes[cls_ind][im_ind] 215 | if dets == []: 216 | continue 217 | # the VOCdevkit expects 1-based indices 218 | for k in range(dets.shape[0]): 219 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 220 | format(index, dets[k, -1], 221 | dets[k, 0] + 1, dets[k, 1] + 1, 222 | dets[k, 2] + 1, dets[k, 3] + 1)) 223 | 224 | def _do_python_eval(self, output_dir='output'): 225 | annopath = os.path.join( 226 | self._devkit_path, 227 | 'VOC' + self._year, 228 | 'Annotations', 229 | '{:s}.xml') 230 | imagesetfile = os.path.join( 231 | self._devkit_path, 232 | 'VOC' + self._year, 233 | 'ImageSets', 234 | 'Main', 235 | self._image_set + '.txt') 236 | cachedir = os.path.join(self._devkit_path, 'annotations_cache') 237 | aps = [] 238 | # The PASCAL VOC metric changed in 2010 239 | use_07_metric = True if int(self._year) < 2010 else False 240 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 241 | if not os.path.isdir(output_dir): 242 | os.mkdir(output_dir) 243 | for i, cls in enumerate(self._classes): 244 | if cls == '__background__': 245 | continue 246 | filename = self._get_voc_results_file_template().format(cls) 247 | rec, prec, ap = voc_eval( 248 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 249 | use_07_metric=use_07_metric, use_diff=self.config['use_diff']) 250 | aps += [ap] 251 | print(('AP for {} = {:.4f}'.format(cls, ap))) 252 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 253 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 254 | print(('Mean AP = {:.4f}'.format(np.mean(aps)))) 255 | print('~~~~~~~~') 256 | print('Results:') 257 | for ap in aps: 258 | print(('{:.3f}'.format(ap))) 259 | print(('{:.3f}'.format(np.mean(aps)))) 260 | print('~~~~~~~~') 261 | print('') 262 | print('--------------------------------------------------------------') 263 | print('Results computed with the **unofficial** Python eval code.') 264 | print('Results should be very close to the official MATLAB eval code.') 265 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 266 | print('-- Thanks, The Management') 267 | print('--------------------------------------------------------------') 268 | 269 | def _do_matlab_eval(self, output_dir='output'): 270 | print('-----------------------------------------------------') 271 | print('Computing results with the official MATLAB eval code.') 272 | print('-----------------------------------------------------') 273 | path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets', 274 | 'VOCdevkit-matlab-wrapper') 275 | cmd = 'cd {} && '.format(path) 276 | cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB) 277 | cmd += '-r "dbstop if error; ' 278 | cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \ 279 | .format(self._devkit_path, self._get_comp_id(), 280 | self._image_set, output_dir) 281 | print(('Running:\n{}'.format(cmd))) 282 | status = subprocess.call(cmd, shell=True) 283 | 284 | def evaluate_detections(self, all_boxes, output_dir): 285 | self._write_voc_results_file(all_boxes) 286 | self._do_python_eval(output_dir) 287 | if self.config['matlab_eval']: 288 | self._do_matlab_eval(output_dir) 289 | if self.config['cleanup']: 290 | for cls in self._classes: 291 | if cls == '__background__': 292 | continue 293 | filename = self._get_voc_results_file_template().format(cls) 294 | os.remove(filename) 295 | 296 | def competition_mode(self, on): 297 | if on: 298 | self.config['use_salt'] = False 299 | self.config['cleanup'] = False 300 | else: 301 | self.config['use_salt'] = True 302 | self.config['cleanup'] = True 303 | 304 | 305 | if __name__ == '__main__': 306 | from datasets.pascal_voc import pascal_voc 307 | 308 | d = pascal_voc('trainval', '2007') 309 | res = d.roidb 310 | from IPython import embed; 311 | 312 | embed() 313 | -------------------------------------------------------------------------------- /lib/nets/mobilenet/conv_blocks.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Convolution blocks for mobilenet.""" 16 | import contextlib 17 | import functools 18 | 19 | import tensorflow as tf 20 | 21 | slim = tf.contrib.slim 22 | 23 | 24 | def _fixed_padding(inputs, kernel_size, rate=1): 25 | """Pads the input along the spatial dimensions independently of input size. 26 | Pads the input such that if it was used in a convolution with 'VALID' padding, 27 | the output would have the same dimensions as if the unpadded input was used 28 | in a convolution with 'SAME' padding. 29 | Args: 30 | inputs: A tensor of size [batch, height_in, width_in, channels]. 31 | kernel_size: The kernel to be used in the conv2d or max_pool2d operation. 32 | rate: An integer, rate for atrous convolution. 33 | Returns: 34 | output: A tensor of size [batch, height_out, width_out, channels] with the 35 | input, either intact (if kernel_size == 1) or padded (if kernel_size > 1). 36 | """ 37 | kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1), 38 | kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)] 39 | pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1] 40 | pad_beg = [pad_total[0] // 2, pad_total[1] // 2] 41 | pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]] 42 | padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]], 43 | [pad_beg[1], pad_end[1]], [0, 0]]) 44 | return padded_inputs 45 | 46 | 47 | def _make_divisible(v, divisor, min_value=None): 48 | if min_value is None: 49 | min_value = divisor 50 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 51 | # Make sure that round down does not go down by more than 10%. 52 | if new_v < 0.9 * v: 53 | new_v += divisor 54 | return new_v 55 | 56 | 57 | def _split_divisible(num, num_ways, divisible_by=8): 58 | """Evenly splits num, num_ways so each piece is a multiple of divisible_by.""" 59 | assert num % divisible_by == 0 60 | assert num / num_ways >= divisible_by 61 | # Note: want to round down, we adjust each split to match the total. 62 | base = num // num_ways // divisible_by * divisible_by 63 | result = [] 64 | accumulated = 0 65 | for i in range(num_ways): 66 | r = base 67 | while accumulated + r < num * (i + 1) / num_ways: 68 | r += divisible_by 69 | result.append(r) 70 | accumulated += r 71 | assert accumulated == num 72 | return result 73 | 74 | 75 | @contextlib.contextmanager 76 | def _v1_compatible_scope_naming(scope): 77 | if scope is None: # Create uniqified separable blocks. 78 | with tf.variable_scope(None, default_name='separable') as s, \ 79 | tf.name_scope(s.original_name_scope): 80 | yield '' 81 | else: 82 | # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts. 83 | # which provide numbered scopes. 84 | scope += '_' 85 | yield scope 86 | 87 | 88 | @slim.add_arg_scope 89 | def split_separable_conv2d(input_tensor, 90 | num_outputs, 91 | scope=None, 92 | normalizer_fn=None, 93 | stride=1, 94 | rate=1, 95 | endpoints=None, 96 | use_explicit_padding=False): 97 | """Separable mobilenet V1 style convolution. 98 | Depthwise convolution, with default non-linearity, 99 | followed by 1x1 depthwise convolution. This is similar to 100 | slim.separable_conv2d, but differs in tha it applies batch 101 | normalization and non-linearity to depthwise. This matches 102 | the basic building of Mobilenet Paper 103 | (https://arxiv.org/abs/1704.04861) 104 | Args: 105 | input_tensor: input 106 | num_outputs: number of outputs 107 | scope: optional name of the scope. Note if provided it will use 108 | scope_depthwise for deptwhise, and scope_pointwise for pointwise. 109 | normalizer_fn: which normalizer function to use for depthwise/pointwise 110 | stride: stride 111 | rate: output rate (also known as dilation rate) 112 | endpoints: optional, if provided, will export additional tensors to it. 113 | use_explicit_padding: Use 'VALID' padding for convolutions, but prepad 114 | inputs so that the output dimensions are the same as if 'SAME' padding 115 | were used. 116 | Returns: 117 | output tesnor 118 | """ 119 | 120 | with _v1_compatible_scope_naming(scope) as scope: 121 | dw_scope = scope + 'depthwise' 122 | endpoints = endpoints if endpoints is not None else {} 123 | kernel_size = [3, 3] 124 | padding = 'SAME' 125 | if use_explicit_padding: 126 | padding = 'VALID' 127 | input_tensor = _fixed_padding(input_tensor, kernel_size, rate) 128 | net = slim.separable_conv2d( 129 | input_tensor, 130 | None, 131 | kernel_size, 132 | depth_multiplier=1, 133 | stride=stride, 134 | rate=rate, 135 | normalizer_fn=normalizer_fn, 136 | padding=padding, 137 | scope=dw_scope) 138 | 139 | endpoints[dw_scope] = net 140 | 141 | pw_scope = scope + 'pointwise' 142 | net = slim.conv2d( 143 | net, 144 | num_outputs, [1, 1], 145 | stride=1, 146 | normalizer_fn=normalizer_fn, 147 | scope=pw_scope) 148 | endpoints[pw_scope] = net 149 | return net 150 | 151 | 152 | def expand_input_by_factor(n, divisible_by=8): 153 | return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by) 154 | 155 | 156 | @slim.add_arg_scope 157 | def expanded_conv(input_tensor, 158 | num_outputs, 159 | expansion_size=expand_input_by_factor(6), 160 | stride=1, 161 | rate=1, 162 | kernel_size=(3, 3), 163 | residual=True, 164 | normalizer_fn=None, 165 | split_projection=1, 166 | split_expansion=1, 167 | expansion_transform=None, 168 | depthwise_location='expansion', 169 | depthwise_channel_multiplier=1, 170 | endpoints=None, 171 | use_explicit_padding=False, 172 | padding='SAME', 173 | scope=None): 174 | """Depthwise Convolution Block with expansion. 175 | Builds a composite convolution that has the following structure 176 | expansion (1x1) -> depthwise (kernel_size) -> projection (1x1) 177 | Args: 178 | input_tensor: input 179 | num_outputs: number of outputs in the final layer. 180 | expansion_size: the size of expansion, could be a constant or a callable. 181 | If latter it will be provided 'num_inputs' as an input. For forward 182 | compatibility it should accept arbitrary keyword arguments. 183 | Default will expand the input by factor of 6. 184 | stride: depthwise stride 185 | rate: depthwise rate 186 | kernel_size: depthwise kernel 187 | residual: whether to include residual connection between input 188 | and output. 189 | normalizer_fn: batchnorm or otherwise 190 | split_projection: how many ways to split projection operator 191 | (that is conv expansion->bottleneck) 192 | split_expansion: how many ways to split expansion op 193 | (that is conv bottleneck->expansion) ops will keep depth divisible 194 | by this value. 195 | expansion_transform: Optional function that takes expansion 196 | as a single input and returns output. 197 | depthwise_location: where to put depthwise covnvolutions supported 198 | values None, 'input', 'output', 'expansion' 199 | depthwise_channel_multiplier: depthwise channel multiplier: 200 | each input will replicated (with different filters) 201 | that many times. So if input had c channels, 202 | output will have c x depthwise_channel_multpilier. 203 | endpoints: An optional dictionary into which intermediate endpoints are 204 | placed. The keys "expansion_output", "depthwise_output", 205 | "projection_output" and "expansion_transform" are always populated, even 206 | if the corresponding functions are not invoked. 207 | use_explicit_padding: Use 'VALID' padding for convolutions, but prepad 208 | inputs so that the output dimensions are the same as if 'SAME' padding 209 | were used. 210 | padding: Padding type to use if `use_explicit_padding` is not set. 211 | scope: optional scope. 212 | Returns: 213 | Tensor of depth num_outputs 214 | Raises: 215 | TypeError: on inval 216 | """ 217 | with tf.variable_scope(scope, default_name='expanded_conv') as s, \ 218 | tf.name_scope(s.original_name_scope): 219 | prev_depth = input_tensor.get_shape().as_list()[3] 220 | if depthwise_location not in [None, 'input', 'output', 'expansion']: 221 | raise TypeError('%r is unknown value for depthwise_location' % 222 | depthwise_location) 223 | if use_explicit_padding: 224 | if padding != 'SAME': 225 | raise TypeError('`use_explicit_padding` should only be used with ' 226 | '"SAME" padding.') 227 | padding = 'VALID' 228 | depthwise_func = functools.partial( 229 | slim.separable_conv2d, 230 | num_outputs=None, 231 | kernel_size=kernel_size, 232 | depth_multiplier=depthwise_channel_multiplier, 233 | stride=stride, 234 | rate=rate, 235 | normalizer_fn=normalizer_fn, 236 | padding=padding, 237 | scope='depthwise') 238 | # b1 -> b2 * r -> b2 239 | # i -> (o * r) (bottleneck) -> o 240 | input_tensor = tf.identity(input_tensor, 'input') 241 | net = input_tensor 242 | 243 | if depthwise_location == 'input': 244 | if use_explicit_padding: 245 | net = _fixed_padding(net, kernel_size, rate) 246 | net = depthwise_func(net, activation_fn=None) 247 | 248 | if callable(expansion_size): 249 | inner_size = expansion_size(num_inputs=prev_depth) 250 | else: 251 | inner_size = expansion_size 252 | 253 | if inner_size > net.shape[3]: 254 | net = split_conv( 255 | net, 256 | inner_size, 257 | num_ways=split_expansion, 258 | scope='expand', 259 | stride=1, 260 | normalizer_fn=normalizer_fn) 261 | net = tf.identity(net, 'expansion_output') 262 | if endpoints is not None: 263 | endpoints['expansion_output'] = net 264 | 265 | if depthwise_location == 'expansion': 266 | if use_explicit_padding: 267 | net = _fixed_padding(net, kernel_size, rate) 268 | net = depthwise_func(net) 269 | 270 | net = tf.identity(net, name='depthwise_output') 271 | if endpoints is not None: 272 | endpoints['depthwise_output'] = net 273 | if expansion_transform: 274 | net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor) 275 | # Note in contrast with expansion, we always have 276 | # projection to produce the desired output size. 277 | net = split_conv( 278 | net, 279 | num_outputs, 280 | num_ways=split_projection, 281 | stride=1, 282 | scope='project', 283 | normalizer_fn=normalizer_fn, 284 | activation_fn=tf.identity) 285 | if endpoints is not None: 286 | endpoints['projection_output'] = net 287 | if depthwise_location == 'output': 288 | if use_explicit_padding: 289 | net = _fixed_padding(net, kernel_size, rate) 290 | net = depthwise_func(net, activation_fn=None) 291 | 292 | if callable(residual): # custom residual 293 | net = residual(input_tensor=input_tensor, output_tensor=net) 294 | elif (residual and 295 | # stride check enforces that we don't add residuals when spatial 296 | # dimensions are None 297 | stride == 1 and 298 | # Depth matches 299 | net.get_shape().as_list()[3] == 300 | input_tensor.get_shape().as_list()[3]): 301 | net += input_tensor 302 | return tf.identity(net, name='output') 303 | 304 | 305 | def split_conv(input_tensor, 306 | num_outputs, 307 | num_ways, 308 | scope, 309 | divisible_by=8, 310 | **kwargs): 311 | """Creates a split convolution. 312 | Split convolution splits the input and output into 313 | 'num_blocks' blocks of approximately the same size each, 314 | and only connects $i$-th input to $i$ output. 315 | Args: 316 | input_tensor: input tensor 317 | num_outputs: number of output filters 318 | num_ways: num blocks to split by. 319 | scope: scope for all the operators. 320 | divisible_by: make sure that every part is divisiable by this. 321 | **kwargs: will be passed directly into conv2d operator 322 | Returns: 323 | tensor 324 | """ 325 | b = input_tensor.get_shape().as_list()[3] 326 | 327 | if num_ways == 1 or min(b // num_ways, 328 | num_outputs // num_ways) < divisible_by: 329 | # Don't do any splitting if we end up with less than 8 filters 330 | # on either side. 331 | return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs) 332 | 333 | outs = [] 334 | input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by) 335 | output_splits = _split_divisible( 336 | num_outputs, num_ways, divisible_by=divisible_by) 337 | inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope) 338 | base = scope 339 | for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)): 340 | scope = base + '_part_%d' % (i,) 341 | n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs) 342 | n = tf.identity(n, scope + '_output') 343 | outs.append(n) 344 | return tf.concat(outs, 3, name=scope + '_concat') 345 | --------------------------------------------------------------------------------