├── data
    ├── result
    │   └── .keep
    ├── ICDAR_submit
    │   └── .keep
    ├── pretrained_model
    │   └── .keep
    ├── demo
    │   └── 007.jpg
    └── cfgs
    │   ├── mobile.yml
    │   ├── res50.yml
    │   ├── squeeze.yml
    │   ├── vgg16.yml
    │   └── res101.yml
├── lib
    ├── nms
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── gpu_nms.hpp
    │   ├── py_cpu_nms.py
    │   ├── gpu_nms.pyx
    │   ├── cpu_nms.pyx
    │   └── nms_kernel.cu
    ├── nets
    │   ├── __init__.py
    │   ├── mobilenet
    │   │   ├── __init__.py
    │   │   ├── mobilenet_v2.py
    │   │   └── conv_blocks.py
    │   ├── vgg16.py
    │   ├── mobilenet_v2.py
    │   ├── squeezenet.py
    │   └── resnet_v1.py
    ├── layer_utils
    │   ├── __init__.py
    │   ├── proposal_top_layer.py
    │   ├── proposal_layer.py
    │   ├── generate_anchors.py
    │   └── anchor_target_layer.py
    ├── model
    │   ├── __init__.py
    │   ├── nms_wrapper.py
    │   ├── bbox_transform.py
    │   ├── test.py
    │   └── config.py
    ├── utils
    │   ├── .gitignore
    │   ├── common.py
    │   ├── helper.py
    │   ├── __init__.py
    │   ├── timer.py
    │   ├── blob.py
    │   ├── bbox.pyx
    │   └── visualization.py
    ├── text_connector
    │   ├── __init__.py
    │   ├── text_connect_cfg.py
    │   ├── other.py
    │   ├── detectors.py
    │   ├── text_proposal_connector.py
    │   ├── text_proposal_graph_builder.py
    │   └── text_proposal_connector_oriented.py
    ├── Makefile
    ├── datasets
    │   ├── __init__.py
    │   ├── factory.py
    │   ├── ds_utils.py
    │   ├── voc_eval.py
    │   ├── imdb.py
    │   └── pascal_voc.py
    ├── roi_data_layer
    │   ├── __init__.py
    │   ├── roidb.py
    │   ├── minibatch.py
    │   └── layer.py
    ├── setup_cpu_win.py
    ├── setup_cpu.py
    └── setup.py
├── tools
    ├── ICDAR15
    │   ├── __init__.py
    │   └── readme.txt
    ├── _init_paths.py
    ├── ICDAR13
    │   └── readme.txt
    ├── ICDAR13_Det
    │   └── readme.txt
    ├── anchor_drawer.py
    ├── convert_utils.py
    ├── freeze_graph.py
    ├── icdar13_split_label.py
    ├── icdar13_to_voc.py
    ├── trainval_net.py
    ├── icdar.py
    └── demo.py
├── requirements.txt
├── .gitignore
├── main.py
├── LICENSE
└── README.md


/data/result/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nms/.gitignore:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/ICDAR_submit/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/pretrained_model/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/layer_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/ICDAR15/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nets/mobilenet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/model/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | 


--------------------------------------------------------------------------------
/lib/utils/.gitignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.cpp
3 | *.h
4 | *.hpp
5 | 


--------------------------------------------------------------------------------
/data/demo/007.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sanster/tf_ctpn/HEAD/data/demo/007.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | numpy
3 | opencv-python
4 | tensorflow-gpu
5 | easydict
6 | 


--------------------------------------------------------------------------------
/lib/text_connector/__init__.py:
--------------------------------------------------------------------------------
1 | from .detectors import TextDetector
2 | from .text_connect_cfg import Config
3 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python3 setup.py build_ext --inplace
3 | 	rm -rf build
4 | clean:
5 | 	rm -rf */*.pyc
6 | 	rm -rf */*.so
7 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/lib/utils/common.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | def check_dir(dir_path):
4 |   """
5 |   create dir if dir not exist
6 |   """
7 |   if not os.path.exists(dir_path):
8 |     os.makedirs(dir_path)
9 | 


--------------------------------------------------------------------------------
/lib/utils/helper.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | 
3 | 
4 | def read_rgb_img(img_file_path):
5 |     bgr = cv2.imread(img_file_path)
6 |     rgb = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
7 |     return rgb
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.so
 2 | *.pyc
 3 | output
 4 | tensorboard
 5 | lib/build
 6 | .idea
 7 | data/demo/*
 8 | data/VOCdevkit2007
 9 | data/cache
10 | data/pretrained_model/*.ckpt
11 | data/result/*
12 | !data/result/.keep
13 | submit.zip
14 | *.zip
15 | model/
16 | 


--------------------------------------------------------------------------------
/lib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/lib/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/lib/roi_data_layer/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/tools/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import sys
 3 | 
 4 | def add_path(path):
 5 |     if path not in sys.path:
 6 |         sys.path.insert(0, path)
 7 | 
 8 | this_dir = osp.dirname(__file__)
 9 | 
10 | # Add lib to PYTHONPATH
11 | lib_path = osp.join(this_dir, '..', 'lib')
12 | add_path(lib_path)
13 | 


--------------------------------------------------------------------------------
/data/cfgs/mobile.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: mobile
 2 | TRAIN:
 3 |   IMS_PER_BATCH: 1
 4 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 5 |   RPN_POSITIVE_OVERLAP: 0.7
 6 |   RPN_BATCHSIZE: 256
 7 |   PROPOSAL_METHOD: gt
 8 |   BG_THRESH_LO: 0.0
 9 |   DISPLAY: 20
10 |   DOUBLE_BIAS: False
11 |   SNAPSHOT_PREFIX: mobile_ctpn
12 |   LEARNING_RATE: 0.00001
13 | 


--------------------------------------------------------------------------------
/data/cfgs/res50.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: res50
 2 | TRAIN:
 3 |   IMS_PER_BATCH: 1
 4 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 5 |   RPN_POSITIVE_OVERLAP: 0.7
 6 |   RPN_BATCHSIZE: 256
 7 |   PROPOSAL_METHOD: gt
 8 |   BG_THRESH_LO: 0.0
 9 |   DISPLAY: 20
10 |   DOUBLE_BIAS: False
11 |   SNAPSHOT_PREFIX: res50_ctpn
12 |   LEARNING_RATE: 0.0001
13 | 


--------------------------------------------------------------------------------
/data/cfgs/squeeze.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: squeeze
 2 | TRAIN:
 3 |   IMS_PER_BATCH: 1
 4 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 5 |   RPN_POSITIVE_OVERLAP: 0.7
 6 |   RPN_BATCHSIZE: 256
 7 |   PROPOSAL_METHOD: gt
 8 |   BG_THRESH_LO: 0.0
 9 |   DISPLAY: 20
10 |   DOUBLE_BIAS: False
11 |   SNAPSHOT_PREFIX: squeeze_ctpn
12 |   LEARNING_RATE: 0.001
13 | 


--------------------------------------------------------------------------------
/data/cfgs/vgg16.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: vgg16
 2 | TRAIN:
 3 |   USE_FLIPPED: False
 4 |   IMS_PER_BATCH: 1
 5 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 6 |   RPN_POSITIVE_OVERLAP: 0.7
 7 |   RPN_BATCHSIZE: 128
 8 |   PROPOSAL_METHOD: gt
 9 |   BG_THRESH_LO: 0.0
10 |   DISPLAY: 20
11 |   DOUBLE_BIAS: False
12 |   SNAPSHOT_PREFIX: vgg16_ctpn
13 |   LEARNING_RATE: 0.00001
14 | 


--------------------------------------------------------------------------------
/data/cfgs/res101.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: res101
 2 | TRAIN:
 3 |   USE_FLIPPED: False
 4 |   LEARNING_RATE: 0.00001
 5 |   IMS_PER_BATCH: 1
 6 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 7 |   RPN_POSITIVE_OVERLAP: 0.7
 8 |   RPN_BATCHSIZE: 128
 9 |   PROPOSAL_METHOD: gt
10 |   BG_THRESH_LO: 0.0
11 |   DISPLAY: 20
12 |   DOUBLE_BIAS: False
13 |   SNAPSHOT_PREFIX: res101_ctpn
14 | 


--------------------------------------------------------------------------------
/lib/text_connector/text_connect_cfg.py:
--------------------------------------------------------------------------------
 1 | class Config:
 2 |     SCALE = 600
 3 |     MAX_SCALE = 1200
 4 |     TEXT_PROPOSALS_WIDTH = 16
 5 |     MIN_NUM_PROPOSALS = 2
 6 |     MIN_RATIO = 0.5
 7 |     LINE_MIN_SCORE = 0.9
 8 |     MAX_HORIZONTAL_GAP = 50
 9 |     TEXT_PROPOSALS_MIN_SCORE = 0.7
10 |     TEXT_PROPOSALS_NMS_THRESH = 0.2
11 |     MIN_V_OVERLAPS = 0.7
12 |     MIN_SIZE_SIM = 0.7
13 | 


--------------------------------------------------------------------------------
/lib/model/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | from model.config import cfg
12 | 
13 | 
14 | def nms(dets, thresh, force_cpu=False):
15 |     """Dispatch to either CPU or GPU NMS implementations."""
16 | 
17 |     if dets.shape[0] == 0:
18 |         return []
19 |     if cfg.USE_GPU_NMS and not force_cpu:
20 |         from nms.gpu_nms import gpu_nms
21 |         return gpu_nms(dets, thresh, device_id=0)
22 |     else:
23 |         from nms.cpu_nms import cpu_nms
24 |         return cpu_nms(dets, thresh)
25 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | # checkpoint_dir = "/home/cwq/data/model/slim/mobilenetv2"
 4 | checkpoint_dir = "/home/cwq/data/checkpoint/tf_crnn/simple_no_lstm_more_bg"
 5 | print("Restoring checkpoint from: " + checkpoint_dir)
 6 | 
 7 | ckpt = tf.train.latest_checkpoint(checkpoint_dir)
 8 | if ckpt is None:
 9 |     print("Checkpoint not found")
10 |     exit(-1)
11 | 
12 | meta_file = ckpt + '.meta'
13 | 
14 | print('Restore variables from {}'.format(ckpt))
15 | print('Restore meta_file from {}'.format(meta_file))
16 | 
17 | config = tf.ConfigProto(allow_soft_placement=True)
18 | with tf.Session(config=config) as sess:
19 |     saver = tf.train.import_meta_graph(meta_file)
20 |     saver.restore(sess, ckpt)
21 | 
22 |     input_graph_def = tf.get_default_graph().as_graph_def()
23 | 
24 |     # Print all node name in graph
25 |     for node in input_graph_def.node:
26 |         print(node.name)
27 | 


--------------------------------------------------------------------------------
/lib/text_connector/other.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def threshold(coords, min_, max_):
 5 |     return np.maximum(np.minimum(coords, max_), min_)
 6 | 
 7 | def clip_boxes(boxes, im_shape):
 8 |     """
 9 |     Clip boxes to image boundaries.
10 |     """
11 |     boxes[:, 0::2]=threshold(boxes[:, 0::2], 0, im_shape[1]-1)
12 |     boxes[:, 1::2]=threshold(boxes[:, 1::2], 0, im_shape[0]-1)
13 |     return boxes
14 | 
15 | 
16 | class Graph:
17 |     def __init__(self, graph):
18 |         self.graph=graph
19 | 
20 |     def sub_graphs_connected(self):
21 |         sub_graphs=[]
22 |         for index in range(self.graph.shape[0]):
23 |             if not self.graph[:, index].any() and self.graph[index, :].any():
24 |                 v=index
25 |                 sub_graphs.append([v])
26 |                 while self.graph[v, :].any():
27 |                     v=np.where(self.graph[v, :])[0][0]
28 |                     sub_graphs[-1].append(v)
29 |         return sub_graphs
30 | 
31 | 


--------------------------------------------------------------------------------
/lib/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | class Timer(object):
11 |     """A simple timer."""
12 |     def __init__(self):
13 |         self.total_time = 0.
14 |         self.calls = 0
15 |         self.start_time = 0.
16 |         self.diff = 0.
17 |         self.average_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Xinlei Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/lib/datasets/factory.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | """Factory method for easily getting imdbs by name."""
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | __sets = {}
14 | from datasets.pascal_voc import pascal_voc
15 | 
16 | # Set up voc_<year>_<split>
17 | for year in ['2007', '2012']:
18 |     for split in ['train', 'val', 'trainval', 'test']:
19 |         name = 'voc_{}_{}'.format(year, split)
20 |         __sets[name] = (lambda split=split, year=year: pascal_voc(split, year))
21 | 
22 | for year in ['2007', '2012']:
23 |     for split in ['train', 'val', 'trainval', 'test']:
24 |         name = 'voc_{}_{}_diff'.format(year, split)
25 |         __sets[name] = (lambda split=split, year=year: pascal_voc(split, year, use_diff=True))
26 | 
27 | 
28 | def get_imdb(name):
29 |     """Get an imdb (image database) by name."""
30 |     if name not in __sets:
31 |         raise KeyError('Unknown dataset: {}'.format(name))
32 |     return __sets[name]()
33 | 
34 | 
35 | def list_imdbs():
36 |     """List all registered imdbs."""
37 |     return list(__sets.keys())
38 | 


--------------------------------------------------------------------------------
/tools/ICDAR13/readme.txt:
--------------------------------------------------------------------------------
 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS
 2 | Requirements:
 3 | -         Python version 2.7.
 4 | -         Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions.
 5 |  
 6 | Procedure:
 7 | Download the ZIP file for the requested script and unzip it to a directory.
 8 |  
 9 | Open a terminal in the directory and run the command:
10 | python script.py –g=gt.zip –s=submit.zip
11 |  
12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct.
13 |  
14 | parameters:
15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task.
16 | -s: Path of your method's results file.
17 |  
18 | Optional parameters:
19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results.
20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script.
21 |  
22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}'


--------------------------------------------------------------------------------
/tools/ICDAR13_Det/readme.txt:
--------------------------------------------------------------------------------
 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS
 2 | Requirements:
 3 | -         Python version 2.7.
 4 | -         Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions.
 5 |  
 6 | Procedure:
 7 | Download the ZIP file for the requested script and unzip it to a directory.
 8 |  
 9 | Open a terminal in the directory and run the command:
10 | python script.py –g=gt.zip –s=submit.zip
11 |  
12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct.
13 |  
14 | parameters:
15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task.
16 | -s: Path of your method's results file.
17 |  
18 | Optional parameters:
19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results.
20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script.
21 |  
22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}'


--------------------------------------------------------------------------------
/tools/ICDAR15/readme.txt:
--------------------------------------------------------------------------------
 1 | INSTRUCTIONS FOR THE STANDALONE SCRIPTS
 2 | Requirements:
 3 | -         Python version 2.7.
 4 | -         Each Task requires different Python modules. When running the script, if some module is not installed you will see a notification and installation instructions.
 5 |  
 6 | Procedure:
 7 | Download the ZIP file for the requested script and unzip it to a directory.
 8 |  
 9 | Open a terminal in the directory and run the command:
10 | python script.py –g=gt.zip –s=submit.zip
11 |  
12 | If you have already installed all the required modules, then you will see the method’s results or an error message if the submitted file is not correct.
13 |  
14 | parameters:
15 | -g: Path of the Ground Truth file. In most cases, the Ground Truth will be included in the same Zip file named 'gt.zip', gt.txt' or 'gt.json'. If not, you will be able to get it on the Downloads page of the Task.
16 | -s: Path of your method's results file.
17 |  
18 | Optional parameters:
19 | -o: Path to a directory where to copy the file ‘results.zip’ that contains per-sample results.
20 | -p: JSON string parameters to override the script default parameters. The parameters that can be overrided are inside the function 'default_evaluation_params' located at the begining of the evaluation Script.
21 |  
22 | Example: python script.py –g=gt.zip –s=submit.zip –o=./ -p='{" IOU_CONSTRAINT" = 0.8}'


--------------------------------------------------------------------------------
/lib/setup_cpu_win.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join as pjoin
 3 | import numpy as np
 4 | from distutils.core import setup
 5 | from distutils.extension import Extension
 6 | from Cython.Distutils import build_ext
 7 | 
 8 | def find_in_path(name, path):
 9 |     "Find a file in a search path"
10 |     #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
11 |     for dir in path.split(os.pathsep):
12 |         binpath = pjoin(dir, name)
13 |         if os.path.exists(binpath):
14 |             return os.path.abspath(binpath)
15 |     return None
16 | 
17 | # Obtain the numpy include directory.  This logic works across numpy versions.
18 | try:
19 |     numpy_include = np.get_include()
20 | except AttributeError:
21 |     numpy_include = np.get_numpy_include()
22 | 
23 | class custom_build_ext(build_ext):
24 |     def build_extensions(self):
25 |         build_ext.build_extensions(self)
26 | 
27 | ext_modules = [
28 |     Extension(
29 |         "utils.cython_bbox",
30 |         ["utils/bbox.pyx"],
31 |         include_dirs = [numpy_include]
32 |     ),
33 |     Extension(
34 |         "nms.cpu_nms",
35 |         ["nms/cpu_nms.pyx"],
36 |         include_dirs = [numpy_include]
37 |     )
38 | ]
39 | 
40 | setup(
41 |     name='tf_faster_rcnn',
42 |     ext_modules=ext_modules,
43 |     # inject our custom trigger
44 |     cmdclass={'build_ext': custom_build_ext},
45 | )
46 | 


--------------------------------------------------------------------------------
/lib/datasets/ds_utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast/er R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Ross Girshick
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def unique_boxes(boxes, scale=1.0):
14 |     """Return indices of unique boxes."""
15 |     v = np.array([1, 1e3, 1e6, 1e9])
16 |     hashes = np.round(boxes * scale).dot(v)
17 |     _, index = np.unique(hashes, return_index=True)
18 |     return np.sort(index)
19 | 
20 | 
21 | def xywh_to_xyxy(boxes):
22 |     """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
23 |     return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
24 | 
25 | 
26 | def xyxy_to_xywh(boxes):
27 |     """Convert [x1 y1 x2 y2] box format to [x y w h] format."""
28 |     return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))
29 | 
30 | 
31 | def validate_boxes(boxes, width=0, height=0):
32 |     """Check that a set of boxes are valid."""
33 |     x1 = boxes[:, 0]
34 |     y1 = boxes[:, 1]
35 |     x2 = boxes[:, 2]
36 |     y2 = boxes[:, 3]
37 |     assert (x1 >= 0).all()
38 |     assert (y1 >= 0).all()
39 |     assert (x2 >= x1).all()
40 |     assert (y2 >= y1).all()
41 |     assert (x2 < width).all()
42 |     assert (y2 < height).all()
43 | 
44 | 
45 | def filter_small_boxes(boxes, min_size):
46 |     w = boxes[:, 2] - boxes[:, 0]
47 |     h = boxes[:, 3] - boxes[:, 1]
48 |     keep = np.where((w >= min_size) & (h > min_size))[0]
49 |     return keep
50 | 


--------------------------------------------------------------------------------
/lib/utils/blob.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | """Blob helper functions."""
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | import numpy as np
14 | import cv2
15 | 
16 | 
17 | def im_list_to_blob(ims):
18 |     """Convert a list of images into a network input.
19 | 
20 |     Assumes images are already prepared (means subtracted, BGR order, ...).
21 |     """
22 |     max_shape = np.array([im.shape for im in ims]).max(axis=0)
23 |     num_images = len(ims)
24 |     blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
25 |                     dtype=np.float32)
26 |     for i in range(num_images):
27 |         im = ims[i]
28 |         blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
29 | 
30 |     return blob
31 | 
32 | 
33 | def prep_im_for_blob(im, pixel_means, target_size, max_size):
34 |     """Mean subtract and scale an image for use in a blob."""
35 |     im = im.astype(np.float32, copy=False)
36 |     im -= pixel_means
37 |     im_shape = im.shape
38 |     im_size_min = np.min(im_shape[0:2])
39 |     im_size_max = np.max(im_shape[0:2])
40 |     im_scale = float(target_size) / float(im_size_min)
41 |     # Prevent the biggest axis from being more than MAX_SIZE
42 |     if np.round(im_scale * im_size_max) > max_size:
43 |         im_scale = float(max_size) / float(im_size_max)
44 |     im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
45 |                     interpolation=cv2.INTER_LINEAR)
46 | 
47 |     return im, im_scale
48 | 


--------------------------------------------------------------------------------
/lib/utils/bbox.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Sergey Karayev
 6 | # --------------------------------------------------------
 7 | 
 8 | cimport cython
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 | 
15 | def bbox_overlaps(
16 |         np.ndarray[DTYPE_t, ndim=2] boxes,
17 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 |     """
19 |     Parameters
20 |     ----------
21 |     boxes: (N, 4) ndarray of float
22 |     query_boxes: (K, 4) ndarray of float
23 |     Returns
24 |     -------
25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 |     """
27 |     cdef unsigned int N = boxes.shape[0]
28 |     cdef unsigned int K = query_boxes.shape[0]
29 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 |     cdef DTYPE_t iw, ih, box_area
31 |     cdef DTYPE_t ua
32 |     cdef unsigned int k, n
33 |     for k in range(K):
34 |         box_area = (
35 |             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 |             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 |         )
38 |         for n in range(N):
39 |             iw = (
40 |                 min(boxes[n, 2], query_boxes[k, 2]) -
41 |                 max(boxes[n, 0], query_boxes[k, 0]) + 1
42 |             )
43 |             if iw > 0:
44 |                 ih = (
45 |                     min(boxes[n, 3], query_boxes[k, 3]) -
46 |                     max(boxes[n, 1], query_boxes[k, 1]) + 1
47 |                 )
48 |                 if ih > 0:
49 |                     ua = float(
50 |                         (boxes[n, 2] - boxes[n, 0] + 1) *
51 |                         (boxes[n, 3] - boxes[n, 1] + 1) +
52 |                         box_area - iw * ih
53 |                     )
54 |                     overlaps[n, k] = iw * ih / ua
55 |     return overlaps
56 | 
57 | 


--------------------------------------------------------------------------------
/lib/layer_utils/proposal_top_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from model.config import cfg
12 | from model.bbox_transform import bbox_transform_inv, clip_boxes
13 | import numpy.random as npr
14 | 
15 | 
16 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, anchors, num_anchors):
17 |     """A layer that just selects the top region proposals
18 |        without using non-maximal suppression,
19 |        For details please see the technical report
20 |     """
21 |     rpn_top_n = cfg.TEST.RPN_TOP_N
22 | 
23 |     scores = rpn_cls_prob[:, :, :, num_anchors:]
24 | 
25 |     rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
26 |     scores = scores.reshape((-1, 1))
27 | 
28 |     length = scores.shape[0]
29 |     if length < rpn_top_n:
30 |         # Random selection, maybe unnecessary and loses good proposals
31 |         # But such case rarely happens
32 |         top_inds = npr.choice(length, size=rpn_top_n, replace=True)
33 |     else:
34 |         top_inds = scores.argsort(0)[::-1]
35 |         top_inds = top_inds[:rpn_top_n]
36 |         top_inds = top_inds.reshape(rpn_top_n, )
37 | 
38 |     # Do the selection here
39 |     anchors = anchors[top_inds, :]
40 |     rpn_bbox_pred = rpn_bbox_pred[top_inds, :]
41 |     scores = scores[top_inds]
42 | 
43 |     # Convert anchors into proposals via bbox transformations
44 |     proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
45 | 
46 |     # Clip predicted boxes to image
47 |     proposals = clip_boxes(proposals, im_info[:2])
48 | 
49 |     # Output rois blob
50 |     # Our RPN implementation only supports a single input image, so all
51 |     # batch inds are 0
52 |     batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
53 |     blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
54 |     return blob, scores
55 | 


--------------------------------------------------------------------------------
/lib/roi_data_layer/roidb.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata."""
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | import numpy as np
14 | from model.config import cfg
15 | from model.bbox_transform import bbox_transform
16 | from utils.cython_bbox import bbox_overlaps
17 | import PIL
18 | 
19 | 
20 | def prepare_roidb(imdb):
21 |     """Enrich the imdb's roidb by adding some derived quantities that
22 |     are useful for training. This function precomputes the maximum
23 |     overlap, taken over ground-truth boxes, between each ROI and
24 |     each ground-truth box. The class with maximum overlap is also
25 |     recorded.
26 |     """
27 |     roidb = imdb.roidb
28 |     sizes = [PIL.Image.open(imdb.image_path_at(i)).size
29 |              for i in range(imdb.num_images)]
30 | 
31 |     for i in range(len(imdb.image_index)):
32 |         roidb[i]['image'] = imdb.image_path_at(i)
33 |         roidb[i]['width'] = sizes[i][0]
34 |         roidb[i]['height'] = sizes[i][1]
35 |         # need gt_overlaps as a dense array for argmax
36 |         gt_overlaps = roidb[i]['gt_overlaps'].toarray()
37 |         # max overlap with gt over classes (columns)
38 |         max_overlaps = gt_overlaps.max(axis=1)
39 |         # gt class that had the max overlap
40 |         max_classes = gt_overlaps.argmax(axis=1)
41 |         roidb[i]['max_classes'] = max_classes
42 |         roidb[i]['max_overlaps'] = max_overlaps
43 |         # sanity checks
44 |         # max overlap of 0 => class should be zero (background)
45 |         zero_inds = np.where(max_overlaps == 0)[0]
46 |         assert all(max_classes[zero_inds] == 0)
47 |         # max overlap > 0 => class should not be zero (must be a fg class)
48 |         nonzero_inds = np.where(max_overlaps > 0)[0]
49 |         assert all(max_classes[nonzero_inds] != 0)
50 | 


--------------------------------------------------------------------------------
/lib/text_connector/detectors.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import numpy as np
 3 | 
 4 | from model.config import cfg
 5 | from model.nms_wrapper import nms
 6 | from .text_proposal_connector import TextProposalConnector
 7 | from .text_proposal_connector_oriented import TextProposalConnector as TextProposalConnectorOriented
 8 | from .text_connect_cfg import Config as TextLineCfg
 9 | 
10 | 
11 | class TextDetector:
12 |     def __init__(self, oriented):
13 |         if oriented:
14 |             print('Use TextProposalConnectorOriented')
15 |             self.text_proposal_connector = TextProposalConnectorOriented()
16 |         else:
17 |             self.text_proposal_connector = TextProposalConnector()
18 | 
19 |     @staticmethod
20 |     def pre_process(text_proposals, scores):
21 |         keep_inds = np.where(scores > TextLineCfg.TEXT_PROPOSALS_MIN_SCORE)[0]
22 |         text_proposals, scores = text_proposals[keep_inds], scores[keep_inds]
23 | 
24 |         # 按得分排序
25 |         sorted_indices = np.argsort(scores.ravel())[::-1]
26 |         text_proposals, scores = text_proposals[sorted_indices], scores[sorted_indices]
27 | 
28 |         # 对proposal做nms
29 |         keep_inds = nms(np.hstack((text_proposals, scores)), TextLineCfg.TEXT_PROPOSALS_NMS_THRESH)
30 |         text_proposals, scores = text_proposals[keep_inds], scores[keep_inds]
31 | 
32 |         return text_proposals, scores
33 | 
34 |     def detect(self, text_proposals, scores, size):
35 |         text_proposals, scores = self.pre_process(text_proposals, scores)
36 | 
37 |         # 获取检测结果
38 |         text_recs = self.text_proposal_connector.get_text_lines(text_proposals, scores, size)
39 |         keep_inds = self.filter_boxes(text_recs)
40 |         return text_recs[keep_inds]
41 | 
42 |     def filter_boxes(self, boxes):
43 |         heights = np.zeros((len(boxes), 1), np.float)
44 |         widths = np.zeros((len(boxes), 1), np.float)
45 |         scores = np.zeros((len(boxes), 1), np.float)
46 |         index = 0
47 |         for box in boxes:
48 |             heights[index] = (abs(box[5] - box[1]) + abs(box[7] - box[3])) / 2.0 + 1
49 |             widths[index] = (abs(box[2] - box[0]) + abs(box[6] - box[4])) / 2.0 + 1
50 |             scores[index] = box[8]
51 |             index += 1
52 | 
53 |         return np.where((widths / heights > TextLineCfg.MIN_RATIO) & (scores > TextLineCfg.LINE_MIN_SCORE) &
54 |                         (widths > (TextLineCfg.TEXT_PROPOSALS_WIDTH * TextLineCfg.MIN_NUM_PROPOSALS)))[0]
55 | 


--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep
69 | 


--------------------------------------------------------------------------------
/tools/anchor_drawer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Visualization anchors on scaled image
 3 | """
 4 | import _init_paths
 5 | import os
 6 | import argparse
 7 | 
 8 | import numpy as np
 9 | import cv2
10 | 
11 | from layer_utils.generate_anchors import generate_anchors
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('--img', default='')
17 |     parser.add_argument('--scale', type=int, default=600)
18 |     parser.add_argument('--max_scale', default=1200)
19 |     args = parser.parse_args()
20 | 
21 |     if not os.path.exists(args.img):
22 |         parser.error('Image not exist.')
23 |     return args
24 | 
25 | 
26 | def draw_anchors(img, heights, width, start_center):
27 |     anchors = []
28 |     center = start_center
29 |     for height in heights:
30 |         anchors.append((
31 |             center[0] - width // 2,
32 |             center[1] - height // 2,
33 |             center[0] + width // 2,
34 |             center[1] + height // 2
35 |         ))
36 |         center = (center[0] + width, center[1])
37 | 
38 |     for anchor in anchors:
39 |         img = cv2.rectangle(img, (anchor[0], anchor[1]), (anchor[2], anchor[3]), color=(255, 0, 0))
40 |     return img
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     args = parse_args()
45 |     img = cv2.imread(args.img)
46 |     im_size_min = min(img.shape)
47 |     im_size_max = max(img.shape)
48 | 
49 |     im_scale = float(args.scale) / float(im_size_min)
50 |     # Prevent the biggest axis from being more than MAX_SIZE
51 |     if np.round(im_scale * im_size_max) > args.max_scale:
52 |         im_scale = float(args.max_scale) / float(im_size_max)
53 | 
54 |     img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR)
55 |     print("Scaled image size")
56 |     print(img.shape)
57 |     width = img.shape[1]
58 |     height = img.shape[0]
59 | 
60 |     base_anchors = generate_anchors(base_height=11,
61 |                                     num_anchors=10,
62 |                                     anchor_width=16,
63 |                                     h_ratio_step=0.7)
64 | 
65 |     heights = [x[3] - x[1] for x in base_anchors]
66 | 
67 |     img = draw_anchors(img, heights, 16, (width // 2, height // 2))
68 |     img = draw_anchors(img, heights, 16, (100, 150))
69 |     img = draw_anchors(img, heights, 16, (width - 300, 150))
70 |     img = draw_anchors(img, heights, 16, (100, height - 150))
71 |     img = draw_anchors(img, heights, 16, (width - 300, height - 150))
72 | 
73 |     cv2.namedWindow('test')
74 |     cv2.imshow('test', img)
75 |     cv2.waitKey()
76 | 


--------------------------------------------------------------------------------
/lib/layer_utils/proposal_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Ross Girshick and Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from model.config import cfg
12 | from model.bbox_transform import bbox_transform_inv, clip_boxes
13 | from model.nms_wrapper import nms
14 | 
15 | 
16 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, anchors, num_anchors):
17 |     """
18 |     A simplified version compared to fast/er RCNN
19 |     For details please see the technical report
20 |     :param
21 |       rpn_cls_prob: (1, H, W, Ax2) softmax result of rpn scores
22 |       rpn_bbox_pred: (1, H, W, Ax4) 1x1 conv result for rpn bbox
23 |     """
24 |     if type(cfg_key) == bytes:
25 |         cfg_key = cfg_key.decode('utf-8')
26 |     pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
27 |     post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
28 |     nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
29 | 
30 |     # Get the scores and bounding boxes for foreground (text)
31 |     # The order in last dim is related to network.py:
32 |     # self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
33 |     # scores = rpn_cls_prob[:, :, :, num_anchors:] # old
34 | 
35 |     height, width = rpn_cls_prob.shape[1:3]  # feature-map的高宽
36 |     scores = np.reshape(np.reshape(rpn_cls_prob, [1, height, width, num_anchors, 2])[:, :, :, :, 1],
37 |                         [1, height, width, num_anchors])
38 | 
39 |     rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
40 |     scores = scores.reshape((-1, 1))
41 |     proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
42 |     proposals = clip_boxes(proposals, im_info[:2])
43 | 
44 |     # Pick the top region proposals
45 |     order = scores.ravel().argsort()[::-1]
46 |     if pre_nms_topN > 0:
47 |         order = order[:pre_nms_topN]
48 |     proposals = proposals[order, :]
49 |     scores = scores[order]
50 | 
51 |     # Non-maximal suppression
52 |     keep = nms(np.hstack((proposals, scores)), nms_thresh, not cfg.USE_GPU_NMS)
53 | 
54 |     # Pick th top region proposals after NMS
55 |     if post_nms_topN > 0:
56 |         keep = keep[:post_nms_topN]
57 |     proposals = proposals[keep, :]
58 |     scores = scores[keep]
59 | 
60 |     # Only support single image as input
61 |     blob = np.hstack((scores.astype(np.float32, copy=False), proposals.astype(np.float32, copy=False)))
62 |     return blob, scores
63 | 


--------------------------------------------------------------------------------
/lib/text_connector/text_proposal_connector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .other import clip_boxes
 3 | from .text_proposal_graph_builder import TextProposalGraphBuilder
 4 | 
 5 | 
 6 | class TextProposalConnector:
 7 |     def __init__(self):
 8 |         self.graph_builder = TextProposalGraphBuilder()
 9 | 
10 |     def group_text_proposals(self, text_proposals, scores, im_size):
11 |         graph = self.graph_builder.build_graph(text_proposals, scores, im_size)
12 |         return graph.sub_graphs_connected()
13 | 
14 |     def fit_y(self, X, Y, x1, x2):
15 |         len(X) != 0
16 |         # if X only include one point, the function will get line y=Y[0]
17 |         if np.sum(X == X[0]) == len(X):
18 |             return Y[0], Y[0]
19 |         p = np.poly1d(np.polyfit(X, Y, 1))
20 |         return p(x1), p(x2)
21 | 
22 |     def get_text_lines(self, text_proposals, scores, im_size):
23 |         # tp=text proposal
24 |         tp_groups = self.group_text_proposals(text_proposals, scores, im_size)
25 |         text_lines = np.zeros((len(tp_groups), 5), np.float32)
26 | 
27 |         for index, tp_indices in enumerate(tp_groups):
28 |             text_line_boxes = text_proposals[list(tp_indices)]
29 | 
30 |             x0 = np.min(text_line_boxes[:, 0])
31 |             x1 = np.max(text_line_boxes[:, 2])
32 | 
33 |             offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5
34 | 
35 |             lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset)
36 |             lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset)
37 | 
38 |             # the score of a text line is the average score of the scores
39 |             # of all text proposals contained in the text line
40 |             score = scores[list(tp_indices)].sum() / float(len(tp_indices))
41 | 
42 |             text_lines[index, 0] = x0
43 |             text_lines[index, 1] = min(lt_y, rt_y)
44 |             text_lines[index, 2] = x1
45 |             text_lines[index, 3] = max(lb_y, rb_y)
46 |             text_lines[index, 4] = score
47 | 
48 |         text_lines = clip_boxes(text_lines, im_size)
49 | 
50 |         text_recs = np.zeros((len(text_lines), 9), np.float)
51 |         index = 0
52 |         for line in text_lines:
53 |             xmin, ymin, xmax, ymax = line[0], line[1], line[2], line[3]
54 |             text_recs[index, 0] = xmin
55 |             text_recs[index, 1] = ymin
56 |             text_recs[index, 2] = xmax
57 |             text_recs[index, 3] = ymin
58 |             text_recs[index, 4] = xmin
59 |             text_recs[index, 5] = ymax
60 |             text_recs[index, 6] = xmax
61 |             text_recs[index, 7] = ymax
62 |             text_recs[index, 8] = line[4]
63 |             index = index + 1
64 | 
65 |         return text_recs
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tf_ctpn
 2 | 
 3 | A tensorflow implement of [CTPN:
 4 | Detecting Text in Natural Image with Connectionist Text Proposal Network](https://arxiv.org/abs/1609.03605).
 5 | 
 6 | Most of code in this project are adapted from
 7 | [CTPN](https://github.com/tianzhi0549/CTPN), [tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) and [text-detection-ctpn](https://github.com/eragonruan/text-detection-ctpn)
 8 | 
 9 | The result of pretrained model on ICDAR13:
10 | 
11 | | Net | Dataset | Recall | Precision | Hmean |
12 | |-------|----------|---------|-------------|------------|
13 | | Origin CTPN |ICDAR13 training data + ?|73.72% | 92.77% | 82.15%|
14 | |vgg16| MLT17 latin/chn + ICDAR13 training data |  74.26% |  82.46% | 78.15% |
15 | 
16 | If you want an end to end OCR service, check this repo: https://github.com/Sanster/DeepOcrService
17 | 
18 | # Setup
19 | Install dependencies:
20 | ```
21 | pip3 install -r requirements.txt
22 | ```
23 | 
24 | Build Cython part for both demo and training.
25 | ```
26 | cd lib/
27 | make clean
28 | make
29 | ```
30 | 
31 | # Quick start
32 | Download pre-trained CTPN model(based on vgg16) from [google drive](https://drive.google.com/open?id=1f8YZCQxmhpXfwGM0KXjoxsoqujg5ruyC), put it in `output/vgg16/voc_2007_trainval/default`.
33 | Run 
34 | ```
35 | python3 tools/demo.py
36 | ```
37 | 
38 | This model is trained on 1080Ti with 80k iterations using this commit `dc533e030e5431212c1d4dbca0bcd7e594a8a368`.
39 | 
40 | 
41 | # Training
42 | 1. Download training dataset from [google drive](https://drive.google.com/open?id=1S9K9NKkA0RYlBswCfyUI0dv_fI4r5bcX). 
43 | This dataset contain 3727 images from MLT17(latin+chinese) and ICDAR13 training set. 
44 | Ground truth anchors are generated by `minAreaRect` of text area, see [eragonruan/text-detection-ctpn#issues215](https://github.com/eragonruan/text-detection-ctpn/issues/215) for more details.You can use [tools/mlt17_to_voc.py](https://github.com/Sanster/tf_ctpn/blob/master/tools/mlt17_to_voc.py) to make your training data.
45 | Put downloaded data in `./data/VOCdevkit2007/VOC2007`
46 | 
47 | 1. Download pre-trained slim vgg16 model from [here](https://github.com/tensorflow/models/tree/master/research/slim#pre-trained-models)
48 | Put the pretrained_models in `./data/pretrained_model`
49 | 
50 | 1. Start training
51 | ```
52 | python3 tools/trainval_net.py
53 | ```
54 | The output checkpoint file will be saved at `./output/vgg16/voc_2007_trainval/default`
55 | 
56 | 1. Start tensorboard
57 | ```
58 | tensorboard --logdir=./tensorboard
59 | ```
60 | 
61 | # Test on ICDRA13
62 | ```
63 | python3 tools/icdar.py --img_dir=path/to/ICDAR13/Challenge2_Test_Task12_Images/ -c=ICDAR13
64 | ```
65 | 
66 | After finish, a submit.zip file will generated in `data/ICDAR_submit`, than run:
67 | 
68 | ```
69 | cd tools/ICDAR13
70 | # use python2
71 | python script.py -g=gt.zip -s=submit.zip
72 | ```
73 | 


--------------------------------------------------------------------------------
/lib/nets/vgg16.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Tensorflow Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import tensorflow as tf
11 | import tensorflow.contrib.slim as slim
12 | from tensorflow.contrib.slim import losses
13 | from tensorflow.contrib.slim import arg_scope
14 | import numpy as np
15 | 
16 | from nets.network import Network
17 | from model.config import cfg
18 | 
19 | 
20 | class vgg16(Network):
21 |     def __init__(self):
22 |         Network.__init__(self)
23 |         self._feat_stride = [16, ]
24 |         self._scope = 'vgg_16'
25 | 
26 |     def _image_to_head(self, is_training, reuse=None):
27 |         with tf.variable_scope(self._scope, self._scope, reuse=reuse):
28 |             net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3],
29 |                               trainable=True, scope='conv1')
30 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
31 |             net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
32 |                               trainable=True, scope='conv2')
33 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
34 |             net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
35 |                               trainable=True, scope='conv3')
36 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
37 |             net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
38 |                               trainable=True, scope='conv4')
39 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
40 |             net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
41 |                               trainable=True, scope='conv5')
42 | 
43 |         self._act_summaries.append(net)
44 |         self._layers['head'] = net
45 | 
46 |         return net
47 | 
48 |     def get_variables_to_restore(self, variables, var_keep_dic):
49 |         variables_to_restore = []
50 | 
51 |         for v in variables:
52 |             # exclude the conv weights that are fc weights in vgg16
53 |             if v.name == (self._scope + '/fc6/weights:0') or \
54 |                     v.name == (self._scope + '/fc7/weights:0'):
55 |                 continue
56 |             # exclude the first conv layer to swap RGB to BGR
57 |             if v.name == (self._scope + '/conv1/conv1_1/weights:0'):
58 |                 self._variables_to_fix[v.name] = v
59 |                 continue
60 |             if v.name.split(':')[0] in var_keep_dic:
61 |                 print('Variables restored: %s' % v.name)
62 |                 variables_to_restore.append(v)
63 | 
64 |         return variables_to_restore
65 | 


--------------------------------------------------------------------------------
/lib/roi_data_layer/minibatch.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick and Xinlei Chen
 6 | # --------------------------------------------------------
 7 | 
 8 | """Compute minibatch blobs for training a Fast R-CNN network."""
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | import numpy as np
14 | import numpy.random as npr
15 | import cv2
16 | from model.config import cfg
17 | from utils.blob import prep_im_for_blob, im_list_to_blob
18 | 
19 | from utils import helper
20 | 
21 | 
22 | def get_minibatch(roidb, num_classes):
23 |     """Given a roidb, construct a minibatch sampled from it."""
24 |     num_images = len(roidb)
25 |     # Sample random scales to use for each image in this batch
26 |     random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), size=num_images)
27 | 
28 |     # Get the input image blob, formatted for caffe
29 |     im_blob, im_scales = _get_image_blob(roidb, random_scale_inds)
30 | 
31 |     blobs = {'data': im_blob}
32 | 
33 |     assert len(im_scales) == 1, "Single batch only"
34 |     assert len(roidb) == 1, "Single batch only"
35 | 
36 |     # gt boxes: (x1, y1, x2, y2, cls)
37 |     if cfg.TRAIN.USE_ALL_GT:
38 |         # Include all ground truth boxes
39 |         gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
40 |     else:
41 |         # For the COCO ground truth boxes, exclude the ones that are ''iscrowd''
42 |         gt_inds = np.where(roidb[0]['gt_classes'] != 0 & np.all(roidb[0]['gt_overlaps'].toarray() > -1.0, axis=1))[0]
43 |     gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
44 |     gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
45 |     gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
46 |     blobs['gt_boxes'] = gt_boxes
47 |     blobs['im_info'] = np.array(
48 |         [im_blob.shape[1], im_blob.shape[2], im_scales[0]],
49 |         dtype=np.float32)
50 | 
51 |     return blobs
52 | 
53 | 
54 | def _get_image_blob(roidb, scale_inds):
55 |     """Builds an input blob from the images in the roidb at the specified
56 |     scales.
57 |     """
58 |     num_images = len(roidb)
59 |     processed_ims = []
60 |     im_scales = []
61 |     for i in range(num_images):
62 |         im = helper.read_rgb_img(roidb[i]['image'])
63 |         if roidb[i]['flipped']:
64 |             im = im[:, ::-1, :]
65 |         target_size = cfg.TRAIN.SCALES[scale_inds[i]]
66 |         im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
67 |                                         cfg.TRAIN.MAX_SIZE)
68 |         im_scales.append(im_scale)
69 |         processed_ims.append(im)
70 | 
71 |     # Create a blob to hold the input images
72 |     blob = im_list_to_blob(processed_ims)
73 | 
74 |     return blob, im_scales
75 | 


--------------------------------------------------------------------------------
/lib/model/bbox_transform.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | import numpy as np
12 | 
13 | 
14 | def bbox_transform(ex_rois, gt_rois):
15 |     ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
16 |     ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
17 |     ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
18 |     ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
19 | 
20 |     gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
21 |     gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
22 |     gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
23 |     gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
24 | 
25 |     targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
26 |     targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
27 |     targets_dw = np.log(gt_widths / ex_widths)
28 |     targets_dh = np.log(gt_heights / ex_heights)
29 | 
30 |     targets = np.vstack(
31 |         (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
32 |     return targets
33 | 
34 | 
35 | def bbox_transform_inv(boxes, deltas):
36 |     if boxes.shape[0] == 0:
37 |         return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
38 | 
39 |     boxes = boxes.astype(deltas.dtype, copy=False)
40 |     widths = boxes[:, 2] - boxes[:, 0] + 1.0
41 |     heights = boxes[:, 3] - boxes[:, 1] + 1.0
42 |     ctr_x = boxes[:, 0] + 0.5 * widths
43 |     ctr_y = boxes[:, 1] + 0.5 * heights
44 | 
45 |     # dx, dw are not used in CTPN
46 |     dx = deltas[:, 0::4]
47 |     dy = deltas[:, 1::4]
48 |     dw = deltas[:, 2::4]
49 |     dh = deltas[:, 3::4]
50 | 
51 |     pred_ctr_x = ctr_x[:, np.newaxis]
52 |     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
53 |     pred_w = widths[:, np.newaxis]
54 |     pred_h = np.exp(dh) * heights[:, np.newaxis]
55 | 
56 |     pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
57 |     # x1
58 |     pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
59 |     # y1
60 |     pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
61 |     # x2
62 |     pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
63 |     # y2
64 |     pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
65 | 
66 |     return pred_boxes
67 | 
68 | 
69 | def clip_boxes(boxes, im_shape):
70 |     """
71 |     Clip boxes to image boundaries.
72 |     """
73 | 
74 |     # x1 >= 0
75 |     boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
76 |     # y1 >= 0
77 |     boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
78 |     # x2 < im_shape[1]
79 |     boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
80 |     # y2 < im_shape[0]
81 |     boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
82 |     return boxes
83 | 


--------------------------------------------------------------------------------
/lib/setup_cpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from os.path import join as pjoin
 3 | import numpy as np
 4 | from distutils.core import setup
 5 | from distutils.extension import Extension
 6 | from Cython.Distutils import build_ext
 7 | 
 8 | def find_in_path(name, path):
 9 |     "Find a file in a search path"
10 |     #adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
11 |     for dir in path.split(os.pathsep):
12 |         binpath = pjoin(dir, name)
13 |         if os.path.exists(binpath):
14 |             return os.path.abspath(binpath)
15 |     return None
16 | 
17 | # Obtain the numpy include directory.  This logic works across numpy versions.
18 | try:
19 |     numpy_include = np.get_include()
20 | except AttributeError:
21 |     numpy_include = np.get_numpy_include()
22 | 
23 | def customize_compiler_for_nvcc(self):
24 |     """inject deep into distutils to customize how the dispatch
25 |     to gcc/nvcc works.
26 | 
27 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
28 |     injected in, and still have the right customizations (i.e.
29 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
30 |     the OO route, I have this. Note, it's kindof like a wierd functional
31 |     subclassing going on."""
32 | 
33 |     # tell the compiler it can processes .cu
34 |     self.src_extensions.append('.cu')
35 | 
36 |     # save references to the default compiler_so and _comple methods
37 |     default_compiler_so = self.compiler_so
38 |     super = self._compile
39 | 
40 |     # now redefine the _compile method. This gets executed for each
41 |     # object but distutils doesn't have the ability to change compilers
42 |     # based on source extension: we add it.
43 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
44 |         print(extra_postargs)
45 |         postargs = extra_postargs['gcc']
46 | 
47 |         super(obj, src, ext, cc_args, postargs, pp_opts)
48 |         # reset the default compiler_so, which we might have changed for cuda
49 |         self.compiler_so = default_compiler_so
50 | 
51 |     # inject our redefined _compile method into the class
52 |     self._compile = _compile
53 | 
54 | # run the customize_compiler
55 | class custom_build_ext(build_ext):
56 |     def build_extensions(self):
57 |         customize_compiler_for_nvcc(self.compiler)
58 |         build_ext.build_extensions(self)
59 | 
60 | ext_modules = [
61 |     Extension(
62 |         "utils.cython_bbox",
63 |         ["utils/bbox.pyx"],
64 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
65 |         include_dirs = [numpy_include]
66 |     ),
67 |     Extension(
68 |         "nms.cpu_nms",
69 |         ["nms/cpu_nms.pyx"],
70 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
71 |         include_dirs = [numpy_include]
72 |     )
73 | ]
74 | 
75 | setup(
76 |     name='tf_faster_rcnn',
77 |     ext_modules=ext_modules,
78 |     # inject our custom trigger
79 |     cmdclass={'build_ext': custom_build_ext},
80 | )
81 | 


--------------------------------------------------------------------------------
/lib/nets/mobilenet_v2.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Tensorflow Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import tensorflow as tf
11 | import tensorflow.contrib.slim as slim
12 | from nets.mobilenet import conv_blocks as ops
13 | from nets.mobilenet import mobilenet as lib
14 | from nets.mobilenet import mobilenet_v2 as mobilenet_v2
15 | 
16 | from nets.network import Network
17 | from model.config import cfg
18 | 
19 | from nets.mobilenet.mobilenet_v2 import expand_input
20 | 
21 | from nets.mobilenet.mobilenet_v2 import op
22 | 
23 | CTPN_DEF = dict(
24 |     defaults={
25 |         # Note: these parameters of batch norm affect the architecture
26 |         # that's why they are here and not in training_scope.
27 |         (slim.batch_norm,): {'center': True, 'scale': True},
28 |         (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
29 |             'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6
30 |         },
31 |         (ops.expanded_conv,): {
32 |             'expansion_size': expand_input(6),
33 |             'split_expansion': 1,
34 |             'normalizer_fn': slim.batch_norm,
35 |             'residual': True
36 |         },
37 |         (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'}
38 |     },
39 |     spec=[
40 |         op(slim.conv2d, stride=1, num_outputs=32, kernel_size=[3, 3]),
41 |         op(ops.expanded_conv,
42 |            expansion_size=expand_input(1, divisible_by=1),
43 |            num_outputs=16),
44 |         op(ops.expanded_conv, stride=2, num_outputs=24),
45 |         op(ops.expanded_conv, stride=1, num_outputs=24),
46 |         op(ops.expanded_conv, stride=2, num_outputs=32),
47 |         op(ops.expanded_conv, stride=1, num_outputs=32),
48 |         op(ops.expanded_conv, stride=1, num_outputs=32),
49 |         op(ops.expanded_conv, stride=2, num_outputs=64),
50 |         op(ops.expanded_conv, stride=1, num_outputs=64),
51 |         op(ops.expanded_conv, stride=1, num_outputs=64),
52 |         op(ops.expanded_conv, stride=1, num_outputs=64),
53 |         op(ops.expanded_conv, stride=1, num_outputs=96),
54 |         op(ops.expanded_conv, stride=1, num_outputs=96),
55 |         op(ops.expanded_conv, stride=1, num_outputs=96),
56 |         op(ops.expanded_conv, stride=2, num_outputs=160),
57 |         op(ops.expanded_conv, stride=1, num_outputs=160),
58 |         op(ops.expanded_conv, stride=1, num_outputs=160),
59 |         op(ops.expanded_conv, stride=1, num_outputs=320),
60 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280)
61 |     ],
62 | )
63 | 
64 | 
65 | class MobileNetV2(Network):
66 |     def __init__(self):
67 |         Network.__init__(self)
68 |         self._feat_stride = [16, ]
69 |         self._scope = 'mobilenet_v2'
70 | 
71 |     def _image_to_head(self, is_training, reuse=None):
72 |         with slim.arg_scope(mobilenet_v2.training_scope(is_training=is_training)):
73 |             net, endpoints = mobilenet_v2.mobilenet_base(self._image, conv_defs=CTPN_DEF)
74 | 
75 |         self.variables_to_restore = slim.get_variables_to_restore()
76 | 
77 |         self._act_summaries.append(net)
78 |         self._layers['head'] = net
79 | 
80 |         return net
81 | 
82 |     def get_variables_to_restore(self, variables, var_keep_dic):
83 |         pass
84 | 


--------------------------------------------------------------------------------
/lib/model/test.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow Faster R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Xinlei Chen
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import cv2
 11 | import numpy as np
 12 | 
 13 | try:
 14 |     import cPickle as pickle
 15 | except ImportError:
 16 |     import pickle
 17 | import os
 18 | import math
 19 | 
 20 | from utils.timer import Timer
 21 | from utils.blob import im_list_to_blob
 22 | 
 23 | from model.config import cfg, get_output_dir
 24 | from model.bbox_transform import clip_boxes, bbox_transform_inv
 25 | from model.nms_wrapper import nms
 26 | 
 27 | 
 28 | def _get_image_blob(im):
 29 |     """Converts an image into a network input.
 30 |     Arguments:
 31 |       im (ndarray): a color image in BGR order
 32 |     Returns:
 33 |       blob (ndarray): a data blob holding an image pyramid
 34 |       im_scale_factors (list): list of image scales (relative to im) used
 35 |         in the image pyramid
 36 |     """
 37 |     im_orig = im.astype(np.float32, copy=True)
 38 |     im_orig -= cfg.PIXEL_MEANS
 39 | 
 40 |     im_shape = im_orig.shape
 41 |     im_size_min = np.min(im_shape[0:2])
 42 |     im_size_max = np.max(im_shape[0:2])
 43 | 
 44 |     processed_ims = []
 45 |     im_scale_factors = []
 46 | 
 47 |     for target_size in cfg.TEST.SCALES:
 48 |         im_scale = float(target_size) / float(im_size_min)
 49 |         # Prevent the biggest axis from being more than MAX_SIZE
 50 |         if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
 51 |             im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
 52 |         im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
 53 |                         interpolation=cv2.INTER_LINEAR)
 54 |         im_scale_factors.append(im_scale)
 55 |         processed_ims.append(im)
 56 | 
 57 |     # Create a blob to hold the input images
 58 |     blob = im_list_to_blob(processed_ims)
 59 | 
 60 |     return blob, np.array(im_scale_factors)
 61 | 
 62 | 
 63 | def _get_blobs(im):
 64 |     """Convert an image and RoIs within that image into network inputs."""
 65 |     blobs = {}
 66 |     blobs['data'], im_scale_factors = _get_image_blob(im)
 67 | 
 68 |     return blobs, im_scale_factors
 69 | 
 70 | 
 71 | def _clip_boxes(boxes, im_shape):
 72 |     """Clip boxes to image boundaries."""
 73 |     if len(boxes) == 0:
 74 |         return boxes
 75 |     # x1 >= 0
 76 |     boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
 77 |     # y1 >= 0
 78 |     boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
 79 |     # x2 < im_shape[1]
 80 |     boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
 81 |     # y2 < im_shape[0]
 82 |     boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
 83 |     return boxes
 84 | 
 85 | 
 86 | def im_detect(sess, net, im):
 87 |     blobs, im_scales = _get_blobs(im)
 88 |     assert len(im_scales) == 1, "Only single-image batch implemented"
 89 | 
 90 |     resized_im_blob = blobs['data']
 91 |     blobs['im_info'] = np.array([resized_im_blob.shape[1], resized_im_blob.shape[2], im_scales[0]], dtype=np.float32)
 92 | 
 93 |     rois = net.test_image(sess, blobs['data'], blobs['im_info'])
 94 | 
 95 |     boxes = rois[:, 1:5]
 96 |     boxes = _clip_boxes(boxes, resized_im_blob.shape[1:3])
 97 | 
 98 |     scores = rois[:, 0]
 99 | 
100 |     return scores, boxes, resized_im_blob.shape[1:3], im_scales[0]
101 | 


--------------------------------------------------------------------------------
/lib/roi_data_layer/layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick and Xinlei Chen
 6 | # --------------------------------------------------------
 7 | 
 8 | """The data layer used during training to train a Fast R-CNN network.
 9 | 
10 | RoIDataLayer implements a Caffe Python layer.
11 | """
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | 
16 | from model.config import cfg
17 | from roi_data_layer.minibatch import get_minibatch
18 | import numpy as np
19 | import time
20 | 
21 | 
22 | class RoIDataLayer(object):
23 |     """Fast R-CNN data layer used for training."""
24 | 
25 |     def __init__(self, roidb, num_classes, random=False):
26 |         """Set the roidb to be used by this layer during training."""
27 |         self._roidb = roidb
28 |         self._num_classes = num_classes
29 |         # Also set a random flag
30 |         self._random = random
31 |         self._shuffle_roidb_inds()
32 | 
33 |     def _shuffle_roidb_inds(self):
34 |         """Randomly permute the training roidb."""
35 |         # If the random flag is set,
36 |         # then the database is shuffled according to system time
37 |         # Useful for the validation set
38 |         if self._random:
39 |             st0 = np.random.get_state()
40 |             millis = int(round(time.time() * 1000)) % 4294967295
41 |             np.random.seed(millis)
42 | 
43 |         if cfg.TRAIN.ASPECT_GROUPING:
44 |             widths = np.array([r['width'] for r in self._roidb])
45 |             heights = np.array([r['height'] for r in self._roidb])
46 |             horz = (widths >= heights)
47 |             vert = np.logical_not(horz)
48 |             horz_inds = np.where(horz)[0]
49 |             vert_inds = np.where(vert)[0]
50 |             inds = np.hstack((
51 |                 np.random.permutation(horz_inds),
52 |                 np.random.permutation(vert_inds)))
53 |             inds = np.reshape(inds, (-1, 2))
54 |             row_perm = np.random.permutation(np.arange(inds.shape[0]))
55 |             inds = np.reshape(inds[row_perm, :], (-1,))
56 |             self._perm = inds
57 |         else:
58 |             self._perm = np.random.permutation(np.arange(len(self._roidb)))
59 |         # Restore the random state
60 |         if self._random:
61 |             np.random.set_state(st0)
62 | 
63 |         self._cur = 0
64 | 
65 |     def _get_next_minibatch_inds(self):
66 |         """Return the roidb indices for the next minibatch."""
67 | 
68 |         if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
69 |             self._shuffle_roidb_inds()
70 | 
71 |         db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
72 |         self._cur += cfg.TRAIN.IMS_PER_BATCH
73 | 
74 |         return db_inds
75 | 
76 |     def _get_next_minibatch(self):
77 |         """Return the blobs to be used for the next minibatch.
78 | 
79 |         If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
80 |         separate process and made available through self._blob_queue.
81 |         """
82 |         db_inds = self._get_next_minibatch_inds()
83 |         minibatch_db = [self._roidb[i] for i in db_inds]
84 |         return get_minibatch(minibatch_db, self._num_classes)
85 | 
86 |     def forward(self):
87 |         """Get blobs and copy them into this layer's top blob vector."""
88 |         blobs = self._get_next_minibatch()
89 |         return blobs
90 | 


--------------------------------------------------------------------------------
/tools/convert_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from xml.dom.minidom import Document
 4 | import numpy as np
 5 | 
 6 | 
 7 | def build_voc_dirs(outdir):
 8 |     mkdir = lambda dir: os.makedirs(dir) if not os.path.exists(dir) else None
 9 |     mkdir(outdir)
10 |     mkdir(os.path.join(outdir, 'Annotations'))
11 |     mkdir(os.path.join(outdir, 'ImageSets'))
12 |     mkdir(os.path.join(outdir, 'ImageSets', 'Layout'))
13 |     mkdir(os.path.join(outdir, 'ImageSets', 'Main'))
14 |     mkdir(os.path.join(outdir, 'ImageSets', 'Segmentation'))
15 |     mkdir(os.path.join(outdir, 'JPEGImages'))
16 |     mkdir(os.path.join(outdir, 'SegmentationClass'))
17 |     mkdir(os.path.join(outdir, 'SegmentationObject'))
18 |     return os.path.join(outdir, 'Annotations'), os.path.join(outdir, 'JPEGImages'), os.path.join(outdir, 'ImageSets',
19 |                                                                                                  'Main')
20 | 
21 | 
22 | def generate_xml(img_name, positions, img_size, database, cls='text'):
23 |     """
24 |     :param positions:  [(xmin, ymin, xmax, ymax)]
25 |     """
26 |     doc = Document()
27 | 
28 |     def append_xml_node_attr(child, parent=None, text=None):
29 |         ele = doc.createElement(child)
30 |         if not text is None:
31 |             text_node = doc.createTextNode(text)
32 |             ele.appendChild(text_node)
33 |         parent = doc if parent is None else parent
34 |         parent.appendChild(ele)
35 |         return ele
36 | 
37 |     # create header
38 |     annotation = append_xml_node_attr('annotation')
39 |     append_xml_node_attr('folder', parent=annotation, text='text')
40 |     append_xml_node_attr('filename', parent=annotation, text=img_name)
41 |     source = append_xml_node_attr('source', parent=annotation)
42 |     append_xml_node_attr('database', parent=source, text=database)
43 |     append_xml_node_attr('annotation', parent=source, text='text')
44 |     append_xml_node_attr('image', parent=source, text='text')
45 |     append_xml_node_attr('flickrid', parent=source, text='000000')
46 |     owner = append_xml_node_attr('owner', parent=annotation)
47 |     append_xml_node_attr('name', parent=owner, text='ms')
48 |     size = append_xml_node_attr('size', annotation)
49 |     append_xml_node_attr('width', size, str(img_size[1]))
50 |     append_xml_node_attr('height', size, str(img_size[0]))
51 |     append_xml_node_attr('depth', size, str(img_size[2]))
52 |     append_xml_node_attr('segmented', parent=annotation, text='0')
53 | 
54 |     # create objects
55 |     objs = []
56 |     for pos in positions:
57 |         obj = append_xml_node_attr('object', parent=annotation)
58 |         occlusion = int(0)
59 |         x1, y1, x2, y2 = int(pos[0]), int(pos[1]), int(pos[2]), int(pos[3])
60 | 
61 |         truncation = float(0)
62 |         difficult = 0
63 |         truncted = 0 if truncation < 0.5 else 1
64 | 
65 |         append_xml_node_attr('name', parent=obj, text=cls)
66 |         append_xml_node_attr('pose', parent=obj, text='none')
67 |         append_xml_node_attr('truncated', parent=obj, text=str(truncted))
68 |         append_xml_node_attr('difficult', parent=obj, text=str(int(difficult)))
69 |         bb = append_xml_node_attr('bndbox', parent=obj)
70 |         append_xml_node_attr('xmin', parent=bb, text=str(x1))
71 |         append_xml_node_attr('ymin', parent=bb, text=str(y1))
72 |         append_xml_node_attr('xmax', parent=bb, text=str(x2))
73 |         append_xml_node_attr('ymax', parent=bb, text=str(y2))
74 | 
75 |         o = {'class': cls, 'box': np.asarray([x1, y1, x2, y2], dtype=float),
76 |              'truncation': truncation, 'difficult': difficult, 'occlusion': occlusion}
77 |         objs.append(o)
78 | 
79 |     return doc, objs
80 | 


--------------------------------------------------------------------------------
/lib/text_connector/text_proposal_graph_builder.py:
--------------------------------------------------------------------------------
 1 | from .text_connect_cfg import Config as TextLineCfg
 2 | from .other import Graph
 3 | import numpy as np
 4 | 
 5 | 
 6 | # noinspection PyAttributeOutsideInit
 7 | class TextProposalGraphBuilder:
 8 |     """
 9 |         Build Text proposals into a graph.
10 |     """
11 | 
12 |     def get_successions(self, index):
13 |         box = self.text_proposals[index]
14 |         results = []
15 |         for left in range(int(box[0]) + 1, min(int(box[0]) + TextLineCfg.MAX_HORIZONTAL_GAP + 1, self.im_size[1])):
16 |             adj_box_indices = self.boxes_table[left]
17 |             for adj_box_index in adj_box_indices:
18 |                 if self.meet_v_iou(adj_box_index, index):
19 |                     results.append(adj_box_index)
20 |             if len(results) != 0:
21 |                 return results
22 |         return results
23 | 
24 |     def get_precursors(self, index):
25 |         box = self.text_proposals[index]
26 |         results = []
27 |         for left in range(int(box[0]) - 1, max(int(box[0] - TextLineCfg.MAX_HORIZONTAL_GAP), 0) - 1, -1):
28 |             adj_box_indices = self.boxes_table[left]
29 |             for adj_box_index in adj_box_indices:
30 |                 if self.meet_v_iou(adj_box_index, index):
31 |                     results.append(adj_box_index)
32 |             if len(results) != 0:
33 |                 return results
34 |         return results
35 | 
36 |     def is_succession_node(self, index, succession_index):
37 |         precursors = self.get_precursors(succession_index)
38 |         if self.scores[index] >= np.max(self.scores[precursors]):
39 |             return True
40 |         return False
41 | 
42 |     def meet_v_iou(self, index1, index2):
43 |         def overlaps_v(index1, index2):
44 |             h1 = self.heights[index1]
45 |             h2 = self.heights[index2]
46 |             y0 = max(self.text_proposals[index2][1], self.text_proposals[index1][1])
47 |             y1 = min(self.text_proposals[index2][3], self.text_proposals[index1][3])
48 |             return max(0, y1 - y0 + 1) / min(h1, h2)
49 | 
50 |         def size_similarity(index1, index2):
51 |             h1 = self.heights[index1]
52 |             h2 = self.heights[index2]
53 |             return min(h1, h2) / max(h1, h2)
54 | 
55 |         v_overlap = overlaps_v(index1, index2)
56 |         size_sim = size_similarity(index1, index2)
57 |         # print("v_overlap %f" % v_overlap)
58 |         # print("size_sim %f" % size_sim)
59 |         return v_overlap >= TextLineCfg.MIN_V_OVERLAPS and size_sim >= TextLineCfg.MIN_SIZE_SIM
60 | 
61 |     def build_graph(self, text_proposals, scores, im_size):
62 |         self.text_proposals = text_proposals
63 |         self.scores = scores
64 |         self.im_size = im_size
65 |         self.heights = text_proposals[:, 3] - text_proposals[:, 1] + 1
66 | 
67 |         boxes_table = [[] for _ in range(self.im_size[1])]
68 |         for index, box in enumerate(text_proposals):
69 |             boxes_table[int(box[0])].append(index)
70 |         self.boxes_table = boxes_table
71 | 
72 |         graph = np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool)
73 | 
74 |         for index, box in enumerate(text_proposals):
75 |             successions = self.get_successions(index)
76 |             if len(successions) == 0:
77 |                 continue
78 |             succession_index = successions[np.argmax(scores[successions])]
79 |             if self.is_succession_node(index, succession_index):
80 |                 # NOTE: a box can have multiple successions(precursors) if multiple successions(precursors)
81 |                 # have equal scores.
82 |                 graph[index, succession_index] = True
83 |         return Graph(graph)
84 | 


--------------------------------------------------------------------------------
/lib/nets/squeezenet.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.slim as slim
 3 | 
 4 | from nets.network import Network
 5 | 
 6 | 
 7 | class SqueezeNet(Network):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         self._feat_stride = [16, ]
11 |         self._scope = 'squeezenet'
12 | 
13 |     def _arg_scope(self, is_training, reuse=None):
14 |         weight_decay = 0.0
15 |         keep_probability = 1.0
16 | 
17 |         batch_norm_params = {
18 |             'is_training': is_training,
19 |             # Decay for the moving averages.
20 |             'decay': 0.995,
21 |             # epsilon to prevent 0s in variance.
22 |             'epsilon': 0.001
23 |         }
24 | 
25 |         with slim.arg_scope([slim.conv2d, slim.fully_connected],
26 |                             weights_initializer=slim.xavier_initializer_conv2d(uniform=True),
27 |                             weights_regularizer=slim.l2_regularizer(weight_decay),
28 |                             normalizer_fn=slim.batch_norm,
29 |                             normalizer_params=batch_norm_params):
30 |             with tf.variable_scope(self._scope, self._scope, reuse=reuse):
31 |                 with slim.arg_scope([slim.batch_norm, slim.dropout],
32 |                                     is_training=is_training) as sc:
33 |                     return sc
34 | 
35 |     def get_variables_to_restore(self, variables, var_keep_dic):
36 |         pass
37 | 
38 |     def _image_to_head(self, is_training, reuse=None):
39 |         with slim.arg_scope(self._arg_scope(is_training, reuse)):
40 |             net = slim.conv2d(self._image, 96, [3, 3], stride=1, scope='conv1')
41 |             net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool1')
42 |             net = self.fire_module(net, 16, 64, scope='fire2')
43 |             net = self.fire_module(net, 16, 64, scope='fire3')
44 |             net = self.fire_module(net, 32, 128, scope='fire4')
45 |             net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool4')
46 |             net = self.fire_module(net, 32, 128, scope='fire5')
47 |             net = self.fire_module(net, 48, 192, scope='fire6')
48 |             net = self.fire_module(net, 48, 192, scope='fire7')
49 |             net = self.fire_module(net, 64, 256, scope='fire8')
50 |             net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool8', padding='SAME')
51 |             net = self.fire_module(net, 64, 256, scope='fire9')
52 |             net = slim.max_pool2d(net, [2, 2], stride=2, scope='maxpool9', padding='SAME')
53 |             net = self.fire_module(net, 64, 512, scope='fire10')
54 | 
55 |         self._act_summaries.append(net)
56 |         self._layers['head'] = net
57 | 
58 |         return net
59 | 
60 |     def fire_module(self, inputs,
61 |                     squeeze_depth,
62 |                     expand_depth,
63 |                     reuse=None,
64 |                     scope=None,
65 |                     outputs_collections=None):
66 |         with tf.variable_scope(scope, 'fire', [inputs], reuse=reuse):
67 |             with slim.arg_scope([slim.conv2d, slim.max_pool2d],
68 |                                 outputs_collections=None):
69 |                 net = self.squeeze(inputs, squeeze_depth)
70 |                 outputs = self.expand(net, expand_depth)
71 |                 return outputs
72 | 
73 |     def squeeze(self, inputs, num_outputs):
74 |         return slim.conv2d(inputs, num_outputs, [1, 1], stride=1, scope='squeeze')
75 | 
76 |     def expand(self, inputs, num_outputs):
77 |         with tf.variable_scope('expand'):
78 |             e1x1 = slim.conv2d(inputs, num_outputs, [1, 1], stride=1, scope='1x1')
79 |             e3x3 = slim.conv2d(inputs, num_outputs, [3, 3], scope='3x3')
80 |         return tf.concat([e1x1, e3x3], 3)
81 | 


--------------------------------------------------------------------------------
/tools/freeze_graph.py:
--------------------------------------------------------------------------------
 1 | """
 2 | bazel-bin/tensorflow/tools/graph_transforms/transform_graph \
 3 |         --in_graph=/home/cwq/code/tf_ctpn/model/ctpn.pb \
 4 |         --out_graph=/home/cwq/code/tf_ctpn/model/ctpn_optimized.pb \
 5 |         --inputs='input' \
 6 |         --outputs='vgg_16_1/rpn_bbox_pred/Conv2D,vgg_16_1/rpn_cls_score_reshape' \
 7 |         --transforms='
 8 |         strip_unused_nodes(name=inputs, type_for_name=float, shape_for_name="1,-1,-1,3")
 9 |         remove_nodes(op=Identity, op=CheckNumerics)
10 |         fold_constants(ignore_errors=true)
11 |         fold_batch_norms
12 |         fold_old_batch_norms'
13 | """
14 | 
15 | import tensorflow as tf
16 | import argparse
17 | import os
18 | import sys
19 | 
20 | 
21 | def main(args):
22 |     with tf.Graph().as_default():
23 |         with tf.Session() as sess:
24 |             # Load the model metagraph and checkpoint
25 |             print('Model directory: %s' % args.ckpt_dir)
26 |             meta_file, ckpt_file = get_model_filenames(args.ckpt_dir)
27 | 
28 |             print('Metagraph file: %s' % meta_file)
29 |             print('Checkpoint file: %s' % ckpt_file)
30 | 
31 |             saver = tf.train.import_meta_graph(meta_file, clear_devices=True)
32 |             tf.get_default_session().run(tf.global_variables_initializer())
33 |             tf.get_default_session().run(tf.local_variables_initializer())
34 |             saver.restore(sess, ckpt_file)
35 | 
36 |             input_graph_def = tf.get_default_graph().as_graph_def()
37 | 
38 |             for node in input_graph_def.node:
39 |                 if node.name == "vgg_16_1/rpn_bbox_pred/Conv2D":
40 |                     node.name = "RPN/rpn_bbox_pred/Conv2D"
41 | 
42 |                 if node.name == "vgg_16_1/rpn_cls_score_reshape":
43 |                     node.name = "RPN/rpn_cls_score_reshape"
44 | 
45 |             output_node_names = ['RPN/rpn_bbox_pred/Conv2D', 'RPN/rpn_cls_prob_reshape']
46 | 
47 |             # We use a built-in TF helper to export variables to constants
48 |             output_graph_def = tf.graph_util.convert_variables_to_constants(
49 |                 sess,  # The session is used to retrieve the weights
50 |                 input_graph_def,  # The graph_def is used to retrieve the nodes
51 |                 output_node_names  # The output node names are used to select the usefull nodes
52 |             )
53 | 
54 |             # Serialize and dump the output graph to the filesystem
55 |             with tf.gfile.GFile(args.output_file, 'wb') as f:
56 |                 f.write(output_graph_def.SerializeToString())
57 |                 pb_file_size = f.size() / 1024. / 1024.
58 |             print("%d ops in the final graph: %s, size: %d mb" %
59 |                   (len(output_graph_def.node), args.output_file, pb_file_size))
60 | 
61 | 
62 | def get_model_filenames(model_dir):
63 |     ckpt = tf.train.get_checkpoint_state(model_dir)
64 |     if ckpt and ckpt.model_checkpoint_path:
65 |         ckpt_file_basename = os.path.basename(ckpt.model_checkpoint_path)
66 |         meta_file = os.path.join(model_dir, ckpt_file_basename + '.meta')
67 |         return meta_file, ckpt.model_checkpoint_path
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     parser = argparse.ArgumentParser()
72 | 
73 |     parser.add_argument('--ckpt_dir', type=str, default='./checkpoint/crnn',
74 |                         help='Directory containing the metagraph (.meta) file and the checkpoint (ckpt) file containing model parameters')
75 | 
76 |     parser.add_argument('--output_file', type=str, default='./model/ctpn.pb',
77 |                         help='Filename for the exported graphdef protobuf (.pb)')
78 | 
79 |     args, _ = parser.parse_known_args()
80 | 
81 |     output_dir = os.path.dirname(args.output_file)
82 |     if not os.path.exists(output_dir):
83 |         os.makedirs(output_dir)
84 | 
85 |     main(args)
86 | 


--------------------------------------------------------------------------------
/lib/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Tensorflow Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from six.moves import range
12 | import PIL.Image as Image
13 | import PIL.ImageColor as ImageColor
14 | import PIL.ImageDraw as ImageDraw
15 | import PIL.ImageFont as ImageFont
16 | 
17 | STANDARD_COLORS = [
18 |     'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
19 |     'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
20 |     'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
21 |     'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
22 |     'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
23 |     'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
24 |     'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
25 |     'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
26 |     'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
27 |     'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
28 |     'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
29 |     'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
30 |     'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
31 |     'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
32 |     'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
33 |     'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
34 |     'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
35 |     'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
36 |     'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
37 |     'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
38 |     'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
39 |     'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
40 |     'WhiteSmoke', 'Yellow', 'YellowGreen'
41 | ]
42 | 
43 | NUM_COLORS = len(STANDARD_COLORS)
44 | 
45 | try:
46 |   FONT = ImageFont.truetype('arial.ttf', 24)
47 | except IOError:
48 |   FONT = ImageFont.load_default()
49 | 
50 | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4):
51 |   draw = ImageDraw.Draw(image)
52 |   (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
53 |   draw.line([(left, top), (left, bottom), (right, bottom),
54 |              (right, top), (left, top)], width=thickness, fill=color)
55 |   text_bottom = bottom
56 |   # Reverse list and print from bottom to top.
57 |   text_width, text_height = font.getsize(display_str)
58 | 
59 |   margin = np.ceil(0.05 * text_height)
60 |   draw.rectangle(
61 |       [(left, text_bottom - text_height - 2 * margin), (left + text_width,
62 |                                                         text_bottom)],
63 |       fill=color)
64 |   # draw.text(
65 |   #     (left + margin, text_bottom - text_height - margin),
66 |   #     display_str,
67 |   #     fill='black',
68 |   #     font=font)
69 | 
70 |   return image
71 | 
72 | def draw_bounding_boxes(image, gt_boxes, im_info):
73 |   num_boxes = gt_boxes.shape[0]
74 |   gt_boxes_new = gt_boxes.copy()
75 |   gt_boxes_new[:,:4] = np.round(gt_boxes_new[:,:4].copy() / im_info[2])
76 |   disp_image = Image.fromarray(np.uint8(image[0]))
77 | 
78 |   for i in range(num_boxes):
79 |     this_class = int(gt_boxes_new[i, 4])
80 |     disp_image = _draw_single_box(disp_image, 
81 |                                 gt_boxes_new[i, 0],
82 |                                 gt_boxes_new[i, 1],
83 |                                 gt_boxes_new[i, 2],
84 |                                 gt_boxes_new[i, 3],
85 |                                 'N%02d-C%02d' % (i, this_class),
86 |                                 FONT,
87 |                                 color=STANDARD_COLORS[this_class % NUM_COLORS])
88 | 
89 |   image[0, :] = np.array(disp_image)
90 |   return image
91 | 


--------------------------------------------------------------------------------
/lib/text_connector/text_proposal_connector_oriented.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import numpy as np
  3 | from .text_proposal_graph_builder import TextProposalGraphBuilder
  4 | 
  5 | 
  6 | class TextProposalConnector:
  7 |     """
  8 |         Connect text proposals into text lines
  9 |     """
 10 | 
 11 |     def __init__(self):
 12 |         self.graph_builder = TextProposalGraphBuilder()
 13 | 
 14 |     def group_text_proposals(self, text_proposals, scores, im_size):
 15 |         graph = self.graph_builder.build_graph(text_proposals, scores, im_size)
 16 |         return graph.sub_graphs_connected()
 17 | 
 18 |     def fit_y(self, X, Y, x1, x2):
 19 |         len(X) != 0
 20 |         # if X only include one point, the function will get line y=Y[0]
 21 |         if np.sum(X == X[0]) == len(X):
 22 |             return Y[0], Y[0]
 23 |         p = np.poly1d(np.polyfit(X, Y, 1))
 24 |         return p(x1), p(x2)
 25 | 
 26 |     def get_text_lines(self, text_proposals, scores, im_size):
 27 |         """
 28 |         text_proposals:boxes
 29 |         
 30 |         """
 31 |         # tp=text proposal
 32 |         tp_groups = self.group_text_proposals(text_proposals, scores, im_size)  # 首先还是建图，获取到文本行由哪几个小框构成
 33 | 
 34 |         text_lines = np.zeros((len(tp_groups), 8), np.float32)
 35 | 
 36 |         for index, tp_indices in enumerate(tp_groups):
 37 |             text_line_boxes = text_proposals[list(tp_indices)]  # 每个文本行的全部小框
 38 |             X = (text_line_boxes[:, 0] + text_line_boxes[:, 2]) / 2  # 求每一个小框的中心x，y坐标
 39 |             Y = (text_line_boxes[:, 1] + text_line_boxes[:, 3]) / 2
 40 | 
 41 |             z1 = np.polyfit(X, Y, 1)  # 多项式拟合，根据之前求的中心店拟合一条直线（最小二乘）
 42 | 
 43 |             x0 = np.min(text_line_boxes[:, 0])  # 文本行x坐标最小值
 44 |             x1 = np.max(text_line_boxes[:, 2])  # 文本行x坐标最大值
 45 | 
 46 |             offset = (text_line_boxes[0, 2] - text_line_boxes[0, 0]) * 0.5  # 小框宽度的一半
 47 | 
 48 |             # 以全部小框的左上角这个点去拟合一条直线，然后计算一下文本行x坐标的极左极右对应的y坐标
 49 |             lt_y, rt_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0 + offset, x1 - offset)
 50 |             # 以全部小框的左下角这个点去拟合一条直线，然后计算一下文本行x坐标的极左极右对应的y坐标
 51 |             lb_y, rb_y = self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0 + offset, x1 - offset)
 52 | 
 53 |             score = scores[list(tp_indices)].sum() / float(len(tp_indices))  # 求全部小框得分的均值作为文本行的均值
 54 | 
 55 |             text_lines[index, 0] = x0
 56 |             text_lines[index, 1] = min(lt_y, rt_y)  # 文本行上端 线段 的y坐标的小值
 57 |             text_lines[index, 2] = x1
 58 |             text_lines[index, 3] = max(lb_y, rb_y)  # 文本行下端 线段 的y坐标的大值
 59 |             text_lines[index, 4] = score  # 文本行得分
 60 |             text_lines[index, 5] = z1[0]  # 根据中心点拟合的直线的k，b
 61 |             text_lines[index, 6] = z1[1]
 62 |             height = np.mean((text_line_boxes[:, 3] - text_line_boxes[:, 1]))  # 小框平均高度
 63 |             text_lines[index, 7] = height + 2.5
 64 | 
 65 |         text_recs = np.zeros((len(text_lines), 9), np.float)
 66 |         index = 0
 67 |         for line in text_lines:
 68 |             b1 = line[6] - line[7] / 2  # 根据高度和文本行中心线，求取文本行上下两条线的b值
 69 |             b2 = line[6] + line[7] / 2
 70 |             x1 = line[0]
 71 |             y1 = line[5] * line[0] + b1  # 左上
 72 |             x2 = line[2]
 73 |             y2 = line[5] * line[2] + b1  # 右上
 74 |             x3 = line[0]
 75 |             y3 = line[5] * line[0] + b2  # 左下
 76 |             x4 = line[2]
 77 |             y4 = line[5] * line[2] + b2  # 右下
 78 |             disX = x2 - x1
 79 |             disY = y2 - y1
 80 |             width = np.sqrt(disX * disX + disY * disY)  # 文本行宽度
 81 | 
 82 |             fTmp0 = y3 - y1  # 文本行高度
 83 |             fTmp1 = fTmp0 * disY / width
 84 |             x = np.fabs(fTmp1 * disX / width)  # 做补偿
 85 |             y = np.fabs(fTmp1 * disY / width)
 86 |             if line[5] < 0:
 87 |                 x1 -= x
 88 |                 y1 += y
 89 |                 x4 += x
 90 |                 y4 -= y
 91 |             else:
 92 |                 x2 += x
 93 |                 y2 += y
 94 |                 x3 -= x
 95 |                 y3 -= y
 96 |             text_recs[index, 0] = x1
 97 |             text_recs[index, 1] = y1
 98 |             text_recs[index, 2] = x2
 99 |             text_recs[index, 3] = y2
100 |             text_recs[index, 4] = x3
101 |             text_recs[index, 5] = y3
102 |             text_recs[index, 6] = x4
103 |             text_recs[index, 7] = y4
104 |             text_recs[index, 8] = line[4]
105 |             index = index + 1
106 | 
107 |         return text_recs
108 | 


--------------------------------------------------------------------------------
/lib/layer_utils/generate_anchors.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Faster R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Sean Bell
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import numpy as np
 12 | 
 13 | 
 14 | def generate_anchors(base_height=11, num_anchors=10, anchor_width=16, h_ratio_step=0.7):
 15 |     """
 16 |     Generate anchor windows template by using different hight start from base_size
 17 |     According to the ctpn paper, anchor's width is always 16 pixels
 18 | 
 19 |     Anchor heights in ctpn sorce code: [11, 16, 23, 33, 48, 68, 97, 139, 198, 283]
 20 |     """
 21 |     base_anchor = np.array([1, 1, anchor_width, anchor_width]) - 1
 22 |     h_ratios = h_ratio_step ** np.arange(0, num_anchors)
 23 | 
 24 |     w, h, x_ctr, y_ctr = _whctrs(base_anchor)
 25 |     ws = np.array([16 for _ in range(num_anchors)])
 26 | 
 27 |     hs = np.ceil(base_height / h_ratios)
 28 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 29 |     return anchors
 30 | 
 31 | 
 32 | def generate_anchors_pre(height, width, feat_stride, num_anchors=10, anchor_width=16, anchor_h_ratio_step=0.7):
 33 |     """
 34 |     A wrapper function to generate anchors given by different height scale
 35 |     :arg
 36 |       height/width: height/width of last shared cnn layer feature map
 37 |       feat_stride: total stride until the last shared cnn layer
 38 | 
 39 |     :returns
 40 |       anchors: anchors on input image
 41 |       length: The total number of anchors
 42 |     """
 43 |     # print("width: %d, height: %d" %(width,height))
 44 |     anchors = generate_anchors(num_anchors=num_anchors, h_ratio_step=anchor_h_ratio_step, anchor_width=anchor_width)
 45 |     A = anchors.shape[0]
 46 |     shift_x = np.arange(0, width) * feat_stride
 47 |     shift_y = np.arange(0, height) * feat_stride
 48 |     shift_x, shift_y = np.meshgrid(shift_x, shift_y)
 49 |     shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
 50 |     K = shifts.shape[0]
 51 |     # width changes faster, so here it is H, W, C
 52 |     anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
 53 |     anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False)
 54 |     length = np.int32(anchors.shape[0])
 55 | 
 56 |     return anchors, length
 57 | 
 58 | 
 59 | def _whctrs(anchor):
 60 |     """
 61 |     Return width, height, x center, and y center for an anchor (window).
 62 |     """
 63 | 
 64 |     w = anchor[2] - anchor[0] + 1
 65 |     h = anchor[3] - anchor[1] + 1
 66 |     x_ctr = anchor[0] + 0.5 * (w - 1)
 67 |     y_ctr = anchor[1] + 0.5 * (h - 1)
 68 |     return w, h, x_ctr, y_ctr
 69 | 
 70 | 
 71 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 72 |     """
 73 |     Given a vector of widths (ws) and heights (hs) around a center
 74 |     (x_ctr, y_ctr), output a set of anchors (windows).
 75 |     """
 76 | 
 77 |     ws = ws[:, np.newaxis]
 78 |     hs = hs[:, np.newaxis]
 79 |     anchors = np.hstack((x_ctr - ws / 2,
 80 |                          y_ctr - hs / 2,
 81 |                          x_ctr + ws / 2,
 82 |                          y_ctr + hs / 2)).astype(np.int32)
 83 |     return anchors
 84 | 
 85 | 
 86 | def _ratio_enum(anchor, ratios):
 87 |     """
 88 |     Enumerate a set of anchors for each aspect ratio wrt an anchor.
 89 |     """
 90 | 
 91 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
 92 |     size = w * h
 93 |     size_ratios = size / ratios
 94 |     ws = np.round(np.sqrt(size_ratios))
 95 |     hs = np.round(ws * ratios)
 96 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 97 |     return anchors
 98 | 
 99 | 
100 | def _scale_enum(anchor, scales):
101 |     """
102 |     Enumerate a set of anchors for each scale wrt an anchor.
103 |     """
104 | 
105 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
106 |     ws = w * scales
107 |     hs = h * scales
108 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
109 |     return anchors
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     import time
114 | 
115 |     t = time.time()
116 |     anchors = generate_anchors(base_height=11, num_anchors=6, anchor_width=16, h_ratio_step=0.7)
117 |     print(anchors)
118 |     for anchor in anchors:
119 |         print(anchor[3] - anchor[1])
120 | 
121 |     # c, length = generate_anchors_pre(47, 37, 16, 6)
122 |     # print(c)
123 |     # print(length)
124 | 


--------------------------------------------------------------------------------
/tools/icdar13_split_label.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import math
  4 | import cv2 as cv
  5 | 
  6 | # path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/image'
  7 | # gt_path = '/media/D/code/OCR/text-detection-ctpn/data/mlt_english+chinese/label'
  8 | 
  9 | path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images'
 10 | gt_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT'
 11 | 
 12 | out_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images_splited'
 13 | label_out_path = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT_splited'
 14 | if not os.path.exists(out_path):
 15 |     os.makedirs(out_path)
 16 | files = os.listdir(path)
 17 | files.sort()
 18 | # files=files[:100]
 19 | for file in files:
 20 |     _, basename = os.path.split(file)
 21 |     if basename.lower().split('.')[-1] not in ['jpg', 'png']:
 22 |         continue
 23 |     stem, ext = os.path.splitext(basename)
 24 |     gt_file = os.path.join(gt_path, 'gt_' + stem + '.txt')
 25 |     img_path = os.path.join(path, file)
 26 |     print(img_path)
 27 |     img = cv.imread(img_path)
 28 |     img_size = img.shape
 29 |     im_size_min = np.min(img_size[0:2])
 30 |     im_size_max = np.max(img_size[0:2])
 31 | 
 32 |     im_scale = float(600) / float(im_size_min)
 33 |     if np.round(im_scale * im_size_max) > 1200:
 34 |         im_scale = float(1200) / float(im_size_max)
 35 |     re_im = cv.resize(img, None, None, fx=im_scale, fy=im_scale, interpolation=cv.INTER_AREA)
 36 |     re_size = re_im.shape
 37 |     cv.imwrite(os.path.join(out_path, stem) + '.jpg', re_im)
 38 | 
 39 |     with open(gt_file, 'r') as f:
 40 |         lines = f.readlines()
 41 |     for line in lines:
 42 |         splitted_line = line.strip().lower().split(' ')
 43 |         splitted_line = [int(n) for n in splitted_line[:-1]]
 44 | 
 45 |         xmin = int(splitted_line[0] * im_scale)
 46 |         ymin = int(splitted_line[1] * im_scale)
 47 |         xmax = int(splitted_line[2] * im_scale)
 48 |         ymax = int(splitted_line[3] * im_scale)
 49 | 
 50 |         if xmin < 0:
 51 |             xmin = 0
 52 |         if xmax > re_size[1] - 1:
 53 |             xmax = re_size[1] - 1
 54 |         if ymin < 0:
 55 |             ymin = 0
 56 |         if ymax > re_size[0] - 1:
 57 |             ymax = re_size[0] - 1
 58 | 
 59 |         width = xmax - xmin
 60 |         height = ymax - ymin
 61 | 
 62 |         # 将完整的文字区域切分为宽度为 16 的小区域
 63 |         step = 16.0
 64 |         xmins = []
 65 | 
 66 |         anchor_count = int(math.ceil(width / step))
 67 |         for i in range(anchor_count):
 68 |             xmins.append(i * int(step) + xmin)
 69 | 
 70 |         if not os.path.exists(label_out_path):
 71 |             os.makedirs(label_out_path)
 72 | 
 73 |         with open(os.path.join(label_out_path, "gt_" + stem) + '.txt', 'a') as f:
 74 |             for i in range(len(xmins)):
 75 |                 f.writelines(str(xmins[i]))
 76 |                 f.writelines(" ")
 77 |                 f.writelines(str(int(ymin)))
 78 |                 f.writelines(" ")
 79 |                 # anchor box 的宽度为 16,
 80 |                 f.writelines(str(int(xmins[i] + step - 1)))
 81 |                 f.writelines(" ")
 82 |                 f.writelines(str(int(ymax)))
 83 |                 f.writelines("\n")
 84 | 
 85 |         # reimplement
 86 |         # step = 16.0
 87 |         # x_left = []
 88 |         # x_right = []
 89 |         # x_left.append(xmin)
 90 |         # x_left_start = int(math.ceil(xmin / 16.0) * 16.0)
 91 |         # if x_left_start == xmin:
 92 |         #     x_left_start = xmin + 16
 93 |         # for i in np.arange(x_left_start, xmax, 16):
 94 |         #     x_left.append(i)
 95 |         # x_left = np.array(x_left)
 96 |         #
 97 |         # x_right.append(x_left_start - 1)
 98 |         # for i in range(1, len(x_left) - 1):
 99 |         #     x_right.append(x_left[i] + 15)
100 |         # x_right.append(xmax)
101 |         # x_right = np.array(x_right)
102 |         #
103 |         # idx = np.where(x_left == x_right)
104 |         # x_left = np.delete(x_left, idx, axis=0)
105 |         # x_right = np.delete(x_right, idx, axis=0)
106 |         #
107 |         # if not os.path.exists(label_out_path):
108 |         #     os.makedirs(label_out_path)
109 |         #
110 |         # with open(os.path.join(label_out_path, "gt_" + stem) + '.txt', 'a') as f:
111 |         #     for i in range(len(x_left)):
112 |         #         f.writelines(str(int(x_left[i])))
113 |         #         f.writelines(" ")
114 |         #         f.writelines(str(int(ymin)))
115 |         #         f.writelines(" ")
116 |         #         f.writelines(str(int(x_right[i])))
117 |         #         f.writelines(" ")
118 |         #         f.writelines(str(int(ymax)))
119 |         #         f.writelines("\n")
120 | 


--------------------------------------------------------------------------------
/tools/icdar13_to_voc.py:
--------------------------------------------------------------------------------
  1 | from xml.dom.minidom import Document
  2 | import cv2
  3 | import os
  4 | import glob
  5 | import shutil
  6 | import numpy as np
  7 | 
  8 | from tools.convert_utils import build_voc_dirs
  9 | 
 10 | 
 11 | def generate_xml(img_name, lines, img_size, class_sets):
 12 |     doc = Document()
 13 | 
 14 |     def append_xml_node_attr(child, parent=None, text=None):
 15 |         ele = doc.createElement(child)
 16 |         if not text is None:
 17 |             text_node = doc.createTextNode(text)
 18 |             ele.appendChild(text_node)
 19 |         parent = doc if parent is None else parent
 20 |         parent.appendChild(ele)
 21 |         return ele
 22 | 
 23 |     cls = 'text'
 24 | 
 25 |     # create header
 26 |     annotation = append_xml_node_attr('annotation')
 27 |     append_xml_node_attr('folder', parent=annotation, text='text')
 28 |     append_xml_node_attr('filename', parent=annotation, text=img_name)
 29 |     source = append_xml_node_attr('source', parent=annotation)
 30 |     append_xml_node_attr('database', parent=source, text='coco_text_database')
 31 |     append_xml_node_attr('annotation', parent=source, text='text')
 32 |     append_xml_node_attr('image', parent=source, text='text')
 33 |     append_xml_node_attr('flickrid', parent=source, text='000000')
 34 |     owner = append_xml_node_attr('owner', parent=annotation)
 35 |     append_xml_node_attr('name', parent=owner, text='ms')
 36 |     size = append_xml_node_attr('size', annotation)
 37 |     append_xml_node_attr('width', size, str(img_size[1]))
 38 |     append_xml_node_attr('height', size, str(img_size[0]))
 39 |     append_xml_node_attr('depth', size, str(img_size[2]))
 40 |     append_xml_node_attr('segmented', parent=annotation, text='0')
 41 | 
 42 |     # create objects
 43 |     objs = []
 44 |     for line in lines:
 45 |         splitted_line = line.strip().lower().split()
 46 | 
 47 |         obj = append_xml_node_attr('object', parent=annotation)
 48 |         occlusion = int(0)
 49 |         x1, y1, x2, y2 = int(float(splitted_line[0])), int(float(splitted_line[1])), \
 50 |                          int(float(splitted_line[2])), int(float(splitted_line[3]))
 51 | 
 52 |         truncation = float(0)
 53 |         difficult = 0
 54 |         truncted = 0 if truncation < 0.5 else 1
 55 | 
 56 |         append_xml_node_attr('name', parent=obj, text=cls)
 57 |         append_xml_node_attr('pose', parent=obj, text='none')
 58 |         append_xml_node_attr('truncated', parent=obj, text=str(truncted))
 59 |         append_xml_node_attr('difficult', parent=obj, text=str(int(difficult)))
 60 |         bb = append_xml_node_attr('bndbox', parent=obj)
 61 |         append_xml_node_attr('xmin', parent=bb, text=str(x1))
 62 |         append_xml_node_attr('ymin', parent=bb, text=str(y1))
 63 |         append_xml_node_attr('xmax', parent=bb, text=str(x2))
 64 |         append_xml_node_attr('ymax', parent=bb, text=str(y2))
 65 | 
 66 |         o = {'class': cls, 'box': np.asarray([x1, y1, x2, y2], dtype=float),
 67 |              'truncation': truncation, 'difficult': difficult, 'occlusion': occlusion}
 68 |         objs.append(o)
 69 | 
 70 |     return doc, objs
 71 | 
 72 | 
 73 | def _is_hard(cls, truncation, occlusion, x1, y1, x2, y2):
 74 |     hard = False
 75 |     if y2 - y1 < 25 and occlusion >= 2:
 76 |         hard = True
 77 |         return hard
 78 |     if occlusion >= 3:
 79 |         hard = True
 80 |         return hard
 81 |     if truncation > 0.8:
 82 |         hard = True
 83 |         return hard
 84 |     return hard
 85 | 
 86 | 
 87 | if __name__ == '__main__':
 88 |     outdir = '/home/cwq/data/ICDAR13/icdar13_voc'
 89 |     dest_label_dir, dest_img_dir, dest_set_dir = build_voc_dirs(outdir)
 90 | 
 91 |     for dset in ['train']:
 92 |         _labeldir = '/home/cwq/data/ICDAR13/Challenge2_Training_Task1_GT_splited'
 93 |         _imagedir = '/home/cwq/data/ICDAR13/Challenge2_Training_Task12_Images_splited'
 94 |         class_sets = ('text', 'dontcare')
 95 |         class_sets_dict = dict((k, i) for i, k in enumerate(class_sets))
 96 |         fs = [open(os.path.join(dest_set_dir, cls + '_' + dset + '.txt'), 'w') for cls in class_sets]
 97 |         ftrain = open(os.path.join(dest_set_dir, dset + '.txt'), 'w')
 98 | 
 99 |         files = glob.glob(os.path.join(_labeldir, '*.txt'))
100 |         files.sort()
101 |         for file in files:
102 |             path, basename = os.path.split(file)
103 |             stem, ext = os.path.splitext(basename)
104 |             img_id = stem.split('_')[1]
105 |             img_name = img_id + '.jpg'
106 |             stem = "icdar13_" + img_id
107 |             with open(file, 'r') as f:
108 |                 lines = f.readlines()
109 |             img_file = os.path.join(_imagedir, img_name)
110 | 
111 |             print(img_file)
112 |             img = cv2.imread(img_file)
113 |             img_size = img.shape
114 | 
115 |             save_img_name = "icdar13_" + img_name
116 |             doc, objs = generate_xml(save_img_name, lines, img_size, class_sets=class_sets)
117 | 
118 |             cv2.imwrite(os.path.join(dest_img_dir, save_img_name), img)
119 |             xmlfile = os.path.join(dest_label_dir, stem + '.xml')
120 | 
121 |             with open(xmlfile, 'w') as f:
122 |                 f.write(doc.toprettyxml(indent='	'))
123 | 
124 |             ftrain.writelines(stem + '\n')
125 | 
126 |         (f.close() for f in fs)
127 |         ftrain.close()
128 | 


--------------------------------------------------------------------------------
/tools/trainval_net.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow Faster R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Zheqi He, Xinlei Chen, based on code from Ross Girshick
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import _init_paths
 11 | from model.train_val import get_training_roidb, train_net
 12 | from model.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir
 13 | from datasets.factory import get_imdb
 14 | import datasets.imdb
 15 | import argparse
 16 | import pprint
 17 | import numpy as np
 18 | import sys
 19 | 
 20 | import tensorflow as tf
 21 | from nets.vgg16 import vgg16
 22 | from nets.resnet_v1 import Resnetv1
 23 | from nets.squeezenet import SqueezeNet
 24 | from nets.mobilenet_v2 import MobileNetV2
 25 | 
 26 | 
 27 | def parse_args():
 28 |     """
 29 |     Parse input arguments
 30 |     """
 31 |     parser = argparse.ArgumentParser(description='Train a CTPN network')
 32 |     parser.add_argument('--cfg', dest='cfg_file',
 33 |                         help='optional config file',
 34 |                         default='./data/cfgs/vgg16.yml', type=str)
 35 |     parser.add_argument('--pretrained_model',
 36 |                         default=None,
 37 |                         help='path to pretrained model, initialize with pretrained model weights',
 38 |                         type=str)
 39 |     parser.add_argument('--imdb', dest='imdb_name',
 40 |                         help='dataset to train on',
 41 |                         default='voc_2007_trainval', type=str)
 42 |     parser.add_argument('--imdbval', dest='imdbval_name',
 43 |                         help='dataset to validate on',
 44 |                         default='voc_2007_test', type=str)
 45 |     parser.add_argument('--iters', dest='max_iters',
 46 |                         help='number of iterations to train',
 47 |                         default=50000, type=int)
 48 |     parser.add_argument('--tag', dest='tag',
 49 |                         help='tag of the model',
 50 |                         default=None, type=str)
 51 |     parser.add_argument('--net', dest='net',
 52 |                         help='vgg16, res50, res101, res152, mobile, squeeze',
 53 |                         choices=['vgg16', 'res50', 'res101', 'res152', 'mobile', 'squeeze'],
 54 |                         default='vgg16', type=str)
 55 |     parser.add_argument('--set', dest='set_cfgs',
 56 |                         help='set config keys', default=None,
 57 |                         nargs=argparse.REMAINDER)
 58 | 
 59 |     args = parser.parse_args()
 60 |     return args
 61 | 
 62 | 
 63 | def combined_roidb(imdb_names):
 64 |     """
 65 |     Combine multiple roidbs
 66 |     """
 67 | 
 68 |     def get_roidb(imdb_name):
 69 |         imdb = get_imdb(imdb_name)
 70 |         print('Loaded dataset `{:s}` for training'.format(imdb.name))
 71 |         imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD)
 72 |         print('Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD))
 73 |         roidb = get_training_roidb(imdb)
 74 |         return roidb
 75 | 
 76 |     roidbs = [get_roidb(s) for s in imdb_names.split('+')]
 77 |     roidb = roidbs[0]
 78 |     if len(roidbs) > 1:
 79 |         for r in roidbs[1:]:
 80 |             roidb.extend(r)
 81 |         tmp = get_imdb(imdb_names.split('+')[1])
 82 |         imdb = datasets.imdb.imdb(imdb_names, tmp.classes)
 83 |     else:
 84 |         imdb = get_imdb(imdb_names)
 85 |     return imdb, roidb
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     args = parse_args()
 90 | 
 91 |     print('Called with args:')
 92 |     print(args)
 93 | 
 94 |     if args.cfg_file is not None:
 95 |         cfg_from_file(args.cfg_file)
 96 |     if args.set_cfgs is not None:
 97 |         cfg_from_list(args.set_cfgs)
 98 | 
 99 |     print('Using config:')
100 |     pprint.pprint(cfg)
101 | 
102 |     np.random.seed(cfg.RNG_SEED)
103 | 
104 |     # train set
105 |     # imdb, roidb = combined_roidb(args.imdb_name)
106 |     imdb = get_imdb(args.imdb_name)
107 |     roidb = get_training_roidb(imdb)
108 |     print('{:d} roidb entries'.format(len(roidb)))
109 | 
110 |     # output directory where the models are saved
111 |     output_dir = get_output_dir(imdb, args.tag)
112 |     print('Output will be saved to `{:s}`'.format(output_dir))
113 | 
114 |     # tensorboard directory where the summaries are saved during training
115 |     tb_dir = get_output_tb_dir(imdb, args.tag)
116 |     print('TensorFlow summaries will be saved to `{:s}`'.format(tb_dir))
117 | 
118 |     # also add the validation set, but with no flipping images
119 |     orgflip = cfg.TRAIN.USE_FLIPPED
120 |     cfg.TRAIN.USE_FLIPPED = False
121 |     _, valroidb = combined_roidb(args.imdbval_name)
122 |     print('{:d} validation roidb entries'.format(len(valroidb)))
123 |     cfg.TRAIN.USE_FLIPPED = orgflip
124 | 
125 |     # load network
126 |     if args.net == 'vgg16':
127 |         net = vgg16()
128 |     elif args.net == 'res50':
129 |         net = Resnetv1(num_layers=50)
130 |     elif args.net == 'res101':
131 |         net = Resnetv1(num_layers=101)
132 |     elif args.net == 'res152':
133 |         net = Resnetv1(num_layers=152)
134 |     elif args.net == 'mobile':
135 |         net = MobileNetV2()
136 |     elif args.net == 'squeeze':
137 |         net = SqueezeNet()
138 |     else:
139 |         raise NotImplementedError
140 | 
141 |     train_net(net, imdb, roidb, valroidb, output_dir, tb_dir,
142 |               pretrained_model=args.pretrained_model,
143 |               max_iters=args.max_iters)
144 | 


--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/lib/setup.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | from os.path import join as pjoin
 10 | import numpy as np
 11 | from distutils.core import setup
 12 | from distutils.extension import Extension
 13 | from Cython.Distutils import build_ext
 14 | 
 15 | 
 16 | def find_in_path(name, path):
 17 |     "Find a file in a search path"
 18 |     # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 19 |     for dir in path.split(os.pathsep):
 20 |         binpath = pjoin(dir, name)
 21 |         if os.path.exists(binpath):
 22 |             return os.path.abspath(binpath)
 23 |     return None
 24 | 
 25 | 
 26 | def locate_cuda():
 27 |     """Locate the CUDA environment on the system
 28 | 
 29 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 30 |     and values giving the absolute path to each directory.
 31 | 
 32 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 33 |     is based on finding 'nvcc' in the PATH.
 34 |     """
 35 | 
 36 |     # first check if the CUDAHOME env variable is in use
 37 |     if 'CUDAHOME' in os.environ:
 38 |         home = os.environ['CUDAHOME']
 39 |         nvcc = pjoin(home, 'bin', 'nvcc')
 40 |     else:
 41 |         # otherwise, search the PATH for NVCC
 42 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 43 |         nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
 44 |         if nvcc is None:
 45 |             raise EnvironmentError('The nvcc binary could not be '
 46 |                                    'located in your $PATH. Either add it to your path, or set $CUDAHOME')
 47 |         home = os.path.dirname(os.path.dirname(nvcc))
 48 | 
 49 |     cudaconfig = {'home': home, 'nvcc': nvcc,
 50 |                   'include': pjoin(home, 'include'),
 51 |                   'lib64': pjoin(home, 'lib64')}
 52 |     for k, v in cudaconfig.items():
 53 |         if not os.path.exists(v):
 54 |             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 55 | 
 56 |     return cudaconfig
 57 | 
 58 | 
 59 | CUDA = locate_cuda()
 60 | 
 61 | # Obtain the numpy include directory.  This logic works across numpy versions.
 62 | try:
 63 |     numpy_include = np.get_include()
 64 | except AttributeError:
 65 |     numpy_include = np.get_numpy_include()
 66 | 
 67 | 
 68 | def customize_compiler_for_nvcc(self):
 69 |     """inject deep into distutils to customize how the dispatch
 70 |     to gcc/nvcc works.
 71 | 
 72 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 73 |     injected in, and still have the right customizations (i.e.
 74 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 75 |     the OO route, I have this. Note, it's kindof like a wierd functional
 76 |     subclassing going on."""
 77 | 
 78 |     # tell the compiler it can processes .cu
 79 |     self.src_extensions.append('.cu')
 80 | 
 81 |     # save references to the default compiler_so and _comple methods
 82 |     default_compiler_so = self.compiler_so
 83 |     super = self._compile
 84 | 
 85 |     # now redefine the _compile method. This gets executed for each
 86 |     # object but distutils doesn't have the ability to change compilers
 87 |     # based on source extension: we add it.
 88 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 89 |         print(extra_postargs)
 90 |         if os.path.splitext(src)[1] == '.cu':
 91 |             # use the cuda for .cu files
 92 |             self.set_executable('compiler_so', CUDA['nvcc'])
 93 |             # use only a subset of the extra_postargs, which are 1-1 translated
 94 |             # from the extra_compile_args in the Extension class
 95 |             postargs = extra_postargs['nvcc']
 96 |         else:
 97 |             postargs = extra_postargs['gcc']
 98 | 
 99 |         super(obj, src, ext, cc_args, postargs, pp_opts)
100 |         # reset the default compiler_so, which we might have changed for cuda
101 |         self.compiler_so = default_compiler_so
102 | 
103 |     # inject our redefined _compile method into the class
104 |     self._compile = _compile
105 | 
106 | 
107 | # run the customize_compiler
108 | class custom_build_ext(build_ext):
109 |     def build_extensions(self):
110 |         customize_compiler_for_nvcc(self.compiler)
111 |         build_ext.build_extensions(self)
112 | 
113 | 
114 | ext_modules = [
115 |     Extension(
116 |         "utils.cython_bbox",
117 |         ["utils/bbox.pyx"],
118 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
119 |         include_dirs=[numpy_include]
120 |     ),
121 |     Extension(
122 |         "nms.cpu_nms",
123 |         ["nms/cpu_nms.pyx"],
124 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
125 |         include_dirs=[numpy_include]
126 |     ),
127 |     Extension('nms.gpu_nms',
128 |               ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
129 |               library_dirs=[CUDA['lib64']],
130 |               libraries=['cudart'],
131 |               language='c++',
132 |               runtime_library_dirs=[CUDA['lib64']],
133 |               # this syntax is specific to this build system
134 |               # we're only going to use certain compiler args with nvcc and not with gcc
135 |               # the implementation of this trick is in customize_compiler() below
136 |               extra_compile_args={'gcc': ["-Wno-unused-function"],
137 |                                   'nvcc': ['-arch=sm_52',
138 |                                            '--ptxas-options=-v',
139 |                                            '-c',
140 |                                            '--compiler-options',
141 |                                            "'-fPIC'"]},
142 |               include_dirs=[numpy_include, CUDA['include']]
143 |               )
144 | ]
145 | 
146 | setup(
147 |     name='tf_faster_rcnn',
148 |     ext_modules=ext_modules,
149 |     # inject our custom trigger
150 |     cmdclass={'build_ext': custom_build_ext},
151 | )
152 | 


--------------------------------------------------------------------------------
/tools/icdar.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import glob
  8 | import time
  9 | from zipfile import ZipFile
 10 | 
 11 | import _init_paths
 12 | from model.config import cfg
 13 | from model.test import im_detect
 14 | from model.nms_wrapper import nms
 15 | from text_connector import TextDetector
 16 | 
 17 | from utils.timer import Timer
 18 | import tensorflow as tf
 19 | import matplotlib.pyplot as plt
 20 | import numpy as np
 21 | import os
 22 | import cv2
 23 | import argparse
 24 | 
 25 | from nets.vgg16 import vgg16
 26 | from nets.resnet_v1 import Resnetv1
 27 | from nets.mobilenet_v2 import MobileNetV2
 28 | 
 29 | from utils import helper
 30 | 
 31 | from demo import recover_scale
 32 | 
 33 | CLASSES = ('__background__', 'text')
 34 | 
 35 | 
 36 | def demo(sess, net, im_file, icdar_dir, oriented=False, ltrb=False):
 37 |     """Detect object classes in an image using pre-computed object proposals."""
 38 | 
 39 |     # Load the demo image
 40 |     im = helper.read_rgb_img(im_file)
 41 | 
 42 |     # Detect all object classes and regress object bounds
 43 |     timer = Timer()
 44 |     timer.tic()
 45 |     scores, boxes, resized_im_shape, im_scale = im_detect(sess, net, im)
 46 |     timer.toc()
 47 | 
 48 |     # Run TextDetector to merge small box
 49 |     line_detector = TextDetector(oriented)
 50 | 
 51 |     # text_lines point order: left-top, right-top, left-bottom, right-bottom
 52 |     text_lines = line_detector.detect(boxes, scores[:, np.newaxis], resized_im_shape)
 53 |     print("Image %s, detect %d text lines in %.3fs" % (im_file, len(text_lines), timer.diff))
 54 | 
 55 |     if len(text_lines) != 0:
 56 |         text_lines = recover_scale(text_lines, im_scale)
 57 | 
 58 |     return save_result_txt(text_lines, icdar_dir, im_file, ltrb)
 59 | 
 60 | 
 61 | def save_result_txt(text_lines, icdar_dir, im_file, ltrb=False):
 62 |     # ICDAR need box points in clockwise
 63 |     boxes = [[l[0], l[1], l[2], l[3], l[6], l[7], l[4], l[5]] for l in text_lines]
 64 | 
 65 |     im_name = im_file.split('/')[-1].split('.')[0]
 66 |     res_file = os.path.join(icdar_dir, 'res_%s.txt' % im_name)
 67 |     if not os.path.exists(icdar_dir):
 68 |         os.makedirs(icdar_dir)
 69 | 
 70 |     with open(res_file, mode='w') as f:
 71 |         for line in boxes:
 72 |             if ltrb:
 73 |                 min_x = min([line[0], line[2], line[4], line[6]])
 74 |                 min_y = min([line[1], line[3], line[5], line[7]])
 75 |                 max_x = max([line[0], line[2], line[4], line[6]])
 76 |                 max_y = max([line[1], line[3], line[5], line[7]])
 77 | 
 78 |                 f.write('%d,%d,%d,%d\n' % (min_x, min_y, max_x, max_y))
 79 |             else:
 80 |                 f.write('%d,%d,%d,%d,%d,%d,%d,%d\n' % (line[0], line[1], line[2], line[3],
 81 |                                                        line[4], line[5], line[6], line[7]))
 82 |     return res_file
 83 | 
 84 | 
 85 | def parse_args():
 86 |     """Parse input arguments."""
 87 |     parser = argparse.ArgumentParser(description='Test images, and save result as ICDAR challenge format')
 88 |     parser.add_argument('--net', dest='net', choices=['vgg16', 'res101'], default='vgg16')
 89 |     parser.add_argument('--img_dir', default='./data/demo')
 90 |     parser.add_argument('--dataset', dest='dataset', help='model tag', default='voc_2007_trainval')
 91 |     parser.add_argument('--tag', dest='tag', help='model tag', default='default')
 92 |     parser.add_argument('-o', '--oriented', action='store_true', default=False, help='output rotated detect box')
 93 |     parser.add_argument('-c', '--challenge', type=str, help='Which challenge to run',
 94 |                         choices=[
 95 |                             'ICDAR13',  # ICDAR13 - Focused Scene Text
 96 |                             'ICDAR15',  # ICDAR15 - Challenge 4 - Incidental Scene Text
 97 |                             'MLT17'  # Multi-lingual scene text detection
 98 |                         ])
 99 |     args = parser.parse_args()
100 | 
101 |     if not os.path.exists(args.img_dir):
102 |         print("img dir not exists.")
103 |         exit(-1)
104 | 
105 |     args.result_dir = os.path.join('./data/result', args.tag)
106 |     if not os.path.exists(args.result_dir):
107 |         os.makedirs(args.result_dir)
108 | 
109 |     return args
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     args = parse_args()
114 | 
115 |     # model path
116 |     netname = args.net
117 |     dataset = args.dataset
118 | 
119 |     ckpt_dir = os.path.join('output', netname, dataset, args.tag)
120 |     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
121 | 
122 |     # set config
123 |     tfconfig = tf.ConfigProto(allow_soft_placement=True)
124 |     tfconfig.gpu_options.allow_growth = True
125 | 
126 |     # init session
127 |     sess = tf.Session(config=tfconfig)
128 |     # load network
129 |     if netname == 'vgg16':
130 |         net = vgg16()
131 |     elif netname == 'res101':
132 |         net = Resnetv1(num_layers=101)
133 |     elif netname == 'mobile':
134 |         net = MobileNetV2()
135 |     else:
136 |         raise NotImplementedError
137 | 
138 |     cfg.USE_GPU_NMS = True
139 |     net.create_architecture("TEST",
140 |                             num_classes=len(CLASSES),
141 |                             tag=args.tag,
142 |                             anchor_width=cfg.CTPN.ANCHOR_WIDTH,
143 |                             anchor_h_ratio_step=cfg.CTPN.H_RADIO_STEP,
144 |                             num_anchors=cfg.CTPN.NUM_ANCHORS)
145 |     saver = tf.train.Saver()
146 |     saver.restore(sess, ckpt.model_checkpoint_path)
147 | 
148 |     print('Loaded network {:s}'.format(ckpt.model_checkpoint_path))
149 | 
150 |     txt_files = []
151 |     icdar_dir = os.path.join(args.result_dir, args.challenge)
152 | 
153 |     ltrb = False
154 |     if args.challenge in ['ICDAR13', 'ICDAR13_Det']:
155 |         ltrb = True
156 | 
157 |     im_files = glob.glob(args.img_dir + "/*.*")
158 |     for im_file in im_files:
159 |         txt_file = demo(sess, net, im_file, icdar_dir, oriented=args.oriented, ltrb=ltrb)
160 |         txt_files.append(txt_file)
161 | 
162 |     zip_path = os.path.join('./data/ICDAR_submit', '%s_%s_submit.zip' % (args.challenge, args.tag))
163 |     print(os.path.abspath(zip_path))
164 |     with ZipFile(zip_path, 'w') as f:
165 |         for txt in txt_files:
166 |             f.write(txt, txt.split('/')[-1])
167 | 


--------------------------------------------------------------------------------
/lib/nets/resnet_v1.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow Faster R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Zheqi He and Xinlei Chen
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import tensorflow as tf
 11 | import tensorflow.contrib.slim as slim
 12 | from tensorflow.contrib.slim import losses
 13 | from tensorflow.contrib.slim import arg_scope
 14 | from tensorflow.contrib.slim.python.slim.nets import resnet_utils
 15 | from tensorflow.contrib.slim.python.slim.nets import resnet_v1
 16 | from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block
 17 | import numpy as np
 18 | 
 19 | from nets.network import Network
 20 | from model.config import cfg
 21 | 
 22 | 
 23 | def resnet_arg_scope(is_training=True,
 24 |                      batch_norm_decay=0.997,
 25 |                      batch_norm_epsilon=1e-5,
 26 |                      batch_norm_scale=True):
 27 |     batch_norm_params = {
 28 |         'is_training': False,
 29 |         'decay': batch_norm_decay,
 30 |         'epsilon': batch_norm_epsilon,
 31 |         'scale': batch_norm_scale,
 32 |         'trainable': False,
 33 |         'updates_collections': tf.GraphKeys.UPDATE_OPS
 34 |     }
 35 | 
 36 |     with arg_scope(
 37 |             [slim.conv2d],
 38 |             weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
 39 |             weights_initializer=slim.variance_scaling_initializer(),
 40 |             trainable=is_training,
 41 |             activation_fn=tf.nn.relu,
 42 |             normalizer_fn=slim.batch_norm,
 43 |             normalizer_params=batch_norm_params):
 44 |         with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
 45 |             return arg_sc
 46 | 
 47 | 
 48 | class Resnetv1(Network):
 49 |     def __init__(self, num_layers=50):
 50 |         Network.__init__(self)
 51 |         self._feat_stride = [16, ]
 52 |         self._num_layers = num_layers
 53 |         self._scope = 'resnet_v1_%d' % num_layers
 54 |         self._decide_blocks()
 55 | 
 56 |     # Do the first few layers manually, because 'SAME' padding can behave inconsistently
 57 |     # for images of different sizes: sometimes 0, sometimes 1
 58 |     def _build_base(self):
 59 |         with tf.variable_scope(self._scope, self._scope):
 60 |             net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
 61 |             net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
 62 |             net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
 63 | 
 64 |         return net
 65 | 
 66 |     def _image_to_head(self, is_training, reuse=None):
 67 |         assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3)
 68 |         # Now the base is always fixed during training
 69 |         with slim.arg_scope(resnet_arg_scope(is_training=False)):
 70 |             net_conv = self._build_base()
 71 |         if cfg.RESNET.FIXED_BLOCKS > 0:
 72 |             with slim.arg_scope(resnet_arg_scope(is_training=False)):
 73 |                 net_conv, _ = resnet_v1.resnet_v1(net_conv,
 74 |                                                   self._blocks[0:cfg.RESNET.FIXED_BLOCKS],
 75 |                                                   global_pool=False,
 76 |                                                   include_root_block=False,
 77 |                                                   reuse=reuse,
 78 |                                                   scope=self._scope)
 79 |         if cfg.RESNET.FIXED_BLOCKS < 3:
 80 |             with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
 81 |                 net_conv, _ = resnet_v1.resnet_v1(net_conv,
 82 |                                                   self._blocks[cfg.RESNET.FIXED_BLOCKS:-1],
 83 |                                                   global_pool=False,
 84 |                                                   include_root_block=False,
 85 |                                                   reuse=reuse,
 86 |                                                   scope=self._scope)
 87 | 
 88 |         self._act_summaries.append(net_conv)
 89 |         self._layers['head'] = net_conv
 90 | 
 91 |         return net_conv
 92 | 
 93 |     def _decide_blocks(self):
 94 |         # choose different blocks for different number of layers
 95 |         if self._num_layers == 50:
 96 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 97 |                             resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
 98 |                             # use stride 1 for the last conv4 layer
 99 |                             resnet_v1_block('block3', base_depth=256, num_units=6, stride=1),
100 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
101 | 
102 |         elif self._num_layers == 101:
103 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
104 |                             resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
105 |                             # use stride 1 for the last conv4 layer
106 |                             resnet_v1_block('block3', base_depth=256, num_units=23, stride=1),
107 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
108 | 
109 |         elif self._num_layers == 152:
110 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
111 |                             resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
112 |                             # use stride 1 for the last conv4 layer
113 |                             resnet_v1_block('block3', base_depth=256, num_units=36, stride=1),
114 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
115 | 
116 |         else:
117 |             # other numbers are not supported
118 |             raise NotImplementedError
119 | 
120 |     def get_variables_to_restore(self, variables, var_keep_dic):
121 |         variables_to_restore = []
122 | 
123 |         for v in variables:
124 |             # exclude the first conv layer to swap RGB to BGR
125 |             if v.name == (self._scope + '/conv1/weights:0'):
126 |                 self._variables_to_fix[v.name] = v
127 |                 continue
128 |             if v.name.split(':')[0] in var_keep_dic:
129 |                 print('Variables restored: %s' % v.name)
130 |                 variables_to_restore.append(v)
131 | 
132 |         return variables_to_restore
133 | 
134 | 


--------------------------------------------------------------------------------
/lib/nets/mobilenet/mobilenet_v2.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Implementation of Mobilenet V2.
 16 | Architecture: https://arxiv.org/abs/1801.04381
 17 | The base model gives 72.2% accuracy on ImageNet, with 300MMadds,
 18 | 3.4 M parameters.
 19 | """
 20 | 
 21 | from __future__ import absolute_import
 22 | from __future__ import division
 23 | from __future__ import print_function
 24 | 
 25 | import copy
 26 | 
 27 | import tensorflow as tf
 28 | 
 29 | from nets.mobilenet import conv_blocks as ops
 30 | from nets.mobilenet import mobilenet as lib
 31 | 
 32 | slim = tf.contrib.slim
 33 | op = lib.op
 34 | 
 35 | expand_input = ops.expand_input_by_factor
 36 | 
 37 | # Architecture: https://arxiv.org/abs/1801.04381
 38 | V2_DEF = dict(
 39 |     defaults={
 40 |         # Note: these parameters of batch norm affect the architecture
 41 |         # that's why they are here and not in training_scope.
 42 |         (slim.batch_norm,): {'center': True, 'scale': True},
 43 |         (slim.conv2d, slim.fully_connected, slim.separable_conv2d): {
 44 |             'normalizer_fn': slim.batch_norm, 'activation_fn': tf.nn.relu6
 45 |         },
 46 |         (ops.expanded_conv,): {
 47 |             'expansion_size': expand_input(6),
 48 |             'split_expansion': 1,
 49 |             'normalizer_fn': slim.batch_norm,
 50 |             'residual': True
 51 |         },
 52 |         (slim.conv2d, slim.separable_conv2d): {'padding': 'SAME'}
 53 |     },
 54 |     spec=[
 55 |         op(slim.conv2d, stride=2, num_outputs=32, kernel_size=[3, 3]),
 56 |         op(ops.expanded_conv,
 57 |            expansion_size=expand_input(1, divisible_by=1),
 58 |            num_outputs=16),
 59 |         op(ops.expanded_conv, stride=2, num_outputs=24),
 60 |         op(ops.expanded_conv, stride=1, num_outputs=24),
 61 |         op(ops.expanded_conv, stride=2, num_outputs=32),
 62 |         op(ops.expanded_conv, stride=1, num_outputs=32),
 63 |         op(ops.expanded_conv, stride=1, num_outputs=32),
 64 |         op(ops.expanded_conv, stride=2, num_outputs=64),
 65 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 66 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 67 |         op(ops.expanded_conv, stride=1, num_outputs=64),
 68 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 69 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 70 |         op(ops.expanded_conv, stride=1, num_outputs=96),
 71 |         op(ops.expanded_conv, stride=2, num_outputs=160),
 72 |         op(ops.expanded_conv, stride=1, num_outputs=160),
 73 |         op(ops.expanded_conv, stride=1, num_outputs=160),
 74 |         op(ops.expanded_conv, stride=1, num_outputs=320),
 75 |         op(slim.conv2d, stride=1, kernel_size=[1, 1], num_outputs=1280)
 76 |     ],
 77 | )
 78 | 
 79 | 
 80 | @slim.add_arg_scope
 81 | def mobilenet(input_tensor,
 82 |               num_classes=1001,
 83 |               depth_multiplier=1.0,
 84 |               scope='MobilenetV2',
 85 |               conv_defs=None,
 86 |               finegrain_classification_mode=False,
 87 |               min_depth=None,
 88 |               divisible_by=None,
 89 |               **kwargs):
 90 |     """Creates mobilenet V2 network.
 91 |     Inference mode is created by default. To create training use training_scope
 92 |     below.
 93 |     with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
 94 |        logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
 95 |     Args:
 96 |       input_tensor: The input tensor
 97 |       num_classes: number of classes
 98 |       depth_multiplier: The multiplier applied to scale number of
 99 |       channels in each layer. Note: this is called depth multiplier in the
100 |       paper but the name is kept for consistency with slim's model builder.
101 |       scope: Scope of the operator
102 |       conv_defs: Allows to override default conv def.
103 |       finegrain_classification_mode: When set to True, the model
104 |       will keep the last layer large even for small multipliers. Following
105 |       https://arxiv.org/abs/1801.04381
106 |       suggests that it improves performance for ImageNet-type of problems.
107 |         *Note* ignored if final_endpoint makes the builder exit earlier.
108 |       min_depth: If provided, will ensure that all layers will have that
109 |       many channels after application of depth multiplier.
110 |       divisible_by: If provided will ensure that all layers # channels
111 |       will be divisible by this number.
112 |       **kwargs: passed directly to mobilenet.mobilenet:
113 |         prediction_fn- what prediction function to use.
114 |         reuse-: whether to reuse variables (if reuse set to true, scope
115 |         must be given).
116 |     Returns:
117 |       logits/endpoints pair
118 |     Raises:
119 |       ValueError: On invalid arguments
120 |     """
121 |     if conv_defs is None:
122 |         conv_defs = V2_DEF
123 |     if 'multiplier' in kwargs:
124 |         raise ValueError('mobilenetv2 doesn\'t support generic '
125 |                          'multiplier parameter use "depth_multiplier" instead.')
126 |     if finegrain_classification_mode:
127 |         conv_defs = copy.deepcopy(conv_defs)
128 |         if depth_multiplier < 1:
129 |             conv_defs['spec'][-1].params['num_outputs'] /= depth_multiplier
130 | 
131 |     depth_args = {}
132 |     # NB: do not set depth_args unless they are provided to avoid overriding
133 |     # whatever default depth_multiplier might have thanks to arg_scope.
134 |     if min_depth is not None:
135 |         depth_args['min_depth'] = min_depth
136 |     if divisible_by is not None:
137 |         depth_args['divisible_by'] = divisible_by
138 | 
139 |     with slim.arg_scope((lib.depth_multiplier,), **depth_args):
140 |         return lib.mobilenet(
141 |             input_tensor,
142 |             num_classes=num_classes,
143 |             conv_defs=conv_defs,
144 |             scope=scope,
145 |             multiplier=depth_multiplier,
146 |             **kwargs)
147 | 
148 | 
149 | @slim.add_arg_scope
150 | def mobilenet_base(input_tensor, depth_multiplier=1.0, **kwargs):
151 |     """Creates base of the mobilenet (no pooling and no logits) ."""
152 |     return mobilenet(input_tensor,
153 |                      depth_multiplier=depth_multiplier,
154 |                      base_only=True, **kwargs)
155 | 
156 | 
157 | def training_scope(**kwargs):
158 |     """Defines MobilenetV2 training scope.
159 |     Usage:
160 |        with tf.contrib.slim.arg_scope(mobilenet_v2.training_scope()):
161 |          logits, endpoints = mobilenet_v2.mobilenet(input_tensor)
162 |     with slim.
163 |     Args:
164 |       **kwargs: Passed to mobilenet.training_scope. The following parameters
165 |       are supported:
166 |         weight_decay- The weight decay to use for regularizing the model.
167 |         stddev-  Standard deviation for initialization, if negative uses xavier.
168 |         dropout_keep_prob- dropout keep probability
169 |         bn_decay- decay for the batch norm moving averages.
170 |     Returns:
171 |       An `arg_scope` to use for the mobilenet v2 model.
172 |     """
173 |     return lib.training_scope(**kwargs)
174 | 
175 | 
176 | __all__ = ['training_scope', 'mobilenet_base', 'mobilenet', 'V2_DEF']
177 | 


--------------------------------------------------------------------------------
/lib/layer_utils/anchor_target_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Faster R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Xinlei Chen
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | from model.config import cfg
 13 | import numpy as np
 14 | import numpy.random as npr
 15 | from utils.cython_bbox import bbox_overlaps
 16 | from model.bbox_transform import bbox_transform
 17 | 
 18 | 
 19 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, all_anchors, num_anchors):
 20 |     """
 21 |     Same as the anchor target layer in original Fast/er RCNN
 22 |     :param
 23 |       rpn_cls_score: (1, H, W, Ax2) bg/fg scores of previous conv layer
 24 |       gt_boxes: (G, 5) vstack of [x1, y1, x2, y2, class]
 25 |       im_info: [image_height, image_width]
 26 |       all_anchors: all anchors pre generated
 27 |       num_anchors:
 28 |     :returns
 29 |         rpn_labels: (1, H, W, A)
 30 |         rpn_bbox_targets: (1, H, W, Ax4)
 31 |     """
 32 |     A = num_anchors
 33 |     total_anchors = all_anchors.shape[0]
 34 |     K = total_anchors / num_anchors
 35 | 
 36 |     # allow boxes to sit over the edge by a small amount
 37 |     _allowed_border = 0
 38 | 
 39 |     # map of shape (..., H, W), height/width for feature map
 40 |     height, width = rpn_cls_score.shape[1:3]
 41 | 
 42 |     # print("rpn: gt_boxes.shape %d" % gt_boxes.shape)
 43 |     # print("rpn: gt_boxes", gt_boxes)
 44 |     # only keep anchors inside the image
 45 |     inds_inside = np.where(
 46 |         (all_anchors[:, 0] >= -_allowed_border) &
 47 |         (all_anchors[:, 1] >= -_allowed_border) &
 48 |         (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
 49 |         (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
 50 |     )[0]
 51 |     # print("total_anchors %d" % total_anchors)
 52 |     # print("inds_inside %d" % len(inds_inside))
 53 | 
 54 |     # keep only inside anchors
 55 |     anchors = all_anchors[inds_inside, :]
 56 | 
 57 |     # label: 1 is positive, 0 is negative, -1 is dont care
 58 |     labels = np.empty((len(inds_inside),), dtype=np.float32)
 59 |     labels.fill(-1)
 60 | 
 61 |     # overlaps between the anchors and the gt boxes
 62 |     # overlaps (ex, gt)
 63 |     overlaps = bbox_overlaps(
 64 |         np.ascontiguousarray(anchors, dtype=np.float),
 65 |         np.ascontiguousarray(gt_boxes, dtype=np.float))
 66 |     argmax_overlaps = overlaps.argmax(axis=1)
 67 |     max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
 68 |     gt_argmax_overlaps = overlaps.argmax(axis=0)
 69 |     gt_max_overlaps = overlaps[gt_argmax_overlaps,
 70 |                                np.arange(overlaps.shape[1])]
 71 |     gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
 72 | 
 73 |     if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
 74 |         # assign bg labels first so that positive labels can clobber them
 75 |         # first set the negatives
 76 |         labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
 77 | 
 78 |     # fg label: for each gt, anchor with highest overlap
 79 |     labels[gt_argmax_overlaps] = 1
 80 | 
 81 |     # fg label: above threshold IOU
 82 |     labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
 83 | 
 84 |     if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
 85 |         # assign bg labels last so that negative labels can clobber positives
 86 |         labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
 87 | 
 88 |     # subsample positive labels if we have too many
 89 |     num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
 90 |     fg_inds = np.where(labels == 1)[0]
 91 |     # print("fg_inds.shape", fg_inds.shape)
 92 |     if len(fg_inds) > num_fg:
 93 |         disable_inds = npr.choice(
 94 |             fg_inds, size=(len(fg_inds) - num_fg), replace=False)
 95 |         labels[disable_inds] = -1
 96 | 
 97 |     # subsample negative labels if we have too many
 98 |     num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
 99 |     bg_inds = np.where(labels == 0)[0]
100 |     # print("bg_inds.shape", bg_inds.shape)
101 |     if len(bg_inds) > num_bg:
102 |         disable_inds = npr.choice(
103 |             bg_inds, size=(len(bg_inds) - num_bg), replace=False)
104 |         labels[disable_inds] = -1
105 | 
106 |     bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
107 |     # get rpn_bbox_targets in delta format, the predict result of rpn is (tx, ty, tw, th)
108 |     bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
109 |     # print("bbox_targets!===================", bbox_targets)
110 | 
111 |     bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
112 |     # only the positive ones have regression targets
113 |     bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
114 | 
115 |     bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
116 |     if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
117 |         # uniform weighting of examples (given non-uniform sampling)
118 |         num_examples = np.sum(labels >= 0)
119 |         # positive_weights = np.ones((1, 4)) * 1.0 / num_examples
120 |         # negative_weights = np.ones((1, 4)) * 1.0 / num_examples
121 |         # CTPN:
122 |         positive_weights = np.ones((1, 4))
123 |         negative_weights = np.zeros((1, 4))
124 |     else:
125 |         assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
126 |                 (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
127 |         positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
128 |                             np.sum(labels == 1))
129 |         negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
130 |                             np.sum(labels == 0))
131 |     bbox_outside_weights[labels == 1, :] = positive_weights
132 |     bbox_outside_weights[labels == 0, :] = negative_weights
133 | 
134 |     # map up to original set of anchors
135 |     labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
136 |     bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
137 |     bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
138 |     bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
139 | 
140 |     # labels
141 |     labels = labels.reshape((1, height, width, A))
142 |     rpn_labels = labels
143 | 
144 |     # bbox_targets
145 |     bbox_targets = bbox_targets.reshape((1, height, width, A * 4))
146 |     rpn_bbox_targets = bbox_targets
147 | 
148 |     # bbox_inside_weights
149 |     bbox_inside_weights = bbox_inside_weights.reshape((1, height, width, A * 4))
150 |     rpn_bbox_inside_weights = bbox_inside_weights
151 | 
152 |     # bbox_outside_weights
153 |     bbox_outside_weights = bbox_outside_weights.reshape((1, height, width, A * 4))
154 |     rpn_bbox_outside_weights = bbox_outside_weights
155 | 
156 |     return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
157 | 
158 | 
159 | def _unmap(data, count, inds, fill=0):
160 |     """ Unmap a subset of item (data) back to the original set of items (of
161 |     size count) """
162 |     if len(data.shape) == 1:
163 |         ret = np.empty((count,), dtype=np.float32)
164 |         ret.fill(fill)
165 |         ret[inds] = data
166 |     else:
167 |         ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
168 |         ret.fill(fill)
169 |         ret[inds, :] = data
170 |     return ret
171 | 
172 | 
173 | def _compute_targets(ex_rois, gt_rois):
174 |     """Compute bounding-box regression targets for an image."""
175 | 
176 |     assert ex_rois.shape[0] == gt_rois.shape[0]
177 |     assert ex_rois.shape[1] == 4
178 |     assert gt_rois.shape[1] == 5
179 | 
180 |     targets = bbox_transform(ex_rois, gt_rois)
181 | 
182 |     return targets
183 | 


--------------------------------------------------------------------------------
/lib/datasets/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast/er R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Bharath Hariharan
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import xml.etree.ElementTree as ET
 11 | import os
 12 | import pickle
 13 | import numpy as np
 14 | 
 15 | 
 16 | def parse_rec(filename):
 17 |     """ Parse a PASCAL VOC xml file """
 18 |     tree = ET.parse(filename)
 19 |     objects = []
 20 |     for obj in tree.findall('object'):
 21 |         obj_struct = {}
 22 |         obj_struct['name'] = obj.find('name').text
 23 |         obj_struct['pose'] = obj.find('pose').text
 24 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 25 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 26 |         bbox = obj.find('bndbox')
 27 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 28 |                               int(bbox.find('ymin').text),
 29 |                               int(bbox.find('xmax').text),
 30 |                               int(bbox.find('ymax').text)]
 31 |         objects.append(obj_struct)
 32 | 
 33 |     return objects
 34 | 
 35 | 
 36 | def voc_ap(rec, prec, use_07_metric=False):
 37 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 38 |     Compute VOC AP given precision and recall.
 39 |     If use_07_metric is true, uses the
 40 |     VOC 07 11 point method (default:False).
 41 |     """
 42 |     if use_07_metric:
 43 |         # 11 point metric
 44 |         ap = 0.
 45 |         for t in np.arange(0., 1.1, 0.1):
 46 |             if np.sum(rec >= t) == 0:
 47 |                 p = 0
 48 |             else:
 49 |                 p = np.max(prec[rec >= t])
 50 |             ap = ap + p / 11.
 51 |     else:
 52 |         # correct AP calculation
 53 |         # first append sentinel values at the end
 54 |         mrec = np.concatenate(([0.], rec, [1.]))
 55 |         mpre = np.concatenate(([0.], prec, [0.]))
 56 | 
 57 |         # compute the precision envelope
 58 |         for i in range(mpre.size - 1, 0, -1):
 59 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 60 | 
 61 |         # to calculate area under PR curve, look for points
 62 |         # where X axis (recall) changes value
 63 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 64 | 
 65 |         # and sum (\Delta recall) * prec
 66 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 67 |     return ap
 68 | 
 69 | 
 70 | def voc_eval(detpath,
 71 |              annopath,
 72 |              imagesetfile,
 73 |              classname,
 74 |              cachedir,
 75 |              ovthresh=0.5,
 76 |              use_07_metric=False,
 77 |              use_diff=False):
 78 |     """rec, prec, ap = voc_eval(detpath,
 79 |                                 annopath,
 80 |                                 imagesetfile,
 81 |                                 classname,
 82 |                                 [ovthresh],
 83 |                                 [use_07_metric])
 84 | 
 85 |     Top level function that does the PASCAL VOC evaluation.
 86 | 
 87 |     detpath: Path to detections
 88 |         detpath.format(classname) should produce the detection results file.
 89 |     annopath: Path to annotations
 90 |         annopath.format(imagename) should be the xml annotations file.
 91 |     imagesetfile: Text file containing the list of images, one image per line.
 92 |     classname: Category name (duh)
 93 |     cachedir: Directory for caching the annotations
 94 |     [ovthresh]: Overlap threshold (default = 0.5)
 95 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 96 |         (default False)
 97 |     """
 98 |     # assumes detections are in detpath.format(classname)
 99 |     # assumes annotations are in annopath.format(imagename)
100 |     # assumes imagesetfile is a text file with each line an image name
101 |     # cachedir caches the annotations in a pickle file
102 | 
103 |     # first load gt
104 |     if not os.path.isdir(cachedir):
105 |         os.mkdir(cachedir)
106 |     cachefile = os.path.join(cachedir, '%s_annots.pkl' % imagesetfile)
107 |     # read list of images
108 |     with open(imagesetfile, 'r') as f:
109 |         lines = f.readlines()
110 |     imagenames = [x.strip() for x in lines]
111 | 
112 |     if not os.path.isfile(cachefile):
113 |         # load annotations
114 |         recs = {}
115 |         for i, imagename in enumerate(imagenames):
116 |             recs[imagename] = parse_rec(annopath.format(imagename))
117 |             if i % 100 == 0:
118 |                 print('Reading annotation for {:d}/{:d}'.format(
119 |                     i + 1, len(imagenames)))
120 |         # save
121 |         print('Saving cached annotations to {:s}'.format(cachefile))
122 |         with open(cachefile, 'w') as f:
123 |             pickle.dump(recs, f)
124 |     else:
125 |         # load
126 |         with open(cachefile, 'rb') as f:
127 |             try:
128 |                 recs = pickle.load(f)
129 |             except:
130 |                 recs = pickle.load(f, encoding='bytes')
131 | 
132 |     # extract gt objects for this class
133 |     class_recs = {}
134 |     npos = 0
135 |     for imagename in imagenames:
136 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
137 |         bbox = np.array([x['bbox'] for x in R])
138 |         if use_diff:
139 |             difficult = np.array([False for x in R]).astype(np.bool)
140 |         else:
141 |             difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
142 |         det = [False] * len(R)
143 |         npos = npos + sum(~difficult)
144 |         class_recs[imagename] = {'bbox': bbox,
145 |                                  'difficult': difficult,
146 |                                  'det': det}
147 | 
148 |     # read dets
149 |     detfile = detpath.format(classname)
150 |     with open(detfile, 'r') as f:
151 |         lines = f.readlines()
152 | 
153 |     splitlines = [x.strip().split(' ') for x in lines]
154 |     image_ids = [x[0] for x in splitlines]
155 |     confidence = np.array([float(x[1]) for x in splitlines])
156 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
157 | 
158 |     nd = len(image_ids)
159 |     tp = np.zeros(nd)
160 |     fp = np.zeros(nd)
161 | 
162 |     if BB.shape[0] > 0:
163 |         # sort by confidence
164 |         sorted_ind = np.argsort(-confidence)
165 |         sorted_scores = np.sort(-confidence)
166 |         BB = BB[sorted_ind, :]
167 |         image_ids = [image_ids[x] for x in sorted_ind]
168 | 
169 |         # go down dets and mark TPs and FPs
170 |         for d in range(nd):
171 |             R = class_recs[image_ids[d]]
172 |             bb = BB[d, :].astype(float)
173 |             ovmax = -np.inf
174 |             BBGT = R['bbox'].astype(float)
175 | 
176 |             if BBGT.size > 0:
177 |                 # compute overlaps
178 |                 # intersection
179 |                 ixmin = np.maximum(BBGT[:, 0], bb[0])
180 |                 iymin = np.maximum(BBGT[:, 1], bb[1])
181 |                 ixmax = np.minimum(BBGT[:, 2], bb[2])
182 |                 iymax = np.minimum(BBGT[:, 3], bb[3])
183 |                 iw = np.maximum(ixmax - ixmin + 1., 0.)
184 |                 ih = np.maximum(iymax - iymin + 1., 0.)
185 |                 inters = iw * ih
186 | 
187 |                 # union
188 |                 uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
189 |                        (BBGT[:, 2] - BBGT[:, 0] + 1.) *
190 |                        (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
191 | 
192 |                 overlaps = inters / uni
193 |                 ovmax = np.max(overlaps)
194 |                 jmax = np.argmax(overlaps)
195 | 
196 |             if ovmax > ovthresh:
197 |                 if not R['difficult'][jmax]:
198 |                     if not R['det'][jmax]:
199 |                         tp[d] = 1.
200 |                         R['det'][jmax] = 1
201 |                     else:
202 |                         fp[d] = 1.
203 |             else:
204 |                 fp[d] = 1.
205 | 
206 |     # compute precision recall
207 |     fp = np.cumsum(fp)
208 |     tp = np.cumsum(tp)
209 |     rec = tp / float(npos)
210 |     # avoid divide by zero in case the first detection matches a difficult
211 |     # ground truth
212 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
213 |     ap = voc_ap(rec, prec, use_07_metric)
214 | 
215 |     return rec, prec, ap
216 | 


--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # --------------------------------------------------------
  4 | # Tensorflow Faster R-CNN
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # Written by Xinlei Chen, based on code from Ross Girshick
  7 | # --------------------------------------------------------
  8 | 
  9 | """
 10 | Demo script showing detections in sample images.
 11 | 
 12 | See README.md for installation instructions before running.
 13 | """
 14 | from __future__ import absolute_import
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | 
 18 | import glob
 19 | import time
 20 | 
 21 | import _init_paths
 22 | from model.config import cfg
 23 | from model.test import im_detect
 24 | from model.nms_wrapper import nms
 25 | from text_connector import TextDetector
 26 | 
 27 | from utils.timer import Timer
 28 | import tensorflow as tf
 29 | import matplotlib.pyplot as plt
 30 | import numpy as np
 31 | import os
 32 | import cv2
 33 | import argparse
 34 | 
 35 | from nets.vgg16 import vgg16
 36 | from nets.resnet_v1 import Resnetv1
 37 | from nets.squeezenet import SqueezeNet
 38 | from nets.mobilenet_v2 import MobileNetV2
 39 | 
 40 | from utils import helper
 41 | 
 42 | CLASSES = ('__background__', 'text')
 43 | 
 44 | 
 45 | def vis_detections(im, class_name, dets, thresh=0.5, text=False):
 46 |     """Draw detected bounding boxes."""
 47 |     inds = np.where(dets[:, -1] >= thresh)[0]
 48 |     if len(inds) == 0:
 49 |         return
 50 | 
 51 |     im = im[:, :, (2, 1, 0)]
 52 |     fig, ax = plt.subplots(figsize=(12, 12))
 53 |     ax.imshow(im, aspect='equal')
 54 |     for i in inds:
 55 |         bbox = dets[i, :8]
 56 |         score = dets[i, -1]
 57 | 
 58 |         ax.add_line(
 59 |             plt.Line2D([bbox[0], bbox[2], bbox[6], bbox[4], bbox[0]],
 60 |                        [bbox[1], bbox[3], bbox[7], bbox[5], bbox[1]],
 61 |                        color='red', linewidth=3)
 62 |         )
 63 | 
 64 |         if text:
 65 |             ax.text(bbox[0], bbox[1] - 2,
 66 |                     '{:s} {:.3f}'.format(class_name, score),
 67 |                     bbox=dict(facecolor='blue', alpha=0.5),
 68 |                     fontsize=14, color='white')
 69 | 
 70 |     ax.set_title(('{} detections with '
 71 |                   'p({} | box) >= {:.1f}').format(class_name, class_name,
 72 |                                                   thresh),
 73 |                  fontsize=14)
 74 |     plt.axis('off')
 75 |     plt.tight_layout()
 76 |     plt.draw()
 77 |     plt.show()
 78 | 
 79 | 
 80 | def save_result(img, img_name, text_lines, result_dir):
 81 |     dst = img
 82 |     color = (0, 150, 0)
 83 |     for bbox in text_lines:
 84 |         bbox = [int(x) for x in bbox]
 85 |         p1 = (bbox[0], bbox[1])
 86 |         p2 = (bbox[2], bbox[3])
 87 |         p3 = (bbox[6], bbox[7])
 88 |         p4 = (bbox[4], bbox[5])
 89 |         dst = cv2.line(dst, p1, p2, color, 2)
 90 |         dst = cv2.line(dst, p2, p3, color, 2)
 91 |         dst = cv2.line(dst, p3, p4, color, 2)
 92 |         dst = cv2.line(dst, p4, p1, color, 2)
 93 | 
 94 |     img_path = os.path.join(result_dir, img_name[0:-4] + '.jpg')
 95 |     cv2.imwrite(img_path, dst)
 96 | 
 97 | 
 98 | def recover_scale(boxes, scale):
 99 |     """
100 |     :param boxes: [(x1, y1, x2, y2)]
101 |     :param scale: image scale
102 |     :return:
103 |     """
104 |     tmp_boxes = []
105 |     for b in boxes:
106 |         tmp_boxes.append([int(x / scale) for x in b])
107 |     return np.asarray(tmp_boxes).astype(np.float32)
108 | 
109 | 
110 | def draw_rpn_boxes(img, img_name, boxes, scores, im_scale, nms, save_dir):
111 |     """
112 |     :param boxes: [(x1, y1, x2, y2)]
113 |     """
114 |     boxes = recover_scale(boxes, im_scale)
115 | 
116 |     base_name = img_name.split('/')[-1]
117 |     color = (0, 255, 0)
118 |     out = img.copy()
119 | 
120 |     if nms:
121 |         boxes, scores = TextDetector.pre_process(boxes, scores)
122 |         file_name = "%s_rpn_nms.jpg" % base_name
123 |     else:
124 |         file_name = "%s_rpn.jpg" % base_name
125 | 
126 |     for i, box in enumerate(boxes):
127 |         cv2.rectangle(out, (box[0], box[1]), (box[2], box[3]), color, 2)
128 |         cx = int((box[0] + box[2]) / 2)
129 |         cy = int((box[1] + box[3]) / 2)
130 |         cv2.putText(out, "%.01f" % scores[i], (cx, cy), cv2.FONT_HERSHEY_SIMPLEX, 0.2, (255, 0, 0))
131 | 
132 |     cv2.imwrite(os.path.join(save_dir, file_name), out)
133 | 
134 | 
135 | def demo(sess, net, im_file, result_dir, viz=False, oriented=False):
136 |     """Detect object classes in an image using pre-computed object proposals."""
137 | 
138 |     # Load the demo image
139 |     im = helper.read_rgb_img(im_file)
140 | 
141 |     # Detect all object classes and regress object bounds
142 |     timer = Timer()
143 |     timer.tic()
144 |     scores, boxes, resized_im_shape, im_scale = im_detect(sess, net, im)
145 |     timer.toc()
146 | 
147 |     im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR)
148 |     img_name = im_file.split('/')[-1]
149 | 
150 |     draw_rpn_boxes(im, img_name, boxes, scores[:, np.newaxis], im_scale, True, result_dir)
151 |     draw_rpn_boxes(im, img_name, boxes, scores[:, np.newaxis], im_scale, False, result_dir)
152 | 
153 |     # Run TextDetector to merge small box
154 |     line_detector = TextDetector(oriented)
155 | 
156 |     # line_detector 的输入必须是在 scale 之后的图片上！！，
157 |     # 如果还原了以后再进行行构建，原图可能太大，导致每个 anchor 的 width 很大，导致 MAX_HORIZONTAL_GAP 太小
158 |     # text_lines point order: left-top, right-top, left-bottom, right-bottom
159 |     text_lines = line_detector.detect(boxes, scores[:, np.newaxis], resized_im_shape)
160 |     print("Image %s, detect %d text lines in %.3fs" % (im_file, len(text_lines), timer.diff))
161 | 
162 |     if len(text_lines) != 0:
163 |         text_lines = recover_scale(text_lines, im_scale)
164 |         save_result(im, img_name, text_lines, result_dir)
165 | 
166 |     # Visualize detections
167 |     if viz:
168 |         vis_detections(im, CLASSES[1], text_lines)
169 | 
170 | 
171 | def parse_args():
172 |     """Parse input arguments."""
173 |     parser = argparse.ArgumentParser(description='Tensorflow CTPN demo')
174 |     parser.add_argument('--net', dest='net', choices=['vgg16', 'squeeze', 'mobile'], default='vgg16')
175 |     parser.add_argument('--img_dir', default='/home/cwq/data/ICDAR13/123')
176 |     parser.add_argument('--dataset', dest='dataset', help='model tag', default='voc_2007_trainval')
177 |     parser.add_argument('--tag', dest='tag', help='model tag', default='vgg_latin_chn_newdata')
178 |     parser.add_argument('--viz', action='store_true', default=False, help='show result')
179 |     parser.add_argument('-o', '--oriented', action='store_true', default=False, help='output rotated detect box')
180 |     args = parser.parse_args()
181 | 
182 |     if not os.path.exists(args.img_dir):
183 |         print("img dir not exists.")
184 |         exit(-1)
185 | 
186 |     args.result_dir = os.path.join('./data/result', args.tag)
187 |     if not os.path.exists(args.result_dir):
188 |         os.makedirs(args.result_dir)
189 | 
190 |     return args
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     args = parse_args()
195 | 
196 |     # model path
197 |     netname = args.net
198 |     dataset = args.dataset
199 | 
200 |     ckpt_dir = os.path.join('output', netname, dataset, args.tag)
201 |     ckpt = tf.train.get_checkpoint_state(ckpt_dir)
202 | 
203 |     # set config
204 |     tfconfig = tf.ConfigProto(allow_soft_placement=True)
205 |     tfconfig.gpu_options.allow_growth = True
206 | 
207 |     # init session
208 |     sess = tf.Session(config=tfconfig)
209 |     # load network
210 |     if netname == 'vgg16':
211 |         net = vgg16()
212 |     elif netname == 'res101':
213 |         net = Resnetv1(num_layers=101)
214 |     elif netname == 'mobile':
215 |         net = MobileNetV2()
216 |     elif args.net == 'squeeze':
217 |         net = SqueezeNet()
218 |     else:
219 |         raise NotImplementedError
220 | 
221 |     net.create_architecture("TEST",
222 |                             num_classes=len(CLASSES),
223 |                             tag=args.tag,
224 |                             anchor_width=cfg.CTPN.ANCHOR_WIDTH,
225 |                             anchor_h_ratio_step=cfg.CTPN.H_RADIO_STEP,
226 |                             num_anchors=cfg.CTPN.NUM_ANCHORS)
227 |     saver = tf.train.Saver()
228 |     saver.restore(sess, ckpt.model_checkpoint_path)
229 | 
230 |     print('Loaded network {:s}'.format(ckpt.model_checkpoint_path))
231 | 
232 |     im_files = glob.glob(args.img_dir + "/*.*")
233 |     for im_file in im_files:
234 |         demo(sess, net, im_file, args.result_dir, args.viz, args.oriented)
235 | 


--------------------------------------------------------------------------------
/lib/datasets/imdb.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Xinlei Chen
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | import os.path as osp
 13 | import PIL
 14 | from utils.cython_bbox import bbox_overlaps
 15 | import numpy as np
 16 | import scipy.sparse
 17 | from model.config import cfg
 18 | 
 19 | 
 20 | class imdb(object):
 21 |     """Image database."""
 22 | 
 23 |     def __init__(self, name, classes=None):
 24 |         self._name = name
 25 |         self._num_classes = 0
 26 |         if not classes:
 27 |             self._classes = []
 28 |         else:
 29 |             self._classes = classes
 30 |         self._image_index = []
 31 |         self._obj_proposer = 'gt'
 32 |         self._roidb = None
 33 |         self._roidb_handler = self.default_roidb
 34 |         # Use this dict for storing dataset specific config options
 35 |         self.config = {}
 36 | 
 37 |     @property
 38 |     def name(self):
 39 |         return self._name
 40 | 
 41 |     @property
 42 |     def num_classes(self):
 43 |         return len(self._classes)
 44 | 
 45 |     @property
 46 |     def classes(self):
 47 |         return self._classes
 48 | 
 49 |     @property
 50 |     def image_index(self):
 51 |         return self._image_index
 52 | 
 53 |     @property
 54 |     def roidb_handler(self):
 55 |         return self._roidb_handler
 56 | 
 57 |     @roidb_handler.setter
 58 |     def roidb_handler(self, val):
 59 |         self._roidb_handler = val
 60 | 
 61 |     def set_proposal_method(self, method):
 62 |         method = eval('self.' + method + '_roidb')
 63 |         self.roidb_handler = method
 64 | 
 65 |     @property
 66 |     def roidb(self):
 67 |         # A roidb is a list of dictionaries, each with the following keys:
 68 |         #   boxes
 69 |         #   gt_overlaps
 70 |         #   gt_classes
 71 |         #   flipped
 72 |         if self._roidb is not None:
 73 |             return self._roidb
 74 |         self._roidb = self.roidb_handler()
 75 |         return self._roidb
 76 | 
 77 |     @property
 78 |     def cache_path(self):
 79 |         cache_path = osp.abspath(osp.join(cfg.DATA_DIR, 'cache'))
 80 |         if not os.path.exists(cache_path):
 81 |             os.makedirs(cache_path)
 82 |         return cache_path
 83 | 
 84 |     @property
 85 |     def num_images(self):
 86 |         return len(self.image_index)
 87 | 
 88 |     def image_path_at(self, i):
 89 |         raise NotImplementedError
 90 | 
 91 |     def default_roidb(self):
 92 |         raise NotImplementedError
 93 | 
 94 |     def evaluate_detections(self, all_boxes, output_dir=None):
 95 |         """
 96 |         all_boxes is a list of length number-of-classes.
 97 |         Each list element is a list of length number-of-images.
 98 |         Each of those list elements is either an empty list []
 99 |         or a numpy array of detection.
100 | 
101 |         all_boxes[class][image] = [] or np.array of shape #dets x 5
102 |         """
103 |         raise NotImplementedError
104 | 
105 |     def _get_widths(self):
106 |         return [PIL.Image.open(self.image_path_at(i)).size[0]
107 |                 for i in range(self.num_images)]
108 | 
109 |     def append_flipped_images(self):
110 |         num_images = self.num_images
111 |         widths = self._get_widths()
112 |         for i in range(num_images):
113 |             boxes = self.roidb[i]['boxes'].copy()
114 |             oldx1 = boxes[:, 0].copy()
115 |             oldx2 = boxes[:, 2].copy()
116 |             boxes[:, 0] = widths[i] - oldx2 - 1
117 |             boxes[:, 2] = widths[i] - oldx1 - 1
118 |             for b in range(len(boxes)):
119 |                 if boxes[b][2] < boxes[b][0]:
120 |                     boxes[b][0] = 0
121 |             assert (boxes[:, 2] >= boxes[:, 0]).all()
122 |             entry = {'boxes': boxes,
123 |                      'gt_overlaps': self.roidb[i]['gt_overlaps'],
124 |                      'gt_classes': self.roidb[i]['gt_classes'],
125 |                      'flipped': True}
126 | 
127 |             if 'gt_ishard' in self.roidb[i] and 'dontcare_areas' in self.roidb[i]:
128 |                 entry['gt_ishard'] = self.roidb[i]['gt_ishard'].copy()
129 |                 dontcare_areas = self.roidb[i]['dontcare_areas'].copy()
130 |                 oldx1 = dontcare_areas[:, 0].copy()
131 |                 oldx2 = dontcare_areas[:, 2].copy()
132 |                 dontcare_areas[:, 0] = widths[i] - oldx2 - 1
133 |                 dontcare_areas[:, 2] = widths[i] - oldx1 - 1
134 |                 entry['dontcare_areas'] = dontcare_areas
135 | 
136 |             self.roidb.append(entry)
137 | 
138 |         self._image_index = self._image_index * 2
139 | 
140 |     def evaluate_recall(self, candidate_boxes=None, thresholds=None,
141 |                         area='all', limit=None):
142 |         """Evaluate detection proposal recall metrics.
143 | 
144 |         Returns:
145 |             results: dictionary of results with keys
146 |                 'ar': average recall
147 |                 'recalls': vector recalls at each IoU overlap threshold
148 |                 'thresholds': vector of IoU overlap thresholds
149 |                 'gt_overlaps': vector of all ground-truth overlaps
150 |         """
151 |         # Record max overlap value for each gt box
152 |         # Return vector of overlap values
153 |         areas = {'all': 0, 'small': 1, 'medium': 2, 'large': 3,
154 |                  '96-128': 4, '128-256': 5, '256-512': 6, '512-inf': 7}
155 |         area_ranges = [[0 ** 2, 1e5 ** 2],  # all
156 |                        [0 ** 2, 32 ** 2],  # small
157 |                        [32 ** 2, 96 ** 2],  # medium
158 |                        [96 ** 2, 1e5 ** 2],  # large
159 |                        [96 ** 2, 128 ** 2],  # 96-128
160 |                        [128 ** 2, 256 ** 2],  # 128-256
161 |                        [256 ** 2, 512 ** 2],  # 256-512
162 |                        [512 ** 2, 1e5 ** 2],  # 512-inf
163 |                        ]
164 |         assert area in areas, 'unknown area range: {}'.format(area)
165 |         area_range = area_ranges[areas[area]]
166 |         gt_overlaps = np.zeros(0)
167 |         num_pos = 0
168 |         for i in range(self.num_images):
169 |             # Checking for max_overlaps == 1 avoids including crowd annotations
170 |             # (...pretty hacking :/)
171 |             max_gt_overlaps = self.roidb[i]['gt_overlaps'].toarray().max(axis=1)
172 |             gt_inds = np.where((self.roidb[i]['gt_classes'] > 0) &
173 |                                (max_gt_overlaps == 1))[0]
174 |             gt_boxes = self.roidb[i]['boxes'][gt_inds, :]
175 |             gt_areas = self.roidb[i]['seg_areas'][gt_inds]
176 |             valid_gt_inds = np.where((gt_areas >= area_range[0]) &
177 |                                      (gt_areas <= area_range[1]))[0]
178 |             gt_boxes = gt_boxes[valid_gt_inds, :]
179 |             num_pos += len(valid_gt_inds)
180 | 
181 |             if candidate_boxes is None:
182 |                 # If candidate_boxes is not supplied, the default is to use the
183 |                 # non-ground-truth boxes from this roidb
184 |                 non_gt_inds = np.where(self.roidb[i]['gt_classes'] == 0)[0]
185 |                 boxes = self.roidb[i]['boxes'][non_gt_inds, :]
186 |             else:
187 |                 boxes = candidate_boxes[i]
188 |             if boxes.shape[0] == 0:
189 |                 continue
190 |             if limit is not None and boxes.shape[0] > limit:
191 |                 boxes = boxes[:limit, :]
192 | 
193 |             overlaps = bbox_overlaps(boxes.astype(np.float),
194 |                                      gt_boxes.astype(np.float))
195 | 
196 |             _gt_overlaps = np.zeros((gt_boxes.shape[0]))
197 |             for j in range(gt_boxes.shape[0]):
198 |                 # find which proposal box maximally covers each gt box
199 |                 argmax_overlaps = overlaps.argmax(axis=0)
200 |                 # and get the iou amount of coverage for each gt box
201 |                 max_overlaps = overlaps.max(axis=0)
202 |                 # find which gt box is 'best' covered (i.e. 'best' = most iou)
203 |                 gt_ind = max_overlaps.argmax()
204 |                 gt_ovr = max_overlaps.max()
205 |                 assert (gt_ovr >= 0)
206 |                 # find the proposal box that covers the best covered gt box
207 |                 box_ind = argmax_overlaps[gt_ind]
208 |                 # record the iou coverage of this gt box
209 |                 _gt_overlaps[j] = overlaps[box_ind, gt_ind]
210 |                 assert (_gt_overlaps[j] == gt_ovr)
211 |                 # mark the proposal box and the gt box as used
212 |                 overlaps[box_ind, :] = -1
213 |                 overlaps[:, gt_ind] = -1
214 |             # append recorded iou coverage level
215 |             gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps))
216 | 
217 |         gt_overlaps = np.sort(gt_overlaps)
218 |         if thresholds is None:
219 |             step = 0.05
220 |             thresholds = np.arange(0.5, 0.95 + 1e-5, step)
221 |         recalls = np.zeros_like(thresholds)
222 |         # compute recall for each iou threshold
223 |         for i, t in enumerate(thresholds):
224 |             recalls[i] = (gt_overlaps >= t).sum() / float(num_pos)
225 |         # ar = 2 * np.trapz(recalls, thresholds)
226 |         ar = recalls.mean()
227 |         return {'ar': ar, 'recalls': recalls, 'thresholds': thresholds,
228 |                 'gt_overlaps': gt_overlaps}
229 | 
230 |     def create_roidb_from_box_list(self, box_list, gt_roidb):
231 |         assert len(box_list) == self.num_images, \
232 |             'Number of boxes must match number of ground-truth images'
233 |         roidb = []
234 |         for i in range(self.num_images):
235 |             boxes = box_list[i]
236 |             num_boxes = boxes.shape[0]
237 |             overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32)
238 | 
239 |             if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0:
240 |                 gt_boxes = gt_roidb[i]['boxes']
241 |                 gt_classes = gt_roidb[i]['gt_classes']
242 |                 gt_overlaps = bbox_overlaps(boxes.astype(np.float),
243 |                                             gt_boxes.astype(np.float))
244 |                 argmaxes = gt_overlaps.argmax(axis=1)
245 |                 maxes = gt_overlaps.max(axis=1)
246 |                 I = np.where(maxes > 0)[0]
247 |                 overlaps[I, gt_classes[argmaxes[I]]] = maxes[I]
248 | 
249 |             overlaps = scipy.sparse.csr_matrix(overlaps)
250 |             roidb.append({
251 |                 'boxes': boxes,
252 |                 'gt_classes': np.zeros((num_boxes,), dtype=np.int32),
253 |                 'gt_overlaps': overlaps,
254 |                 'flipped': False,
255 |                 'seg_areas': np.zeros((num_boxes,), dtype=np.float32),
256 |             })
257 |         return roidb
258 | 
259 |     @staticmethod
260 |     def merge_roidbs(a, b):
261 |         assert len(a) == len(b)
262 |         for i in range(len(a)):
263 |             a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
264 |             a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
265 |                                             b[i]['gt_classes']))
266 |             a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
267 |                                                        b[i]['gt_overlaps']])
268 |             a[i]['seg_areas'] = np.hstack((a[i]['seg_areas'],
269 |                                            b[i]['seg_areas']))
270 |         return a
271 | 
272 |     def competition_mode(self, on):
273 |         """Turn competition mode on or off."""
274 |         pass
275 | 


--------------------------------------------------------------------------------
/lib/model/config.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | import os
  6 | import os.path as osp
  7 | import numpy as np
  8 | # `pip install easydict` if you don't have it
  9 | from easydict import EasyDict as edict
 10 | 
 11 | __C = edict()
 12 | # Consumers can get config by:
 13 | #   from fast_rcnn_config import cfg
 14 | cfg = __C
 15 | 
 16 | #
 17 | # Training options
 18 | #
 19 | __C.TRAIN = edict()
 20 | 
 21 | # Initial learning rate
 22 | __C.TRAIN.LEARNING_RATE = 0.001
 23 | 
 24 | # Optimizer Adam, Momentum, RMS
 25 | __C.TRAIN.OPTIMIZER = 'Adam'
 26 | 
 27 | # Momentum
 28 | __C.TRAIN.MOMENTUM = 0.9
 29 | 
 30 | # Weight decay, for regularization
 31 | __C.TRAIN.WEIGHT_DECAY = 0.0005
 32 | 
 33 | # Factor for reducing the learning rate
 34 | __C.TRAIN.GAMMA = 0.1
 35 | 
 36 | # Step size for reducing the learning rate, currently only support one step
 37 | __C.TRAIN.STEPSIZE = [50000]
 38 | 
 39 | # Iteration intervals for showing the loss during training, on command line interface
 40 | __C.TRAIN.DISPLAY = 10
 41 | 
 42 | # Whether to double the learning rate for bias
 43 | __C.TRAIN.DOUBLE_BIAS = True
 44 | 
 45 | # Whether to initialize the weights with truncated normal distribution 
 46 | __C.TRAIN.TRUNCATED = False
 47 | 
 48 | # Whether to have weight decay on bias as well
 49 | __C.TRAIN.BIAS_DECAY = False
 50 | 
 51 | # Whether to add ground truth boxes to the pool when sampling regions
 52 | __C.TRAIN.USE_GT = False
 53 | 
 54 | # Whether to use aspect-ratio grouping of training images, introduced merely for saving
 55 | # GPU memory
 56 | __C.TRAIN.ASPECT_GROUPING = False
 57 | 
 58 | # The number of snapshots kept, older ones are deleted to save space
 59 | __C.TRAIN.SNAPSHOT_KEPT = 3
 60 | 
 61 | # The time interval for saving tensorflow summaries
 62 | __C.TRAIN.SUMMARY_INTERVAL = 30
 63 | 
 64 | # Scale to use during training (can list multiple scales)
 65 | # The scale is the pixel size of an image's shortest side
 66 | __C.TRAIN.SCALES = (600,)
 67 | 
 68 | # Max pixel size of the longest side of a scaled input image
 69 | __C.TRAIN.MAX_SIZE = 1200
 70 | 
 71 | # Images to use per minibatch
 72 | __C.TRAIN.IMS_PER_BATCH = 1
 73 | 
 74 | # Fraction of minibatch that is labeled foreground (i.e. class > 0)
 75 | __C.TRAIN.FG_FRACTION = 0.3
 76 | 
 77 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH)
 78 | __C.TRAIN.FG_THRESH = 0.5
 79 | 
 80 | # Overlap threshold for a ROI to be considered background (class = 0 if
 81 | # overlap in [LO, HI))
 82 | __C.TRAIN.BG_THRESH_HI = 0.5
 83 | __C.TRAIN.BG_THRESH_LO = 0.1
 84 | 
 85 | __C.TRAIN.USE_FLIPPED = True
 86 | 
 87 | # Train bounding-box regressors
 88 | __C.TRAIN.BBOX_REG = True
 89 | 
 90 | # Overlap required between a ROI and ground-truth box in order for that ROI to
 91 | # be used as a bounding-box regression training example
 92 | __C.TRAIN.BBOX_THRESH = 0.5
 93 | 
 94 | # Iterations between snapshots
 95 | __C.TRAIN.SNAPSHOT_ITERS = 5000
 96 | 
 97 | # solver.prototxt specifies the snapshot path prefix, this adds an optional
 98 | # infix to yield the path: <prefix>[_<infix>]_iters_XYZ.caffemodel
 99 | __C.TRAIN.SNAPSHOT_PREFIX = 'res101_faster_rcnn'
100 | 
101 | # Normalize the targets (subtract empirical mean, divide by empirical stddev)
102 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True
103 | 
104 | # Deprecated (inside weights) useless in CTPN
105 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (0.0, 1.0, 0.0, 1.0)
106 | 
107 | # Normalize the targets using "precomputed" (or made up) means and stdevs
108 | # (BBOX_NORMALIZE_TARGETS must also be True)
109 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = True
110 | 
111 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0)
112 | 
113 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
114 | 
115 | # Train using these proposals
116 | __C.TRAIN.PROPOSAL_METHOD = 'gt'
117 | 
118 | # Make minibatches from images that have similar aspect ratios (i.e. both
119 | # tall and thin or both short and wide) in order to avoid wasting computation
120 | # on zero-padding.
121 | 
122 | # IOU >= thresh: positive example
123 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
124 | 
125 | # IOU < thresh: negative example
126 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.5
127 | 
128 | # If an anchor satisfied by positive and negative conditions set to negative
129 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False
130 | 
131 | # Max number of foreground examples
132 | __C.TRAIN.RPN_FG_FRACTION = 0.5
133 | 
134 | # Total number of examples
135 | __C.TRAIN.RPN_BATCHSIZE = 128
136 | 
137 | # NMS threshold used on RPN proposals
138 | __C.TRAIN.RPN_NMS_THRESH = 0.7
139 | 
140 | # Number of top scoring boxes to keep before apply NMS to RPN proposals
141 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
142 | 
143 | # Number of top scoring boxes to keep after applying NMS to RPN proposals
144 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000
145 | 
146 | # The order of weights see lib/model/bbox_transform.py  bbox_transform()
147 | # Weights for (x, y, w, h), for CTPN it should be (0.,1.,0.,1.)
148 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (0.0, 1.0, 0.0, 1.0)
149 | 
150 | # Give the positive RPN examples weight of p * 1 / {num positives}
151 | # and give negatives a weight of (1 - p)
152 | # Set to -1.0 to use uniform example weighting
153 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
154 | 
155 | # Whether to use all ground truth bounding boxes for training, 
156 | # For COCO, setting USE_ALL_GT to False will exclude boxes that are flagged as ''iscrowd''
157 | __C.TRAIN.USE_ALL_GT = True
158 | 
159 | #
160 | # Testing options
161 | #
162 | __C.TEST = edict()
163 | 
164 | # Scale to use during testing (can NOT list multiple scales)
165 | # The scale is the pixel size of an image's shortest side
166 | __C.TEST.SCALES = (600,)
167 | 
168 | # Max pixel size of the longest side of a scaled input image
169 | __C.TEST.MAX_SIZE = 1200
170 | 
171 | # Overlap threshold used for non-maximum suppression (suppress boxes with
172 | # IoU >= this threshold)
173 | __C.TEST.NMS = 0.3
174 | 
175 | # Test using bounding-box regressors
176 | __C.TEST.BBOX_REG = True
177 | 
178 | # Test using these proposals
179 | __C.TEST.PROPOSAL_METHOD = 'gt'
180 | 
181 | ## NMS threshold used on RPN proposals
182 | __C.TEST.RPN_NMS_THRESH = 0.7
183 | 
184 | # Number of top scoring boxes to keep before apply NMS to RPN proposals
185 | __C.TEST.RPN_PRE_NMS_TOP_N = 12000
186 | 
187 | # Number of top scoring boxes to keep after applying NMS to RPN proposals
188 | __C.TEST.RPN_POST_NMS_TOP_N = 1000
189 | 
190 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
191 | # __C.TEST.RPN_MIN_SIZE = 16
192 | 
193 | # Testing mode, default to be 'nms', 'top' is slower but better
194 | # See report for details
195 | __C.TEST.MODE = 'nms'
196 | 
197 | # Only useful when TEST.MODE is 'top', specifies the number of top proposals to select
198 | __C.TEST.RPN_TOP_N = 5000
199 | 
200 | #
201 | # ResNet options
202 | #
203 | 
204 | __C.RESNET = edict()
205 | 
206 | # Number of fixed blocks during training, by default the first of all 4 blocks is fixed
207 | # Range: 0 (none) to 3 (all)
208 | __C.RESNET.FIXED_BLOCKS = 1
209 | 
210 | #
211 | # MobileNet options
212 | #
213 | 
214 | __C.MOBILENET = edict()
215 | 
216 | # Whether to regularize the depth-wise filters during training
217 | __C.MOBILENET.REGU_DEPTH = False
218 | 
219 | # Number of fixed layers during training, by default the bottom 5 of 14 layers is fixed
220 | # Range: 0 (none) to 12 (all)
221 | __C.MOBILENET.FIXED_LAYERS = 5
222 | 
223 | # Weight decay for the mobilenet weights
224 | __C.MOBILENET.WEIGHT_DECAY = 0.00004
225 | 
226 | # Depth multiplier
227 | __C.MOBILENET.DEPTH_MULTIPLIER = 1.
228 | 
229 | #
230 | # MISC
231 | #
232 | 
233 | # Pixel mean values (BGR order) as a (1, 1, 3) array
234 | # Means for VGG, from https://github.com/tensorflow/models/blob/master/research/slim/preprocessing/vgg_preprocessing.py
235 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]])
236 | # __C.PIXEL_MEANS = np.array([[[103.94, 116.78, 123.68]]])
237 | 
238 | # For reproducibility
239 | __C.RNG_SEED = 3
240 | 
241 | # Root directory of project
242 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..'))
243 | 
244 | # Data directory
245 | __C.DATA_DIR = osp.abspath(osp.join(__C.ROOT_DIR, 'data'))
246 | 
247 | # Name (or path to) the matlab executable
248 | __C.MATLAB = 'matlab'
249 | 
250 | # Place outputs under an experiments directory
251 | __C.EXP_DIR = 'default'
252 | 
253 | # Use GPU implementation of non-maximum suppression
254 | __C.USE_GPU_NMS = True
255 | 
256 | # Anchor scales for RPN
257 | __C.ANCHOR_SCALES = [8, 16, 32]
258 | 
259 | # Anchor ratios for RPN
260 | __C.ANCHOR_RATIOS = [0.5, 1, 2]
261 | 
262 | # Number of filters for the RPN layer
263 | __C.RPN_CHANNELS = 512
264 | 
265 | #
266 | # CTPN options
267 | #
268 | 
269 | __C.CTPN = edict()
270 | 
271 | __C.CTPN.NUM_ANCHORS = 10
272 | __C.CTPN.ANCHOR_WIDTH = 16
273 | __C.CTPN.H_RADIO_STEP = 0.7
274 | 
275 | 
276 | def get_output_dir(imdb, weights_filename):
277 |     """Return the directory where experimental artifacts are placed.
278 |     If the directory does not exist, it is created.
279 | 
280 |     A canonical path is built using the name from an imdb and a network
281 |     (if not None).
282 |     """
283 |     outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name))
284 |     if weights_filename is None:
285 |         weights_filename = 'default'
286 |     outdir = osp.join(outdir, weights_filename)
287 |     if not os.path.exists(outdir):
288 |         os.makedirs(outdir)
289 |     return outdir
290 | 
291 | 
292 | def get_output_tb_dir(imdb, weights_filename):
293 |     """Return the directory where tensorflow summaries are placed.
294 |     If the directory does not exist, it is created.
295 | 
296 |     A canonical path is built using the name from an imdb and a network
297 |     (if not None).
298 |     """
299 |     outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'tensorboard', __C.EXP_DIR, imdb.name))
300 |     if weights_filename is None:
301 |         weights_filename = 'default'
302 |     outdir = osp.join(outdir, weights_filename)
303 |     if not os.path.exists(outdir):
304 |         os.makedirs(outdir)
305 |     return outdir
306 | 
307 | 
308 | def _merge_a_into_b(a, b):
309 |     """Merge config dictionary a into config dictionary b, clobbering the
310 |     options in b whenever they are also specified in a.
311 |     """
312 |     if type(a) is not edict:
313 |         return
314 | 
315 |     for k, v in a.items():
316 |         # a must specify keys that are in b
317 |         if k not in b:
318 |             raise KeyError('{} is not a valid config key'.format(k))
319 | 
320 |         # the types must match, too
321 |         old_type = type(b[k])
322 |         if old_type is not type(v):
323 |             if isinstance(b[k], np.ndarray):
324 |                 v = np.array(v, dtype=b[k].dtype)
325 |             else:
326 |                 raise ValueError(('Type mismatch ({} vs. {}) '
327 |                                   'for config key: {}').format(type(b[k]),
328 |                                                                type(v), k))
329 | 
330 |         # recursively merge dicts
331 |         if type(v) is edict:
332 |             try:
333 |                 _merge_a_into_b(a[k], b[k])
334 |             except:
335 |                 print(('Error under config key: {}'.format(k)))
336 |                 raise
337 |         else:
338 |             b[k] = v
339 | 
340 | 
341 | def cfg_from_file(filename):
342 |     """Load a config file and merge it into the default options."""
343 |     import yaml
344 |     with open(filename, 'r') as f:
345 |         yaml_cfg = edict(yaml.load(f))
346 | 
347 |     _merge_a_into_b(yaml_cfg, __C)
348 | 
349 | 
350 | def cfg_from_list(cfg_list):
351 |     """Set config keys via list (e.g., from command line)."""
352 |     from ast import literal_eval
353 |     assert len(cfg_list) % 2 == 0
354 |     for k, v in zip(cfg_list[0::2], cfg_list[1::2]):
355 |         key_list = k.split('.')
356 |         d = __C
357 |         for subkey in key_list[:-1]:
358 |             assert subkey in d
359 |             d = d[subkey]
360 |         subkey = key_list[-1]
361 |         assert subkey in d
362 |         try:
363 |             value = literal_eval(v)
364 |         except:
365 |             # handle the case when v is a string literal
366 |             value = v
367 |         assert type(value) == type(d[subkey]), \
368 |             'type {} does not match original type {}'.format(
369 |                 type(value), type(d[subkey]))
370 |         d[subkey] = value
371 | 


--------------------------------------------------------------------------------
/lib/datasets/pascal_voc.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Xinlei Chen
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | from datasets.imdb import imdb
 13 | import datasets.ds_utils as ds_utils
 14 | import xml.etree.ElementTree as ET
 15 | import numpy as np
 16 | import scipy.sparse
 17 | import scipy.io as sio
 18 | import utils.cython_bbox
 19 | import pickle
 20 | import subprocess
 21 | import uuid
 22 | from .voc_eval import voc_eval
 23 | from model.config import cfg
 24 | 
 25 | 
 26 | class pascal_voc(imdb):
 27 |     def __init__(self, image_set, year, use_diff=False):
 28 |         name = 'voc_' + year + '_' + image_set
 29 |         if use_diff:
 30 |             name += '_diff'
 31 |         imdb.__init__(self, name)
 32 |         self._year = year
 33 |         self._image_set = image_set
 34 |         self._devkit_path = self._get_default_path()
 35 |         self._data_path = os.path.join(self._devkit_path, 'VOC' + self._year)
 36 |         self._classes = ('__background__',  # always index 0
 37 |                          'text')
 38 |         self._class_to_ind = dict(list(zip(self.classes, list(range(self.num_classes)))))
 39 |         self._image_ext = '.jpg'
 40 |         self._image_index = self._load_image_set_index()
 41 |         # Default to roidb handler
 42 |         self._roidb_handler = self.gt_roidb
 43 |         self._salt = str(uuid.uuid4())
 44 |         self._comp_id = 'comp4'
 45 | 
 46 |         # PASCAL specific config options
 47 |         self.config = {'cleanup': True,
 48 |                        'use_salt': True,
 49 |                        'use_diff': use_diff,
 50 |                        'matlab_eval': False,
 51 |                        'rpn_file': None}
 52 | 
 53 |         assert os.path.exists(self._devkit_path), \
 54 |             'VOCdevkit path does not exist: {}'.format(self._devkit_path)
 55 |         assert os.path.exists(self._data_path), \
 56 |             'Path does not exist: {}'.format(self._data_path)
 57 | 
 58 |     def image_path_at(self, i):
 59 |         """
 60 |         Return the absolute path to image i in the image sequence.
 61 |         """
 62 |         return self.image_path_from_index(self._image_index[i])
 63 | 
 64 |     def image_path_from_index(self, index):
 65 |         """
 66 |         Construct an image path from the image's "index" identifier.
 67 |         """
 68 |         image_path = os.path.join(self._data_path, 'JPEGImages',
 69 |                                   index + self._image_ext)
 70 |         assert os.path.exists(image_path), \
 71 |             'Path does not exist: {}'.format(image_path)
 72 |         return image_path
 73 | 
 74 |     def _load_image_set_index(self):
 75 |         """
 76 |         Load the indexes listed in this dataset's image set file.
 77 |         """
 78 |         # Example path to image set file:
 79 |         # self._devkit_path + /VOCdevkit2007/VOC2007/ImageSets/Main/val.txt
 80 |         image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
 81 |                                       self._image_set + '.txt')
 82 |         assert os.path.exists(image_set_file), \
 83 |             'Path does not exist: {}'.format(image_set_file)
 84 |         with open(image_set_file) as f:
 85 |             image_index = [x.strip() for x in f.readlines()]
 86 |         return image_index
 87 | 
 88 |     def _get_default_path(self):
 89 |         """
 90 |         Return the default path where PASCAL VOC is expected to be installed.
 91 |         """
 92 |         return os.path.join(cfg.DATA_DIR, 'VOCdevkit' + self._year)
 93 | 
 94 |     def gt_roidb(self):
 95 |         """
 96 |         Return the database of ground-truth regions of interest.
 97 | 
 98 |         This function loads/saves from/to a cache file to speed up future calls.
 99 |         """
100 |         cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl')
101 |         if os.path.exists(cache_file):
102 |             with open(cache_file, 'rb') as fid:
103 |                 try:
104 |                     roidb = pickle.load(fid)
105 |                 except:
106 |                     roidb = pickle.load(fid, encoding='bytes')
107 |             print('{} gt roidb loaded from {}'.format(self.name, cache_file))
108 |             return roidb
109 | 
110 |         gt_roidb = [self._load_pascal_annotation(index)
111 |                     for index in self.image_index]
112 |         with open(cache_file, 'wb') as fid:
113 |             pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL)
114 |         print('wrote gt roidb to {}'.format(cache_file))
115 | 
116 |         return gt_roidb
117 | 
118 |     def rpn_roidb(self):
119 |         if int(self._year) == 2007 or self._image_set != 'test':
120 |             gt_roidb = self.gt_roidb()
121 |             rpn_roidb = self._load_rpn_roidb(gt_roidb)
122 |             roidb = imdb.merge_roidbs(gt_roidb, rpn_roidb)
123 |         else:
124 |             roidb = self._load_rpn_roidb(None)
125 | 
126 |         return roidb
127 | 
128 |     def _load_rpn_roidb(self, gt_roidb):
129 |         filename = self.config['rpn_file']
130 |         print('loading {}'.format(filename))
131 |         assert os.path.exists(filename), \
132 |             'rpn data not found at: {}'.format(filename)
133 |         with open(filename, 'rb') as f:
134 |             box_list = pickle.load(f)
135 |         return self.create_roidb_from_box_list(box_list, gt_roidb)
136 | 
137 |     def _load_pascal_annotation(self, index):
138 |         """
139 |         Load image and bounding boxes info from XML file in the PASCAL VOC
140 |         format.
141 |         """
142 |         filename = os.path.join(self._data_path, 'Annotations', index + '.xml')
143 |         tree = ET.parse(filename)
144 |         objs = tree.findall('object')
145 |         if not self.config['use_diff']:
146 |             # Exclude the samples labeled as difficult
147 |             non_diff_objs = [
148 |                 obj for obj in objs if int(obj.find('difficult').text) == 0]
149 |             # if len(non_diff_objs) != len(objs):
150 |             #     print 'Removed {} difficult objects'.format(
151 |             #         len(objs) - len(non_diff_objs))
152 |             objs = non_diff_objs
153 |         num_objs = len(objs)
154 | 
155 |         boxes = np.zeros((num_objs, 4), dtype=np.uint16)
156 |         gt_classes = np.zeros((num_objs), dtype=np.int32)
157 |         overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32)
158 |         # "Seg" area for pascal is just the box area
159 |         seg_areas = np.zeros((num_objs), dtype=np.float32)
160 | 
161 |         # Load object bounding boxes into a data frame.
162 |         for ix, obj in enumerate(objs):
163 |             bbox = obj.find('bndbox')
164 |             # Make pixel indexes 0-based
165 |             # Origin faster rcnn
166 |             # x1 = float(bbox.find('xmin').text) - 1
167 |             # y1 = float(bbox.find('ymin').text) - 1
168 |             # x2 = float(bbox.find('xmax').text) - 1
169 |             # y2 = float(bbox.find('ymax').text) - 1
170 | 
171 |             x1 = float(bbox.find('xmin').text)
172 |             y1 = float(bbox.find('ymin').text)
173 |             x2 = float(bbox.find('xmax').text)
174 |             y2 = float(bbox.find('ymax').text)
175 | 
176 |             cls = self._class_to_ind[obj.find('name').text.lower().strip()]
177 |             boxes[ix, :] = [x1, y1, x2, y2]
178 |             gt_classes[ix] = cls
179 |             overlaps[ix, cls] = 1.0
180 |             seg_areas[ix] = (x2 - x1 + 1) * (y2 - y1 + 1)
181 | 
182 |         overlaps = scipy.sparse.csr_matrix(overlaps)
183 | 
184 |         return {'boxes': boxes,
185 |                 'gt_classes': gt_classes,
186 |                 'gt_overlaps': overlaps,
187 |                 'flipped': False,
188 |                 'seg_areas': seg_areas}
189 | 
190 |     def _get_comp_id(self):
191 |         comp_id = (self._comp_id + '_' + self._salt if self.config['use_salt']
192 |                    else self._comp_id)
193 |         return comp_id
194 | 
195 |     def _get_voc_results_file_template(self):
196 |         # VOCdevkit/results/VOC2007/Main/<comp_id>_det_test_aeroplane.txt
197 |         filename = self._get_comp_id() + '_det_' + self._image_set + '_{:s}.txt'
198 |         path = os.path.join(
199 |             self._devkit_path,
200 |             'results',
201 |             'VOC' + self._year,
202 |             'Main',
203 |             filename)
204 |         return path
205 | 
206 |     def _write_voc_results_file(self, all_boxes):
207 |         for cls_ind, cls in enumerate(self.classes):
208 |             if cls == '__background__':
209 |                 continue
210 |             print('Writing {} VOC results file'.format(cls))
211 |             filename = self._get_voc_results_file_template().format(cls)
212 |             with open(filename, 'wt') as f:
213 |                 for im_ind, index in enumerate(self.image_index):
214 |                     dets = all_boxes[cls_ind][im_ind]
215 |                     if dets == []:
216 |                         continue
217 |                     # the VOCdevkit expects 1-based indices
218 |                     for k in range(dets.shape[0]):
219 |                         f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
220 |                                 format(index, dets[k, -1],
221 |                                        dets[k, 0] + 1, dets[k, 1] + 1,
222 |                                        dets[k, 2] + 1, dets[k, 3] + 1))
223 | 
224 |     def _do_python_eval(self, output_dir='output'):
225 |         annopath = os.path.join(
226 |             self._devkit_path,
227 |             'VOC' + self._year,
228 |             'Annotations',
229 |             '{:s}.xml')
230 |         imagesetfile = os.path.join(
231 |             self._devkit_path,
232 |             'VOC' + self._year,
233 |             'ImageSets',
234 |             'Main',
235 |             self._image_set + '.txt')
236 |         cachedir = os.path.join(self._devkit_path, 'annotations_cache')
237 |         aps = []
238 |         # The PASCAL VOC metric changed in 2010
239 |         use_07_metric = True if int(self._year) < 2010 else False
240 |         print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No'))
241 |         if not os.path.isdir(output_dir):
242 |             os.mkdir(output_dir)
243 |         for i, cls in enumerate(self._classes):
244 |             if cls == '__background__':
245 |                 continue
246 |             filename = self._get_voc_results_file_template().format(cls)
247 |             rec, prec, ap = voc_eval(
248 |                 filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
249 |                 use_07_metric=use_07_metric, use_diff=self.config['use_diff'])
250 |             aps += [ap]
251 |             print(('AP for {} = {:.4f}'.format(cls, ap)))
252 |             with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f:
253 |                 pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f)
254 |         print(('Mean AP = {:.4f}'.format(np.mean(aps))))
255 |         print('~~~~~~~~')
256 |         print('Results:')
257 |         for ap in aps:
258 |             print(('{:.3f}'.format(ap)))
259 |         print(('{:.3f}'.format(np.mean(aps))))
260 |         print('~~~~~~~~')
261 |         print('')
262 |         print('--------------------------------------------------------------')
263 |         print('Results computed with the **unofficial** Python eval code.')
264 |         print('Results should be very close to the official MATLAB eval code.')
265 |         print('Recompute with `./tools/reval.py --matlab ...` for your paper.')
266 |         print('-- Thanks, The Management')
267 |         print('--------------------------------------------------------------')
268 | 
269 |     def _do_matlab_eval(self, output_dir='output'):
270 |         print('-----------------------------------------------------')
271 |         print('Computing results with the official MATLAB eval code.')
272 |         print('-----------------------------------------------------')
273 |         path = os.path.join(cfg.ROOT_DIR, 'lib', 'datasets',
274 |                             'VOCdevkit-matlab-wrapper')
275 |         cmd = 'cd {} && '.format(path)
276 |         cmd += '{:s} -nodisplay -nodesktop '.format(cfg.MATLAB)
277 |         cmd += '-r "dbstop if error; '
278 |         cmd += 'voc_eval(\'{:s}\',\'{:s}\',\'{:s}\',\'{:s}\'); quit;"' \
279 |             .format(self._devkit_path, self._get_comp_id(),
280 |                     self._image_set, output_dir)
281 |         print(('Running:\n{}'.format(cmd)))
282 |         status = subprocess.call(cmd, shell=True)
283 | 
284 |     def evaluate_detections(self, all_boxes, output_dir):
285 |         self._write_voc_results_file(all_boxes)
286 |         self._do_python_eval(output_dir)
287 |         if self.config['matlab_eval']:
288 |             self._do_matlab_eval(output_dir)
289 |         if self.config['cleanup']:
290 |             for cls in self._classes:
291 |                 if cls == '__background__':
292 |                     continue
293 |                 filename = self._get_voc_results_file_template().format(cls)
294 |                 os.remove(filename)
295 | 
296 |     def competition_mode(self, on):
297 |         if on:
298 |             self.config['use_salt'] = False
299 |             self.config['cleanup'] = False
300 |         else:
301 |             self.config['use_salt'] = True
302 |             self.config['cleanup'] = True
303 | 
304 | 
305 | if __name__ == '__main__':
306 |     from datasets.pascal_voc import pascal_voc
307 | 
308 |     d = pascal_voc('trainval', '2007')
309 |     res = d.roidb
310 |     from IPython import embed;
311 | 
312 |     embed()
313 | 


--------------------------------------------------------------------------------
/lib/nets/mobilenet/conv_blocks.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Convolution blocks for mobilenet."""
 16 | import contextlib
 17 | import functools
 18 | 
 19 | import tensorflow as tf
 20 | 
 21 | slim = tf.contrib.slim
 22 | 
 23 | 
 24 | def _fixed_padding(inputs, kernel_size, rate=1):
 25 |     """Pads the input along the spatial dimensions independently of input size.
 26 |     Pads the input such that if it was used in a convolution with 'VALID' padding,
 27 |     the output would have the same dimensions as if the unpadded input was used
 28 |     in a convolution with 'SAME' padding.
 29 |     Args:
 30 |       inputs: A tensor of size [batch, height_in, width_in, channels].
 31 |       kernel_size: The kernel to be used in the conv2d or max_pool2d operation.
 32 |       rate: An integer, rate for atrous convolution.
 33 |     Returns:
 34 |       output: A tensor of size [batch, height_out, width_out, channels] with the
 35 |         input, either intact (if kernel_size == 1) or padded (if kernel_size > 1).
 36 |     """
 37 |     kernel_size_effective = [kernel_size[0] + (kernel_size[0] - 1) * (rate - 1),
 38 |                              kernel_size[0] + (kernel_size[0] - 1) * (rate - 1)]
 39 |     pad_total = [kernel_size_effective[0] - 1, kernel_size_effective[1] - 1]
 40 |     pad_beg = [pad_total[0] // 2, pad_total[1] // 2]
 41 |     pad_end = [pad_total[0] - pad_beg[0], pad_total[1] - pad_beg[1]]
 42 |     padded_inputs = tf.pad(inputs, [[0, 0], [pad_beg[0], pad_end[0]],
 43 |                                     [pad_beg[1], pad_end[1]], [0, 0]])
 44 |     return padded_inputs
 45 | 
 46 | 
 47 | def _make_divisible(v, divisor, min_value=None):
 48 |     if min_value is None:
 49 |         min_value = divisor
 50 |     new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
 51 |     # Make sure that round down does not go down by more than 10%.
 52 |     if new_v < 0.9 * v:
 53 |         new_v += divisor
 54 |     return new_v
 55 | 
 56 | 
 57 | def _split_divisible(num, num_ways, divisible_by=8):
 58 |     """Evenly splits num, num_ways so each piece is a multiple of divisible_by."""
 59 |     assert num % divisible_by == 0
 60 |     assert num / num_ways >= divisible_by
 61 |     # Note: want to round down, we adjust each split to match the total.
 62 |     base = num // num_ways // divisible_by * divisible_by
 63 |     result = []
 64 |     accumulated = 0
 65 |     for i in range(num_ways):
 66 |         r = base
 67 |         while accumulated + r < num * (i + 1) / num_ways:
 68 |             r += divisible_by
 69 |         result.append(r)
 70 |         accumulated += r
 71 |     assert accumulated == num
 72 |     return result
 73 | 
 74 | 
 75 | @contextlib.contextmanager
 76 | def _v1_compatible_scope_naming(scope):
 77 |     if scope is None:  # Create uniqified separable blocks.
 78 |         with tf.variable_scope(None, default_name='separable') as s, \
 79 |                 tf.name_scope(s.original_name_scope):
 80 |             yield ''
 81 |     else:
 82 |         # We use scope_depthwise, scope_pointwise for compatibility with V1 ckpts.
 83 |         # which provide numbered scopes.
 84 |         scope += '_'
 85 |         yield scope
 86 | 
 87 | 
 88 | @slim.add_arg_scope
 89 | def split_separable_conv2d(input_tensor,
 90 |                            num_outputs,
 91 |                            scope=None,
 92 |                            normalizer_fn=None,
 93 |                            stride=1,
 94 |                            rate=1,
 95 |                            endpoints=None,
 96 |                            use_explicit_padding=False):
 97 |     """Separable mobilenet V1 style convolution.
 98 |     Depthwise convolution, with default non-linearity,
 99 |     followed by 1x1 depthwise convolution.  This is similar to
100 |     slim.separable_conv2d, but differs in tha it applies batch
101 |     normalization and non-linearity to depthwise. This  matches
102 |     the basic building of Mobilenet Paper
103 |     (https://arxiv.org/abs/1704.04861)
104 |     Args:
105 |       input_tensor: input
106 |       num_outputs: number of outputs
107 |       scope: optional name of the scope. Note if provided it will use
108 |       scope_depthwise for deptwhise, and scope_pointwise for pointwise.
109 |       normalizer_fn: which normalizer function to use for depthwise/pointwise
110 |       stride: stride
111 |       rate: output rate (also known as dilation rate)
112 |       endpoints: optional, if provided, will export additional tensors to it.
113 |       use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
114 |         inputs so that the output dimensions are the same as if 'SAME' padding
115 |         were used.
116 |     Returns:
117 |       output tesnor
118 |     """
119 | 
120 |     with _v1_compatible_scope_naming(scope) as scope:
121 |         dw_scope = scope + 'depthwise'
122 |         endpoints = endpoints if endpoints is not None else {}
123 |         kernel_size = [3, 3]
124 |         padding = 'SAME'
125 |         if use_explicit_padding:
126 |             padding = 'VALID'
127 |             input_tensor = _fixed_padding(input_tensor, kernel_size, rate)
128 |         net = slim.separable_conv2d(
129 |             input_tensor,
130 |             None,
131 |             kernel_size,
132 |             depth_multiplier=1,
133 |             stride=stride,
134 |             rate=rate,
135 |             normalizer_fn=normalizer_fn,
136 |             padding=padding,
137 |             scope=dw_scope)
138 | 
139 |         endpoints[dw_scope] = net
140 | 
141 |         pw_scope = scope + 'pointwise'
142 |         net = slim.conv2d(
143 |             net,
144 |             num_outputs, [1, 1],
145 |             stride=1,
146 |             normalizer_fn=normalizer_fn,
147 |             scope=pw_scope)
148 |         endpoints[pw_scope] = net
149 |     return net
150 | 
151 | 
152 | def expand_input_by_factor(n, divisible_by=8):
153 |     return lambda num_inputs, **_: _make_divisible(num_inputs * n, divisible_by)
154 | 
155 | 
156 | @slim.add_arg_scope
157 | def expanded_conv(input_tensor,
158 |                   num_outputs,
159 |                   expansion_size=expand_input_by_factor(6),
160 |                   stride=1,
161 |                   rate=1,
162 |                   kernel_size=(3, 3),
163 |                   residual=True,
164 |                   normalizer_fn=None,
165 |                   split_projection=1,
166 |                   split_expansion=1,
167 |                   expansion_transform=None,
168 |                   depthwise_location='expansion',
169 |                   depthwise_channel_multiplier=1,
170 |                   endpoints=None,
171 |                   use_explicit_padding=False,
172 |                   padding='SAME',
173 |                   scope=None):
174 |     """Depthwise Convolution Block with expansion.
175 |     Builds a composite convolution that has the following structure
176 |     expansion (1x1) -> depthwise (kernel_size) -> projection (1x1)
177 |     Args:
178 |       input_tensor: input
179 |       num_outputs: number of outputs in the final layer.
180 |       expansion_size: the size of expansion, could be a constant or a callable.
181 |         If latter it will be provided 'num_inputs' as an input. For forward
182 |         compatibility it should accept arbitrary keyword arguments.
183 |         Default will expand the input by factor of 6.
184 |       stride: depthwise stride
185 |       rate: depthwise rate
186 |       kernel_size: depthwise kernel
187 |       residual: whether to include residual connection between input
188 |         and output.
189 |       normalizer_fn: batchnorm or otherwise
190 |       split_projection: how many ways to split projection operator
191 |         (that is conv expansion->bottleneck)
192 |       split_expansion: how many ways to split expansion op
193 |         (that is conv bottleneck->expansion) ops will keep depth divisible
194 |         by this value.
195 |       expansion_transform: Optional function that takes expansion
196 |         as a single input and returns output.
197 |       depthwise_location: where to put depthwise covnvolutions supported
198 |         values None, 'input', 'output', 'expansion'
199 |       depthwise_channel_multiplier: depthwise channel multiplier:
200 |       each input will replicated (with different filters)
201 |       that many times. So if input had c channels,
202 |       output will have c x depthwise_channel_multpilier.
203 |       endpoints: An optional dictionary into which intermediate endpoints are
204 |         placed. The keys "expansion_output", "depthwise_output",
205 |         "projection_output" and "expansion_transform" are always populated, even
206 |         if the corresponding functions are not invoked.
207 |       use_explicit_padding: Use 'VALID' padding for convolutions, but prepad
208 |         inputs so that the output dimensions are the same as if 'SAME' padding
209 |         were used.
210 |       padding: Padding type to use if `use_explicit_padding` is not set.
211 |       scope: optional scope.
212 |     Returns:
213 |       Tensor of depth num_outputs
214 |     Raises:
215 |       TypeError: on inval
216 |     """
217 |     with tf.variable_scope(scope, default_name='expanded_conv') as s, \
218 |             tf.name_scope(s.original_name_scope):
219 |         prev_depth = input_tensor.get_shape().as_list()[3]
220 |         if depthwise_location not in [None, 'input', 'output', 'expansion']:
221 |             raise TypeError('%r is unknown value for depthwise_location' %
222 |                             depthwise_location)
223 |         if use_explicit_padding:
224 |             if padding != 'SAME':
225 |                 raise TypeError('`use_explicit_padding` should only be used with '
226 |                                 '"SAME" padding.')
227 |             padding = 'VALID'
228 |         depthwise_func = functools.partial(
229 |             slim.separable_conv2d,
230 |             num_outputs=None,
231 |             kernel_size=kernel_size,
232 |             depth_multiplier=depthwise_channel_multiplier,
233 |             stride=stride,
234 |             rate=rate,
235 |             normalizer_fn=normalizer_fn,
236 |             padding=padding,
237 |             scope='depthwise')
238 |         # b1 -> b2 * r -> b2
239 |         #   i -> (o * r) (bottleneck) -> o
240 |         input_tensor = tf.identity(input_tensor, 'input')
241 |         net = input_tensor
242 | 
243 |         if depthwise_location == 'input':
244 |             if use_explicit_padding:
245 |                 net = _fixed_padding(net, kernel_size, rate)
246 |             net = depthwise_func(net, activation_fn=None)
247 | 
248 |         if callable(expansion_size):
249 |             inner_size = expansion_size(num_inputs=prev_depth)
250 |         else:
251 |             inner_size = expansion_size
252 | 
253 |         if inner_size > net.shape[3]:
254 |             net = split_conv(
255 |                 net,
256 |                 inner_size,
257 |                 num_ways=split_expansion,
258 |                 scope='expand',
259 |                 stride=1,
260 |                 normalizer_fn=normalizer_fn)
261 |             net = tf.identity(net, 'expansion_output')
262 |         if endpoints is not None:
263 |             endpoints['expansion_output'] = net
264 | 
265 |         if depthwise_location == 'expansion':
266 |             if use_explicit_padding:
267 |                 net = _fixed_padding(net, kernel_size, rate)
268 |             net = depthwise_func(net)
269 | 
270 |         net = tf.identity(net, name='depthwise_output')
271 |         if endpoints is not None:
272 |             endpoints['depthwise_output'] = net
273 |         if expansion_transform:
274 |             net = expansion_transform(expansion_tensor=net, input_tensor=input_tensor)
275 |         # Note in contrast with expansion, we always have
276 |         # projection to produce the desired output size.
277 |         net = split_conv(
278 |             net,
279 |             num_outputs,
280 |             num_ways=split_projection,
281 |             stride=1,
282 |             scope='project',
283 |             normalizer_fn=normalizer_fn,
284 |             activation_fn=tf.identity)
285 |         if endpoints is not None:
286 |             endpoints['projection_output'] = net
287 |         if depthwise_location == 'output':
288 |             if use_explicit_padding:
289 |                 net = _fixed_padding(net, kernel_size, rate)
290 |             net = depthwise_func(net, activation_fn=None)
291 | 
292 |         if callable(residual):  # custom residual
293 |             net = residual(input_tensor=input_tensor, output_tensor=net)
294 |         elif (residual and
295 |               # stride check enforces that we don't add residuals when spatial
296 |               # dimensions are None
297 |               stride == 1 and
298 |               # Depth matches
299 |               net.get_shape().as_list()[3] ==
300 |               input_tensor.get_shape().as_list()[3]):
301 |             net += input_tensor
302 |         return tf.identity(net, name='output')
303 | 
304 | 
305 | def split_conv(input_tensor,
306 |                num_outputs,
307 |                num_ways,
308 |                scope,
309 |                divisible_by=8,
310 |                **kwargs):
311 |     """Creates a split convolution.
312 |     Split convolution splits the input and output into
313 |     'num_blocks' blocks of approximately the same size each,
314 |     and only connects $i$-th input to $i$ output.
315 |     Args:
316 |       input_tensor: input tensor
317 |       num_outputs: number of output filters
318 |       num_ways: num blocks to split by.
319 |       scope: scope for all the operators.
320 |       divisible_by: make sure that every part is divisiable by this.
321 |       **kwargs: will be passed directly into conv2d operator
322 |     Returns:
323 |       tensor
324 |     """
325 |     b = input_tensor.get_shape().as_list()[3]
326 | 
327 |     if num_ways == 1 or min(b // num_ways,
328 |                             num_outputs // num_ways) < divisible_by:
329 |         # Don't do any splitting if we end up with less than 8 filters
330 |         # on either side.
331 |         return slim.conv2d(input_tensor, num_outputs, [1, 1], scope=scope, **kwargs)
332 | 
333 |     outs = []
334 |     input_splits = _split_divisible(b, num_ways, divisible_by=divisible_by)
335 |     output_splits = _split_divisible(
336 |         num_outputs, num_ways, divisible_by=divisible_by)
337 |     inputs = tf.split(input_tensor, input_splits, axis=3, name='split_' + scope)
338 |     base = scope
339 |     for i, (input_tensor, out_size) in enumerate(zip(inputs, output_splits)):
340 |         scope = base + '_part_%d' % (i,)
341 |         n = slim.conv2d(input_tensor, out_size, [1, 1], scope=scope, **kwargs)
342 |         n = tf.identity(n, scope + '_output')
343 |         outs.append(n)
344 |     return tf.concat(outs, 3, name=scope + '_concat')
345 | 


--------------------------------------------------------------------------------