├── src
    ├── utils
    │   ├── __init__.py
    │   ├── timer.py
    │   └── cpu_nms.pyx
    ├── layers
    │   ├── __init__.py
    │   └── text_proposal_layer.py
    ├── text_proposal_connector.py
    ├── detectors.py
    ├── anchor.py
    ├── other.py
    └── text_proposal_graph_builder.py
├── .gitignore
├── demo_images
    ├── img_1.jpg
    ├── img_2.jpg
    └── img_3.jpg
├── .gitmodules
├── Makefile
├── tools
    ├── cfg.py
    ├── demo.py
    └── demo_vid.py
├── LICENSE
├── README.md
└── models
    └── deploy.prototxt


/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'zhitian'
2 | 


--------------------------------------------------------------------------------
/src/layers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tianzhi'
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.caffemodel
3 | .idea
4 | *.so
5 | results
6 | *.xml
7 | 
8 | 


--------------------------------------------------------------------------------
/demo_images/img_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qingswu/CTPN/HEAD/demo_images/img_1.jpg


--------------------------------------------------------------------------------
/demo_images/img_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qingswu/CTPN/HEAD/demo_images/img_2.jpg


--------------------------------------------------------------------------------
/demo_images/img_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/qingswu/CTPN/HEAD/demo_images/img_3.jpg


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "caffe"]
2 | 	path = caffe
3 | 	url = https://github.com/qingswu/caffe
4 | 	branch = CTPN
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	cython src/utils/cpu_nms.pyx
3 | 	gcc -shared -pthread -fPIC -fwrapv -O2 -Wall -fno-strict-aliasing \
4 | 		-I/usr/include/python2.7 -o src/utils/cpu_nms.so src/utils/cpu_nms.c
5 | 	rm -rf src/utils/cpu_nms.c
6 | 


--------------------------------------------------------------------------------
/tools/cfg.py:
--------------------------------------------------------------------------------
 1 | # MUST be imported firstly
 2 | import sys
 3 | import numpy as np
 4 | 
 5 | class Config:
 6 |     MEAN=np.float32([102.9801, 115.9465, 122.7717])
 7 |     TEST_GPU_ID=0
 8 |     SCALE=600
 9 |     MAX_SCALE=1000
10 | 
11 |     LINE_MIN_SCORE=0.7
12 |     TEXT_PROPOSALS_MIN_SCORE=0.7
13 |     TEXT_PROPOSALS_NMS_THRESH=0.3
14 |     MAX_HORIZONTAL_GAP=50
15 |     TEXT_LINE_NMS_THRESH=0.3
16 |     MIN_NUM_PROPOSALS=2
17 |     MIN_RATIO=1.2
18 |     MIN_V_OVERLAPS=0.7
19 |     MIN_SIZE_SIM=0.7
20 |     TEXT_PROPOSALS_WIDTH=16
21 | 
22 | def init():
23 |     sys.path.insert(0, "./tools")
24 |     sys.path.insert(0, "./caffe/python")
25 |     sys.path.insert(0, "./src")
26 | init()
27 | 


--------------------------------------------------------------------------------
/src/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | class Timer(object):
11 |     """A simple timer."""
12 |     def __init__(self):
13 |         self.total_time = 0.
14 |         self.calls = 0
15 |         self.start_time = 0.
16 |         self.diff = 0.
17 |         self.average_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | ARE THIRD PARTY CODES ARE LICENSED TO YOU UNDER THEIR ORIGINAL LICENSE TERMS.
24 | 


--------------------------------------------------------------------------------
/src/layers/text_proposal_layer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import yaml, caffe
 3 | from other import clip_boxes
 4 | from anchor import AnchorText
 5 | 
 6 | 
 7 | class ProposalLayer(caffe.Layer):
 8 |     def setup(self, bottom, top):
 9 |         # parse the layer parameter string, which must be valid YAML
10 |         layer_params = yaml.load(self.param_str)
11 | 
12 |         self._feat_stride = layer_params['feat_stride']
13 |         self.anchor_generator=AnchorText()
14 |         self._num_anchors = self.anchor_generator.anchor_num
15 | 
16 |         top[0].reshape(1, 4)
17 |         top[1].reshape(1, 1, 1, 1)
18 | 
19 |     def forward(self, bottom, top):
20 |         assert bottom[0].data.shape[0]==1, \
21 |             'Only single item batches are supported'
22 | 
23 |         scores = bottom[0].data[:, self._num_anchors:, :, :]
24 | 
25 |         bbox_deltas = bottom[1].data
26 |         im_info = bottom[2].data[0, :]
27 |         height, width = scores.shape[-2:]
28 | 
29 |         anchors=self.anchor_generator.locate_anchors((height, width), self._feat_stride)
30 | 
31 |         scores=scores.transpose((0, 2, 3, 1)).reshape(-1, 1)
32 |         bbox_deltas=bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 2))
33 | 
34 |         proposals=self.anchor_generator.apply_deltas_to_anchors(bbox_deltas, anchors)
35 | 
36 |         # clip the proposals in excess of the boundaries of the image
37 |         proposals=clip_boxes(proposals, im_info[:2])
38 | 
39 |         blob=proposals.astype(np.float32, copy=False)
40 |         top[0].reshape(*(blob.shape))
41 |         top[0].data[...]=blob
42 | 
43 |         top[1].reshape(*(scores.shape))
44 |         top[1].data[...]=scores
45 | 
46 |     def backward(self, top, propagate_down, bottom):
47 |         pass
48 | 
49 |     def reshape(self, bottom, top):
50 |         pass
51 | 


--------------------------------------------------------------------------------
/src/text_proposal_connector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from other import clip_boxes
 3 | from text_proposal_graph_builder import TextProposalGraphBuilder
 4 | 
 5 | class TextProposalConnector:
 6 |     """
 7 |         Connect text proposals into text lines
 8 |     """
 9 |     def __init__(self):
10 |         self.graph_builder=TextProposalGraphBuilder()
11 | 
12 |     def group_text_proposals(self, text_proposals, scores, im_size):
13 |         graph=self.graph_builder.build_graph(text_proposals, scores, im_size)
14 |         return graph.sub_graphs_connected()
15 | 
16 |     def fit_y(self, X, Y, x1, x2):
17 |         len(X)!=0
18 |         # if X only include one point, the function will get line y=Y[0]
19 |         if np.sum(X==X[0])==len(X):
20 |             return Y[0], Y[0]
21 |         p=np.poly1d(np.polyfit(X, Y, 1))
22 |         return p(x1), p(x2)
23 | 
24 |     def get_text_lines(self, text_proposals, scores, im_size):
25 |         # tp=text proposal
26 |         tp_groups=self.group_text_proposals(text_proposals, scores, im_size)
27 |         text_lines=np.zeros((len(tp_groups), 5), np.float32)
28 | 
29 |         for index, tp_indices in enumerate(tp_groups):
30 |             text_line_boxes=text_proposals[list(tp_indices)]
31 | 
32 |             x0=np.min(text_line_boxes[:, 0])
33 |             x1=np.max(text_line_boxes[:, 2])
34 | 
35 |             offset=(text_line_boxes[0, 2]-text_line_boxes[0, 0])*0.5
36 | 
37 |             lt_y, rt_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 1], x0+offset, x1-offset)
38 |             lb_y, rb_y=self.fit_y(text_line_boxes[:, 0], text_line_boxes[:, 3], x0+offset, x1-offset)
39 | 
40 |             # the score of a text line is the average score of the scores
41 |             # of all text proposals contained in the text line
42 |             score=scores[list(tp_indices)].sum()/float(len(tp_indices))
43 | 
44 |             text_lines[index, 0]=x0
45 |             text_lines[index, 1]=min(lt_y, rt_y)
46 |             text_lines[index, 2]=x1
47 |             text_lines[index, 3]=max(lb_y, rb_y)
48 |             text_lines[index, 4]=score
49 | 
50 |         text_lines=clip_boxes(text_lines, im_size)
51 | 
52 |         return text_lines
53 | 


--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # The codes are used for implementing CTPN for scene text detection, described in:
 4 | #
 5 | # Z. Tian, W. Huang, T. He, P. He and Y. Qiao: Detecting Text in Natural Image with
 6 | # Connectionist Text Proposal Network, ECCV, 2016.
 7 | #
 8 | # Online demo is available at: textdet.com
 9 | #
10 | # These demo codes (with our trained model) are for text-line detection (without
11 | # side-refiement part).
12 | #
13 | #
14 | # ====== Copyright by Zhi Tian, Weilin Huang, Tong He, Pan He and Yu Qiao==========
15 | 
16 | #            Email: zhi.tian@siat.ac.cn; wl.huang@siat.ac.cn
17 | #
18 | #   Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences
19 | #
20 | #
21 | 
22 | from cfg import Config as cfg
23 | from other import draw_boxes, resize_im, CaffeModel
24 | import cv2, os, caffe, sys
25 | from detectors import TextProposalDetector, TextDetector
26 | import os.path as osp
27 | from utils.timer import Timer
28 | 
29 | DEMO_IMAGE_DIR="demo_images/"
30 | NET_DEF_FILE="models/deploy.prototxt"
31 | MODEL_FILE="models/ctpn_trained_model.caffemodel"
32 | 
33 | if len(sys.argv)>1 and sys.argv[1]=="--no-gpu":
34 |     caffe.set_mode_cpu()
35 | else:
36 |     caffe.set_mode_gpu()
37 |     caffe.set_device(cfg.TEST_GPU_ID)
38 | 
39 | # initialize the detectors
40 | text_proposals_detector=TextProposalDetector(CaffeModel(NET_DEF_FILE, MODEL_FILE))
41 | text_detector=TextDetector(text_proposals_detector)
42 | 
43 | demo_imnames=os.listdir(DEMO_IMAGE_DIR)
44 | timer=Timer()
45 | 
46 | for im_name in demo_imnames:
47 |     print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
48 |     print "Image: %s"%im_name
49 | 
50 |     im_file=osp.join(DEMO_IMAGE_DIR, im_name)
51 |     im=cv2.imread(im_file)
52 | 
53 |     timer.tic()
54 | 
55 |     im, f=resize_im(im, cfg.SCALE, cfg.MAX_SCALE)
56 |     text_lines=text_detector.detect(im)
57 | 
58 |     print "Number of the detected text lines: %s"%len(text_lines)
59 |     print "Time: %f"%timer.toc()
60 | 
61 |     im_with_text_lines=draw_boxes(im, text_lines, caption=im_name, wait=False)
62 | 
63 | print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
64 | print "Thank you for trying our demo. Press any key to exit..."
65 | cv2.waitKey(0)
66 | 
67 | 


--------------------------------------------------------------------------------
/tools/demo_vid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # The codes are used for implementing CTPN for scene text detection, described in:
 4 | #
 5 | # Z. Tian, W. Huang, T. He, P. He and Y. Qiao: Detecting Text in Natural Image with
 6 | # Connectionist Text Proposal Network, ECCV, 2016.
 7 | #
 8 | # Online demo is available at: textdet.com
 9 | #
10 | # These demo codes (with our trained model) are for text-line detection (without
11 | # side-refiement part).
12 | #
13 | #
14 | # ====== Copyright by Zhi Tian, Weilin Huang, Tong He, Pan He and Yu Qiao==========
15 | 
16 | #            Email: zhi.tian@siat.ac.cn; wl.huang@siat.ac.cn
17 | #
18 | #   Shenzhen Institutes of Advanced Technology, Chinese Academy of Sciences
19 | #
20 | #
21 | 
22 | from cfg import Config as cfg
23 | from other import draw_boxes, resize_im, CaffeModel
24 | import cv2, os, caffe, sys
25 | from detectors import TextProposalDetector, TextDetector
26 | import os.path as osp
27 | from utils.timer import Timer
28 | 
29 | NET_DEF_FILE="models/deploy.prototxt"
30 | MODEL_FILE="models/ctpn_trained_model.caffemodel"
31 | 
32 | if len(sys.argv)>1 and sys.argv[1]=="--no-gpu":
33 |     caffe.set_mode_cpu()
34 | else:
35 |     caffe.set_mode_gpu()
36 |     caffe.set_device(cfg.TEST_GPU_ID)
37 | 
38 | vid = 0
39 | if len(sys.argv) == 2 and sys.argv[1] != "--no-gpu":
40 |     if os.path.exists(sys.argv[1]):
41 |         vid = sys.argv[1]
42 | elif len(sys.argv) == 3:
43 |     if os.path.exists(sys.argv[2]):
44 |         vid = sys.argv[2]
45 | elif len(sys.argv) > 3:
46 |     print "Wrong parameter."
47 |     exit()
48 | 
49 | # initialize the detectors
50 | text_proposals_detector=TextProposalDetector(CaffeModel(NET_DEF_FILE, MODEL_FILE))
51 | text_detector=TextDetector(text_proposals_detector)
52 | 
53 | timer=Timer()
54 | cam = cv2.VideoCapture(vid)
55 | 
56 | while True:
57 |     ret, im = cam.read()
58 |     if not ret:
59 |         break
60 | 
61 |     timer.tic()
62 | 
63 |     im, f=resize_im(im, cfg.SCALE, cfg.MAX_SCALE)
64 |     text_lines=text_detector.detect(im)
65 | 
66 |     print "Number of the detected text lines: %s"%len(text_lines)
67 |     print "Time: %f"%timer.toc()
68 | 
69 |     im_with_text_lines=draw_boxes(im, text_lines, caption="Text Detection", wait=False)
70 |     key = cv2.waitKey(1)
71 |     if key == 27:
72 |         break
73 | 


--------------------------------------------------------------------------------
/src/utils/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep
69 | 


--------------------------------------------------------------------------------
/src/detectors.py:
--------------------------------------------------------------------------------
 1 | from cfg import Config as cfg
 2 | from other import prepare_img, normalize
 3 | import numpy as np
 4 | from utils.cpu_nms import cpu_nms as nms
 5 | from text_proposal_connector import TextProposalConnector
 6 | 
 7 | 
 8 | class TextProposalDetector:
 9 |     """
10 |         Detect text proposals in an image
11 |     """
12 |     def __init__(self, caffe_model):
13 |         self.caffe_model=caffe_model
14 | 
15 |     def detect(self, im, mean):
16 |         im_data=prepare_img(im, mean)
17 |         _=self.caffe_model.forward2({
18 |             "data": im_data[np.newaxis, :],
19 |             "im_info": np.array([[im_data.shape[1], im_data.shape[2]]], np.float32)
20 |         })
21 |         rois=self.caffe_model.blob("rois")
22 |         scores=self.caffe_model.blob("scores")
23 |         return rois, scores
24 | 
25 | 
26 | class TextDetector:
27 |     """
28 |         Detect text from an image
29 |     """
30 |     def __init__(self, text_proposal_detector):
31 |         self.text_proposal_detector=text_proposal_detector
32 |         self.text_proposal_connector=TextProposalConnector()
33 | 
34 |     def detect(self, im):
35 |         """
36 |         Detecting texts from an image
37 |         :return: the bounding boxes of the detected texts
38 |         """
39 |         text_proposals, scores=self.text_proposal_detector.detect(im, cfg.MEAN)
40 |         keep_inds=np.where(scores>cfg.TEXT_PROPOSALS_MIN_SCORE)[0]
41 |         text_proposals, scores=text_proposals[keep_inds], scores[keep_inds]
42 | 
43 |         sorted_indices=np.argsort(scores.ravel())[::-1]
44 |         text_proposals, scores=text_proposals[sorted_indices], scores[sorted_indices]
45 | 
46 |         # nms for text proposals
47 |         keep_inds=nms(np.hstack((text_proposals, scores)), cfg.TEXT_PROPOSALS_NMS_THRESH)
48 |         text_proposals, scores=text_proposals[keep_inds], scores[keep_inds]
49 | 
50 |         scores=normalize(scores)
51 | 
52 |         text_lines=self.text_proposal_connector.get_text_lines(text_proposals, scores, im.shape[:2])
53 | 
54 |         keep_inds=self.filter_boxes(text_lines)
55 |         text_lines=text_lines[keep_inds]
56 | 
57 |         # nms for text lines
58 |         if text_lines.shape[0]!=0:
59 |             keep_inds=nms(text_lines, cfg.TEXT_LINE_NMS_THRESH)
60 |             text_lines=text_lines[keep_inds]
61 | 
62 |         return text_lines
63 | 
64 |     def filter_boxes(self, boxes):
65 |         heights=boxes[:, 3]-boxes[:, 1]+1
66 |         widths=boxes[:, 2]-boxes[:, 0]+1
67 |         scores=boxes[:, -1]
68 |         return np.where((widths/heights>cfg.MIN_RATIO) & (scores>cfg.LINE_MIN_SCORE) &
69 |                           (widths>(cfg.TEXT_PROPOSALS_WIDTH*cfg.MIN_NUM_PROPOSALS)))[0]
70 | 


--------------------------------------------------------------------------------
/src/anchor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class AnchorText:
 5 |     def __init__(self):
 6 |         self.anchor_num=10
 7 | 
 8 |     def generate_basic_anchors(self, sizes, base_size=16):
 9 |         """
10 |         :param sizes: [(h1, w1), (h2, w2)...]
11 |         :param base_size
12 |         :return:
13 |         """
14 |         assert(self.anchor_num==len(sizes))
15 |         base_anchor=np.array([0, 0, base_size-1, base_size-1], np.int32)
16 |         anchors=np.zeros((len(sizes), 4), np.int32)
17 |         index=0
18 |         for h, w in sizes:
19 |             anchors[index]=self.scale_anchor(base_anchor, h, w)
20 |             index+=1
21 |         return anchors
22 | 
23 |     def scale_anchor(self, anchor, h, w):
24 |         x_ctr=(anchor[0]+anchor[2])*0.5
25 |         y_ctr=(anchor[1]+anchor[3])*0.5
26 |         scaled_anchor=anchor.copy()
27 |         scaled_anchor[0]=x_ctr-w/2
28 |         scaled_anchor[2]=x_ctr+w/2
29 |         scaled_anchor[1]=y_ctr-h/2
30 |         scaled_anchor[3]=y_ctr+h/2
31 |         return scaled_anchor
32 | 
33 |     def apply_deltas_to_anchors(self, boxes_delta, anchors):
34 |         """
35 |             :return [l t r b]
36 |         """
37 |         anchor_y_ctr=(anchors[:, 1]+anchors[:, 3])/2.
38 |         anchor_h=anchors[:, 3]-anchors[:, 1]+1.
39 |         global_coords=np.zeros_like(boxes_delta, np.float32)
40 |         global_coords[:, 1]=np.exp(boxes_delta[:, 1])*anchor_h
41 |         global_coords[:, 0]=boxes_delta[:, 0]*anchor_h+anchor_y_ctr-global_coords[:, 1]/2.
42 |         return np.hstack((anchors[:, [0]], global_coords[:, [0]], anchors[:, [2]],
43 |                           global_coords[:, [0]]+global_coords[:, [1]])).astype(np.float32)
44 | 
45 |     def basic_anchors(self):
46 |         """
47 |             anchor [l t r b]
48 |         """
49 |         heights=[11, 16, 23, 33, 48, 68, 97, 139, 198, 283]
50 |         widths=[16]
51 |         sizes=[]
52 |         for h in heights:
53 |             for w in widths:
54 |                 sizes.append((h, w))
55 |         return self.generate_basic_anchors(sizes)
56 | 
57 |     def locate_anchors(self, feat_map_size, feat_stride):
58 |         """
59 |             return all anchors on the feature map
60 |         """
61 |         basic_anchors_=self.basic_anchors()
62 |         anchors=np.zeros((basic_anchors_.shape[0]*feat_map_size[0]*feat_map_size[1], 4), np.int32)
63 |         index=0
64 |         for y_ in range(feat_map_size[0]):
65 |             for x_ in range(feat_map_size[1]):
66 |                 shift=np.array([x_, y_, x_, y_])*feat_stride
67 |                 anchors[index:index+basic_anchors_.shape[0], :]=basic_anchors_+shift
68 |                 index+=basic_anchors_.shape[0]
69 |         return anchors
70 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CUDA 8.0 compatible version
 2 | 1 . Updated caffe to current version, keeping the files that the official version doesn't have.
 3 | 
 4 | 2 . Small fix in CTPN code to adapt to the new caffe.
 5 | ```{bash}
 6 | git clone --recursive https://github.com/qingswu/CTPN.git
 7 | ...compile caffe following official steps
 8 | # and goto root folder, compile cython code
 9 | make
10 | # download model
11 | wget http://textdet.com/downloads/ctpn_trained_model.caffemodel -P models/
12 | # run the demo
13 | ./tools/demo.py
14 | ```
15 | 
16 | # Detecting Text in Natural Image with Connectionist Text Proposal Network
17 | The codes are used for implementing CTPN for scene text detection, described in: 
18 | 
19 |     Z. Tian, W. Huang, T. He, P. He and Y. Qiao: Detecting Text in Natural Image with
20 |     Connectionist Text Proposal Network, ECCV, 2016.
21 | 
22 | Online demo is available at: [textdet.com](http://textdet.com)
23 | 
24 | These demo codes (with our trained model) are for text-line detection (without 
25 | side-refiement part).
26 | 
27 | # Required hardware
28 | You need a GPU. If you use CUDNN, about 1.5GB free memory is required. If you don't use CUDNN, you will need about 5GB free memory, and the testing time will slightly increase. Therefore, we strongly recommend to use CUDNN.
29 | 
30 | It's also possible to run the program on CPU only, but it's extremely slow due to the non-optimal CPU implementation.
31 | # Required softwares
32 | Python2.7, cython and all what Caffe depends on.
33 | 
34 | # How to run this code
35 | 
36 | 1. Clone this repository with `git clone https://github.com/tianzhi0549/CTPN.git`. It will checkout the codes of CTPN and Caffe we ship.
37 | 
38 | 2. Install the caffe we ship with codes bellow.
39 |     * Install caffe's dependencies. You can follow [this tutorial](http://caffe.berkeleyvision.org/installation.html). *Note: we need Python support. The CUDA version we need is 7.0.*
40 |     * Enter the directory `caffe`.
41 |     * Run `cp Makefile.config.example Makefile.config`.
42 |     * Open Makefile.config and set `WITH_PYTHON_LAYER := 1`. If you want to use CUDNN, please also set `CUDNN := 1`. Uncomment the `CPU_ONLY :=1` if you want to compile it without GPU.
43 | 
44 |       *Note: To use CUDNN, you need to download CUDNN from NVIDIA's official website, and install it in advance. The CUDNN version we use is 3.0.*
45 |     * Run `make -j && make pycaffe`.
46 | 
47 | 3. After Caffe is set up, you need to download a trained model (about 78M) from [Google Drive](https://drive.google.com/open?id=0B7c5Ix-XO7hqQWtKQ0lxTko4ZGs) or [our website](http://textdet.com/downloads/ctpn_trained_model.caffemodel), and then populate it into directory `models`. The model's name should be ` ctpn_trained_model.caffemodel`.
48 | 
49 | 4. Now, be sure you are in the root directory of the codes. Run `make` to compile some cython files.
50 | 
51 | 5. Run `python tools/demo.py` for a demo. Or `python tools/demo.py --no-gpu` to run it under CPU mode.
52 | 
53 | # License
54 | The codes are released under the MIT License.
55 | 


--------------------------------------------------------------------------------
/src/other.py:
--------------------------------------------------------------------------------
 1 | import cv2, caffe
 2 | import numpy as np
 3 | from matplotlib import cm
 4 | 
 5 | 
 6 | def prepare_img(im, mean):
 7 |     """
 8 |         transform img into caffe's input img.
 9 |     """
10 |     im_data=np.transpose(im-mean, (2, 0, 1))
11 |     return im_data
12 | 
13 | 
14 | def draw_boxes(im, bboxes, is_display=True, color=None, caption="Image", wait=True):
15 |     """
16 |         boxes: bounding boxes
17 |     """
18 |     im=im.copy()
19 |     for box in bboxes:
20 |         if color==None:
21 |             if len(box)==5 or len(box)==9:
22 |                 c=tuple(cm.jet([box[-1]])[0, 2::-1]*255)
23 |             else:
24 |                 c=tuple(np.random.randint(0, 256, 3))
25 |         else:
26 |             c=color
27 |         cv2.rectangle(im, tuple(box[:2]), tuple(box[2:4]), c)
28 |     if is_display:
29 |         cv2.imshow(caption, im)
30 |         if wait:
31 |             cv2.waitKey(0)
32 |     return im
33 | 
34 | 
35 | def threshold(coords, min_, max_):
36 |     return np.maximum(np.minimum(coords, max_), min_)
37 | 
38 | 
39 | def clip_boxes(boxes, im_shape):
40 |     """
41 |     Clip boxes to image boundaries.
42 |     """
43 |     boxes[:, 0::2]=threshold(boxes[:, 0::2], 0, im_shape[1]-1)
44 |     boxes[:, 1::2]=threshold(boxes[:, 1::2], 0, im_shape[0]-1)
45 |     return boxes
46 | 
47 | 
48 | def normalize(data):
49 |     if data.shape[0]==0:
50 |         return data
51 |     max_=data.max()
52 |     min_=data.min()
53 |     return (data-min_)/(max_-min_) if max_-min_!=0 else data-min_
54 | 
55 | 
56 | def resize_im(im, scale, max_scale=None):
57 |     f=float(scale)/min(im.shape[0], im.shape[1])
58 |     if max_scale!=None and f*max(im.shape[0], im.shape[1])>max_scale:
59 |         f=float(max_scale)/max(im.shape[0], im.shape[1])
60 |     return cv2.resize(im, (0, 0), fx=f, fy=f), f
61 | 
62 | 
63 | class Graph:
64 |     def __init__(self, graph):
65 |         self.graph=graph
66 | 
67 |     def sub_graphs_connected(self):
68 |         sub_graphs=[]
69 |         for index in xrange(self.graph.shape[0]):
70 |             if not self.graph[:, index].any() and self.graph[index, :].any():
71 |                 v=index
72 |                 sub_graphs.append([v])
73 |                 while self.graph[v, :].any():
74 |                     v=np.where(self.graph[v, :])[0][0]
75 |                     sub_graphs[-1].append(v)
76 |         return sub_graphs
77 | 
78 | 
79 | class CaffeModel:
80 |     def __init__(self, net_def_file, model_file):
81 |         self.net_def_file=net_def_file
82 |         self.net=caffe.Net(net_def_file, model_file, caffe.TEST)
83 | 
84 |     def blob(self, key):
85 |         return self.net.blobs[key].data.copy()
86 | 
87 |     def forward(self, input_data):
88 |         return self.forward2({"data": input_data[np.newaxis, :]})
89 | 
90 |     def forward2(self, input_data):
91 |         for k, v in input_data.items():
92 |             self.net.blobs[k].reshape(*v.shape)
93 |             self.net.blobs[k].data[...]=v
94 |         return self.net.forward()
95 | 
96 |     def net_def_file(self):
97 |         return self.net_def_file
98 | 


--------------------------------------------------------------------------------
/src/text_proposal_graph_builder.py:
--------------------------------------------------------------------------------
 1 | from cfg import Config as cfg
 2 | import numpy as np
 3 | from other import Graph
 4 | 
 5 | 
 6 | class TextProposalGraphBuilder:
 7 |     """
 8 |         Build Text proposals into a graph.
 9 |     """
10 |     def get_successions(self, index):
11 |             box=self.text_proposals[index]
12 |             results=[]
13 |             for left in range(int(box[0])+1, min(int(box[0])+cfg.MAX_HORIZONTAL_GAP+1, self.im_size[1])):
14 |                 adj_box_indices=self.boxes_table[left]
15 |                 for adj_box_index in adj_box_indices:
16 |                     if self.meet_v_iou(adj_box_index, index):
17 |                         results.append(adj_box_index)
18 |                 if len(results)!=0:
19 |                     return results
20 |             return results
21 | 
22 |     def get_precursors(self, index):
23 |         box=self.text_proposals[index]
24 |         results=[]
25 |         for left in range(int(box[0])-1, max(int(box[0]-cfg.MAX_HORIZONTAL_GAP), 0)-1, -1):
26 |             adj_box_indices=self.boxes_table[left]
27 |             for adj_box_index in adj_box_indices:
28 |                 if self.meet_v_iou(adj_box_index, index):
29 |                     results.append(adj_box_index)
30 |             if len(results)!=0:
31 |                 return results
32 |         return results
33 | 
34 |     def is_succession_node(self, index, succession_index):
35 |         precursors=self.get_precursors(succession_index)
36 |         if self.scores[index]>=np.max(self.scores[precursors]):
37 |             return True
38 |         return False
39 | 
40 |     def meet_v_iou(self, index1, index2):
41 |         def overlaps_v(index1, index2):
42 |             h1=self.heights[index1]
43 |             h2=self.heights[index2]
44 |             y0=max(self.text_proposals[index2][1], self.text_proposals[index1][1])
45 |             y1=min(self.text_proposals[index2][3], self.text_proposals[index1][3])
46 |             return max(0, y1-y0+1)/min(h1, h2)
47 | 
48 |         def size_similarity(index1, index2):
49 |             h1=self.heights[index1]
50 |             h2=self.heights[index2]
51 |             return min(h1, h2)/max(h1, h2)
52 | 
53 |         return overlaps_v(index1, index2)>=cfg.MIN_V_OVERLAPS and \
54 |                size_similarity(index1, index2)>=cfg.MIN_SIZE_SIM
55 | 
56 |     def build_graph(self, text_proposals, scores, im_size):
57 |         self.text_proposals=text_proposals
58 |         self.scores=scores
59 |         self.im_size=im_size
60 |         self.heights=text_proposals[:, 3]-text_proposals[:, 1]+1
61 | 
62 |         boxes_table=[[] for _ in range(self.im_size[1])]
63 |         for index, box in enumerate(text_proposals):
64 |             boxes_table[int(box[0])].append(index)
65 |         self.boxes_table=boxes_table
66 | 
67 |         graph=np.zeros((text_proposals.shape[0], text_proposals.shape[0]), np.bool)
68 | 
69 |         for index, box in enumerate(text_proposals):
70 |             successions=self.get_successions(index)
71 |             if len(successions)==0:
72 |                 continue
73 |             succession_index=successions[np.argmax(scores[successions])]
74 |             if self.is_succession_node(index, succession_index):
75 |                 # NOTE: a box can have multiple successions(precursors) if multiple successions(precursors)
76 |                 # have equal scores.
77 |                 graph[index, succession_index]=True
78 |         return Graph(graph)
79 | 


--------------------------------------------------------------------------------
/models/deploy.prototxt:
--------------------------------------------------------------------------------
  1 | name: "VGG_ILSVRC_16_layers"
  2 | 
  3 | input: 'data'
  4 | input_shape {
  5 |     dim: 1
  6 |     dim: 3
  7 |     dim: 600
  8 |     dim: 900
  9 | }
 10 | 
 11 | input: 'im_info'
 12 | input_shape {
 13 |     dim: 1
 14 |     dim: 3
 15 | }
 16 | 
 17 | layer {
 18 |   name: "conv1_1"
 19 |   type: "Convolution"
 20 |   bottom: "data"
 21 |   top: "conv1_1"
 22 |   param {
 23 |     lr_mult: 0
 24 |     decay_mult: 0
 25 |   }
 26 |   param {
 27 |     lr_mult: 0
 28 |     decay_mult: 0
 29 |   }
 30 |   convolution_param {
 31 |     num_output: 64
 32 |     pad: 1
 33 |     kernel_size: 3
 34 |   }
 35 | }
 36 | layer {
 37 |   name: "relu1_1"
 38 |   type: "ReLU"
 39 |   bottom: "conv1_1"
 40 |   top: "conv1_1"
 41 | }
 42 | layer {
 43 |   name: "conv1_2"
 44 |   type: "Convolution"
 45 |   bottom: "conv1_1"
 46 |   top: "conv1_2"
 47 |   param {
 48 |     lr_mult: 0
 49 |     decay_mult: 0
 50 |   }
 51 |   param {
 52 |     lr_mult: 0
 53 |     decay_mult: 0
 54 |   }
 55 |   convolution_param {
 56 |     num_output: 64
 57 |     pad: 1
 58 |     kernel_size: 3
 59 |   }
 60 | }
 61 | layer {
 62 |   name: "relu1_2"
 63 |   type: "ReLU"
 64 |   bottom: "conv1_2"
 65 |   top: "conv1_2"
 66 | }
 67 | layer {
 68 |   name: "pool1"
 69 |   type: "Pooling"
 70 |   bottom: "conv1_2"
 71 |   top: "pool1"
 72 |   pooling_param {
 73 |     pool: MAX
 74 |     kernel_size: 2
 75 |     stride: 2
 76 |   }
 77 | }
 78 | layer {
 79 |   name: "conv2_1"
 80 |   type: "Convolution"
 81 |   bottom: "pool1"
 82 |   top: "conv2_1"
 83 |   param {
 84 |     lr_mult: 0
 85 |     decay_mult: 0
 86 |   }
 87 |   param {
 88 |     lr_mult: 0
 89 |     decay_mult: 0
 90 |   }
 91 |   convolution_param {
 92 |     num_output: 128
 93 |     pad: 1
 94 |     kernel_size: 3
 95 |   }
 96 | }
 97 | layer {
 98 |   name: "relu2_1"
 99 |   type: "ReLU"
100 |   bottom: "conv2_1"
101 |   top: "conv2_1"
102 | }
103 | layer {
104 |   name: "conv2_2"
105 |   type: "Convolution"
106 |   bottom: "conv2_1"
107 |   top: "conv2_2"
108 |   param {
109 |     lr_mult: 0
110 |     decay_mult: 0
111 |   }
112 |   param {
113 |     lr_mult: 0
114 |     decay_mult: 0
115 |   }
116 |   convolution_param {
117 |     num_output: 128
118 |     pad: 1
119 |     kernel_size: 3
120 |   }
121 | }
122 | layer {
123 |   name: "relu2_2"
124 |   type: "ReLU"
125 |   bottom: "conv2_2"
126 |   top: "conv2_2"
127 | }
128 | layer {
129 |   name: "pool2"
130 |   type: "Pooling"
131 |   bottom: "conv2_2"
132 |   top: "pool2"
133 |   pooling_param {
134 |     pool: MAX
135 |     kernel_size: 2
136 |     stride: 2
137 |   }
138 | }
139 | layer {
140 |   name: "conv3_1"
141 |   type: "Convolution"
142 |   bottom: "pool2"
143 |   top: "conv3_1"
144 |   param {
145 |     lr_mult: 1
146 |   }
147 |   param {
148 |     lr_mult: 2
149 |   }
150 |   convolution_param {
151 |     num_output: 256
152 |     pad: 1
153 |     kernel_size: 3
154 |   }
155 | }
156 | layer {
157 |   name: "relu3_1"
158 |   type: "ReLU"
159 |   bottom: "conv3_1"
160 |   top: "conv3_1"
161 | }
162 | layer {
163 |   name: "conv3_2"
164 |   type: "Convolution"
165 |   bottom: "conv3_1"
166 |   top: "conv3_2"
167 |   param {
168 |     lr_mult: 1
169 |   }
170 |   param {
171 |     lr_mult: 2
172 |   }
173 |   convolution_param {
174 |     num_output: 256
175 |     pad: 1
176 |     kernel_size: 3
177 |   }
178 | }
179 | layer {
180 |   name: "relu3_2"
181 |   type: "ReLU"
182 |   bottom: "conv3_2"
183 |   top: "conv3_2"
184 | }
185 | layer {
186 |   name: "conv3_3"
187 |   type: "Convolution"
188 |   bottom: "conv3_2"
189 |   top: "conv3_3"
190 |   param {
191 |     lr_mult: 1
192 |   }
193 |   param {
194 |     lr_mult: 2
195 |   }
196 |   convolution_param {
197 |     num_output: 256
198 |     pad: 1
199 |     kernel_size: 3
200 |   }
201 | }
202 | layer {
203 |   name: "relu3_3"
204 |   type: "ReLU"
205 |   bottom: "conv3_3"
206 |   top: "conv3_3"
207 | }
208 | layer {
209 |   name: "pool3"
210 |   type: "Pooling"
211 |   bottom: "conv3_3"
212 |   top: "pool3"
213 |   pooling_param {
214 |     pool: MAX
215 |     kernel_size: 2
216 |     stride: 2
217 |   }
218 | }
219 | layer {
220 |   name: "conv4_1"
221 |   type: "Convolution"
222 |   bottom: "pool3"
223 |   top: "conv4_1"
224 |   param {
225 |     lr_mult: 1
226 |   }
227 |   param {
228 |     lr_mult: 2
229 |   }
230 |   convolution_param {
231 |     num_output: 512
232 |     pad: 1
233 |     kernel_size: 3
234 |   }
235 | }
236 | layer {
237 |   name: "relu4_1"
238 |   type: "ReLU"
239 |   bottom: "conv4_1"
240 |   top: "conv4_1"
241 | }
242 | layer {
243 |   name: "conv4_2"
244 |   type: "Convolution"
245 |   bottom: "conv4_1"
246 |   top: "conv4_2"
247 |   param {
248 |     lr_mult: 1
249 |   }
250 |   param {
251 |     lr_mult: 2
252 |   }
253 |   convolution_param {
254 |     num_output: 512
255 |     pad: 1
256 |     kernel_size: 3
257 |   }
258 | }
259 | layer {
260 |   name: "relu4_2"
261 |   type: "ReLU"
262 |   bottom: "conv4_2"
263 |   top: "conv4_2"
264 | }
265 | layer {
266 |   name: "conv4_3"
267 |   type: "Convolution"
268 |   bottom: "conv4_2"
269 |   top: "conv4_3"
270 |   param {
271 |     lr_mult: 1
272 |   }
273 |   param {
274 |     lr_mult: 2
275 |   }
276 |   convolution_param {
277 |     num_output: 512
278 |     pad: 1
279 |     kernel_size: 3
280 |   }
281 | }
282 | layer {
283 |   name: "relu4_3"
284 |   type: "ReLU"
285 |   bottom: "conv4_3"
286 |   top: "conv4_3"
287 | }
288 | layer {
289 |   name: "pool4"
290 |   type: "Pooling"
291 |   bottom: "conv4_3"
292 |   top: "pool4"
293 |   pooling_param {
294 |     pool: MAX
295 |     kernel_size: 2
296 |     stride: 2
297 |   }
298 | }
299 | layer {
300 |   name: "conv5_1"
301 |   type: "Convolution"
302 |   bottom: "pool4"
303 |   top: "conv5_1"
304 |   param {
305 |     lr_mult: 1
306 |   }
307 |   param {
308 |     lr_mult: 2
309 |   }
310 |   convolution_param {
311 |     num_output: 512
312 |     pad: 1
313 |     kernel_size: 3
314 |   }
315 | }
316 | layer {
317 |   name: "relu5_1"
318 |   type: "ReLU"
319 |   bottom: "conv5_1"
320 |   top: "conv5_1"
321 | }
322 | layer {
323 |   name: "conv5_2"
324 |   type: "Convolution"
325 |   bottom: "conv5_1"
326 |   top: "conv5_2"
327 |   param {
328 |     lr_mult: 1
329 |   }
330 |   param {
331 |     lr_mult: 2
332 |   }
333 |   convolution_param {
334 |     num_output: 512
335 |     pad: 1
336 |     kernel_size: 3
337 |   }
338 | }
339 | layer {
340 |   name: "relu5_2"
341 |   type: "ReLU"
342 |   bottom: "conv5_2"
343 |   top: "conv5_2"
344 | }
345 | layer {
346 |   name: "conv5_3"
347 |   type: "Convolution"
348 |   bottom: "conv5_2"
349 |   top: "conv5_3"
350 |   param {
351 |     lr_mult: 1
352 |   }
353 |   param {
354 |     lr_mult: 2
355 |   }
356 |   convolution_param {
357 |     num_output: 512
358 |     pad: 1
359 |     kernel_size: 3
360 |   }
361 | }
362 | layer {
363 |   name: "relu5_3"
364 |   type: "ReLU"
365 |   bottom: "conv5_3"
366 |   top: "conv5_3"
367 | }
368 | 
369 | #========= RPN ============
370 | 
371 | # prepare lstm inputs
372 | layer {
373 |   name: "im2col"
374 |   bottom: "conv5_3"
375 |   top: "im2col"
376 |   type: "Im2col"
377 |   convolution_param {
378 |     pad: 1
379 |     kernel_size: 3
380 |   }
381 | }
382 | layer {
383 |   name: "im2col_transpose"
384 |   top: "im2col_transpose"
385 |   bottom: "im2col"
386 |   type: "Transpose"
387 |   transpose_param {
388 |     dim: 3
389 |     dim: 2
390 |     dim: 0
391 |     dim: 1
392 |   }
393 | }
394 | layer {
395 |   name: "lstm_input"
396 |   type: "Reshape"
397 |   bottom: "im2col_transpose"
398 |   top: "lstm_input"
399 |   reshape_param {
400 |     shape { dim: -1 }
401 |     axis: 1
402 |     num_axes: 2
403 |   }
404 | }
405 | 
406 | layer {
407 |   name: "lstm"
408 |   type: "Lstm"
409 |   bottom: "lstm_input"
410 |   top: "lstm"
411 |   lstm_param {
412 |       num_output: 128
413 |       weight_filler {
414 |         type: "gaussian"
415 |         std: 0.01
416 |       }
417 |       bias_filler {
418 |         type: "constant"
419 |       }
420 |       clipping_threshold: 1
421 |     }
422 | }
423 | 
424 | 
425 | # ===================== rlstm ===================
426 | layer {
427 |   name: "lstm-reverse1"
428 |   type: "Reverse"
429 |   bottom: "lstm_input"
430 |   top: "rlstm_input"
431 |   reverse_param {
432 |     axis: 0
433 |   }
434 | }
435 | layer {
436 |   name: "rlstm"
437 |   type: "Lstm"
438 |   bottom: "rlstm_input"
439 |   top: "rlstm-output"
440 |   lstm_param {
441 |     num_output: 128
442 |    }
443 | }
444 | layer {
445 |   name: "lstm-reverse2"
446 |   type: "Reverse"
447 |   bottom: "rlstm-output"
448 |   top: "rlstm"
449 |   reverse_param {
450 |     axis: 0
451 |   }
452 | }
453 | 
454 | 
455 | # merge lstm and rlstm
456 | layer {
457 |   name: "merge_lstm_rlstm"
458 |   type: "Concat"
459 |   bottom: "lstm"
460 |   bottom: "rlstm"
461 |   top: "merge_lstm_rlstm"
462 |   concat_param {
463 |     axis: 2
464 |   }
465 | }
466 | layer {
467 |   name: "lstm_output_reshape"
468 |   type: "Reshape"
469 |   bottom: "merge_lstm_rlstm"
470 |   top: "lstm_output_reshape"
471 |   reshape_param {
472 |     shape { dim: -1 dim: 1 }
473 |     axis: 1
474 |     num_axes: 1
475 |   }
476 | }
477 | # transpose size of output as (N, C, H, W)
478 | layer {
479 |   name: "lstm_output"
480 |   type: "Transpose"
481 |   bottom: "lstm_output_reshape"
482 |   top: "lstm_output"
483 |   transpose_param {
484 |     dim: 2
485 |     dim: 3
486 |     dim: 1
487 |     dim: 0
488 |   }
489 | }
490 | layer {
491 |   name: "fc"
492 |   bottom: "lstm_output"
493 |   top: "fc"
494 |   type: "Convolution"
495 |   convolution_param {
496 |       num_output: 512
497 |       kernel_size: 1
498 |     }
499 | }
500 | layer {
501 |   name: "relu_fc"
502 |   type: "ReLU"
503 |   bottom: "fc"
504 |   top: "fc"
505 | }
506 | layer {
507 |   name: "rpn_cls_score"
508 |   type: "Convolution"
509 |   bottom: "fc"
510 |   top: "rpn_cls_score"
511 |   param { lr_mult: 1.0 }
512 |   param { lr_mult: 2.0 }
513 |   convolution_param {
514 |     num_output: 20
515 |     kernel_size: 1 pad: 0 stride: 1
516 |   }
517 | }
518 | layer {
519 |    bottom: "rpn_cls_score"
520 |    top: "rpn_cls_score_reshape"
521 |    name: "rpn_cls_score_reshape"
522 |    type: "Reshape"
523 |    reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
524 | }
525 | layer {
526 |   name: "rpn_bbox_pred"
527 |   type: "Convolution"
528 |   bottom: "fc"
529 |   top: "rpn_bbox_pred"
530 |   param { lr_mult: 1.0 }
531 |   param { lr_mult: 2.0 }
532 |   convolution_param {
533 |     num_output: 20
534 |     kernel_size: 1 pad: 0 stride: 1
535 |   }
536 | }
537 | layer {
538 |   name: "rpn_cls_prob"
539 |   type: "Softmax"
540 |   bottom: "rpn_cls_score_reshape"
541 |   top: "rpn_cls_prob"
542 | }
543 | 
544 | layer {
545 |   name: 'rpn_cls_prob_reshape'
546 |   type: 'Reshape'
547 |   bottom: 'rpn_cls_prob'
548 |   top: 'rpn_cls_prob_reshape'
549 |   reshape_param { shape { dim: 0 dim: 20 dim: -1 dim: 0 } }
550 | }
551 | 
552 | layer {
553 |   name: 'proposal'
554 |   type: 'Python'
555 |   bottom: 'rpn_cls_prob_reshape'
556 |   bottom: 'rpn_bbox_pred'
557 |   bottom: 'im_info'
558 |   top: 'rois'
559 |   top: 'scores'
560 |   python_param {
561 |     module: 'layers.text_proposal_layer'
562 |     layer: 'ProposalLayer'
563 |     param_str: "'feat_stride': 16"
564 |   }
565 | }
566 | 


--------------------------------------------------------------------------------