├── .idea
├── SSD_scene_text_detection.iml
├── inspectionProfiles
│ └── Project_Default.xml
├── markdown-navigator.xml
├── markdown-navigator
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── create_lmdbdata_scenetext.sh
├── create_train_test_file.py
├── create_xml.py
├── ssd_icdar_scenetext.py
├── test_file
├── 101.jpg
├── 104.jpg
├── 120.jpg
├── output_101.png
├── output_104.png
├── output_120.png
└── scenetext.ipynb
├── test_name_size.py
└── training_file
├── VGG_scenetext_SSD_300x300.sh
├── deploy.prototxt
├── labelmap_voc.prototxt
├── solver.prototxt
├── test.prototxt
├── test.txt
├── test_name_size.txt
├── train.prototxt
└── trainval.txt
/.idea/SSD_scene_text_detection.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/.idea/markdown-navigator/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Note
2 | This repository is not being actively maintained due to lack of time and interest. My sincerest apologies to the open source community for allowing this project to stagnate. I hope it was useful for some of you as a jumping-off point.
3 |
4 | ## SSD_scene-text-detection
5 |
6 | The note about the original paper: [SSD: Single Shot MultiBox Detector](https://github.com/weiliu89/caffe/tree/ssd) can be found [here](http://blog.csdn.net/u010167269/article/details/52563573).
7 |
8 | This practice is inspired by [ssd-plate_detection](https://github.com/hyh21521038/ssd-plate_detection)
9 |
10 | The detail of the above code can read my blog: http://blog.csdn.net/u010167269/article/details/52851667, which was written in chinese.
11 |
12 | Meanwhile, I have uploaded my training caffemodel to BaiduYun, Google Drive, Dropbox.
13 | - BaiduYun:https://pan.baidu.com/s/1dE0Ok3v
14 | - Google Drive: https://drive.google.com/open?id=0B65vBUruA6N4bzNCSGxTcnEtNjg
15 | - Dropbox: https://www.dropbox.com/s/o3mrsfoiyfp21ou/VGG_scenetext_SSD_300x300_iter_60000.caffemodel?dl=0
16 |
17 | Some examples of the scene text detection:
18 | ### Good Cases
19 |
20 |
21 |
22 | ### Bad Cases
23 |
24 |
25 | ## Note
26 | Currently, I mainly focus on image/video captioning.
27 |
--------------------------------------------------------------------------------
/create_lmdbdata_scenetext.sh:
--------------------------------------------------------------------------------
1 | cur_dir=$(cd $( dirname ${BASH_SOURCE[0]} ) && pwd )
2 | root_dir=$cur_dir/../..
3 |
4 | cd $root_dir
5 |
6 | redo=1
7 | data_root_dir="$HOME/data/VOCdevkit"
8 | dataset_name="scenetext"
9 | mapfile="$root_dir/data/$dataset_name/labelmap_voc_scenetext.prototxt"
10 | anno_type="detection"
11 | db="lmdb"
12 | min_dim=0
13 | max_dim=0
14 | width=0
15 | height=0
16 |
17 | extra_cmd="--encode-type=jpg --encoded"
18 | if [ $redo ]
19 | then
20 | extra_cmd="$extra_cmd --redo"
21 | fi
22 | for subset in test trainval
23 | do
24 | python $root_dir/scripts/create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile \
25 | --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height \
26 | --check-label $extra_cmd $data_root_dir $root_dir/data/$dataset_name/$subset.txt \
27 | $data_root_dir/$dataset_name/$db/$dataset_name"_"$subset"_"$db examples/$dataset_name
28 | done
29 |
30 |
--------------------------------------------------------------------------------
/create_train_test_file.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import os, sys
4 | import glob
5 |
6 | trainval_dir = "/home/chenxp/data/VOCdevkit/scenetext/trainval"
7 | test_dir = "/home/chenxp/data/VOCdevkit/scenetext/test"
8 |
9 | trainval_img_lists = glob.glob(trainval_dir + '/*.jpg')
10 | trainval_img_names = []
11 | for item in trainval_img_lists:
12 | temp1, temp2 = os.path.splitext(os.path.basename(item))
13 | trainval_img_names.append(temp1)
14 |
15 | test_img_lists = glob.glob(test_dir + '/*.jpg')
16 | test_img_names = []
17 | for item in test_img_lists:
18 | temp1, temp2 = os.path.splitext(os.path.basename(item))
19 | test_img_names.append(temp1)
20 |
21 | dist_img_dir = "scenetext/JPEGImages"
22 | dist_anno_dir = "scenetext/Annotations"
23 |
24 | trainval_fd = open("/home/chenxp/caffe/data/scenetext/trainval.txt", 'w')
25 | test_fd = open("/home/chenxp/caffe/data/scenetext/test.txt", 'w')
26 |
27 | for item in trainval_img_names:
28 | trainval_fd.write(dist_img_dir + '/' + str(item) + '.jpg' + ' ' + dist_anno_dir + '/' + str(item) + '.xml\n')
29 |
30 | for item in test_img_names:
31 | test_fd.write(dist_img_dir + '/' + str(item) + '.jpg' + ' ' + dist_anno_dir + '/' + str(item) + '.xml\n')
32 |
33 |
--------------------------------------------------------------------------------
/create_xml.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import os, sys
4 | import glob
5 | from PIL import Image
6 |
7 | src_img_dir = "/media/chenxp/Datadisk/ocr_dataset/ICDAR2011/train-textloc"
8 | src_txt_dir = "/media/chenxp/Datadisk/ocr_dataset/ICDAR2011/train-textloc"
9 |
10 | img_Lists = glob.glob(src_img_dir + '/*.jpg')
11 |
12 | img_basenames = [] # e.g. 100.jpg
13 | for item in img_Lists:
14 | img_basenames.append(os.path.basename(item))
15 |
16 | img_names = [] # e.g. 100
17 | for item in img_basenames:
18 | temp1, temp2 = os.path.splitext(item)
19 | img_names.append(temp1)
20 |
21 | for img in img_names:
22 | im = Image.open((src_img_dir + '/' + img + '.jpg'))
23 | width, height = im.size
24 |
25 | # open the crospronding txt file
26 | gt = open(src_txt_dir + '/gt_' + img + '.txt').read().splitlines()
27 |
28 | # write in xml file
29 | os.mknod(src_txt_dir + '/' + img + '.xml')
30 | xml_file = open((src_txt_dir + '/' + img + '.xml'), 'w')
31 | xml_file.write('\n')
32 | xml_file.write(' VOC2007\n')
33 | xml_file.write(' ' + str(img) + '.jpg' + '\n')
34 | xml_file.write(' \n')
35 | xml_file.write(' ' + str(width) + '\n')
36 | xml_file.write(' ' + str(height) + '\n')
37 | xml_file.write(' 3\n')
38 | xml_file.write(' \n')
39 |
40 | # write the region of text on xml file
41 | for img_each_label in gt:
42 | spt = img_each_label.split(',')
43 | xml_file.write(' \n')
55 |
56 | xml_file.write('')
57 |
58 |
--------------------------------------------------------------------------------
/ssd_icdar_scenetext.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import caffe
3 | from caffe.model_libs import *
4 | from google.protobuf import text_format
5 |
6 | import math
7 | import os
8 | import shutil
9 | import stat
10 | import subprocess
11 | import sys
12 |
13 | # Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
14 | def AddExtraLayers(net, use_batchnorm=True):
15 | use_relu = True
16 |
17 | # Add additional convolutional layers.
18 | from_layer = net.keys()[-1]
19 | # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
20 | out_layer = "conv6_1"
21 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1)
22 |
23 | from_layer = out_layer
24 | out_layer = "conv6_2"
25 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2)
26 |
27 | for i in xrange(7, 9):
28 | from_layer = out_layer
29 | out_layer = "conv{}_1".format(i)
30 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1)
31 |
32 | from_layer = out_layer
33 | out_layer = "conv{}_2".format(i)
34 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2)
35 |
36 | # Add global pooling layer.
37 | name = net.keys()[-1]
38 | net.pool6 = L.Pooling(net[name], pool=P.Pooling.AVE, global_pooling=True)
39 |
40 | return net
41 |
42 |
43 | ### Modify the following parameters accordingly ###
44 | # The directory which contains the caffe code.
45 | # We assume you are running the script at the CAFFE_ROOT.
46 | caffe_root = os.getcwd()
47 |
48 | # Set true if you want to start training right after generating all files.
49 | run_soon = True
50 | # Set true if you want to load from most recently saved snapshot.
51 | # Otherwise, we will load from the pretrain_model defined below.
52 | resume_training = True
53 | # If true, Remove old model files.
54 | remove_old_models = False
55 |
56 | # The database file for training data. Created by data/VOC0712/create_data.sh
57 | train_data = "examples/scenetext_trainval_lmdb"
58 | # The database file for testing data. Created by data/VOC0712/create_data.sh
59 | test_data = "examples/scenetext_test_lmdb"
60 | # Specify the batch sampler.
61 | resize_width = 300
62 | resize_height = 300
63 | resize = "{}x{}".format(resize_width, resize_height)
64 | batch_sampler = [
65 | {
66 | 'sampler': {
67 | },
68 | 'max_trials': 1,
69 | 'max_sample': 1,
70 | },
71 | {
72 | 'sampler': {
73 | 'min_scale': 0.3,
74 | 'max_scale': 1.0,
75 | 'min_aspect_ratio': 0.5,
76 | 'max_aspect_ratio': 2.0,
77 | },
78 | 'sample_constraint': {
79 | 'min_jaccard_overlap': 0.1,
80 | },
81 | 'max_trials': 50,
82 | 'max_sample': 1,
83 | },
84 | {
85 | 'sampler': {
86 | 'min_scale': 0.3,
87 | 'max_scale': 1.0,
88 | 'min_aspect_ratio': 0.5,
89 | 'max_aspect_ratio': 2.0,
90 | },
91 | 'sample_constraint': {
92 | 'min_jaccard_overlap': 0.3,
93 | },
94 | 'max_trials': 50,
95 | 'max_sample': 1,
96 | },
97 | {
98 | 'sampler': {
99 | 'min_scale': 0.3,
100 | 'max_scale': 1.0,
101 | 'min_aspect_ratio': 0.5,
102 | 'max_aspect_ratio': 2.0,
103 | },
104 | 'sample_constraint': {
105 | 'min_jaccard_overlap': 0.5,
106 | },
107 | 'max_trials': 50,
108 | 'max_sample': 1,
109 | },
110 | {
111 | 'sampler': {
112 | 'min_scale': 0.3,
113 | 'max_scale': 1.0,
114 | 'min_aspect_ratio': 0.5,
115 | 'max_aspect_ratio': 2.0,
116 | },
117 | 'sample_constraint': {
118 | 'min_jaccard_overlap': 0.7,
119 | },
120 | 'max_trials': 50,
121 | 'max_sample': 1,
122 | },
123 | {
124 | 'sampler': {
125 | 'min_scale': 0.3,
126 | 'max_scale': 1.0,
127 | 'min_aspect_ratio': 0.5,
128 | 'max_aspect_ratio': 2.0,
129 | },
130 | 'sample_constraint': {
131 | 'min_jaccard_overlap': 0.9,
132 | },
133 | 'max_trials': 50,
134 | 'max_sample': 1,
135 | },
136 | {
137 | 'sampler': {
138 | 'min_scale': 0.3,
139 | 'max_scale': 1.0,
140 | 'min_aspect_ratio': 0.5,
141 | 'max_aspect_ratio': 2.0,
142 | },
143 | 'sample_constraint': {
144 | 'max_jaccard_overlap': 1.0,
145 | },
146 | 'max_trials': 50,
147 | 'max_sample': 1,
148 | },
149 | ]
150 | train_transform_param = {
151 | 'mirror': True,
152 | 'mean_value': [104, 117, 123],
153 | 'resize_param': {
154 | 'prob': 1,
155 | 'resize_mode': P.Resize.WARP,
156 | 'height': resize_height,
157 | 'width': resize_width,
158 | 'interp_mode': [
159 | P.Resize.LINEAR,
160 | P.Resize.AREA,
161 | P.Resize.NEAREST,
162 | P.Resize.CUBIC,
163 | P.Resize.LANCZOS4,
164 | ],
165 | },
166 | 'emit_constraint': {
167 | 'emit_type': caffe_pb2.EmitConstraint.CENTER,
168 | }
169 | }
170 | test_transform_param = {
171 | 'mean_value': [104, 117, 123],
172 | 'resize_param': {
173 | 'prob': 1,
174 | 'resize_mode': P.Resize.WARP,
175 | 'height': resize_height,
176 | 'width': resize_width,
177 | 'interp_mode': [P.Resize.LINEAR],
178 | },
179 | }
180 |
181 | # If true, use batch norm for all newly added layers.
182 | # Currently only the non batch norm version has been tested.
183 | use_batchnorm = False
184 | # Use different initial learning rate.
185 | if use_batchnorm:
186 | base_lr = 0.0004
187 | else:
188 | # A learning rate for batch_size = 1, num_gpus = 1.
189 | base_lr = 0.00004
190 |
191 | # Modify the job name if you want.
192 | job_name = "SSD_{}".format(resize)
193 | # The name of the model. Modify it if you want.
194 | model_name = "VGG_scenetext_{}".format(job_name)
195 |
196 | # Directory which stores the model .prototxt file.
197 | save_dir = "models/VGGNet/scenetext/{}".format(job_name)
198 | # Directory which stores the snapshot of models.
199 | snapshot_dir = "models/VGGNet/scenetext/{}".format(job_name)
200 | # Directory which stores the job script and log file.
201 | job_dir = "jobs/VGGNet/scenetext/{}".format(job_name)
202 | # Directory which stores the detection results.
203 | output_result_dir = "{}/data/VOCdevkit/results/{}".format(os.environ['HOME'], job_name)
204 |
205 | # model definition files.
206 | train_net_file = "{}/train.prototxt".format(save_dir)
207 | test_net_file = "{}/test.prototxt".format(save_dir)
208 | deploy_net_file = "{}/deploy.prototxt".format(save_dir)
209 | solver_file = "{}/solver.prototxt".format(save_dir)
210 | # snapshot prefix.
211 | snapshot_prefix = "{}/{}".format(snapshot_dir, model_name)
212 | # job script path.
213 | job_file = "{}/{}.sh".format(job_dir, model_name)
214 |
215 | # Stores the test image names and sizes. Created by data/VOC0712/create_list.sh
216 | name_size_file = "data/scenetext/test_name_size.txt"
217 | # The pretrained model. We use the Fully convolutional reduced (atrous) VGGNet.
218 | pretrain_model = "models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel"
219 | # Stores LabelMapItem.
220 | label_map_file = "data/scenetext/labelmap_voc.prototxt"
221 |
222 | # MultiBoxLoss parameters.
223 | num_classes = 2
224 | share_location = True
225 | background_label_id=0
226 | train_on_diff_gt = True
227 | normalization_mode = P.Loss.VALID
228 | code_type = P.PriorBox.CENTER_SIZE
229 | neg_pos_ratio = 3.
230 | loc_weight = (neg_pos_ratio + 1.) / 4.
231 | multibox_loss_param = {
232 | 'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
233 | 'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
234 | 'loc_weight': loc_weight,
235 | 'num_classes': num_classes,
236 | 'share_location': share_location,
237 | 'match_type': P.MultiBoxLoss.PER_PREDICTION,
238 | 'overlap_threshold': 0.5,
239 | 'use_prior_for_matching': True,
240 | 'background_label_id': background_label_id,
241 | 'use_difficult_gt': train_on_diff_gt,
242 | 'do_neg_mining': True,
243 | 'neg_pos_ratio': neg_pos_ratio,
244 | 'neg_overlap': 0.5,
245 | 'code_type': code_type,
246 | }
247 | loss_param = {
248 | 'normalization': normalization_mode,
249 | }
250 |
251 | # parameters for generating priors.
252 | # minimum dimension of input image
253 | min_dim = 300
254 | # conv4_3 ==> 38 x 38
255 | # fc7 ==> 19 x 19
256 | # conv6_2 ==> 10 x 10
257 | # conv7_2 ==> 5 x 5
258 | # conv8_2 ==> 3 x 3
259 | # pool6 ==> 1 x 1
260 | mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'pool6']
261 | # in percent %
262 | min_ratio = 20
263 | max_ratio = 95
264 | step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
265 | min_sizes = []
266 | max_sizes = []
267 | for ratio in xrange(min_ratio, max_ratio + 1, step):
268 | min_sizes.append(min_dim * ratio / 100.)
269 | max_sizes.append(min_dim * (ratio + step) / 100.)
270 | min_sizes = [min_dim * 10 / 100.] + min_sizes
271 | max_sizes = [[]] + max_sizes
272 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]]
273 | # L2 normalize conv4_3.
274 | normalizations = [20, -1, -1, -1, -1, -1]
275 | # variance used to encode/decode prior bboxes.
276 | if code_type == P.PriorBox.CENTER_SIZE:
277 | prior_variance = [0.1, 0.1, 0.2, 0.2]
278 | else:
279 | prior_variance = [0.1]
280 | flip = True
281 | clip = True
282 |
283 | # Solver parameters.
284 | # Defining which GPUs to use.
285 | gpus = "0,1,2,3"
286 | gpulist = gpus.split(",")
287 | num_gpus = len(gpulist)
288 |
289 | # Divide the mini-batch to different GPUs.
290 | batch_size = 32
291 | accum_batch_size = 32
292 | iter_size = accum_batch_size / batch_size
293 | solver_mode = P.Solver.CPU
294 | device_id = 0
295 | batch_size_per_device = batch_size
296 | if num_gpus > 0:
297 | batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
298 | iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
299 | solver_mode = P.Solver.GPU
300 | device_id = int(gpulist[0])
301 |
302 | if normalization_mode == P.Loss.NONE:
303 | base_lr /= batch_size_per_device
304 | elif normalization_mode == P.Loss.VALID:
305 | base_lr *= 25. / loc_weight
306 | elif normalization_mode == P.Loss.FULL:
307 | # Roughly there are 2000 prior bboxes per image.
308 | # TODO(weiliu89): Estimate the exact # of priors.
309 | base_lr *= 2000.
310 |
311 | # Which layers to freeze (no backward) during training.
312 | freeze_layers = ['conv1_1', 'conv1_2', 'conv2_1', 'conv2_2']
313 |
314 | # Evaluate on whole test set.
315 | num_test_image = 70
316 | test_batch_size = 1
317 | test_iter = num_test_image / test_batch_size
318 |
319 | solver_param = {
320 | # Train parameters
321 | 'base_lr': base_lr,
322 | 'weight_decay': 0.0005,
323 | 'lr_policy': "step",
324 | 'stepsize': 40000,
325 | 'gamma': 0.1,
326 | 'momentum': 0.9,
327 | 'iter_size': iter_size,
328 | 'max_iter': 60000,
329 | 'snapshot': 40000,
330 | 'display': 10,
331 | 'average_loss': 10,
332 | 'type': "SGD",
333 | 'solver_mode': solver_mode,
334 | 'device_id': device_id,
335 | 'debug_info': False,
336 | 'snapshot_after_train': True,
337 | # Test parameters
338 | 'test_iter': [test_iter],
339 | 'test_interval': 10000,
340 | 'eval_type': "detection",
341 | 'ap_version': "11point",
342 | 'test_initialization': False,
343 | }
344 |
345 | # parameters for generating detection output.
346 | det_out_param = {
347 | 'num_classes': num_classes,
348 | 'share_location': share_location,
349 | 'background_label_id': background_label_id,
350 | 'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
351 | 'save_output_param': {
352 | 'output_directory': output_result_dir,
353 | 'output_name_prefix': "comp4_det_test_",
354 | 'output_format': "VOC",
355 | 'label_map_file': label_map_file,
356 | 'name_size_file': name_size_file,
357 | 'num_test_image': num_test_image,
358 | },
359 | 'keep_top_k': 200,
360 | 'confidence_threshold': 0.01,
361 | 'code_type': code_type,
362 | }
363 |
364 | # parameters for evaluating detection results.
365 | det_eval_param = {
366 | 'num_classes': num_classes,
367 | 'background_label_id': background_label_id,
368 | 'overlap_threshold': 0.5,
369 | 'evaluate_difficult_gt': False,
370 | 'name_size_file': name_size_file,
371 | }
372 |
373 | ### Hopefully you don't need to change the following ###
374 | # Check file.
375 | check_if_exist(train_data)
376 | check_if_exist(test_data)
377 | check_if_exist(label_map_file)
378 | check_if_exist(pretrain_model)
379 | make_if_not_exist(save_dir)
380 | make_if_not_exist(job_dir)
381 | make_if_not_exist(snapshot_dir)
382 |
383 | # Create train net.
384 | net = caffe.NetSpec()
385 | net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
386 | train=True, output_label=True, label_map_file=label_map_file,
387 | transform_param=train_transform_param, batch_sampler=batch_sampler)
388 |
389 | VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
390 | dropout=False, freeze_layers=freeze_layers)
391 |
392 | AddExtraLayers(net, use_batchnorm)
393 |
394 | mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
395 | use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
396 | aspect_ratios=aspect_ratios, normalizations=normalizations,
397 | num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
398 | prior_variance=prior_variance, kernel_size=3, pad=1)
399 |
400 | # Create the MultiBoxLossLayer.
401 | name = "mbox_loss"
402 | mbox_layers.append(net.label)
403 | net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
404 | loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
405 | propagate_down=[True, True, False, False])
406 |
407 | with open(train_net_file, 'w') as f:
408 | print('name: "{}_train"'.format(model_name), file=f)
409 | print(net.to_proto(), file=f)
410 | shutil.copy(train_net_file, job_dir)
411 |
412 | # Create test net.
413 | net = caffe.NetSpec()
414 | net.data, net.label = CreateAnnotatedDataLayer(test_data, batch_size=test_batch_size,
415 | train=False, output_label=True, label_map_file=label_map_file,
416 | transform_param=test_transform_param)
417 |
418 | VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
419 | dropout=False, freeze_layers=freeze_layers)
420 |
421 | AddExtraLayers(net, use_batchnorm)
422 |
423 | mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
424 | use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
425 | aspect_ratios=aspect_ratios, normalizations=normalizations,
426 | num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
427 | prior_variance=prior_variance, kernel_size=3, pad=1)
428 |
429 | conf_name = "mbox_conf"
430 | if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
431 | reshape_name = "{}_reshape".format(conf_name)
432 | net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
433 | softmax_name = "{}_softmax".format(conf_name)
434 | net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
435 | flatten_name = "{}_flatten".format(conf_name)
436 | net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
437 | mbox_layers[1] = net[flatten_name]
438 | elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
439 | sigmoid_name = "{}_sigmoid".format(conf_name)
440 | net[sigmoid_name] = L.Sigmoid(net[conf_name])
441 | mbox_layers[1] = net[sigmoid_name]
442 |
443 | net.detection_out = L.DetectionOutput(*mbox_layers,
444 | detection_output_param=det_out_param,
445 | include=dict(phase=caffe_pb2.Phase.Value('TEST')))
446 | net.detection_eval = L.DetectionEvaluate(net.detection_out, net.label,
447 | detection_evaluate_param=det_eval_param,
448 | include=dict(phase=caffe_pb2.Phase.Value('TEST')))
449 |
450 | with open(test_net_file, 'w') as f:
451 | print('name: "{}_test"'.format(model_name), file=f)
452 | print(net.to_proto(), file=f)
453 | shutil.copy(test_net_file, job_dir)
454 |
455 | # Create deploy net.
456 | # Remove the first and last layer from test net.
457 | deploy_net = net
458 | with open(deploy_net_file, 'w') as f:
459 | net_param = deploy_net.to_proto()
460 | # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
461 | del net_param.layer[0]
462 | del net_param.layer[-1]
463 | net_param.name = '{}_deploy'.format(model_name)
464 | net_param.input.extend(['data'])
465 | net_param.input_shape.extend([
466 | caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
467 | print(net_param, file=f)
468 | shutil.copy(deploy_net_file, job_dir)
469 |
470 | # Create solver.
471 | solver = caffe_pb2.SolverParameter(
472 | train_net=train_net_file,
473 | test_net=[test_net_file],
474 | snapshot_prefix=snapshot_prefix,
475 | **solver_param)
476 |
477 | with open(solver_file, 'w') as f:
478 | print(solver, file=f)
479 | shutil.copy(solver_file, job_dir)
480 |
481 | max_iter = 0
482 | # Find most recent snapshot.
483 | for file in os.listdir(snapshot_dir):
484 | if file.endswith(".solverstate"):
485 | basename = os.path.splitext(file)[0]
486 | iter = int(basename.split("{}_iter_".format(model_name))[1])
487 | if iter > max_iter:
488 | max_iter = iter
489 |
490 | train_src_param = '--weights="{}" \\\n'.format(pretrain_model)
491 | if resume_training:
492 | if max_iter > 0:
493 | train_src_param = '--snapshot="{}_iter_{}.solverstate" \\\n'.format(snapshot_prefix, max_iter)
494 |
495 | if remove_old_models:
496 | # Remove any snapshots smaller than max_iter.
497 | for file in os.listdir(snapshot_dir):
498 | if file.endswith(".solverstate"):
499 | basename = os.path.splitext(file)[0]
500 | iter = int(basename.split("{}_iter_".format(model_name))[1])
501 | if max_iter > iter:
502 | os.remove("{}/{}".format(snapshot_dir, file))
503 | if file.endswith(".caffemodel"):
504 | basename = os.path.splitext(file)[0]
505 | iter = int(basename.split("{}_iter_".format(model_name))[1])
506 | if max_iter > iter:
507 | os.remove("{}/{}".format(snapshot_dir, file))
508 |
509 | # Create job file.
510 | with open(job_file, 'w') as f:
511 | f.write('cd {}\n'.format(caffe_root))
512 | f.write('./build/tools/caffe train \\\n')
513 | f.write('--solver="{}" \\\n'.format(solver_file))
514 | f.write(train_src_param)
515 | if solver_param['solver_mode'] == P.Solver.GPU:
516 | f.write('--gpu {} 2>&1 | tee {}/{}.log\n'.format(gpus, job_dir, model_name))
517 | else:
518 | f.write('2>&1 | tee {}/{}.log\n'.format(job_dir, model_name))
519 |
520 | # Copy the python script to job_dir.
521 | py_file = os.path.abspath(__file__)
522 | shutil.copy(py_file, job_dir)
523 |
524 | # Run the job.
525 | os.chmod(job_file, stat.S_IRWXU)
526 | if run_soon:
527 | subprocess.call(job_file, shell=True)
528 |
529 |
--------------------------------------------------------------------------------
/test_file/101.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/101.jpg
--------------------------------------------------------------------------------
/test_file/104.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/104.jpg
--------------------------------------------------------------------------------
/test_file/120.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/120.jpg
--------------------------------------------------------------------------------
/test_file/output_101.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/output_101.png
--------------------------------------------------------------------------------
/test_file/output_104.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/output_104.png
--------------------------------------------------------------------------------
/test_file/output_120.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chenxinpeng/SSD_scene_text_detection/0972baf67e88a736d4147f874e1f056597be1c57/test_file/output_120.png
--------------------------------------------------------------------------------
/test_name_size.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/python
2 |
3 | import os, sys
4 | import glob
5 | from PIL import Image
6 |
7 | img_dir = "/home/chenxp/data/VOCdevkit/scenetext/JPEGImages"
8 |
9 | img_lists = glob.glob(img_dir + '/*.jpg')
10 |
11 | test_name_size = open('/home/chenxp/caffe/data/scenetext/test_name_size.txt', 'w')
12 |
13 | for item in img_lists:
14 | img = Image.open(item)
15 | width, height = img.size
16 | temp1, temp2 = os.path.splitext(os.path.basename(item))
17 | test_name_size.write(temp1 + ' ' + str(height) + ' ' + str(width) + '\n')
18 |
19 |
--------------------------------------------------------------------------------
/training_file/VGG_scenetext_SSD_300x300.sh:
--------------------------------------------------------------------------------
1 | cd /home/chenxp/caffe
2 | ./build/tools/caffe train \
3 | --solver="models/VGGNet/scenetext/SSD_300x300/solver.prototxt" \
4 | --weights="models/VGGNet/VGG_ILSVRC_16_layers_fc_reduced.caffemodel" \
5 | --gpu 0 | tee jobs/VGGNet/scenetext/SSD_300x300/VGG_scenetext_SSD_300x300.log
6 |
--------------------------------------------------------------------------------
/training_file/deploy.prototxt:
--------------------------------------------------------------------------------
1 | name: "VGG_scenetext_SSD_300x300_deploy"
2 | input: "data"
3 | input_shape {
4 | dim: 1
5 | dim: 3
6 | dim: 300
7 | dim: 300
8 | }
9 | layer {
10 | name: "conv1_1"
11 | type: "Convolution"
12 | bottom: "data"
13 | top: "conv1_1"
14 | param {
15 | lr_mult: 0
16 | decay_mult: 0
17 | }
18 | param {
19 | lr_mult: 0
20 | decay_mult: 0
21 | }
22 | convolution_param {
23 | num_output: 64
24 | pad: 1
25 | kernel_size: 3
26 | weight_filler {
27 | type: "xavier"
28 | }
29 | bias_filler {
30 | type: "constant"
31 | value: 0
32 | }
33 | }
34 | }
35 | layer {
36 | name: "relu1_1"
37 | type: "ReLU"
38 | bottom: "conv1_1"
39 | top: "conv1_1"
40 | }
41 | layer {
42 | name: "conv1_2"
43 | type: "Convolution"
44 | bottom: "conv1_1"
45 | top: "conv1_2"
46 | param {
47 | lr_mult: 0
48 | decay_mult: 0
49 | }
50 | param {
51 | lr_mult: 0
52 | decay_mult: 0
53 | }
54 | convolution_param {
55 | num_output: 64
56 | pad: 1
57 | kernel_size: 3
58 | weight_filler {
59 | type: "xavier"
60 | }
61 | bias_filler {
62 | type: "constant"
63 | value: 0
64 | }
65 | }
66 | }
67 | layer {
68 | name: "relu1_2"
69 | type: "ReLU"
70 | bottom: "conv1_2"
71 | top: "conv1_2"
72 | }
73 | layer {
74 | name: "pool1"
75 | type: "Pooling"
76 | bottom: "conv1_2"
77 | top: "pool1"
78 | pooling_param {
79 | pool: MAX
80 | kernel_size: 2
81 | stride: 2
82 | }
83 | }
84 | layer {
85 | name: "conv2_1"
86 | type: "Convolution"
87 | bottom: "pool1"
88 | top: "conv2_1"
89 | param {
90 | lr_mult: 0
91 | decay_mult: 0
92 | }
93 | param {
94 | lr_mult: 0
95 | decay_mult: 0
96 | }
97 | convolution_param {
98 | num_output: 128
99 | pad: 1
100 | kernel_size: 3
101 | weight_filler {
102 | type: "xavier"
103 | }
104 | bias_filler {
105 | type: "constant"
106 | value: 0
107 | }
108 | }
109 | }
110 | layer {
111 | name: "relu2_1"
112 | type: "ReLU"
113 | bottom: "conv2_1"
114 | top: "conv2_1"
115 | }
116 | layer {
117 | name: "conv2_2"
118 | type: "Convolution"
119 | bottom: "conv2_1"
120 | top: "conv2_2"
121 | param {
122 | lr_mult: 0
123 | decay_mult: 0
124 | }
125 | param {
126 | lr_mult: 0
127 | decay_mult: 0
128 | }
129 | convolution_param {
130 | num_output: 128
131 | pad: 1
132 | kernel_size: 3
133 | weight_filler {
134 | type: "xavier"
135 | }
136 | bias_filler {
137 | type: "constant"
138 | value: 0
139 | }
140 | }
141 | }
142 | layer {
143 | name: "relu2_2"
144 | type: "ReLU"
145 | bottom: "conv2_2"
146 | top: "conv2_2"
147 | }
148 | layer {
149 | name: "pool2"
150 | type: "Pooling"
151 | bottom: "conv2_2"
152 | top: "pool2"
153 | pooling_param {
154 | pool: MAX
155 | kernel_size: 2
156 | stride: 2
157 | }
158 | }
159 | layer {
160 | name: "conv3_1"
161 | type: "Convolution"
162 | bottom: "pool2"
163 | top: "conv3_1"
164 | param {
165 | lr_mult: 1
166 | decay_mult: 1
167 | }
168 | param {
169 | lr_mult: 2
170 | decay_mult: 0
171 | }
172 | convolution_param {
173 | num_output: 256
174 | pad: 1
175 | kernel_size: 3
176 | weight_filler {
177 | type: "xavier"
178 | }
179 | bias_filler {
180 | type: "constant"
181 | value: 0
182 | }
183 | }
184 | }
185 | layer {
186 | name: "relu3_1"
187 | type: "ReLU"
188 | bottom: "conv3_1"
189 | top: "conv3_1"
190 | }
191 | layer {
192 | name: "conv3_2"
193 | type: "Convolution"
194 | bottom: "conv3_1"
195 | top: "conv3_2"
196 | param {
197 | lr_mult: 1
198 | decay_mult: 1
199 | }
200 | param {
201 | lr_mult: 2
202 | decay_mult: 0
203 | }
204 | convolution_param {
205 | num_output: 256
206 | pad: 1
207 | kernel_size: 3
208 | weight_filler {
209 | type: "xavier"
210 | }
211 | bias_filler {
212 | type: "constant"
213 | value: 0
214 | }
215 | }
216 | }
217 | layer {
218 | name: "relu3_2"
219 | type: "ReLU"
220 | bottom: "conv3_2"
221 | top: "conv3_2"
222 | }
223 | layer {
224 | name: "conv3_3"
225 | type: "Convolution"
226 | bottom: "conv3_2"
227 | top: "conv3_3"
228 | param {
229 | lr_mult: 1
230 | decay_mult: 1
231 | }
232 | param {
233 | lr_mult: 2
234 | decay_mult: 0
235 | }
236 | convolution_param {
237 | num_output: 256
238 | pad: 1
239 | kernel_size: 3
240 | weight_filler {
241 | type: "xavier"
242 | }
243 | bias_filler {
244 | type: "constant"
245 | value: 0
246 | }
247 | }
248 | }
249 | layer {
250 | name: "relu3_3"
251 | type: "ReLU"
252 | bottom: "conv3_3"
253 | top: "conv3_3"
254 | }
255 | layer {
256 | name: "pool3"
257 | type: "Pooling"
258 | bottom: "conv3_3"
259 | top: "pool3"
260 | pooling_param {
261 | pool: MAX
262 | kernel_size: 2
263 | stride: 2
264 | }
265 | }
266 | layer {
267 | name: "conv4_1"
268 | type: "Convolution"
269 | bottom: "pool3"
270 | top: "conv4_1"
271 | param {
272 | lr_mult: 1
273 | decay_mult: 1
274 | }
275 | param {
276 | lr_mult: 2
277 | decay_mult: 0
278 | }
279 | convolution_param {
280 | num_output: 512
281 | pad: 1
282 | kernel_size: 3
283 | weight_filler {
284 | type: "xavier"
285 | }
286 | bias_filler {
287 | type: "constant"
288 | value: 0
289 | }
290 | }
291 | }
292 | layer {
293 | name: "relu4_1"
294 | type: "ReLU"
295 | bottom: "conv4_1"
296 | top: "conv4_1"
297 | }
298 | layer {
299 | name: "conv4_2"
300 | type: "Convolution"
301 | bottom: "conv4_1"
302 | top: "conv4_2"
303 | param {
304 | lr_mult: 1
305 | decay_mult: 1
306 | }
307 | param {
308 | lr_mult: 2
309 | decay_mult: 0
310 | }
311 | convolution_param {
312 | num_output: 512
313 | pad: 1
314 | kernel_size: 3
315 | weight_filler {
316 | type: "xavier"
317 | }
318 | bias_filler {
319 | type: "constant"
320 | value: 0
321 | }
322 | }
323 | }
324 | layer {
325 | name: "relu4_2"
326 | type: "ReLU"
327 | bottom: "conv4_2"
328 | top: "conv4_2"
329 | }
330 | layer {
331 | name: "conv4_3"
332 | type: "Convolution"
333 | bottom: "conv4_2"
334 | top: "conv4_3"
335 | param {
336 | lr_mult: 1
337 | decay_mult: 1
338 | }
339 | param {
340 | lr_mult: 2
341 | decay_mult: 0
342 | }
343 | convolution_param {
344 | num_output: 512
345 | pad: 1
346 | kernel_size: 3
347 | weight_filler {
348 | type: "xavier"
349 | }
350 | bias_filler {
351 | type: "constant"
352 | value: 0
353 | }
354 | }
355 | }
356 | layer {
357 | name: "relu4_3"
358 | type: "ReLU"
359 | bottom: "conv4_3"
360 | top: "conv4_3"
361 | }
362 | layer {
363 | name: "pool4"
364 | type: "Pooling"
365 | bottom: "conv4_3"
366 | top: "pool4"
367 | pooling_param {
368 | pool: MAX
369 | kernel_size: 2
370 | stride: 2
371 | }
372 | }
373 | layer {
374 | name: "conv5_1"
375 | type: "Convolution"
376 | bottom: "pool4"
377 | top: "conv5_1"
378 | param {
379 | lr_mult: 1
380 | decay_mult: 1
381 | }
382 | param {
383 | lr_mult: 2
384 | decay_mult: 0
385 | }
386 | convolution_param {
387 | num_output: 512
388 | pad: 1
389 | kernel_size: 3
390 | weight_filler {
391 | type: "xavier"
392 | }
393 | bias_filler {
394 | type: "constant"
395 | value: 0
396 | }
397 | }
398 | }
399 | layer {
400 | name: "relu5_1"
401 | type: "ReLU"
402 | bottom: "conv5_1"
403 | top: "conv5_1"
404 | }
405 | layer {
406 | name: "conv5_2"
407 | type: "Convolution"
408 | bottom: "conv5_1"
409 | top: "conv5_2"
410 | param {
411 | lr_mult: 1
412 | decay_mult: 1
413 | }
414 | param {
415 | lr_mult: 2
416 | decay_mult: 0
417 | }
418 | convolution_param {
419 | num_output: 512
420 | pad: 1
421 | kernel_size: 3
422 | weight_filler {
423 | type: "xavier"
424 | }
425 | bias_filler {
426 | type: "constant"
427 | value: 0
428 | }
429 | }
430 | }
431 | layer {
432 | name: "relu5_2"
433 | type: "ReLU"
434 | bottom: "conv5_2"
435 | top: "conv5_2"
436 | }
437 | layer {
438 | name: "conv5_3"
439 | type: "Convolution"
440 | bottom: "conv5_2"
441 | top: "conv5_3"
442 | param {
443 | lr_mult: 1
444 | decay_mult: 1
445 | }
446 | param {
447 | lr_mult: 2
448 | decay_mult: 0
449 | }
450 | convolution_param {
451 | num_output: 512
452 | pad: 1
453 | kernel_size: 3
454 | weight_filler {
455 | type: "xavier"
456 | }
457 | bias_filler {
458 | type: "constant"
459 | value: 0
460 | }
461 | }
462 | }
463 | layer {
464 | name: "relu5_3"
465 | type: "ReLU"
466 | bottom: "conv5_3"
467 | top: "conv5_3"
468 | }
469 | layer {
470 | name: "pool5"
471 | type: "Pooling"
472 | bottom: "conv5_3"
473 | top: "pool5"
474 | pooling_param {
475 | pool: MAX
476 | kernel_size: 3
477 | stride: 1
478 | pad: 1
479 | }
480 | }
481 | layer {
482 | name: "fc6"
483 | type: "Convolution"
484 | bottom: "pool5"
485 | top: "fc6"
486 | param {
487 | lr_mult: 1
488 | decay_mult: 1
489 | }
490 | param {
491 | lr_mult: 2
492 | decay_mult: 0
493 | }
494 | convolution_param {
495 | num_output: 1024
496 | pad: 6
497 | kernel_size: 3
498 | weight_filler {
499 | type: "xavier"
500 | }
501 | bias_filler {
502 | type: "constant"
503 | value: 0
504 | }
505 | dilation: 6
506 | }
507 | }
508 | layer {
509 | name: "relu6"
510 | type: "ReLU"
511 | bottom: "fc6"
512 | top: "fc6"
513 | }
514 | layer {
515 | name: "fc7"
516 | type: "Convolution"
517 | bottom: "fc6"
518 | top: "fc7"
519 | param {
520 | lr_mult: 1
521 | decay_mult: 1
522 | }
523 | param {
524 | lr_mult: 2
525 | decay_mult: 0
526 | }
527 | convolution_param {
528 | num_output: 1024
529 | kernel_size: 1
530 | weight_filler {
531 | type: "xavier"
532 | }
533 | bias_filler {
534 | type: "constant"
535 | value: 0
536 | }
537 | }
538 | }
539 | layer {
540 | name: "relu7"
541 | type: "ReLU"
542 | bottom: "fc7"
543 | top: "fc7"
544 | }
545 | layer {
546 | name: "conv6_1"
547 | type: "Convolution"
548 | bottom: "fc7"
549 | top: "conv6_1"
550 | param {
551 | lr_mult: 1
552 | decay_mult: 1
553 | }
554 | param {
555 | lr_mult: 2
556 | decay_mult: 0
557 | }
558 | convolution_param {
559 | num_output: 256
560 | pad: 0
561 | kernel_size: 1
562 | stride: 1
563 | weight_filler {
564 | type: "xavier"
565 | }
566 | bias_filler {
567 | type: "constant"
568 | value: 0
569 | }
570 | }
571 | }
572 | layer {
573 | name: "conv6_1_relu"
574 | type: "ReLU"
575 | bottom: "conv6_1"
576 | top: "conv6_1"
577 | }
578 | layer {
579 | name: "conv6_2"
580 | type: "Convolution"
581 | bottom: "conv6_1"
582 | top: "conv6_2"
583 | param {
584 | lr_mult: 1
585 | decay_mult: 1
586 | }
587 | param {
588 | lr_mult: 2
589 | decay_mult: 0
590 | }
591 | convolution_param {
592 | num_output: 512
593 | pad: 1
594 | kernel_size: 3
595 | stride: 2
596 | weight_filler {
597 | type: "xavier"
598 | }
599 | bias_filler {
600 | type: "constant"
601 | value: 0
602 | }
603 | }
604 | }
605 | layer {
606 | name: "conv6_2_relu"
607 | type: "ReLU"
608 | bottom: "conv6_2"
609 | top: "conv6_2"
610 | }
611 | layer {
612 | name: "conv7_1"
613 | type: "Convolution"
614 | bottom: "conv6_2"
615 | top: "conv7_1"
616 | param {
617 | lr_mult: 1
618 | decay_mult: 1
619 | }
620 | param {
621 | lr_mult: 2
622 | decay_mult: 0
623 | }
624 | convolution_param {
625 | num_output: 128
626 | pad: 0
627 | kernel_size: 1
628 | stride: 1
629 | weight_filler {
630 | type: "xavier"
631 | }
632 | bias_filler {
633 | type: "constant"
634 | value: 0
635 | }
636 | }
637 | }
638 | layer {
639 | name: "conv7_1_relu"
640 | type: "ReLU"
641 | bottom: "conv7_1"
642 | top: "conv7_1"
643 | }
644 | layer {
645 | name: "conv7_2"
646 | type: "Convolution"
647 | bottom: "conv7_1"
648 | top: "conv7_2"
649 | param {
650 | lr_mult: 1
651 | decay_mult: 1
652 | }
653 | param {
654 | lr_mult: 2
655 | decay_mult: 0
656 | }
657 | convolution_param {
658 | num_output: 256
659 | pad: 1
660 | kernel_size: 3
661 | stride: 2
662 | weight_filler {
663 | type: "xavier"
664 | }
665 | bias_filler {
666 | type: "constant"
667 | value: 0
668 | }
669 | }
670 | }
671 | layer {
672 | name: "conv7_2_relu"
673 | type: "ReLU"
674 | bottom: "conv7_2"
675 | top: "conv7_2"
676 | }
677 | layer {
678 | name: "conv8_1"
679 | type: "Convolution"
680 | bottom: "conv7_2"
681 | top: "conv8_1"
682 | param {
683 | lr_mult: 1
684 | decay_mult: 1
685 | }
686 | param {
687 | lr_mult: 2
688 | decay_mult: 0
689 | }
690 | convolution_param {
691 | num_output: 128
692 | pad: 0
693 | kernel_size: 1
694 | stride: 1
695 | weight_filler {
696 | type: "xavier"
697 | }
698 | bias_filler {
699 | type: "constant"
700 | value: 0
701 | }
702 | }
703 | }
704 | layer {
705 | name: "conv8_1_relu"
706 | type: "ReLU"
707 | bottom: "conv8_1"
708 | top: "conv8_1"
709 | }
710 | layer {
711 | name: "conv8_2"
712 | type: "Convolution"
713 | bottom: "conv8_1"
714 | top: "conv8_2"
715 | param {
716 | lr_mult: 1
717 | decay_mult: 1
718 | }
719 | param {
720 | lr_mult: 2
721 | decay_mult: 0
722 | }
723 | convolution_param {
724 | num_output: 256
725 | pad: 1
726 | kernel_size: 3
727 | stride: 2
728 | weight_filler {
729 | type: "xavier"
730 | }
731 | bias_filler {
732 | type: "constant"
733 | value: 0
734 | }
735 | }
736 | }
737 | layer {
738 | name: "conv8_2_relu"
739 | type: "ReLU"
740 | bottom: "conv8_2"
741 | top: "conv8_2"
742 | }
743 | layer {
744 | name: "pool6"
745 | type: "Pooling"
746 | bottom: "conv8_2"
747 | top: "pool6"
748 | pooling_param {
749 | pool: AVE
750 | global_pooling: true
751 | }
752 | }
753 | layer {
754 | name: "conv4_3_norm"
755 | type: "Normalize"
756 | bottom: "conv4_3"
757 | top: "conv4_3_norm"
758 | norm_param {
759 | across_spatial: false
760 | scale_filler {
761 | type: "constant"
762 | value: 20
763 | }
764 | channel_shared: false
765 | }
766 | }
767 | layer {
768 | name: "conv4_3_norm_mbox_loc"
769 | type: "Convolution"
770 | bottom: "conv4_3_norm"
771 | top: "conv4_3_norm_mbox_loc"
772 | param {
773 | lr_mult: 1
774 | decay_mult: 1
775 | }
776 | param {
777 | lr_mult: 2
778 | decay_mult: 0
779 | }
780 | convolution_param {
781 | num_output: 12
782 | pad: 1
783 | kernel_size: 3
784 | stride: 1
785 | weight_filler {
786 | type: "xavier"
787 | }
788 | bias_filler {
789 | type: "constant"
790 | value: 0
791 | }
792 | }
793 | }
794 | layer {
795 | name: "conv4_3_norm_mbox_loc_perm"
796 | type: "Permute"
797 | bottom: "conv4_3_norm_mbox_loc"
798 | top: "conv4_3_norm_mbox_loc_perm"
799 | permute_param {
800 | order: 0
801 | order: 2
802 | order: 3
803 | order: 1
804 | }
805 | }
806 | layer {
807 | name: "conv4_3_norm_mbox_loc_flat"
808 | type: "Flatten"
809 | bottom: "conv4_3_norm_mbox_loc_perm"
810 | top: "conv4_3_norm_mbox_loc_flat"
811 | flatten_param {
812 | axis: 1
813 | }
814 | }
815 | layer {
816 | name: "conv4_3_norm_mbox_conf"
817 | type: "Convolution"
818 | bottom: "conv4_3_norm"
819 | top: "conv4_3_norm_mbox_conf"
820 | param {
821 | lr_mult: 1
822 | decay_mult: 1
823 | }
824 | param {
825 | lr_mult: 2
826 | decay_mult: 0
827 | }
828 | convolution_param {
829 | num_output: 6
830 | pad: 1
831 | kernel_size: 3
832 | stride: 1
833 | weight_filler {
834 | type: "xavier"
835 | }
836 | bias_filler {
837 | type: "constant"
838 | value: 0
839 | }
840 | }
841 | }
842 | layer {
843 | name: "conv4_3_norm_mbox_conf_perm"
844 | type: "Permute"
845 | bottom: "conv4_3_norm_mbox_conf"
846 | top: "conv4_3_norm_mbox_conf_perm"
847 | permute_param {
848 | order: 0
849 | order: 2
850 | order: 3
851 | order: 1
852 | }
853 | }
854 | layer {
855 | name: "conv4_3_norm_mbox_conf_flat"
856 | type: "Flatten"
857 | bottom: "conv4_3_norm_mbox_conf_perm"
858 | top: "conv4_3_norm_mbox_conf_flat"
859 | flatten_param {
860 | axis: 1
861 | }
862 | }
863 | layer {
864 | name: "conv4_3_norm_mbox_priorbox"
865 | type: "PriorBox"
866 | bottom: "conv4_3_norm"
867 | bottom: "data"
868 | top: "conv4_3_norm_mbox_priorbox"
869 | prior_box_param {
870 | min_size: 30.0
871 | aspect_ratio: 2
872 | flip: true
873 | clip: true
874 | variance: 0.1
875 | variance: 0.1
876 | variance: 0.2
877 | variance: 0.2
878 | }
879 | }
880 | layer {
881 | name: "fc7_mbox_loc"
882 | type: "Convolution"
883 | bottom: "fc7"
884 | top: "fc7_mbox_loc"
885 | param {
886 | lr_mult: 1
887 | decay_mult: 1
888 | }
889 | param {
890 | lr_mult: 2
891 | decay_mult: 0
892 | }
893 | convolution_param {
894 | num_output: 24
895 | pad: 1
896 | kernel_size: 3
897 | stride: 1
898 | weight_filler {
899 | type: "xavier"
900 | }
901 | bias_filler {
902 | type: "constant"
903 | value: 0
904 | }
905 | }
906 | }
907 | layer {
908 | name: "fc7_mbox_loc_perm"
909 | type: "Permute"
910 | bottom: "fc7_mbox_loc"
911 | top: "fc7_mbox_loc_perm"
912 | permute_param {
913 | order: 0
914 | order: 2
915 | order: 3
916 | order: 1
917 | }
918 | }
919 | layer {
920 | name: "fc7_mbox_loc_flat"
921 | type: "Flatten"
922 | bottom: "fc7_mbox_loc_perm"
923 | top: "fc7_mbox_loc_flat"
924 | flatten_param {
925 | axis: 1
926 | }
927 | }
928 | layer {
929 | name: "fc7_mbox_conf"
930 | type: "Convolution"
931 | bottom: "fc7"
932 | top: "fc7_mbox_conf"
933 | param {
934 | lr_mult: 1
935 | decay_mult: 1
936 | }
937 | param {
938 | lr_mult: 2
939 | decay_mult: 0
940 | }
941 | convolution_param {
942 | num_output: 12
943 | pad: 1
944 | kernel_size: 3
945 | stride: 1
946 | weight_filler {
947 | type: "xavier"
948 | }
949 | bias_filler {
950 | type: "constant"
951 | value: 0
952 | }
953 | }
954 | }
955 | layer {
956 | name: "fc7_mbox_conf_perm"
957 | type: "Permute"
958 | bottom: "fc7_mbox_conf"
959 | top: "fc7_mbox_conf_perm"
960 | permute_param {
961 | order: 0
962 | order: 2
963 | order: 3
964 | order: 1
965 | }
966 | }
967 | layer {
968 | name: "fc7_mbox_conf_flat"
969 | type: "Flatten"
970 | bottom: "fc7_mbox_conf_perm"
971 | top: "fc7_mbox_conf_flat"
972 | flatten_param {
973 | axis: 1
974 | }
975 | }
976 | layer {
977 | name: "fc7_mbox_priorbox"
978 | type: "PriorBox"
979 | bottom: "fc7"
980 | bottom: "data"
981 | top: "fc7_mbox_priorbox"
982 | prior_box_param {
983 | min_size: 60.0
984 | max_size: 114.0
985 | aspect_ratio: 2
986 | aspect_ratio: 3
987 | flip: true
988 | clip: true
989 | variance: 0.1
990 | variance: 0.1
991 | variance: 0.2
992 | variance: 0.2
993 | }
994 | }
995 | layer {
996 | name: "conv6_2_mbox_loc"
997 | type: "Convolution"
998 | bottom: "conv6_2"
999 | top: "conv6_2_mbox_loc"
1000 | param {
1001 | lr_mult: 1
1002 | decay_mult: 1
1003 | }
1004 | param {
1005 | lr_mult: 2
1006 | decay_mult: 0
1007 | }
1008 | convolution_param {
1009 | num_output: 24
1010 | pad: 1
1011 | kernel_size: 3
1012 | stride: 1
1013 | weight_filler {
1014 | type: "xavier"
1015 | }
1016 | bias_filler {
1017 | type: "constant"
1018 | value: 0
1019 | }
1020 | }
1021 | }
1022 | layer {
1023 | name: "conv6_2_mbox_loc_perm"
1024 | type: "Permute"
1025 | bottom: "conv6_2_mbox_loc"
1026 | top: "conv6_2_mbox_loc_perm"
1027 | permute_param {
1028 | order: 0
1029 | order: 2
1030 | order: 3
1031 | order: 1
1032 | }
1033 | }
1034 | layer {
1035 | name: "conv6_2_mbox_loc_flat"
1036 | type: "Flatten"
1037 | bottom: "conv6_2_mbox_loc_perm"
1038 | top: "conv6_2_mbox_loc_flat"
1039 | flatten_param {
1040 | axis: 1
1041 | }
1042 | }
1043 | layer {
1044 | name: "conv6_2_mbox_conf"
1045 | type: "Convolution"
1046 | bottom: "conv6_2"
1047 | top: "conv6_2_mbox_conf"
1048 | param {
1049 | lr_mult: 1
1050 | decay_mult: 1
1051 | }
1052 | param {
1053 | lr_mult: 2
1054 | decay_mult: 0
1055 | }
1056 | convolution_param {
1057 | num_output: 12
1058 | pad: 1
1059 | kernel_size: 3
1060 | stride: 1
1061 | weight_filler {
1062 | type: "xavier"
1063 | }
1064 | bias_filler {
1065 | type: "constant"
1066 | value: 0
1067 | }
1068 | }
1069 | }
1070 | layer {
1071 | name: "conv6_2_mbox_conf_perm"
1072 | type: "Permute"
1073 | bottom: "conv6_2_mbox_conf"
1074 | top: "conv6_2_mbox_conf_perm"
1075 | permute_param {
1076 | order: 0
1077 | order: 2
1078 | order: 3
1079 | order: 1
1080 | }
1081 | }
1082 | layer {
1083 | name: "conv6_2_mbox_conf_flat"
1084 | type: "Flatten"
1085 | bottom: "conv6_2_mbox_conf_perm"
1086 | top: "conv6_2_mbox_conf_flat"
1087 | flatten_param {
1088 | axis: 1
1089 | }
1090 | }
1091 | layer {
1092 | name: "conv6_2_mbox_priorbox"
1093 | type: "PriorBox"
1094 | bottom: "conv6_2"
1095 | bottom: "data"
1096 | top: "conv6_2_mbox_priorbox"
1097 | prior_box_param {
1098 | min_size: 114.0
1099 | max_size: 168.0
1100 | aspect_ratio: 2
1101 | aspect_ratio: 3
1102 | flip: true
1103 | clip: true
1104 | variance: 0.1
1105 | variance: 0.1
1106 | variance: 0.2
1107 | variance: 0.2
1108 | }
1109 | }
1110 | layer {
1111 | name: "conv7_2_mbox_loc"
1112 | type: "Convolution"
1113 | bottom: "conv7_2"
1114 | top: "conv7_2_mbox_loc"
1115 | param {
1116 | lr_mult: 1
1117 | decay_mult: 1
1118 | }
1119 | param {
1120 | lr_mult: 2
1121 | decay_mult: 0
1122 | }
1123 | convolution_param {
1124 | num_output: 24
1125 | pad: 1
1126 | kernel_size: 3
1127 | stride: 1
1128 | weight_filler {
1129 | type: "xavier"
1130 | }
1131 | bias_filler {
1132 | type: "constant"
1133 | value: 0
1134 | }
1135 | }
1136 | }
1137 | layer {
1138 | name: "conv7_2_mbox_loc_perm"
1139 | type: "Permute"
1140 | bottom: "conv7_2_mbox_loc"
1141 | top: "conv7_2_mbox_loc_perm"
1142 | permute_param {
1143 | order: 0
1144 | order: 2
1145 | order: 3
1146 | order: 1
1147 | }
1148 | }
1149 | layer {
1150 | name: "conv7_2_mbox_loc_flat"
1151 | type: "Flatten"
1152 | bottom: "conv7_2_mbox_loc_perm"
1153 | top: "conv7_2_mbox_loc_flat"
1154 | flatten_param {
1155 | axis: 1
1156 | }
1157 | }
1158 | layer {
1159 | name: "conv7_2_mbox_conf"
1160 | type: "Convolution"
1161 | bottom: "conv7_2"
1162 | top: "conv7_2_mbox_conf"
1163 | param {
1164 | lr_mult: 1
1165 | decay_mult: 1
1166 | }
1167 | param {
1168 | lr_mult: 2
1169 | decay_mult: 0
1170 | }
1171 | convolution_param {
1172 | num_output: 12
1173 | pad: 1
1174 | kernel_size: 3
1175 | stride: 1
1176 | weight_filler {
1177 | type: "xavier"
1178 | }
1179 | bias_filler {
1180 | type: "constant"
1181 | value: 0
1182 | }
1183 | }
1184 | }
1185 | layer {
1186 | name: "conv7_2_mbox_conf_perm"
1187 | type: "Permute"
1188 | bottom: "conv7_2_mbox_conf"
1189 | top: "conv7_2_mbox_conf_perm"
1190 | permute_param {
1191 | order: 0
1192 | order: 2
1193 | order: 3
1194 | order: 1
1195 | }
1196 | }
1197 | layer {
1198 | name: "conv7_2_mbox_conf_flat"
1199 | type: "Flatten"
1200 | bottom: "conv7_2_mbox_conf_perm"
1201 | top: "conv7_2_mbox_conf_flat"
1202 | flatten_param {
1203 | axis: 1
1204 | }
1205 | }
1206 | layer {
1207 | name: "conv7_2_mbox_priorbox"
1208 | type: "PriorBox"
1209 | bottom: "conv7_2"
1210 | bottom: "data"
1211 | top: "conv7_2_mbox_priorbox"
1212 | prior_box_param {
1213 | min_size: 168.0
1214 | max_size: 222.0
1215 | aspect_ratio: 2
1216 | aspect_ratio: 3
1217 | flip: true
1218 | clip: true
1219 | variance: 0.1
1220 | variance: 0.1
1221 | variance: 0.2
1222 | variance: 0.2
1223 | }
1224 | }
1225 | layer {
1226 | name: "conv8_2_mbox_loc"
1227 | type: "Convolution"
1228 | bottom: "conv8_2"
1229 | top: "conv8_2_mbox_loc"
1230 | param {
1231 | lr_mult: 1
1232 | decay_mult: 1
1233 | }
1234 | param {
1235 | lr_mult: 2
1236 | decay_mult: 0
1237 | }
1238 | convolution_param {
1239 | num_output: 24
1240 | pad: 1
1241 | kernel_size: 3
1242 | stride: 1
1243 | weight_filler {
1244 | type: "xavier"
1245 | }
1246 | bias_filler {
1247 | type: "constant"
1248 | value: 0
1249 | }
1250 | }
1251 | }
1252 | layer {
1253 | name: "conv8_2_mbox_loc_perm"
1254 | type: "Permute"
1255 | bottom: "conv8_2_mbox_loc"
1256 | top: "conv8_2_mbox_loc_perm"
1257 | permute_param {
1258 | order: 0
1259 | order: 2
1260 | order: 3
1261 | order: 1
1262 | }
1263 | }
1264 | layer {
1265 | name: "conv8_2_mbox_loc_flat"
1266 | type: "Flatten"
1267 | bottom: "conv8_2_mbox_loc_perm"
1268 | top: "conv8_2_mbox_loc_flat"
1269 | flatten_param {
1270 | axis: 1
1271 | }
1272 | }
1273 | layer {
1274 | name: "conv8_2_mbox_conf"
1275 | type: "Convolution"
1276 | bottom: "conv8_2"
1277 | top: "conv8_2_mbox_conf"
1278 | param {
1279 | lr_mult: 1
1280 | decay_mult: 1
1281 | }
1282 | param {
1283 | lr_mult: 2
1284 | decay_mult: 0
1285 | }
1286 | convolution_param {
1287 | num_output: 12
1288 | pad: 1
1289 | kernel_size: 3
1290 | stride: 1
1291 | weight_filler {
1292 | type: "xavier"
1293 | }
1294 | bias_filler {
1295 | type: "constant"
1296 | value: 0
1297 | }
1298 | }
1299 | }
1300 | layer {
1301 | name: "conv8_2_mbox_conf_perm"
1302 | type: "Permute"
1303 | bottom: "conv8_2_mbox_conf"
1304 | top: "conv8_2_mbox_conf_perm"
1305 | permute_param {
1306 | order: 0
1307 | order: 2
1308 | order: 3
1309 | order: 1
1310 | }
1311 | }
1312 | layer {
1313 | name: "conv8_2_mbox_conf_flat"
1314 | type: "Flatten"
1315 | bottom: "conv8_2_mbox_conf_perm"
1316 | top: "conv8_2_mbox_conf_flat"
1317 | flatten_param {
1318 | axis: 1
1319 | }
1320 | }
1321 | layer {
1322 | name: "conv8_2_mbox_priorbox"
1323 | type: "PriorBox"
1324 | bottom: "conv8_2"
1325 | bottom: "data"
1326 | top: "conv8_2_mbox_priorbox"
1327 | prior_box_param {
1328 | min_size: 222.0
1329 | max_size: 276.0
1330 | aspect_ratio: 2
1331 | aspect_ratio: 3
1332 | flip: true
1333 | clip: true
1334 | variance: 0.1
1335 | variance: 0.1
1336 | variance: 0.2
1337 | variance: 0.2
1338 | }
1339 | }
1340 | layer {
1341 | name: "pool6_mbox_loc"
1342 | type: "Convolution"
1343 | bottom: "pool6"
1344 | top: "pool6_mbox_loc"
1345 | param {
1346 | lr_mult: 1
1347 | decay_mult: 1
1348 | }
1349 | param {
1350 | lr_mult: 2
1351 | decay_mult: 0
1352 | }
1353 | convolution_param {
1354 | num_output: 24
1355 | pad: 1
1356 | kernel_size: 3
1357 | stride: 1
1358 | weight_filler {
1359 | type: "xavier"
1360 | }
1361 | bias_filler {
1362 | type: "constant"
1363 | value: 0
1364 | }
1365 | }
1366 | }
1367 | layer {
1368 | name: "pool6_mbox_loc_perm"
1369 | type: "Permute"
1370 | bottom: "pool6_mbox_loc"
1371 | top: "pool6_mbox_loc_perm"
1372 | permute_param {
1373 | order: 0
1374 | order: 2
1375 | order: 3
1376 | order: 1
1377 | }
1378 | }
1379 | layer {
1380 | name: "pool6_mbox_loc_flat"
1381 | type: "Flatten"
1382 | bottom: "pool6_mbox_loc_perm"
1383 | top: "pool6_mbox_loc_flat"
1384 | flatten_param {
1385 | axis: 1
1386 | }
1387 | }
1388 | layer {
1389 | name: "pool6_mbox_conf"
1390 | type: "Convolution"
1391 | bottom: "pool6"
1392 | top: "pool6_mbox_conf"
1393 | param {
1394 | lr_mult: 1
1395 | decay_mult: 1
1396 | }
1397 | param {
1398 | lr_mult: 2
1399 | decay_mult: 0
1400 | }
1401 | convolution_param {
1402 | num_output: 12
1403 | pad: 1
1404 | kernel_size: 3
1405 | stride: 1
1406 | weight_filler {
1407 | type: "xavier"
1408 | }
1409 | bias_filler {
1410 | type: "constant"
1411 | value: 0
1412 | }
1413 | }
1414 | }
1415 | layer {
1416 | name: "pool6_mbox_conf_perm"
1417 | type: "Permute"
1418 | bottom: "pool6_mbox_conf"
1419 | top: "pool6_mbox_conf_perm"
1420 | permute_param {
1421 | order: 0
1422 | order: 2
1423 | order: 3
1424 | order: 1
1425 | }
1426 | }
1427 | layer {
1428 | name: "pool6_mbox_conf_flat"
1429 | type: "Flatten"
1430 | bottom: "pool6_mbox_conf_perm"
1431 | top: "pool6_mbox_conf_flat"
1432 | flatten_param {
1433 | axis: 1
1434 | }
1435 | }
1436 | layer {
1437 | name: "pool6_mbox_priorbox"
1438 | type: "PriorBox"
1439 | bottom: "pool6"
1440 | bottom: "data"
1441 | top: "pool6_mbox_priorbox"
1442 | prior_box_param {
1443 | min_size: 276.0
1444 | max_size: 330.0
1445 | aspect_ratio: 2
1446 | aspect_ratio: 3
1447 | flip: true
1448 | clip: true
1449 | variance: 0.1
1450 | variance: 0.1
1451 | variance: 0.2
1452 | variance: 0.2
1453 | }
1454 | }
1455 | layer {
1456 | name: "mbox_loc"
1457 | type: "Concat"
1458 | bottom: "conv4_3_norm_mbox_loc_flat"
1459 | bottom: "fc7_mbox_loc_flat"
1460 | bottom: "conv6_2_mbox_loc_flat"
1461 | bottom: "conv7_2_mbox_loc_flat"
1462 | bottom: "conv8_2_mbox_loc_flat"
1463 | bottom: "pool6_mbox_loc_flat"
1464 | top: "mbox_loc"
1465 | concat_param {
1466 | axis: 1
1467 | }
1468 | }
1469 | layer {
1470 | name: "mbox_conf"
1471 | type: "Concat"
1472 | bottom: "conv4_3_norm_mbox_conf_flat"
1473 | bottom: "fc7_mbox_conf_flat"
1474 | bottom: "conv6_2_mbox_conf_flat"
1475 | bottom: "conv7_2_mbox_conf_flat"
1476 | bottom: "conv8_2_mbox_conf_flat"
1477 | bottom: "pool6_mbox_conf_flat"
1478 | top: "mbox_conf"
1479 | concat_param {
1480 | axis: 1
1481 | }
1482 | }
1483 | layer {
1484 | name: "mbox_priorbox"
1485 | type: "Concat"
1486 | bottom: "conv4_3_norm_mbox_priorbox"
1487 | bottom: "fc7_mbox_priorbox"
1488 | bottom: "conv6_2_mbox_priorbox"
1489 | bottom: "conv7_2_mbox_priorbox"
1490 | bottom: "conv8_2_mbox_priorbox"
1491 | bottom: "pool6_mbox_priorbox"
1492 | top: "mbox_priorbox"
1493 | concat_param {
1494 | axis: 2
1495 | }
1496 | }
1497 | layer {
1498 | name: "mbox_conf_reshape"
1499 | type: "Reshape"
1500 | bottom: "mbox_conf"
1501 | top: "mbox_conf_reshape"
1502 | reshape_param {
1503 | shape {
1504 | dim: 0
1505 | dim: -1
1506 | dim: 2
1507 | }
1508 | }
1509 | }
1510 | layer {
1511 | name: "mbox_conf_softmax"
1512 | type: "Softmax"
1513 | bottom: "mbox_conf_reshape"
1514 | top: "mbox_conf_softmax"
1515 | softmax_param {
1516 | axis: 2
1517 | }
1518 | }
1519 | layer {
1520 | name: "mbox_conf_flatten"
1521 | type: "Flatten"
1522 | bottom: "mbox_conf_softmax"
1523 | top: "mbox_conf_flatten"
1524 | flatten_param {
1525 | axis: 1
1526 | }
1527 | }
1528 | layer {
1529 | name: "detection_out"
1530 | type: "DetectionOutput"
1531 | bottom: "mbox_loc"
1532 | bottom: "mbox_conf_flatten"
1533 | bottom: "mbox_priorbox"
1534 | top: "detection_out"
1535 | include {
1536 | phase: TEST
1537 | }
1538 | detection_output_param {
1539 | num_classes: 2
1540 | share_location: true
1541 | background_label_id: 0
1542 | nms_param {
1543 | nms_threshold: 0.45
1544 | top_k: 400
1545 | }
1546 | save_output_param {
1547 | output_directory: "/home/chenxp/data/VOCdevkit/results/SSD_300x300"
1548 | output_name_prefix: "comp4_det_test_"
1549 | output_format: "VOC"
1550 | label_map_file: "data/scenetext/labelmap_voc.prototxt"
1551 | name_size_file: "data/scenetext/test_name_size.txt"
1552 | num_test_image: 70
1553 | }
1554 | code_type: CENTER_SIZE
1555 | keep_top_k: 200
1556 | confidence_threshold: 0.01
1557 | }
1558 | }
1559 |
--------------------------------------------------------------------------------
/training_file/labelmap_voc.prototxt:
--------------------------------------------------------------------------------
1 | item {
2 | name: "none_of_the_above"
3 | label: 0
4 | display_name: "background"
5 | }
6 | item {
7 | name: "object"
8 | label: 1
9 | display_name: "text"
10 | }
--------------------------------------------------------------------------------
/training_file/solver.prototxt:
--------------------------------------------------------------------------------
1 | train_net: "models/VGGNet/scenetext/SSD_300x300/train.prototxt"
2 | test_net: "models/VGGNet/scenetext/SSD_300x300/test.prototxt"
3 | test_iter: 70
4 | test_interval: 100
5 | base_lr: 0.0001
6 | display: 10
7 | max_iter: 10000
8 | lr_policy: "step"
9 | gamma: 0.1
10 | momentum: 0.9
11 | weight_decay: 0.0005
12 | stepsize: 2000
13 | snapshot: 2000
14 | snapshot_prefix: "models/VGGNet/scenetext/SSD_300x300/VGG_scenetext_SSD_300x300"
15 | solver_mode: GPU
16 | device_id: 0
17 | debug_info: false
18 | snapshot_after_train: true
19 | test_initialization: false
20 | average_loss: 10
21 | iter_size: 1
22 | type: "SGD"
23 | eval_type: "detection"
24 | ap_version: "11point"
25 |
--------------------------------------------------------------------------------
/training_file/test.prototxt:
--------------------------------------------------------------------------------
1 | name: "VGG_scenetext_SSD_300x300_test"
2 | layer {
3 | name: "data"
4 | type: "AnnotatedData"
5 | top: "data"
6 | top: "label"
7 | include {
8 | phase: TEST
9 | }
10 | transform_param {
11 | mean_value: 104
12 | mean_value: 117
13 | mean_value: 123
14 | resize_param {
15 | prob: 1
16 | resize_mode: WARP
17 | height: 300
18 | width: 300
19 | interp_mode: LINEAR
20 | }
21 | }
22 | data_param {
23 | source: "examples/scenetext_test_lmdb"
24 | batch_size: 1
25 | backend: LMDB
26 | }
27 | annotated_data_param {
28 | batch_sampler {
29 | }
30 | label_map_file: "data/scenetext/labelmap_voc.prototxt"
31 | }
32 | }
33 | layer {
34 | name: "conv1_1"
35 | type: "Convolution"
36 | bottom: "data"
37 | top: "conv1_1"
38 | param {
39 | lr_mult: 0
40 | decay_mult: 0
41 | }
42 | param {
43 | lr_mult: 0
44 | decay_mult: 0
45 | }
46 | convolution_param {
47 | num_output: 64
48 | pad: 1
49 | kernel_size: 3
50 | weight_filler {
51 | type: "xavier"
52 | }
53 | bias_filler {
54 | type: "constant"
55 | value: 0
56 | }
57 | }
58 | }
59 | layer {
60 | name: "relu1_1"
61 | type: "ReLU"
62 | bottom: "conv1_1"
63 | top: "conv1_1"
64 | }
65 | layer {
66 | name: "conv1_2"
67 | type: "Convolution"
68 | bottom: "conv1_1"
69 | top: "conv1_2"
70 | param {
71 | lr_mult: 0
72 | decay_mult: 0
73 | }
74 | param {
75 | lr_mult: 0
76 | decay_mult: 0
77 | }
78 | convolution_param {
79 | num_output: 64
80 | pad: 1
81 | kernel_size: 3
82 | weight_filler {
83 | type: "xavier"
84 | }
85 | bias_filler {
86 | type: "constant"
87 | value: 0
88 | }
89 | }
90 | }
91 | layer {
92 | name: "relu1_2"
93 | type: "ReLU"
94 | bottom: "conv1_2"
95 | top: "conv1_2"
96 | }
97 | layer {
98 | name: "pool1"
99 | type: "Pooling"
100 | bottom: "conv1_2"
101 | top: "pool1"
102 | pooling_param {
103 | pool: MAX
104 | kernel_size: 2
105 | stride: 2
106 | }
107 | }
108 | layer {
109 | name: "conv2_1"
110 | type: "Convolution"
111 | bottom: "pool1"
112 | top: "conv2_1"
113 | param {
114 | lr_mult: 0
115 | decay_mult: 0
116 | }
117 | param {
118 | lr_mult: 0
119 | decay_mult: 0
120 | }
121 | convolution_param {
122 | num_output: 128
123 | pad: 1
124 | kernel_size: 3
125 | weight_filler {
126 | type: "xavier"
127 | }
128 | bias_filler {
129 | type: "constant"
130 | value: 0
131 | }
132 | }
133 | }
134 | layer {
135 | name: "relu2_1"
136 | type: "ReLU"
137 | bottom: "conv2_1"
138 | top: "conv2_1"
139 | }
140 | layer {
141 | name: "conv2_2"
142 | type: "Convolution"
143 | bottom: "conv2_1"
144 | top: "conv2_2"
145 | param {
146 | lr_mult: 0
147 | decay_mult: 0
148 | }
149 | param {
150 | lr_mult: 0
151 | decay_mult: 0
152 | }
153 | convolution_param {
154 | num_output: 128
155 | pad: 1
156 | kernel_size: 3
157 | weight_filler {
158 | type: "xavier"
159 | }
160 | bias_filler {
161 | type: "constant"
162 | value: 0
163 | }
164 | }
165 | }
166 | layer {
167 | name: "relu2_2"
168 | type: "ReLU"
169 | bottom: "conv2_2"
170 | top: "conv2_2"
171 | }
172 | layer {
173 | name: "pool2"
174 | type: "Pooling"
175 | bottom: "conv2_2"
176 | top: "pool2"
177 | pooling_param {
178 | pool: MAX
179 | kernel_size: 2
180 | stride: 2
181 | }
182 | }
183 | layer {
184 | name: "conv3_1"
185 | type: "Convolution"
186 | bottom: "pool2"
187 | top: "conv3_1"
188 | param {
189 | lr_mult: 1
190 | decay_mult: 1
191 | }
192 | param {
193 | lr_mult: 2
194 | decay_mult: 0
195 | }
196 | convolution_param {
197 | num_output: 256
198 | pad: 1
199 | kernel_size: 3
200 | weight_filler {
201 | type: "xavier"
202 | }
203 | bias_filler {
204 | type: "constant"
205 | value: 0
206 | }
207 | }
208 | }
209 | layer {
210 | name: "relu3_1"
211 | type: "ReLU"
212 | bottom: "conv3_1"
213 | top: "conv3_1"
214 | }
215 | layer {
216 | name: "conv3_2"
217 | type: "Convolution"
218 | bottom: "conv3_1"
219 | top: "conv3_2"
220 | param {
221 | lr_mult: 1
222 | decay_mult: 1
223 | }
224 | param {
225 | lr_mult: 2
226 | decay_mult: 0
227 | }
228 | convolution_param {
229 | num_output: 256
230 | pad: 1
231 | kernel_size: 3
232 | weight_filler {
233 | type: "xavier"
234 | }
235 | bias_filler {
236 | type: "constant"
237 | value: 0
238 | }
239 | }
240 | }
241 | layer {
242 | name: "relu3_2"
243 | type: "ReLU"
244 | bottom: "conv3_2"
245 | top: "conv3_2"
246 | }
247 | layer {
248 | name: "conv3_3"
249 | type: "Convolution"
250 | bottom: "conv3_2"
251 | top: "conv3_3"
252 | param {
253 | lr_mult: 1
254 | decay_mult: 1
255 | }
256 | param {
257 | lr_mult: 2
258 | decay_mult: 0
259 | }
260 | convolution_param {
261 | num_output: 256
262 | pad: 1
263 | kernel_size: 3
264 | weight_filler {
265 | type: "xavier"
266 | }
267 | bias_filler {
268 | type: "constant"
269 | value: 0
270 | }
271 | }
272 | }
273 | layer {
274 | name: "relu3_3"
275 | type: "ReLU"
276 | bottom: "conv3_3"
277 | top: "conv3_3"
278 | }
279 | layer {
280 | name: "pool3"
281 | type: "Pooling"
282 | bottom: "conv3_3"
283 | top: "pool3"
284 | pooling_param {
285 | pool: MAX
286 | kernel_size: 2
287 | stride: 2
288 | }
289 | }
290 | layer {
291 | name: "conv4_1"
292 | type: "Convolution"
293 | bottom: "pool3"
294 | top: "conv4_1"
295 | param {
296 | lr_mult: 1
297 | decay_mult: 1
298 | }
299 | param {
300 | lr_mult: 2
301 | decay_mult: 0
302 | }
303 | convolution_param {
304 | num_output: 512
305 | pad: 1
306 | kernel_size: 3
307 | weight_filler {
308 | type: "xavier"
309 | }
310 | bias_filler {
311 | type: "constant"
312 | value: 0
313 | }
314 | }
315 | }
316 | layer {
317 | name: "relu4_1"
318 | type: "ReLU"
319 | bottom: "conv4_1"
320 | top: "conv4_1"
321 | }
322 | layer {
323 | name: "conv4_2"
324 | type: "Convolution"
325 | bottom: "conv4_1"
326 | top: "conv4_2"
327 | param {
328 | lr_mult: 1
329 | decay_mult: 1
330 | }
331 | param {
332 | lr_mult: 2
333 | decay_mult: 0
334 | }
335 | convolution_param {
336 | num_output: 512
337 | pad: 1
338 | kernel_size: 3
339 | weight_filler {
340 | type: "xavier"
341 | }
342 | bias_filler {
343 | type: "constant"
344 | value: 0
345 | }
346 | }
347 | }
348 | layer {
349 | name: "relu4_2"
350 | type: "ReLU"
351 | bottom: "conv4_2"
352 | top: "conv4_2"
353 | }
354 | layer {
355 | name: "conv4_3"
356 | type: "Convolution"
357 | bottom: "conv4_2"
358 | top: "conv4_3"
359 | param {
360 | lr_mult: 1
361 | decay_mult: 1
362 | }
363 | param {
364 | lr_mult: 2
365 | decay_mult: 0
366 | }
367 | convolution_param {
368 | num_output: 512
369 | pad: 1
370 | kernel_size: 3
371 | weight_filler {
372 | type: "xavier"
373 | }
374 | bias_filler {
375 | type: "constant"
376 | value: 0
377 | }
378 | }
379 | }
380 | layer {
381 | name: "relu4_3"
382 | type: "ReLU"
383 | bottom: "conv4_3"
384 | top: "conv4_3"
385 | }
386 | layer {
387 | name: "pool4"
388 | type: "Pooling"
389 | bottom: "conv4_3"
390 | top: "pool4"
391 | pooling_param {
392 | pool: MAX
393 | kernel_size: 2
394 | stride: 2
395 | }
396 | }
397 | layer {
398 | name: "conv5_1"
399 | type: "Convolution"
400 | bottom: "pool4"
401 | top: "conv5_1"
402 | param {
403 | lr_mult: 1
404 | decay_mult: 1
405 | }
406 | param {
407 | lr_mult: 2
408 | decay_mult: 0
409 | }
410 | convolution_param {
411 | num_output: 512
412 | pad: 1
413 | kernel_size: 3
414 | weight_filler {
415 | type: "xavier"
416 | }
417 | bias_filler {
418 | type: "constant"
419 | value: 0
420 | }
421 | }
422 | }
423 | layer {
424 | name: "relu5_1"
425 | type: "ReLU"
426 | bottom: "conv5_1"
427 | top: "conv5_1"
428 | }
429 | layer {
430 | name: "conv5_2"
431 | type: "Convolution"
432 | bottom: "conv5_1"
433 | top: "conv5_2"
434 | param {
435 | lr_mult: 1
436 | decay_mult: 1
437 | }
438 | param {
439 | lr_mult: 2
440 | decay_mult: 0
441 | }
442 | convolution_param {
443 | num_output: 512
444 | pad: 1
445 | kernel_size: 3
446 | weight_filler {
447 | type: "xavier"
448 | }
449 | bias_filler {
450 | type: "constant"
451 | value: 0
452 | }
453 | }
454 | }
455 | layer {
456 | name: "relu5_2"
457 | type: "ReLU"
458 | bottom: "conv5_2"
459 | top: "conv5_2"
460 | }
461 | layer {
462 | name: "conv5_3"
463 | type: "Convolution"
464 | bottom: "conv5_2"
465 | top: "conv5_3"
466 | param {
467 | lr_mult: 1
468 | decay_mult: 1
469 | }
470 | param {
471 | lr_mult: 2
472 | decay_mult: 0
473 | }
474 | convolution_param {
475 | num_output: 512
476 | pad: 1
477 | kernel_size: 3
478 | weight_filler {
479 | type: "xavier"
480 | }
481 | bias_filler {
482 | type: "constant"
483 | value: 0
484 | }
485 | }
486 | }
487 | layer {
488 | name: "relu5_3"
489 | type: "ReLU"
490 | bottom: "conv5_3"
491 | top: "conv5_3"
492 | }
493 | layer {
494 | name: "pool5"
495 | type: "Pooling"
496 | bottom: "conv5_3"
497 | top: "pool5"
498 | pooling_param {
499 | pool: MAX
500 | kernel_size: 3
501 | stride: 1
502 | pad: 1
503 | }
504 | }
505 | layer {
506 | name: "fc6"
507 | type: "Convolution"
508 | bottom: "pool5"
509 | top: "fc6"
510 | param {
511 | lr_mult: 1
512 | decay_mult: 1
513 | }
514 | param {
515 | lr_mult: 2
516 | decay_mult: 0
517 | }
518 | convolution_param {
519 | num_output: 1024
520 | pad: 6
521 | kernel_size: 3
522 | weight_filler {
523 | type: "xavier"
524 | }
525 | bias_filler {
526 | type: "constant"
527 | value: 0
528 | }
529 | dilation: 6
530 | }
531 | }
532 | layer {
533 | name: "relu6"
534 | type: "ReLU"
535 | bottom: "fc6"
536 | top: "fc6"
537 | }
538 | layer {
539 | name: "fc7"
540 | type: "Convolution"
541 | bottom: "fc6"
542 | top: "fc7"
543 | param {
544 | lr_mult: 1
545 | decay_mult: 1
546 | }
547 | param {
548 | lr_mult: 2
549 | decay_mult: 0
550 | }
551 | convolution_param {
552 | num_output: 1024
553 | kernel_size: 1
554 | weight_filler {
555 | type: "xavier"
556 | }
557 | bias_filler {
558 | type: "constant"
559 | value: 0
560 | }
561 | }
562 | }
563 | layer {
564 | name: "relu7"
565 | type: "ReLU"
566 | bottom: "fc7"
567 | top: "fc7"
568 | }
569 | layer {
570 | name: "conv6_1"
571 | type: "Convolution"
572 | bottom: "fc7"
573 | top: "conv6_1"
574 | param {
575 | lr_mult: 1
576 | decay_mult: 1
577 | }
578 | param {
579 | lr_mult: 2
580 | decay_mult: 0
581 | }
582 | convolution_param {
583 | num_output: 256
584 | pad: 0
585 | kernel_size: 1
586 | stride: 1
587 | weight_filler {
588 | type: "xavier"
589 | }
590 | bias_filler {
591 | type: "constant"
592 | value: 0
593 | }
594 | }
595 | }
596 | layer {
597 | name: "conv6_1_relu"
598 | type: "ReLU"
599 | bottom: "conv6_1"
600 | top: "conv6_1"
601 | }
602 | layer {
603 | name: "conv6_2"
604 | type: "Convolution"
605 | bottom: "conv6_1"
606 | top: "conv6_2"
607 | param {
608 | lr_mult: 1
609 | decay_mult: 1
610 | }
611 | param {
612 | lr_mult: 2
613 | decay_mult: 0
614 | }
615 | convolution_param {
616 | num_output: 512
617 | pad: 1
618 | kernel_size: 3
619 | stride: 2
620 | weight_filler {
621 | type: "xavier"
622 | }
623 | bias_filler {
624 | type: "constant"
625 | value: 0
626 | }
627 | }
628 | }
629 | layer {
630 | name: "conv6_2_relu"
631 | type: "ReLU"
632 | bottom: "conv6_2"
633 | top: "conv6_2"
634 | }
635 | layer {
636 | name: "conv7_1"
637 | type: "Convolution"
638 | bottom: "conv6_2"
639 | top: "conv7_1"
640 | param {
641 | lr_mult: 1
642 | decay_mult: 1
643 | }
644 | param {
645 | lr_mult: 2
646 | decay_mult: 0
647 | }
648 | convolution_param {
649 | num_output: 128
650 | pad: 0
651 | kernel_size: 1
652 | stride: 1
653 | weight_filler {
654 | type: "xavier"
655 | }
656 | bias_filler {
657 | type: "constant"
658 | value: 0
659 | }
660 | }
661 | }
662 | layer {
663 | name: "conv7_1_relu"
664 | type: "ReLU"
665 | bottom: "conv7_1"
666 | top: "conv7_1"
667 | }
668 | layer {
669 | name: "conv7_2"
670 | type: "Convolution"
671 | bottom: "conv7_1"
672 | top: "conv7_2"
673 | param {
674 | lr_mult: 1
675 | decay_mult: 1
676 | }
677 | param {
678 | lr_mult: 2
679 | decay_mult: 0
680 | }
681 | convolution_param {
682 | num_output: 256
683 | pad: 1
684 | kernel_size: 3
685 | stride: 2
686 | weight_filler {
687 | type: "xavier"
688 | }
689 | bias_filler {
690 | type: "constant"
691 | value: 0
692 | }
693 | }
694 | }
695 | layer {
696 | name: "conv7_2_relu"
697 | type: "ReLU"
698 | bottom: "conv7_2"
699 | top: "conv7_2"
700 | }
701 | layer {
702 | name: "conv8_1"
703 | type: "Convolution"
704 | bottom: "conv7_2"
705 | top: "conv8_1"
706 | param {
707 | lr_mult: 1
708 | decay_mult: 1
709 | }
710 | param {
711 | lr_mult: 2
712 | decay_mult: 0
713 | }
714 | convolution_param {
715 | num_output: 128
716 | pad: 0
717 | kernel_size: 1
718 | stride: 1
719 | weight_filler {
720 | type: "xavier"
721 | }
722 | bias_filler {
723 | type: "constant"
724 | value: 0
725 | }
726 | }
727 | }
728 | layer {
729 | name: "conv8_1_relu"
730 | type: "ReLU"
731 | bottom: "conv8_1"
732 | top: "conv8_1"
733 | }
734 | layer {
735 | name: "conv8_2"
736 | type: "Convolution"
737 | bottom: "conv8_1"
738 | top: "conv8_2"
739 | param {
740 | lr_mult: 1
741 | decay_mult: 1
742 | }
743 | param {
744 | lr_mult: 2
745 | decay_mult: 0
746 | }
747 | convolution_param {
748 | num_output: 256
749 | pad: 1
750 | kernel_size: 3
751 | stride: 2
752 | weight_filler {
753 | type: "xavier"
754 | }
755 | bias_filler {
756 | type: "constant"
757 | value: 0
758 | }
759 | }
760 | }
761 | layer {
762 | name: "conv8_2_relu"
763 | type: "ReLU"
764 | bottom: "conv8_2"
765 | top: "conv8_2"
766 | }
767 | layer {
768 | name: "pool6"
769 | type: "Pooling"
770 | bottom: "conv8_2"
771 | top: "pool6"
772 | pooling_param {
773 | pool: AVE
774 | global_pooling: true
775 | }
776 | }
777 | layer {
778 | name: "conv4_3_norm"
779 | type: "Normalize"
780 | bottom: "conv4_3"
781 | top: "conv4_3_norm"
782 | norm_param {
783 | across_spatial: false
784 | scale_filler {
785 | type: "constant"
786 | value: 20
787 | }
788 | channel_shared: false
789 | }
790 | }
791 | layer {
792 | name: "conv4_3_norm_mbox_loc"
793 | type: "Convolution"
794 | bottom: "conv4_3_norm"
795 | top: "conv4_3_norm_mbox_loc"
796 | param {
797 | lr_mult: 1
798 | decay_mult: 1
799 | }
800 | param {
801 | lr_mult: 2
802 | decay_mult: 0
803 | }
804 | convolution_param {
805 | num_output: 12
806 | pad: 1
807 | kernel_size: 3
808 | stride: 1
809 | weight_filler {
810 | type: "xavier"
811 | }
812 | bias_filler {
813 | type: "constant"
814 | value: 0
815 | }
816 | }
817 | }
818 | layer {
819 | name: "conv4_3_norm_mbox_loc_perm"
820 | type: "Permute"
821 | bottom: "conv4_3_norm_mbox_loc"
822 | top: "conv4_3_norm_mbox_loc_perm"
823 | permute_param {
824 | order: 0
825 | order: 2
826 | order: 3
827 | order: 1
828 | }
829 | }
830 | layer {
831 | name: "conv4_3_norm_mbox_loc_flat"
832 | type: "Flatten"
833 | bottom: "conv4_3_norm_mbox_loc_perm"
834 | top: "conv4_3_norm_mbox_loc_flat"
835 | flatten_param {
836 | axis: 1
837 | }
838 | }
839 | layer {
840 | name: "conv4_3_norm_mbox_conf"
841 | type: "Convolution"
842 | bottom: "conv4_3_norm"
843 | top: "conv4_3_norm_mbox_conf"
844 | param {
845 | lr_mult: 1
846 | decay_mult: 1
847 | }
848 | param {
849 | lr_mult: 2
850 | decay_mult: 0
851 | }
852 | convolution_param {
853 | num_output: 6
854 | pad: 1
855 | kernel_size: 3
856 | stride: 1
857 | weight_filler {
858 | type: "xavier"
859 | }
860 | bias_filler {
861 | type: "constant"
862 | value: 0
863 | }
864 | }
865 | }
866 | layer {
867 | name: "conv4_3_norm_mbox_conf_perm"
868 | type: "Permute"
869 | bottom: "conv4_3_norm_mbox_conf"
870 | top: "conv4_3_norm_mbox_conf_perm"
871 | permute_param {
872 | order: 0
873 | order: 2
874 | order: 3
875 | order: 1
876 | }
877 | }
878 | layer {
879 | name: "conv4_3_norm_mbox_conf_flat"
880 | type: "Flatten"
881 | bottom: "conv4_3_norm_mbox_conf_perm"
882 | top: "conv4_3_norm_mbox_conf_flat"
883 | flatten_param {
884 | axis: 1
885 | }
886 | }
887 | layer {
888 | name: "conv4_3_norm_mbox_priorbox"
889 | type: "PriorBox"
890 | bottom: "conv4_3_norm"
891 | bottom: "data"
892 | top: "conv4_3_norm_mbox_priorbox"
893 | prior_box_param {
894 | min_size: 30.0
895 | aspect_ratio: 2
896 | flip: true
897 | clip: true
898 | variance: 0.1
899 | variance: 0.1
900 | variance: 0.2
901 | variance: 0.2
902 | }
903 | }
904 | layer {
905 | name: "fc7_mbox_loc"
906 | type: "Convolution"
907 | bottom: "fc7"
908 | top: "fc7_mbox_loc"
909 | param {
910 | lr_mult: 1
911 | decay_mult: 1
912 | }
913 | param {
914 | lr_mult: 2
915 | decay_mult: 0
916 | }
917 | convolution_param {
918 | num_output: 24
919 | pad: 1
920 | kernel_size: 3
921 | stride: 1
922 | weight_filler {
923 | type: "xavier"
924 | }
925 | bias_filler {
926 | type: "constant"
927 | value: 0
928 | }
929 | }
930 | }
931 | layer {
932 | name: "fc7_mbox_loc_perm"
933 | type: "Permute"
934 | bottom: "fc7_mbox_loc"
935 | top: "fc7_mbox_loc_perm"
936 | permute_param {
937 | order: 0
938 | order: 2
939 | order: 3
940 | order: 1
941 | }
942 | }
943 | layer {
944 | name: "fc7_mbox_loc_flat"
945 | type: "Flatten"
946 | bottom: "fc7_mbox_loc_perm"
947 | top: "fc7_mbox_loc_flat"
948 | flatten_param {
949 | axis: 1
950 | }
951 | }
952 | layer {
953 | name: "fc7_mbox_conf"
954 | type: "Convolution"
955 | bottom: "fc7"
956 | top: "fc7_mbox_conf"
957 | param {
958 | lr_mult: 1
959 | decay_mult: 1
960 | }
961 | param {
962 | lr_mult: 2
963 | decay_mult: 0
964 | }
965 | convolution_param {
966 | num_output: 12
967 | pad: 1
968 | kernel_size: 3
969 | stride: 1
970 | weight_filler {
971 | type: "xavier"
972 | }
973 | bias_filler {
974 | type: "constant"
975 | value: 0
976 | }
977 | }
978 | }
979 | layer {
980 | name: "fc7_mbox_conf_perm"
981 | type: "Permute"
982 | bottom: "fc7_mbox_conf"
983 | top: "fc7_mbox_conf_perm"
984 | permute_param {
985 | order: 0
986 | order: 2
987 | order: 3
988 | order: 1
989 | }
990 | }
991 | layer {
992 | name: "fc7_mbox_conf_flat"
993 | type: "Flatten"
994 | bottom: "fc7_mbox_conf_perm"
995 | top: "fc7_mbox_conf_flat"
996 | flatten_param {
997 | axis: 1
998 | }
999 | }
1000 | layer {
1001 | name: "fc7_mbox_priorbox"
1002 | type: "PriorBox"
1003 | bottom: "fc7"
1004 | bottom: "data"
1005 | top: "fc7_mbox_priorbox"
1006 | prior_box_param {
1007 | min_size: 60.0
1008 | max_size: 114.0
1009 | aspect_ratio: 2
1010 | aspect_ratio: 3
1011 | flip: true
1012 | clip: true
1013 | variance: 0.1
1014 | variance: 0.1
1015 | variance: 0.2
1016 | variance: 0.2
1017 | }
1018 | }
1019 | layer {
1020 | name: "conv6_2_mbox_loc"
1021 | type: "Convolution"
1022 | bottom: "conv6_2"
1023 | top: "conv6_2_mbox_loc"
1024 | param {
1025 | lr_mult: 1
1026 | decay_mult: 1
1027 | }
1028 | param {
1029 | lr_mult: 2
1030 | decay_mult: 0
1031 | }
1032 | convolution_param {
1033 | num_output: 24
1034 | pad: 1
1035 | kernel_size: 3
1036 | stride: 1
1037 | weight_filler {
1038 | type: "xavier"
1039 | }
1040 | bias_filler {
1041 | type: "constant"
1042 | value: 0
1043 | }
1044 | }
1045 | }
1046 | layer {
1047 | name: "conv6_2_mbox_loc_perm"
1048 | type: "Permute"
1049 | bottom: "conv6_2_mbox_loc"
1050 | top: "conv6_2_mbox_loc_perm"
1051 | permute_param {
1052 | order: 0
1053 | order: 2
1054 | order: 3
1055 | order: 1
1056 | }
1057 | }
1058 | layer {
1059 | name: "conv6_2_mbox_loc_flat"
1060 | type: "Flatten"
1061 | bottom: "conv6_2_mbox_loc_perm"
1062 | top: "conv6_2_mbox_loc_flat"
1063 | flatten_param {
1064 | axis: 1
1065 | }
1066 | }
1067 | layer {
1068 | name: "conv6_2_mbox_conf"
1069 | type: "Convolution"
1070 | bottom: "conv6_2"
1071 | top: "conv6_2_mbox_conf"
1072 | param {
1073 | lr_mult: 1
1074 | decay_mult: 1
1075 | }
1076 | param {
1077 | lr_mult: 2
1078 | decay_mult: 0
1079 | }
1080 | convolution_param {
1081 | num_output: 12
1082 | pad: 1
1083 | kernel_size: 3
1084 | stride: 1
1085 | weight_filler {
1086 | type: "xavier"
1087 | }
1088 | bias_filler {
1089 | type: "constant"
1090 | value: 0
1091 | }
1092 | }
1093 | }
1094 | layer {
1095 | name: "conv6_2_mbox_conf_perm"
1096 | type: "Permute"
1097 | bottom: "conv6_2_mbox_conf"
1098 | top: "conv6_2_mbox_conf_perm"
1099 | permute_param {
1100 | order: 0
1101 | order: 2
1102 | order: 3
1103 | order: 1
1104 | }
1105 | }
1106 | layer {
1107 | name: "conv6_2_mbox_conf_flat"
1108 | type: "Flatten"
1109 | bottom: "conv6_2_mbox_conf_perm"
1110 | top: "conv6_2_mbox_conf_flat"
1111 | flatten_param {
1112 | axis: 1
1113 | }
1114 | }
1115 | layer {
1116 | name: "conv6_2_mbox_priorbox"
1117 | type: "PriorBox"
1118 | bottom: "conv6_2"
1119 | bottom: "data"
1120 | top: "conv6_2_mbox_priorbox"
1121 | prior_box_param {
1122 | min_size: 114.0
1123 | max_size: 168.0
1124 | aspect_ratio: 2
1125 | aspect_ratio: 3
1126 | flip: true
1127 | clip: true
1128 | variance: 0.1
1129 | variance: 0.1
1130 | variance: 0.2
1131 | variance: 0.2
1132 | }
1133 | }
1134 | layer {
1135 | name: "conv7_2_mbox_loc"
1136 | type: "Convolution"
1137 | bottom: "conv7_2"
1138 | top: "conv7_2_mbox_loc"
1139 | param {
1140 | lr_mult: 1
1141 | decay_mult: 1
1142 | }
1143 | param {
1144 | lr_mult: 2
1145 | decay_mult: 0
1146 | }
1147 | convolution_param {
1148 | num_output: 24
1149 | pad: 1
1150 | kernel_size: 3
1151 | stride: 1
1152 | weight_filler {
1153 | type: "xavier"
1154 | }
1155 | bias_filler {
1156 | type: "constant"
1157 | value: 0
1158 | }
1159 | }
1160 | }
1161 | layer {
1162 | name: "conv7_2_mbox_loc_perm"
1163 | type: "Permute"
1164 | bottom: "conv7_2_mbox_loc"
1165 | top: "conv7_2_mbox_loc_perm"
1166 | permute_param {
1167 | order: 0
1168 | order: 2
1169 | order: 3
1170 | order: 1
1171 | }
1172 | }
1173 | layer {
1174 | name: "conv7_2_mbox_loc_flat"
1175 | type: "Flatten"
1176 | bottom: "conv7_2_mbox_loc_perm"
1177 | top: "conv7_2_mbox_loc_flat"
1178 | flatten_param {
1179 | axis: 1
1180 | }
1181 | }
1182 | layer {
1183 | name: "conv7_2_mbox_conf"
1184 | type: "Convolution"
1185 | bottom: "conv7_2"
1186 | top: "conv7_2_mbox_conf"
1187 | param {
1188 | lr_mult: 1
1189 | decay_mult: 1
1190 | }
1191 | param {
1192 | lr_mult: 2
1193 | decay_mult: 0
1194 | }
1195 | convolution_param {
1196 | num_output: 12
1197 | pad: 1
1198 | kernel_size: 3
1199 | stride: 1
1200 | weight_filler {
1201 | type: "xavier"
1202 | }
1203 | bias_filler {
1204 | type: "constant"
1205 | value: 0
1206 | }
1207 | }
1208 | }
1209 | layer {
1210 | name: "conv7_2_mbox_conf_perm"
1211 | type: "Permute"
1212 | bottom: "conv7_2_mbox_conf"
1213 | top: "conv7_2_mbox_conf_perm"
1214 | permute_param {
1215 | order: 0
1216 | order: 2
1217 | order: 3
1218 | order: 1
1219 | }
1220 | }
1221 | layer {
1222 | name: "conv7_2_mbox_conf_flat"
1223 | type: "Flatten"
1224 | bottom: "conv7_2_mbox_conf_perm"
1225 | top: "conv7_2_mbox_conf_flat"
1226 | flatten_param {
1227 | axis: 1
1228 | }
1229 | }
1230 | layer {
1231 | name: "conv7_2_mbox_priorbox"
1232 | type: "PriorBox"
1233 | bottom: "conv7_2"
1234 | bottom: "data"
1235 | top: "conv7_2_mbox_priorbox"
1236 | prior_box_param {
1237 | min_size: 168.0
1238 | max_size: 222.0
1239 | aspect_ratio: 2
1240 | aspect_ratio: 3
1241 | flip: true
1242 | clip: true
1243 | variance: 0.1
1244 | variance: 0.1
1245 | variance: 0.2
1246 | variance: 0.2
1247 | }
1248 | }
1249 | layer {
1250 | name: "conv8_2_mbox_loc"
1251 | type: "Convolution"
1252 | bottom: "conv8_2"
1253 | top: "conv8_2_mbox_loc"
1254 | param {
1255 | lr_mult: 1
1256 | decay_mult: 1
1257 | }
1258 | param {
1259 | lr_mult: 2
1260 | decay_mult: 0
1261 | }
1262 | convolution_param {
1263 | num_output: 24
1264 | pad: 1
1265 | kernel_size: 3
1266 | stride: 1
1267 | weight_filler {
1268 | type: "xavier"
1269 | }
1270 | bias_filler {
1271 | type: "constant"
1272 | value: 0
1273 | }
1274 | }
1275 | }
1276 | layer {
1277 | name: "conv8_2_mbox_loc_perm"
1278 | type: "Permute"
1279 | bottom: "conv8_2_mbox_loc"
1280 | top: "conv8_2_mbox_loc_perm"
1281 | permute_param {
1282 | order: 0
1283 | order: 2
1284 | order: 3
1285 | order: 1
1286 | }
1287 | }
1288 | layer {
1289 | name: "conv8_2_mbox_loc_flat"
1290 | type: "Flatten"
1291 | bottom: "conv8_2_mbox_loc_perm"
1292 | top: "conv8_2_mbox_loc_flat"
1293 | flatten_param {
1294 | axis: 1
1295 | }
1296 | }
1297 | layer {
1298 | name: "conv8_2_mbox_conf"
1299 | type: "Convolution"
1300 | bottom: "conv8_2"
1301 | top: "conv8_2_mbox_conf"
1302 | param {
1303 | lr_mult: 1
1304 | decay_mult: 1
1305 | }
1306 | param {
1307 | lr_mult: 2
1308 | decay_mult: 0
1309 | }
1310 | convolution_param {
1311 | num_output: 12
1312 | pad: 1
1313 | kernel_size: 3
1314 | stride: 1
1315 | weight_filler {
1316 | type: "xavier"
1317 | }
1318 | bias_filler {
1319 | type: "constant"
1320 | value: 0
1321 | }
1322 | }
1323 | }
1324 | layer {
1325 | name: "conv8_2_mbox_conf_perm"
1326 | type: "Permute"
1327 | bottom: "conv8_2_mbox_conf"
1328 | top: "conv8_2_mbox_conf_perm"
1329 | permute_param {
1330 | order: 0
1331 | order: 2
1332 | order: 3
1333 | order: 1
1334 | }
1335 | }
1336 | layer {
1337 | name: "conv8_2_mbox_conf_flat"
1338 | type: "Flatten"
1339 | bottom: "conv8_2_mbox_conf_perm"
1340 | top: "conv8_2_mbox_conf_flat"
1341 | flatten_param {
1342 | axis: 1
1343 | }
1344 | }
1345 | layer {
1346 | name: "conv8_2_mbox_priorbox"
1347 | type: "PriorBox"
1348 | bottom: "conv8_2"
1349 | bottom: "data"
1350 | top: "conv8_2_mbox_priorbox"
1351 | prior_box_param {
1352 | min_size: 222.0
1353 | max_size: 276.0
1354 | aspect_ratio: 2
1355 | aspect_ratio: 3
1356 | flip: true
1357 | clip: true
1358 | variance: 0.1
1359 | variance: 0.1
1360 | variance: 0.2
1361 | variance: 0.2
1362 | }
1363 | }
1364 | layer {
1365 | name: "pool6_mbox_loc"
1366 | type: "Convolution"
1367 | bottom: "pool6"
1368 | top: "pool6_mbox_loc"
1369 | param {
1370 | lr_mult: 1
1371 | decay_mult: 1
1372 | }
1373 | param {
1374 | lr_mult: 2
1375 | decay_mult: 0
1376 | }
1377 | convolution_param {
1378 | num_output: 24
1379 | pad: 1
1380 | kernel_size: 3
1381 | stride: 1
1382 | weight_filler {
1383 | type: "xavier"
1384 | }
1385 | bias_filler {
1386 | type: "constant"
1387 | value: 0
1388 | }
1389 | }
1390 | }
1391 | layer {
1392 | name: "pool6_mbox_loc_perm"
1393 | type: "Permute"
1394 | bottom: "pool6_mbox_loc"
1395 | top: "pool6_mbox_loc_perm"
1396 | permute_param {
1397 | order: 0
1398 | order: 2
1399 | order: 3
1400 | order: 1
1401 | }
1402 | }
1403 | layer {
1404 | name: "pool6_mbox_loc_flat"
1405 | type: "Flatten"
1406 | bottom: "pool6_mbox_loc_perm"
1407 | top: "pool6_mbox_loc_flat"
1408 | flatten_param {
1409 | axis: 1
1410 | }
1411 | }
1412 | layer {
1413 | name: "pool6_mbox_conf"
1414 | type: "Convolution"
1415 | bottom: "pool6"
1416 | top: "pool6_mbox_conf"
1417 | param {
1418 | lr_mult: 1
1419 | decay_mult: 1
1420 | }
1421 | param {
1422 | lr_mult: 2
1423 | decay_mult: 0
1424 | }
1425 | convolution_param {
1426 | num_output: 12
1427 | pad: 1
1428 | kernel_size: 3
1429 | stride: 1
1430 | weight_filler {
1431 | type: "xavier"
1432 | }
1433 | bias_filler {
1434 | type: "constant"
1435 | value: 0
1436 | }
1437 | }
1438 | }
1439 | layer {
1440 | name: "pool6_mbox_conf_perm"
1441 | type: "Permute"
1442 | bottom: "pool6_mbox_conf"
1443 | top: "pool6_mbox_conf_perm"
1444 | permute_param {
1445 | order: 0
1446 | order: 2
1447 | order: 3
1448 | order: 1
1449 | }
1450 | }
1451 | layer {
1452 | name: "pool6_mbox_conf_flat"
1453 | type: "Flatten"
1454 | bottom: "pool6_mbox_conf_perm"
1455 | top: "pool6_mbox_conf_flat"
1456 | flatten_param {
1457 | axis: 1
1458 | }
1459 | }
1460 | layer {
1461 | name: "pool6_mbox_priorbox"
1462 | type: "PriorBox"
1463 | bottom: "pool6"
1464 | bottom: "data"
1465 | top: "pool6_mbox_priorbox"
1466 | prior_box_param {
1467 | min_size: 276.0
1468 | max_size: 330.0
1469 | aspect_ratio: 2
1470 | aspect_ratio: 3
1471 | flip: true
1472 | clip: true
1473 | variance: 0.1
1474 | variance: 0.1
1475 | variance: 0.2
1476 | variance: 0.2
1477 | }
1478 | }
1479 | layer {
1480 | name: "mbox_loc"
1481 | type: "Concat"
1482 | bottom: "conv4_3_norm_mbox_loc_flat"
1483 | bottom: "fc7_mbox_loc_flat"
1484 | bottom: "conv6_2_mbox_loc_flat"
1485 | bottom: "conv7_2_mbox_loc_flat"
1486 | bottom: "conv8_2_mbox_loc_flat"
1487 | bottom: "pool6_mbox_loc_flat"
1488 | top: "mbox_loc"
1489 | concat_param {
1490 | axis: 1
1491 | }
1492 | }
1493 | layer {
1494 | name: "mbox_conf"
1495 | type: "Concat"
1496 | bottom: "conv4_3_norm_mbox_conf_flat"
1497 | bottom: "fc7_mbox_conf_flat"
1498 | bottom: "conv6_2_mbox_conf_flat"
1499 | bottom: "conv7_2_mbox_conf_flat"
1500 | bottom: "conv8_2_mbox_conf_flat"
1501 | bottom: "pool6_mbox_conf_flat"
1502 | top: "mbox_conf"
1503 | concat_param {
1504 | axis: 1
1505 | }
1506 | }
1507 | layer {
1508 | name: "mbox_priorbox"
1509 | type: "Concat"
1510 | bottom: "conv4_3_norm_mbox_priorbox"
1511 | bottom: "fc7_mbox_priorbox"
1512 | bottom: "conv6_2_mbox_priorbox"
1513 | bottom: "conv7_2_mbox_priorbox"
1514 | bottom: "conv8_2_mbox_priorbox"
1515 | bottom: "pool6_mbox_priorbox"
1516 | top: "mbox_priorbox"
1517 | concat_param {
1518 | axis: 2
1519 | }
1520 | }
1521 | layer {
1522 | name: "mbox_conf_reshape"
1523 | type: "Reshape"
1524 | bottom: "mbox_conf"
1525 | top: "mbox_conf_reshape"
1526 | reshape_param {
1527 | shape {
1528 | dim: 0
1529 | dim: -1
1530 | dim: 2
1531 | }
1532 | }
1533 | }
1534 | layer {
1535 | name: "mbox_conf_softmax"
1536 | type: "Softmax"
1537 | bottom: "mbox_conf_reshape"
1538 | top: "mbox_conf_softmax"
1539 | softmax_param {
1540 | axis: 2
1541 | }
1542 | }
1543 | layer {
1544 | name: "mbox_conf_flatten"
1545 | type: "Flatten"
1546 | bottom: "mbox_conf_softmax"
1547 | top: "mbox_conf_flatten"
1548 | flatten_param {
1549 | axis: 1
1550 | }
1551 | }
1552 | layer {
1553 | name: "detection_out"
1554 | type: "DetectionOutput"
1555 | bottom: "mbox_loc"
1556 | bottom: "mbox_conf_flatten"
1557 | bottom: "mbox_priorbox"
1558 | top: "detection_out"
1559 | include {
1560 | phase: TEST
1561 | }
1562 | detection_output_param {
1563 | num_classes: 2
1564 | share_location: true
1565 | background_label_id: 0
1566 | nms_param {
1567 | nms_threshold: 0.45
1568 | top_k: 400
1569 | }
1570 | save_output_param {
1571 | output_directory: "/home/chenxp/data/VOCdevkit/results/SSD_300x300"
1572 | output_name_prefix: "comp4_det_test_"
1573 | output_format: "VOC"
1574 | label_map_file: "data/scenetext/labelmap_voc.prototxt"
1575 | name_size_file: "data/scenetext/test_name_size.txt"
1576 | num_test_image: 70
1577 | }
1578 | code_type: CENTER_SIZE
1579 | keep_top_k: 200
1580 | confidence_threshold: 0.01
1581 | }
1582 | }
1583 | layer {
1584 | name: "detection_eval"
1585 | type: "DetectionEvaluate"
1586 | bottom: "detection_out"
1587 | bottom: "label"
1588 | top: "detection_eval"
1589 | include {
1590 | phase: TEST
1591 | }
1592 | detection_evaluate_param {
1593 | num_classes: 2
1594 | background_label_id: 0
1595 | overlap_threshold: 0.5
1596 | evaluate_difficult_gt: false
1597 | name_size_file: "data/scenetext/test_name_size.txt"
1598 | }
1599 | }
1600 |
1601 |
--------------------------------------------------------------------------------
/training_file/test.txt:
--------------------------------------------------------------------------------
1 | scenetext/JPEGImages/318.jpg scenetext/Annotations/318.xml
2 | scenetext/JPEGImages/320.jpg scenetext/Annotations/320.xml
3 | scenetext/JPEGImages/269.jpg scenetext/Annotations/269.xml
4 | scenetext/JPEGImages/299.jpg scenetext/Annotations/299.xml
5 | scenetext/JPEGImages/313.jpg scenetext/Annotations/313.xml
6 | scenetext/JPEGImages/265.jpg scenetext/Annotations/265.xml
7 | scenetext/JPEGImages/262.jpg scenetext/Annotations/262.xml
8 | scenetext/JPEGImages/317.jpg scenetext/Annotations/317.xml
9 | scenetext/JPEGImages/279.jpg scenetext/Annotations/279.xml
10 | scenetext/JPEGImages/302.jpg scenetext/Annotations/302.xml
11 | scenetext/JPEGImages/283.jpg scenetext/Annotations/283.xml
12 | scenetext/JPEGImages/271.jpg scenetext/Annotations/271.xml
13 | scenetext/JPEGImages/277.jpg scenetext/Annotations/277.xml
14 | scenetext/JPEGImages/327.jpg scenetext/Annotations/327.xml
15 | scenetext/JPEGImages/270.jpg scenetext/Annotations/270.xml
16 | scenetext/JPEGImages/307.jpg scenetext/Annotations/307.xml
17 | scenetext/JPEGImages/315.jpg scenetext/Annotations/315.xml
18 | scenetext/JPEGImages/282.jpg scenetext/Annotations/282.xml
19 | scenetext/JPEGImages/264.jpg scenetext/Annotations/264.xml
20 | scenetext/JPEGImages/326.jpg scenetext/Annotations/326.xml
21 | scenetext/JPEGImages/261.jpg scenetext/Annotations/261.xml
22 | scenetext/JPEGImages/300.jpg scenetext/Annotations/300.xml
23 | scenetext/JPEGImages/263.jpg scenetext/Annotations/263.xml
24 | scenetext/JPEGImages/285.jpg scenetext/Annotations/285.xml
25 | scenetext/JPEGImages/274.jpg scenetext/Annotations/274.xml
26 | scenetext/JPEGImages/266.jpg scenetext/Annotations/266.xml
27 | scenetext/JPEGImages/303.jpg scenetext/Annotations/303.xml
28 | scenetext/JPEGImages/314.jpg scenetext/Annotations/314.xml
29 | scenetext/JPEGImages/288.jpg scenetext/Annotations/288.xml
30 | scenetext/JPEGImages/328.jpg scenetext/Annotations/328.xml
31 | scenetext/JPEGImages/292.jpg scenetext/Annotations/292.xml
32 | scenetext/JPEGImages/287.jpg scenetext/Annotations/287.xml
33 | scenetext/JPEGImages/273.jpg scenetext/Annotations/273.xml
34 | scenetext/JPEGImages/291.jpg scenetext/Annotations/291.xml
35 | scenetext/JPEGImages/293.jpg scenetext/Annotations/293.xml
36 | scenetext/JPEGImages/319.jpg scenetext/Annotations/319.xml
37 | scenetext/JPEGImages/268.jpg scenetext/Annotations/268.xml
38 | scenetext/JPEGImages/308.jpg scenetext/Annotations/308.xml
39 | scenetext/JPEGImages/316.jpg scenetext/Annotations/316.xml
40 | scenetext/JPEGImages/324.jpg scenetext/Annotations/324.xml
41 | scenetext/JPEGImages/289.jpg scenetext/Annotations/289.xml
42 | scenetext/JPEGImages/294.jpg scenetext/Annotations/294.xml
43 | scenetext/JPEGImages/323.jpg scenetext/Annotations/323.xml
44 | scenetext/JPEGImages/286.jpg scenetext/Annotations/286.xml
45 | scenetext/JPEGImages/276.jpg scenetext/Annotations/276.xml
46 | scenetext/JPEGImages/297.jpg scenetext/Annotations/297.xml
47 | scenetext/JPEGImages/301.jpg scenetext/Annotations/301.xml
48 | scenetext/JPEGImages/296.jpg scenetext/Annotations/296.xml
49 | scenetext/JPEGImages/281.jpg scenetext/Annotations/281.xml
50 | scenetext/JPEGImages/272.jpg scenetext/Annotations/272.xml
51 | scenetext/JPEGImages/304.jpg scenetext/Annotations/304.xml
52 | scenetext/JPEGImages/275.jpg scenetext/Annotations/275.xml
53 | scenetext/JPEGImages/312.jpg scenetext/Annotations/312.xml
54 | scenetext/JPEGImages/259.jpg scenetext/Annotations/259.xml
55 | scenetext/JPEGImages/305.jpg scenetext/Annotations/305.xml
56 | scenetext/JPEGImages/298.jpg scenetext/Annotations/298.xml
57 | scenetext/JPEGImages/284.jpg scenetext/Annotations/284.xml
58 | scenetext/JPEGImages/310.jpg scenetext/Annotations/310.xml
59 | scenetext/JPEGImages/290.jpg scenetext/Annotations/290.xml
60 | scenetext/JPEGImages/321.jpg scenetext/Annotations/321.xml
61 | scenetext/JPEGImages/322.jpg scenetext/Annotations/322.xml
62 | scenetext/JPEGImages/325.jpg scenetext/Annotations/325.xml
63 | scenetext/JPEGImages/280.jpg scenetext/Annotations/280.xml
64 | scenetext/JPEGImages/260.jpg scenetext/Annotations/260.xml
65 | scenetext/JPEGImages/306.jpg scenetext/Annotations/306.xml
66 | scenetext/JPEGImages/278.jpg scenetext/Annotations/278.xml
67 | scenetext/JPEGImages/311.jpg scenetext/Annotations/311.xml
68 | scenetext/JPEGImages/295.jpg scenetext/Annotations/295.xml
69 | scenetext/JPEGImages/267.jpg scenetext/Annotations/267.xml
70 | scenetext/JPEGImages/309.jpg scenetext/Annotations/309.xml
71 |
--------------------------------------------------------------------------------
/training_file/test_name_size.txt:
--------------------------------------------------------------------------------
1 | 106 480 640
2 | 203 480 640
3 | 258 480 640
4 | 318 480 640
5 | 122 480 640
6 | 103 480 640
7 | 320 640 480
8 | 213 480 640
9 | 149 480 640
10 | 269 2592 3888
11 | 161 480 640
12 | 178 520 669
13 | 163 480 640
14 | 299 1200 1600
15 | 168 480 640
16 | 313 480 640
17 | 132 2592 3888
18 | 265 2592 3888
19 | 252 480 640
20 | 262 480 640
21 | 198 480 640
22 | 194 480 640
23 | 317 480 640
24 | 256 768 1024
25 | 279 480 640
26 | 231 480 640
27 | 181 480 640
28 | 180 480 640
29 | 222 768 1024
30 | 125 480 640
31 | 302 480 640
32 | 283 1209 893
33 | 271 480 640
34 | 277 179 626
35 | 124 480 640
36 | 177 480 640
37 | 201 480 640
38 | 204 480 640
39 | 228 480 640
40 | 219 480 640
41 | 241 960 1280
42 | 167 1200 1600
43 | 327 1200 1600
44 | 130 1200 1600
45 | 270 1200 1600
46 | 101 480 640
47 | 221 480 640
48 | 243 480 640
49 | 307 480 640
50 | 195 768 1024
51 | 208 480 640
52 | 138 960 1280
53 | 127 1200 1600
54 | 154 480 640
55 | 242 480 640
56 | 315 2592 3888
57 | 109 2592 3888
58 | 239 480 640
59 | 282 480 640
60 | 264 480 640
61 | 326 480 640
62 | 144 2592 3888
63 | 261 480 640
64 | 193 480 640
65 | 253 480 640
66 | 216 1200 1600
67 | 300 1200 1600
68 | 234 960 1280
69 | 263 208 640
70 | 179 480 640
71 | 285 2592 3888
72 | 274 480 640
73 | 137 480 640
74 | 189 640 480
75 | 209 480 640
76 | 116 480 640
77 | 266 2592 3888
78 | 196 2592 3888
79 | 162 1200 1600
80 | 303 2592 3888
81 | 314 480 640
82 | 288 2592 3888
83 | 174 2592 3888
84 | 328 2592 3888
85 | 226 2592 3888
86 | 292 480 640
87 | 287 960 1280
88 | 273 255 629
89 | 245 102 422
90 | 120 960 1280
91 | 155 480 640
92 | 291 480 640
93 | 293 480 640
94 | 148 755 627
95 | 319 960 1280
96 | 268 480 640
97 | 248 480 640
98 | 255 853 985
99 | 224 2592 3888
100 | 118 466 640
101 | 107 480 640
102 | 230 480 640
103 | 172 480 640
104 | 169 2592 3888
105 | 218 2592 3888
106 | 112 960 1280
107 | 229 480 640
108 | 308 922 1598
109 | 133 480 640
110 | 176 480 640
111 | 200 480 640
112 | 146 480 640
113 | 119 2592 3888
114 | 182 640 480
115 | 197 480 640
116 | 316 480 640
117 | 142 480 640
118 | 190 480 640
119 | 184 348 458
120 | 117 480 640
121 | 114 960 1280
122 | 152 1200 1600
123 | 140 2592 3888
124 | 324 2592 3888
125 | 170 279 640
126 | 166 480 640
127 | 111 480 640
128 | 157 387 587
129 | 235 480 640
130 | 223 167 640
131 | 145 480 640
132 | 187 480 640
133 | 108 611 1019
134 | 289 960 1280
135 | 110 301 640
136 | 294 2592 3888
137 | 212 1200 1600
138 | 323 480 640
139 | 286 2592 3888
140 | 247 1200 1600
141 | 276 480 640
142 | 192 480 640
143 | 297 480 640
144 | 175 480 640
145 | 301 2592 3888
146 | 257 480 640
147 | 296 1536 2048
148 | 100 480 640
149 | 151 480 640
150 | 171 480 640
151 | 227 2592 3888
152 | 237 480 640
153 | 210 480 640
154 | 250 465 640
155 | 281 480 640
156 | 104 640 480
157 | 160 252 537
158 | 272 480 640
159 | 147 960 1280
160 | 136 640 480
161 | 304 480 640
162 | 150 480 640
163 | 158 480 640
164 | 275 480 640
165 | 123 2592 3888
166 | 202 480 640
167 | 312 960 1280
168 | 259 480 640
169 | 113 480 640
170 | 305 731 713
171 | 298 430 1124
172 | 236 1200 1600
173 | 165 480 640
174 | 139 480 640
175 | 135 480 640
176 | 215 480 640
177 | 173 480 640
178 | 284 480 640
179 | 310 480 640
180 | 164 480 640
181 | 186 480 640
182 | 131 480 640
183 | 134 1200 1600
184 | 188 480 640
185 | 217 480 640
186 | 290 480 640
187 | 115 2592 3888
188 | 321 480 640
189 | 205 1200 1600
190 | 322 480 640
191 | 191 2592 3888
192 | 325 480 640
193 | 246 2592 3888
194 | 254 480 640
195 | 105 1280 960
196 | 126 1200 1600
197 | 153 2592 3888
198 | 280 2592 3888
199 | 185 480 640
200 | 225 1200 1600
201 | 260 480 640
202 | 220 960 1280
203 | 199 480 640
204 | 129 2592 3888
205 | 156 1280 960
206 | 306 2592 3888
207 | 278 480 640
208 | 233 960 1280
209 | 143 480 640
210 | 311 480 640
211 | 141 480 640
212 | 295 480 640
213 | 206 480 640
214 | 244 480 640
215 | 128 1200 1600
216 | 267 2592 3888
217 | 309 480 640
218 | 214 480 640
219 | 102 480 640
220 | 238 1200 1600
221 | 211 640 480
222 | 232 640 480
223 | 183 2592 3888
224 | 159 480 640
225 | 249 2592 3888
226 | 251 480 640
227 | 207 480 640
228 | 121 480 640
229 | 240 480 640
230 |
--------------------------------------------------------------------------------
/training_file/train.prototxt:
--------------------------------------------------------------------------------
1 | name: "VGG_scenetext_SSD_300x300_train"
2 | layer {
3 | name: "data"
4 | type: "AnnotatedData"
5 | top: "data"
6 | top: "label"
7 | include {
8 | phase: TRAIN
9 | }
10 | transform_param {
11 | mirror: true
12 | mean_value: 104
13 | mean_value: 117
14 | mean_value: 123
15 | resize_param {
16 | prob: 1
17 | resize_mode: WARP
18 | height: 300
19 | width: 300
20 | interp_mode: LINEAR
21 | interp_mode: AREA
22 | interp_mode: NEAREST
23 | interp_mode: CUBIC
24 | interp_mode: LANCZOS4
25 | }
26 | emit_constraint {
27 | emit_type: CENTER
28 | }
29 | }
30 | data_param {
31 | source: "examples/scenetext_trainval_lmdb"
32 | batch_size: 8
33 | backend: LMDB
34 | }
35 | annotated_data_param {
36 | batch_sampler {
37 | max_sample: 1
38 | max_trials: 1
39 | }
40 | batch_sampler {
41 | sampler {
42 | min_scale: 0.3
43 | max_scale: 1.0
44 | min_aspect_ratio: 0.5
45 | max_aspect_ratio: 2.0
46 | }
47 | sample_constraint {
48 | min_jaccard_overlap: 0.1
49 | }
50 | max_sample: 1
51 | max_trials: 50
52 | }
53 | batch_sampler {
54 | sampler {
55 | min_scale: 0.3
56 | max_scale: 1.0
57 | min_aspect_ratio: 0.5
58 | max_aspect_ratio: 2.0
59 | }
60 | sample_constraint {
61 | min_jaccard_overlap: 0.3
62 | }
63 | max_sample: 1
64 | max_trials: 50
65 | }
66 | batch_sampler {
67 | sampler {
68 | min_scale: 0.3
69 | max_scale: 1.0
70 | min_aspect_ratio: 0.5
71 | max_aspect_ratio: 2.0
72 | }
73 | sample_constraint {
74 | min_jaccard_overlap: 0.5
75 | }
76 | max_sample: 1
77 | max_trials: 50
78 | }
79 | batch_sampler {
80 | sampler {
81 | min_scale: 0.3
82 | max_scale: 1.0
83 | min_aspect_ratio: 0.5
84 | max_aspect_ratio: 2.0
85 | }
86 | sample_constraint {
87 | min_jaccard_overlap: 0.7
88 | }
89 | max_sample: 1
90 | max_trials: 50
91 | }
92 | batch_sampler {
93 | sampler {
94 | min_scale: 0.3
95 | max_scale: 1.0
96 | min_aspect_ratio: 0.5
97 | max_aspect_ratio: 2.0
98 | }
99 | sample_constraint {
100 | min_jaccard_overlap: 0.9
101 | }
102 | max_sample: 1
103 | max_trials: 50
104 | }
105 | batch_sampler {
106 | sampler {
107 | min_scale: 0.3
108 | max_scale: 1.0
109 | min_aspect_ratio: 0.5
110 | max_aspect_ratio: 2.0
111 | }
112 | sample_constraint {
113 | max_jaccard_overlap: 1.0
114 | }
115 | max_sample: 1
116 | max_trials: 50
117 | }
118 | label_map_file: "data/scenetext/labelmap_voc.prototxt"
119 | }
120 | }
121 | layer {
122 | name: "conv1_1"
123 | type: "Convolution"
124 | bottom: "data"
125 | top: "conv1_1"
126 | param {
127 | lr_mult: 0
128 | decay_mult: 0
129 | }
130 | param {
131 | lr_mult: 0
132 | decay_mult: 0
133 | }
134 | convolution_param {
135 | num_output: 64
136 | pad: 1
137 | kernel_size: 3
138 | weight_filler {
139 | type: "xavier"
140 | }
141 | bias_filler {
142 | type: "constant"
143 | value: 0
144 | }
145 | }
146 | }
147 | layer {
148 | name: "relu1_1"
149 | type: "ReLU"
150 | bottom: "conv1_1"
151 | top: "conv1_1"
152 | }
153 | layer {
154 | name: "conv1_2"
155 | type: "Convolution"
156 | bottom: "conv1_1"
157 | top: "conv1_2"
158 | param {
159 | lr_mult: 0
160 | decay_mult: 0
161 | }
162 | param {
163 | lr_mult: 0
164 | decay_mult: 0
165 | }
166 | convolution_param {
167 | num_output: 64
168 | pad: 1
169 | kernel_size: 3
170 | weight_filler {
171 | type: "xavier"
172 | }
173 | bias_filler {
174 | type: "constant"
175 | value: 0
176 | }
177 | }
178 | }
179 | layer {
180 | name: "relu1_2"
181 | type: "ReLU"
182 | bottom: "conv1_2"
183 | top: "conv1_2"
184 | }
185 | layer {
186 | name: "pool1"
187 | type: "Pooling"
188 | bottom: "conv1_2"
189 | top: "pool1"
190 | pooling_param {
191 | pool: MAX
192 | kernel_size: 2
193 | stride: 2
194 | }
195 | }
196 | layer {
197 | name: "conv2_1"
198 | type: "Convolution"
199 | bottom: "pool1"
200 | top: "conv2_1"
201 | param {
202 | lr_mult: 0
203 | decay_mult: 0
204 | }
205 | param {
206 | lr_mult: 0
207 | decay_mult: 0
208 | }
209 | convolution_param {
210 | num_output: 128
211 | pad: 1
212 | kernel_size: 3
213 | weight_filler {
214 | type: "xavier"
215 | }
216 | bias_filler {
217 | type: "constant"
218 | value: 0
219 | }
220 | }
221 | }
222 | layer {
223 | name: "relu2_1"
224 | type: "ReLU"
225 | bottom: "conv2_1"
226 | top: "conv2_1"
227 | }
228 | layer {
229 | name: "conv2_2"
230 | type: "Convolution"
231 | bottom: "conv2_1"
232 | top: "conv2_2"
233 | param {
234 | lr_mult: 0
235 | decay_mult: 0
236 | }
237 | param {
238 | lr_mult: 0
239 | decay_mult: 0
240 | }
241 | convolution_param {
242 | num_output: 128
243 | pad: 1
244 | kernel_size: 3
245 | weight_filler {
246 | type: "xavier"
247 | }
248 | bias_filler {
249 | type: "constant"
250 | value: 0
251 | }
252 | }
253 | }
254 | layer {
255 | name: "relu2_2"
256 | type: "ReLU"
257 | bottom: "conv2_2"
258 | top: "conv2_2"
259 | }
260 | layer {
261 | name: "pool2"
262 | type: "Pooling"
263 | bottom: "conv2_2"
264 | top: "pool2"
265 | pooling_param {
266 | pool: MAX
267 | kernel_size: 2
268 | stride: 2
269 | }
270 | }
271 | layer {
272 | name: "conv3_1"
273 | type: "Convolution"
274 | bottom: "pool2"
275 | top: "conv3_1"
276 | param {
277 | lr_mult: 1
278 | decay_mult: 1
279 | }
280 | param {
281 | lr_mult: 2
282 | decay_mult: 0
283 | }
284 | convolution_param {
285 | num_output: 256
286 | pad: 1
287 | kernel_size: 3
288 | weight_filler {
289 | type: "xavier"
290 | }
291 | bias_filler {
292 | type: "constant"
293 | value: 0
294 | }
295 | }
296 | }
297 | layer {
298 | name: "relu3_1"
299 | type: "ReLU"
300 | bottom: "conv3_1"
301 | top: "conv3_1"
302 | }
303 | layer {
304 | name: "conv3_2"
305 | type: "Convolution"
306 | bottom: "conv3_1"
307 | top: "conv3_2"
308 | param {
309 | lr_mult: 1
310 | decay_mult: 1
311 | }
312 | param {
313 | lr_mult: 2
314 | decay_mult: 0
315 | }
316 | convolution_param {
317 | num_output: 256
318 | pad: 1
319 | kernel_size: 3
320 | weight_filler {
321 | type: "xavier"
322 | }
323 | bias_filler {
324 | type: "constant"
325 | value: 0
326 | }
327 | }
328 | }
329 | layer {
330 | name: "relu3_2"
331 | type: "ReLU"
332 | bottom: "conv3_2"
333 | top: "conv3_2"
334 | }
335 | layer {
336 | name: "conv3_3"
337 | type: "Convolution"
338 | bottom: "conv3_2"
339 | top: "conv3_3"
340 | param {
341 | lr_mult: 1
342 | decay_mult: 1
343 | }
344 | param {
345 | lr_mult: 2
346 | decay_mult: 0
347 | }
348 | convolution_param {
349 | num_output: 256
350 | pad: 1
351 | kernel_size: 3
352 | weight_filler {
353 | type: "xavier"
354 | }
355 | bias_filler {
356 | type: "constant"
357 | value: 0
358 | }
359 | }
360 | }
361 | layer {
362 | name: "relu3_3"
363 | type: "ReLU"
364 | bottom: "conv3_3"
365 | top: "conv3_3"
366 | }
367 | layer {
368 | name: "pool3"
369 | type: "Pooling"
370 | bottom: "conv3_3"
371 | top: "pool3"
372 | pooling_param {
373 | pool: MAX
374 | kernel_size: 2
375 | stride: 2
376 | }
377 | }
378 | layer {
379 | name: "conv4_1"
380 | type: "Convolution"
381 | bottom: "pool3"
382 | top: "conv4_1"
383 | param {
384 | lr_mult: 1
385 | decay_mult: 1
386 | }
387 | param {
388 | lr_mult: 2
389 | decay_mult: 0
390 | }
391 | convolution_param {
392 | num_output: 512
393 | pad: 1
394 | kernel_size: 3
395 | weight_filler {
396 | type: "xavier"
397 | }
398 | bias_filler {
399 | type: "constant"
400 | value: 0
401 | }
402 | }
403 | }
404 | layer {
405 | name: "relu4_1"
406 | type: "ReLU"
407 | bottom: "conv4_1"
408 | top: "conv4_1"
409 | }
410 | layer {
411 | name: "conv4_2"
412 | type: "Convolution"
413 | bottom: "conv4_1"
414 | top: "conv4_2"
415 | param {
416 | lr_mult: 1
417 | decay_mult: 1
418 | }
419 | param {
420 | lr_mult: 2
421 | decay_mult: 0
422 | }
423 | convolution_param {
424 | num_output: 512
425 | pad: 1
426 | kernel_size: 3
427 | weight_filler {
428 | type: "xavier"
429 | }
430 | bias_filler {
431 | type: "constant"
432 | value: 0
433 | }
434 | }
435 | }
436 | layer {
437 | name: "relu4_2"
438 | type: "ReLU"
439 | bottom: "conv4_2"
440 | top: "conv4_2"
441 | }
442 | layer {
443 | name: "conv4_3"
444 | type: "Convolution"
445 | bottom: "conv4_2"
446 | top: "conv4_3"
447 | param {
448 | lr_mult: 1
449 | decay_mult: 1
450 | }
451 | param {
452 | lr_mult: 2
453 | decay_mult: 0
454 | }
455 | convolution_param {
456 | num_output: 512
457 | pad: 1
458 | kernel_size: 3
459 | weight_filler {
460 | type: "xavier"
461 | }
462 | bias_filler {
463 | type: "constant"
464 | value: 0
465 | }
466 | }
467 | }
468 | layer {
469 | name: "relu4_3"
470 | type: "ReLU"
471 | bottom: "conv4_3"
472 | top: "conv4_3"
473 | }
474 | layer {
475 | name: "pool4"
476 | type: "Pooling"
477 | bottom: "conv4_3"
478 | top: "pool4"
479 | pooling_param {
480 | pool: MAX
481 | kernel_size: 2
482 | stride: 2
483 | }
484 | }
485 | layer {
486 | name: "conv5_1"
487 | type: "Convolution"
488 | bottom: "pool4"
489 | top: "conv5_1"
490 | param {
491 | lr_mult: 1
492 | decay_mult: 1
493 | }
494 | param {
495 | lr_mult: 2
496 | decay_mult: 0
497 | }
498 | convolution_param {
499 | num_output: 512
500 | pad: 1
501 | kernel_size: 3
502 | weight_filler {
503 | type: "xavier"
504 | }
505 | bias_filler {
506 | type: "constant"
507 | value: 0
508 | }
509 | }
510 | }
511 | layer {
512 | name: "relu5_1"
513 | type: "ReLU"
514 | bottom: "conv5_1"
515 | top: "conv5_1"
516 | }
517 | layer {
518 | name: "conv5_2"
519 | type: "Convolution"
520 | bottom: "conv5_1"
521 | top: "conv5_2"
522 | param {
523 | lr_mult: 1
524 | decay_mult: 1
525 | }
526 | param {
527 | lr_mult: 2
528 | decay_mult: 0
529 | }
530 | convolution_param {
531 | num_output: 512
532 | pad: 1
533 | kernel_size: 3
534 | weight_filler {
535 | type: "xavier"
536 | }
537 | bias_filler {
538 | type: "constant"
539 | value: 0
540 | }
541 | }
542 | }
543 | layer {
544 | name: "relu5_2"
545 | type: "ReLU"
546 | bottom: "conv5_2"
547 | top: "conv5_2"
548 | }
549 | layer {
550 | name: "conv5_3"
551 | type: "Convolution"
552 | bottom: "conv5_2"
553 | top: "conv5_3"
554 | param {
555 | lr_mult: 1
556 | decay_mult: 1
557 | }
558 | param {
559 | lr_mult: 2
560 | decay_mult: 0
561 | }
562 | convolution_param {
563 | num_output: 512
564 | pad: 1
565 | kernel_size: 3
566 | weight_filler {
567 | type: "xavier"
568 | }
569 | bias_filler {
570 | type: "constant"
571 | value: 0
572 | }
573 | }
574 | }
575 | layer {
576 | name: "relu5_3"
577 | type: "ReLU"
578 | bottom: "conv5_3"
579 | top: "conv5_3"
580 | }
581 | layer {
582 | name: "pool5"
583 | type: "Pooling"
584 | bottom: "conv5_3"
585 | top: "pool5"
586 | pooling_param {
587 | pool: MAX
588 | kernel_size: 3
589 | stride: 1
590 | pad: 1
591 | }
592 | }
593 | layer {
594 | name: "fc6"
595 | type: "Convolution"
596 | bottom: "pool5"
597 | top: "fc6"
598 | param {
599 | lr_mult: 1
600 | decay_mult: 1
601 | }
602 | param {
603 | lr_mult: 2
604 | decay_mult: 0
605 | }
606 | convolution_param {
607 | num_output: 1024
608 | pad: 6
609 | kernel_size: 3
610 | weight_filler {
611 | type: "xavier"
612 | }
613 | bias_filler {
614 | type: "constant"
615 | value: 0
616 | }
617 | dilation: 6
618 | }
619 | }
620 | layer {
621 | name: "relu6"
622 | type: "ReLU"
623 | bottom: "fc6"
624 | top: "fc6"
625 | }
626 | layer {
627 | name: "fc7"
628 | type: "Convolution"
629 | bottom: "fc6"
630 | top: "fc7"
631 | param {
632 | lr_mult: 1
633 | decay_mult: 1
634 | }
635 | param {
636 | lr_mult: 2
637 | decay_mult: 0
638 | }
639 | convolution_param {
640 | num_output: 1024
641 | kernel_size: 1
642 | weight_filler {
643 | type: "xavier"
644 | }
645 | bias_filler {
646 | type: "constant"
647 | value: 0
648 | }
649 | }
650 | }
651 | layer {
652 | name: "relu7"
653 | type: "ReLU"
654 | bottom: "fc7"
655 | top: "fc7"
656 | }
657 | layer {
658 | name: "conv6_1"
659 | type: "Convolution"
660 | bottom: "fc7"
661 | top: "conv6_1"
662 | param {
663 | lr_mult: 1
664 | decay_mult: 1
665 | }
666 | param {
667 | lr_mult: 2
668 | decay_mult: 0
669 | }
670 | convolution_param {
671 | num_output: 256
672 | pad: 0
673 | kernel_size: 1
674 | stride: 1
675 | weight_filler {
676 | type: "xavier"
677 | }
678 | bias_filler {
679 | type: "constant"
680 | value: 0
681 | }
682 | }
683 | }
684 | layer {
685 | name: "conv6_1_relu"
686 | type: "ReLU"
687 | bottom: "conv6_1"
688 | top: "conv6_1"
689 | }
690 | layer {
691 | name: "conv6_2"
692 | type: "Convolution"
693 | bottom: "conv6_1"
694 | top: "conv6_2"
695 | param {
696 | lr_mult: 1
697 | decay_mult: 1
698 | }
699 | param {
700 | lr_mult: 2
701 | decay_mult: 0
702 | }
703 | convolution_param {
704 | num_output: 512
705 | pad: 1
706 | kernel_size: 3
707 | stride: 2
708 | weight_filler {
709 | type: "xavier"
710 | }
711 | bias_filler {
712 | type: "constant"
713 | value: 0
714 | }
715 | }
716 | }
717 | layer {
718 | name: "conv6_2_relu"
719 | type: "ReLU"
720 | bottom: "conv6_2"
721 | top: "conv6_2"
722 | }
723 | layer {
724 | name: "conv7_1"
725 | type: "Convolution"
726 | bottom: "conv6_2"
727 | top: "conv7_1"
728 | param {
729 | lr_mult: 1
730 | decay_mult: 1
731 | }
732 | param {
733 | lr_mult: 2
734 | decay_mult: 0
735 | }
736 | convolution_param {
737 | num_output: 128
738 | pad: 0
739 | kernel_size: 1
740 | stride: 1
741 | weight_filler {
742 | type: "xavier"
743 | }
744 | bias_filler {
745 | type: "constant"
746 | value: 0
747 | }
748 | }
749 | }
750 | layer {
751 | name: "conv7_1_relu"
752 | type: "ReLU"
753 | bottom: "conv7_1"
754 | top: "conv7_1"
755 | }
756 | layer {
757 | name: "conv7_2"
758 | type: "Convolution"
759 | bottom: "conv7_1"
760 | top: "conv7_2"
761 | param {
762 | lr_mult: 1
763 | decay_mult: 1
764 | }
765 | param {
766 | lr_mult: 2
767 | decay_mult: 0
768 | }
769 | convolution_param {
770 | num_output: 256
771 | pad: 1
772 | kernel_size: 3
773 | stride: 2
774 | weight_filler {
775 | type: "xavier"
776 | }
777 | bias_filler {
778 | type: "constant"
779 | value: 0
780 | }
781 | }
782 | }
783 | layer {
784 | name: "conv7_2_relu"
785 | type: "ReLU"
786 | bottom: "conv7_2"
787 | top: "conv7_2"
788 | }
789 | layer {
790 | name: "conv8_1"
791 | type: "Convolution"
792 | bottom: "conv7_2"
793 | top: "conv8_1"
794 | param {
795 | lr_mult: 1
796 | decay_mult: 1
797 | }
798 | param {
799 | lr_mult: 2
800 | decay_mult: 0
801 | }
802 | convolution_param {
803 | num_output: 128
804 | pad: 0
805 | kernel_size: 1
806 | stride: 1
807 | weight_filler {
808 | type: "xavier"
809 | }
810 | bias_filler {
811 | type: "constant"
812 | value: 0
813 | }
814 | }
815 | }
816 | layer {
817 | name: "conv8_1_relu"
818 | type: "ReLU"
819 | bottom: "conv8_1"
820 | top: "conv8_1"
821 | }
822 | layer {
823 | name: "conv8_2"
824 | type: "Convolution"
825 | bottom: "conv8_1"
826 | top: "conv8_2"
827 | param {
828 | lr_mult: 1
829 | decay_mult: 1
830 | }
831 | param {
832 | lr_mult: 2
833 | decay_mult: 0
834 | }
835 | convolution_param {
836 | num_output: 256
837 | pad: 1
838 | kernel_size: 3
839 | stride: 2
840 | weight_filler {
841 | type: "xavier"
842 | }
843 | bias_filler {
844 | type: "constant"
845 | value: 0
846 | }
847 | }
848 | }
849 | layer {
850 | name: "conv8_2_relu"
851 | type: "ReLU"
852 | bottom: "conv8_2"
853 | top: "conv8_2"
854 | }
855 | layer {
856 | name: "pool6"
857 | type: "Pooling"
858 | bottom: "conv8_2"
859 | top: "pool6"
860 | pooling_param {
861 | pool: AVE
862 | global_pooling: true
863 | }
864 | }
865 | layer {
866 | name: "conv4_3_norm"
867 | type: "Normalize"
868 | bottom: "conv4_3"
869 | top: "conv4_3_norm"
870 | norm_param {
871 | across_spatial: false
872 | scale_filler {
873 | type: "constant"
874 | value: 20
875 | }
876 | channel_shared: false
877 | }
878 | }
879 | layer {
880 | name: "conv4_3_norm_mbox_loc"
881 | type: "Convolution"
882 | bottom: "conv4_3_norm"
883 | top: "conv4_3_norm_mbox_loc"
884 | param {
885 | lr_mult: 1
886 | decay_mult: 1
887 | }
888 | param {
889 | lr_mult: 2
890 | decay_mult: 0
891 | }
892 | convolution_param {
893 | num_output: 12
894 | pad: 1
895 | kernel_size: 3
896 | stride: 1
897 | weight_filler {
898 | type: "xavier"
899 | }
900 | bias_filler {
901 | type: "constant"
902 | value: 0
903 | }
904 | }
905 | }
906 | layer {
907 | name: "conv4_3_norm_mbox_loc_perm"
908 | type: "Permute"
909 | bottom: "conv4_3_norm_mbox_loc"
910 | top: "conv4_3_norm_mbox_loc_perm"
911 | permute_param {
912 | order: 0
913 | order: 2
914 | order: 3
915 | order: 1
916 | }
917 | }
918 | layer {
919 | name: "conv4_3_norm_mbox_loc_flat"
920 | type: "Flatten"
921 | bottom: "conv4_3_norm_mbox_loc_perm"
922 | top: "conv4_3_norm_mbox_loc_flat"
923 | flatten_param {
924 | axis: 1
925 | }
926 | }
927 | layer {
928 | name: "conv4_3_norm_mbox_conf"
929 | type: "Convolution"
930 | bottom: "conv4_3_norm"
931 | top: "conv4_3_norm_mbox_conf"
932 | param {
933 | lr_mult: 1
934 | decay_mult: 1
935 | }
936 | param {
937 | lr_mult: 2
938 | decay_mult: 0
939 | }
940 | convolution_param {
941 | num_output: 6
942 | pad: 1
943 | kernel_size: 3
944 | stride: 1
945 | weight_filler {
946 | type: "xavier"
947 | }
948 | bias_filler {
949 | type: "constant"
950 | value: 0
951 | }
952 | }
953 | }
954 | layer {
955 | name: "conv4_3_norm_mbox_conf_perm"
956 | type: "Permute"
957 | bottom: "conv4_3_norm_mbox_conf"
958 | top: "conv4_3_norm_mbox_conf_perm"
959 | permute_param {
960 | order: 0
961 | order: 2
962 | order: 3
963 | order: 1
964 | }
965 | }
966 | layer {
967 | name: "conv4_3_norm_mbox_conf_flat"
968 | type: "Flatten"
969 | bottom: "conv4_3_norm_mbox_conf_perm"
970 | top: "conv4_3_norm_mbox_conf_flat"
971 | flatten_param {
972 | axis: 1
973 | }
974 | }
975 | layer {
976 | name: "conv4_3_norm_mbox_priorbox"
977 | type: "PriorBox"
978 | bottom: "conv4_3_norm"
979 | bottom: "data"
980 | top: "conv4_3_norm_mbox_priorbox"
981 | prior_box_param {
982 | min_size: 30.0
983 | aspect_ratio: 2
984 | flip: true
985 | clip: true
986 | variance: 0.1
987 | variance: 0.1
988 | variance: 0.2
989 | variance: 0.2
990 | }
991 | }
992 | layer {
993 | name: "fc7_mbox_loc"
994 | type: "Convolution"
995 | bottom: "fc7"
996 | top: "fc7_mbox_loc"
997 | param {
998 | lr_mult: 1
999 | decay_mult: 1
1000 | }
1001 | param {
1002 | lr_mult: 2
1003 | decay_mult: 0
1004 | }
1005 | convolution_param {
1006 | num_output: 24
1007 | pad: 1
1008 | kernel_size: 3
1009 | stride: 1
1010 | weight_filler {
1011 | type: "xavier"
1012 | }
1013 | bias_filler {
1014 | type: "constant"
1015 | value: 0
1016 | }
1017 | }
1018 | }
1019 | layer {
1020 | name: "fc7_mbox_loc_perm"
1021 | type: "Permute"
1022 | bottom: "fc7_mbox_loc"
1023 | top: "fc7_mbox_loc_perm"
1024 | permute_param {
1025 | order: 0
1026 | order: 2
1027 | order: 3
1028 | order: 1
1029 | }
1030 | }
1031 | layer {
1032 | name: "fc7_mbox_loc_flat"
1033 | type: "Flatten"
1034 | bottom: "fc7_mbox_loc_perm"
1035 | top: "fc7_mbox_loc_flat"
1036 | flatten_param {
1037 | axis: 1
1038 | }
1039 | }
1040 | layer {
1041 | name: "fc7_mbox_conf"
1042 | type: "Convolution"
1043 | bottom: "fc7"
1044 | top: "fc7_mbox_conf"
1045 | param {
1046 | lr_mult: 1
1047 | decay_mult: 1
1048 | }
1049 | param {
1050 | lr_mult: 2
1051 | decay_mult: 0
1052 | }
1053 | convolution_param {
1054 | num_output: 12
1055 | pad: 1
1056 | kernel_size: 3
1057 | stride: 1
1058 | weight_filler {
1059 | type: "xavier"
1060 | }
1061 | bias_filler {
1062 | type: "constant"
1063 | value: 0
1064 | }
1065 | }
1066 | }
1067 | layer {
1068 | name: "fc7_mbox_conf_perm"
1069 | type: "Permute"
1070 | bottom: "fc7_mbox_conf"
1071 | top: "fc7_mbox_conf_perm"
1072 | permute_param {
1073 | order: 0
1074 | order: 2
1075 | order: 3
1076 | order: 1
1077 | }
1078 | }
1079 | layer {
1080 | name: "fc7_mbox_conf_flat"
1081 | type: "Flatten"
1082 | bottom: "fc7_mbox_conf_perm"
1083 | top: "fc7_mbox_conf_flat"
1084 | flatten_param {
1085 | axis: 1
1086 | }
1087 | }
1088 | layer {
1089 | name: "fc7_mbox_priorbox"
1090 | type: "PriorBox"
1091 | bottom: "fc7"
1092 | bottom: "data"
1093 | top: "fc7_mbox_priorbox"
1094 | prior_box_param {
1095 | min_size: 60.0
1096 | max_size: 114.0
1097 | aspect_ratio: 2
1098 | aspect_ratio: 3
1099 | flip: true
1100 | clip: true
1101 | variance: 0.1
1102 | variance: 0.1
1103 | variance: 0.2
1104 | variance: 0.2
1105 | }
1106 | }
1107 | layer {
1108 | name: "conv6_2_mbox_loc"
1109 | type: "Convolution"
1110 | bottom: "conv6_2"
1111 | top: "conv6_2_mbox_loc"
1112 | param {
1113 | lr_mult: 1
1114 | decay_mult: 1
1115 | }
1116 | param {
1117 | lr_mult: 2
1118 | decay_mult: 0
1119 | }
1120 | convolution_param {
1121 | num_output: 24
1122 | pad: 1
1123 | kernel_size: 3
1124 | stride: 1
1125 | weight_filler {
1126 | type: "xavier"
1127 | }
1128 | bias_filler {
1129 | type: "constant"
1130 | value: 0
1131 | }
1132 | }
1133 | }
1134 | layer {
1135 | name: "conv6_2_mbox_loc_perm"
1136 | type: "Permute"
1137 | bottom: "conv6_2_mbox_loc"
1138 | top: "conv6_2_mbox_loc_perm"
1139 | permute_param {
1140 | order: 0
1141 | order: 2
1142 | order: 3
1143 | order: 1
1144 | }
1145 | }
1146 | layer {
1147 | name: "conv6_2_mbox_loc_flat"
1148 | type: "Flatten"
1149 | bottom: "conv6_2_mbox_loc_perm"
1150 | top: "conv6_2_mbox_loc_flat"
1151 | flatten_param {
1152 | axis: 1
1153 | }
1154 | }
1155 | layer {
1156 | name: "conv6_2_mbox_conf"
1157 | type: "Convolution"
1158 | bottom: "conv6_2"
1159 | top: "conv6_2_mbox_conf"
1160 | param {
1161 | lr_mult: 1
1162 | decay_mult: 1
1163 | }
1164 | param {
1165 | lr_mult: 2
1166 | decay_mult: 0
1167 | }
1168 | convolution_param {
1169 | num_output: 12
1170 | pad: 1
1171 | kernel_size: 3
1172 | stride: 1
1173 | weight_filler {
1174 | type: "xavier"
1175 | }
1176 | bias_filler {
1177 | type: "constant"
1178 | value: 0
1179 | }
1180 | }
1181 | }
1182 | layer {
1183 | name: "conv6_2_mbox_conf_perm"
1184 | type: "Permute"
1185 | bottom: "conv6_2_mbox_conf"
1186 | top: "conv6_2_mbox_conf_perm"
1187 | permute_param {
1188 | order: 0
1189 | order: 2
1190 | order: 3
1191 | order: 1
1192 | }
1193 | }
1194 | layer {
1195 | name: "conv6_2_mbox_conf_flat"
1196 | type: "Flatten"
1197 | bottom: "conv6_2_mbox_conf_perm"
1198 | top: "conv6_2_mbox_conf_flat"
1199 | flatten_param {
1200 | axis: 1
1201 | }
1202 | }
1203 | layer {
1204 | name: "conv6_2_mbox_priorbox"
1205 | type: "PriorBox"
1206 | bottom: "conv6_2"
1207 | bottom: "data"
1208 | top: "conv6_2_mbox_priorbox"
1209 | prior_box_param {
1210 | min_size: 114.0
1211 | max_size: 168.0
1212 | aspect_ratio: 2
1213 | aspect_ratio: 3
1214 | flip: true
1215 | clip: true
1216 | variance: 0.1
1217 | variance: 0.1
1218 | variance: 0.2
1219 | variance: 0.2
1220 | }
1221 | }
1222 | layer {
1223 | name: "conv7_2_mbox_loc"
1224 | type: "Convolution"
1225 | bottom: "conv7_2"
1226 | top: "conv7_2_mbox_loc"
1227 | param {
1228 | lr_mult: 1
1229 | decay_mult: 1
1230 | }
1231 | param {
1232 | lr_mult: 2
1233 | decay_mult: 0
1234 | }
1235 | convolution_param {
1236 | num_output: 24
1237 | pad: 1
1238 | kernel_size: 3
1239 | stride: 1
1240 | weight_filler {
1241 | type: "xavier"
1242 | }
1243 | bias_filler {
1244 | type: "constant"
1245 | value: 0
1246 | }
1247 | }
1248 | }
1249 | layer {
1250 | name: "conv7_2_mbox_loc_perm"
1251 | type: "Permute"
1252 | bottom: "conv7_2_mbox_loc"
1253 | top: "conv7_2_mbox_loc_perm"
1254 | permute_param {
1255 | order: 0
1256 | order: 2
1257 | order: 3
1258 | order: 1
1259 | }
1260 | }
1261 | layer {
1262 | name: "conv7_2_mbox_loc_flat"
1263 | type: "Flatten"
1264 | bottom: "conv7_2_mbox_loc_perm"
1265 | top: "conv7_2_mbox_loc_flat"
1266 | flatten_param {
1267 | axis: 1
1268 | }
1269 | }
1270 | layer {
1271 | name: "conv7_2_mbox_conf"
1272 | type: "Convolution"
1273 | bottom: "conv7_2"
1274 | top: "conv7_2_mbox_conf"
1275 | param {
1276 | lr_mult: 1
1277 | decay_mult: 1
1278 | }
1279 | param {
1280 | lr_mult: 2
1281 | decay_mult: 0
1282 | }
1283 | convolution_param {
1284 | num_output: 12
1285 | pad: 1
1286 | kernel_size: 3
1287 | stride: 1
1288 | weight_filler {
1289 | type: "xavier"
1290 | }
1291 | bias_filler {
1292 | type: "constant"
1293 | value: 0
1294 | }
1295 | }
1296 | }
1297 | layer {
1298 | name: "conv7_2_mbox_conf_perm"
1299 | type: "Permute"
1300 | bottom: "conv7_2_mbox_conf"
1301 | top: "conv7_2_mbox_conf_perm"
1302 | permute_param {
1303 | order: 0
1304 | order: 2
1305 | order: 3
1306 | order: 1
1307 | }
1308 | }
1309 | layer {
1310 | name: "conv7_2_mbox_conf_flat"
1311 | type: "Flatten"
1312 | bottom: "conv7_2_mbox_conf_perm"
1313 | top: "conv7_2_mbox_conf_flat"
1314 | flatten_param {
1315 | axis: 1
1316 | }
1317 | }
1318 | layer {
1319 | name: "conv7_2_mbox_priorbox"
1320 | type: "PriorBox"
1321 | bottom: "conv7_2"
1322 | bottom: "data"
1323 | top: "conv7_2_mbox_priorbox"
1324 | prior_box_param {
1325 | min_size: 168.0
1326 | max_size: 222.0
1327 | aspect_ratio: 2
1328 | aspect_ratio: 3
1329 | flip: true
1330 | clip: true
1331 | variance: 0.1
1332 | variance: 0.1
1333 | variance: 0.2
1334 | variance: 0.2
1335 | }
1336 | }
1337 | layer {
1338 | name: "conv8_2_mbox_loc"
1339 | type: "Convolution"
1340 | bottom: "conv8_2"
1341 | top: "conv8_2_mbox_loc"
1342 | param {
1343 | lr_mult: 1
1344 | decay_mult: 1
1345 | }
1346 | param {
1347 | lr_mult: 2
1348 | decay_mult: 0
1349 | }
1350 | convolution_param {
1351 | num_output: 24
1352 | pad: 1
1353 | kernel_size: 3
1354 | stride: 1
1355 | weight_filler {
1356 | type: "xavier"
1357 | }
1358 | bias_filler {
1359 | type: "constant"
1360 | value: 0
1361 | }
1362 | }
1363 | }
1364 | layer {
1365 | name: "conv8_2_mbox_loc_perm"
1366 | type: "Permute"
1367 | bottom: "conv8_2_mbox_loc"
1368 | top: "conv8_2_mbox_loc_perm"
1369 | permute_param {
1370 | order: 0
1371 | order: 2
1372 | order: 3
1373 | order: 1
1374 | }
1375 | }
1376 | layer {
1377 | name: "conv8_2_mbox_loc_flat"
1378 | type: "Flatten"
1379 | bottom: "conv8_2_mbox_loc_perm"
1380 | top: "conv8_2_mbox_loc_flat"
1381 | flatten_param {
1382 | axis: 1
1383 | }
1384 | }
1385 | layer {
1386 | name: "conv8_2_mbox_conf"
1387 | type: "Convolution"
1388 | bottom: "conv8_2"
1389 | top: "conv8_2_mbox_conf"
1390 | param {
1391 | lr_mult: 1
1392 | decay_mult: 1
1393 | }
1394 | param {
1395 | lr_mult: 2
1396 | decay_mult: 0
1397 | }
1398 | convolution_param {
1399 | num_output: 12
1400 | pad: 1
1401 | kernel_size: 3
1402 | stride: 1
1403 | weight_filler {
1404 | type: "xavier"
1405 | }
1406 | bias_filler {
1407 | type: "constant"
1408 | value: 0
1409 | }
1410 | }
1411 | }
1412 | layer {
1413 | name: "conv8_2_mbox_conf_perm"
1414 | type: "Permute"
1415 | bottom: "conv8_2_mbox_conf"
1416 | top: "conv8_2_mbox_conf_perm"
1417 | permute_param {
1418 | order: 0
1419 | order: 2
1420 | order: 3
1421 | order: 1
1422 | }
1423 | }
1424 | layer {
1425 | name: "conv8_2_mbox_conf_flat"
1426 | type: "Flatten"
1427 | bottom: "conv8_2_mbox_conf_perm"
1428 | top: "conv8_2_mbox_conf_flat"
1429 | flatten_param {
1430 | axis: 1
1431 | }
1432 | }
1433 | layer {
1434 | name: "conv8_2_mbox_priorbox"
1435 | type: "PriorBox"
1436 | bottom: "conv8_2"
1437 | bottom: "data"
1438 | top: "conv8_2_mbox_priorbox"
1439 | prior_box_param {
1440 | min_size: 222.0
1441 | max_size: 276.0
1442 | aspect_ratio: 2
1443 | aspect_ratio: 3
1444 | flip: true
1445 | clip: true
1446 | variance: 0.1
1447 | variance: 0.1
1448 | variance: 0.2
1449 | variance: 0.2
1450 | }
1451 | }
1452 | layer {
1453 | name: "pool6_mbox_loc"
1454 | type: "Convolution"
1455 | bottom: "pool6"
1456 | top: "pool6_mbox_loc"
1457 | param {
1458 | lr_mult: 1
1459 | decay_mult: 1
1460 | }
1461 | param {
1462 | lr_mult: 2
1463 | decay_mult: 0
1464 | }
1465 | convolution_param {
1466 | num_output: 24
1467 | pad: 1
1468 | kernel_size: 3
1469 | stride: 1
1470 | weight_filler {
1471 | type: "xavier"
1472 | }
1473 | bias_filler {
1474 | type: "constant"
1475 | value: 0
1476 | }
1477 | }
1478 | }
1479 | layer {
1480 | name: "pool6_mbox_loc_perm"
1481 | type: "Permute"
1482 | bottom: "pool6_mbox_loc"
1483 | top: "pool6_mbox_loc_perm"
1484 | permute_param {
1485 | order: 0
1486 | order: 2
1487 | order: 3
1488 | order: 1
1489 | }
1490 | }
1491 | layer {
1492 | name: "pool6_mbox_loc_flat"
1493 | type: "Flatten"
1494 | bottom: "pool6_mbox_loc_perm"
1495 | top: "pool6_mbox_loc_flat"
1496 | flatten_param {
1497 | axis: 1
1498 | }
1499 | }
1500 | layer {
1501 | name: "pool6_mbox_conf"
1502 | type: "Convolution"
1503 | bottom: "pool6"
1504 | top: "pool6_mbox_conf"
1505 | param {
1506 | lr_mult: 1
1507 | decay_mult: 1
1508 | }
1509 | param {
1510 | lr_mult: 2
1511 | decay_mult: 0
1512 | }
1513 | convolution_param {
1514 | num_output: 12
1515 | pad: 1
1516 | kernel_size: 3
1517 | stride: 1
1518 | weight_filler {
1519 | type: "xavier"
1520 | }
1521 | bias_filler {
1522 | type: "constant"
1523 | value: 0
1524 | }
1525 | }
1526 | }
1527 | layer {
1528 | name: "pool6_mbox_conf_perm"
1529 | type: "Permute"
1530 | bottom: "pool6_mbox_conf"
1531 | top: "pool6_mbox_conf_perm"
1532 | permute_param {
1533 | order: 0
1534 | order: 2
1535 | order: 3
1536 | order: 1
1537 | }
1538 | }
1539 | layer {
1540 | name: "pool6_mbox_conf_flat"
1541 | type: "Flatten"
1542 | bottom: "pool6_mbox_conf_perm"
1543 | top: "pool6_mbox_conf_flat"
1544 | flatten_param {
1545 | axis: 1
1546 | }
1547 | }
1548 | layer {
1549 | name: "pool6_mbox_priorbox"
1550 | type: "PriorBox"
1551 | bottom: "pool6"
1552 | bottom: "data"
1553 | top: "pool6_mbox_priorbox"
1554 | prior_box_param {
1555 | min_size: 276.0
1556 | max_size: 330.0
1557 | aspect_ratio: 2
1558 | aspect_ratio: 3
1559 | flip: true
1560 | clip: true
1561 | variance: 0.1
1562 | variance: 0.1
1563 | variance: 0.2
1564 | variance: 0.2
1565 | }
1566 | }
1567 | layer {
1568 | name: "mbox_loc"
1569 | type: "Concat"
1570 | bottom: "conv4_3_norm_mbox_loc_flat"
1571 | bottom: "fc7_mbox_loc_flat"
1572 | bottom: "conv6_2_mbox_loc_flat"
1573 | bottom: "conv7_2_mbox_loc_flat"
1574 | bottom: "conv8_2_mbox_loc_flat"
1575 | bottom: "pool6_mbox_loc_flat"
1576 | top: "mbox_loc"
1577 | concat_param {
1578 | axis: 1
1579 | }
1580 | }
1581 | layer {
1582 | name: "mbox_conf"
1583 | type: "Concat"
1584 | bottom: "conv4_3_norm_mbox_conf_flat"
1585 | bottom: "fc7_mbox_conf_flat"
1586 | bottom: "conv6_2_mbox_conf_flat"
1587 | bottom: "conv7_2_mbox_conf_flat"
1588 | bottom: "conv8_2_mbox_conf_flat"
1589 | bottom: "pool6_mbox_conf_flat"
1590 | top: "mbox_conf"
1591 | concat_param {
1592 | axis: 1
1593 | }
1594 | }
1595 | layer {
1596 | name: "mbox_priorbox"
1597 | type: "Concat"
1598 | bottom: "conv4_3_norm_mbox_priorbox"
1599 | bottom: "fc7_mbox_priorbox"
1600 | bottom: "conv6_2_mbox_priorbox"
1601 | bottom: "conv7_2_mbox_priorbox"
1602 | bottom: "conv8_2_mbox_priorbox"
1603 | bottom: "pool6_mbox_priorbox"
1604 | top: "mbox_priorbox"
1605 | concat_param {
1606 | axis: 2
1607 | }
1608 | }
1609 | layer {
1610 | name: "mbox_loss"
1611 | type: "MultiBoxLoss"
1612 | bottom: "mbox_loc"
1613 | bottom: "mbox_conf"
1614 | bottom: "mbox_priorbox"
1615 | bottom: "label"
1616 | top: "mbox_loss"
1617 | include {
1618 | phase: TRAIN
1619 | }
1620 | propagate_down: true
1621 | propagate_down: true
1622 | propagate_down: false
1623 | propagate_down: false
1624 | loss_param {
1625 | normalization: VALID
1626 | }
1627 | multibox_loss_param {
1628 | loc_loss_type: SMOOTH_L1
1629 | conf_loss_type: SOFTMAX
1630 | loc_weight: 1.0
1631 | num_classes: 2
1632 | share_location: true
1633 | match_type: PER_PREDICTION
1634 | overlap_threshold: 0.5
1635 | use_prior_for_matching: true
1636 | background_label_id: 0
1637 | use_difficult_gt: true
1638 | do_neg_mining: true
1639 | neg_pos_ratio: 3.0
1640 | neg_overlap: 0.5
1641 | code_type: CENTER_SIZE
1642 | }
1643 | }
1644 |
--------------------------------------------------------------------------------
/training_file/trainval.txt:
--------------------------------------------------------------------------------
1 | scenetext/JPEGImages/106.jpg scenetext/Annotations/106.xml
2 | scenetext/JPEGImages/203.jpg scenetext/Annotations/203.xml
3 | scenetext/JPEGImages/258.jpg scenetext/Annotations/258.xml
4 | scenetext/JPEGImages/122.jpg scenetext/Annotations/122.xml
5 | scenetext/JPEGImages/103.jpg scenetext/Annotations/103.xml
6 | scenetext/JPEGImages/213.jpg scenetext/Annotations/213.xml
7 | scenetext/JPEGImages/149.jpg scenetext/Annotations/149.xml
8 | scenetext/JPEGImages/161.jpg scenetext/Annotations/161.xml
9 | scenetext/JPEGImages/178.jpg scenetext/Annotations/178.xml
10 | scenetext/JPEGImages/163.jpg scenetext/Annotations/163.xml
11 | scenetext/JPEGImages/168.jpg scenetext/Annotations/168.xml
12 | scenetext/JPEGImages/132.jpg scenetext/Annotations/132.xml
13 | scenetext/JPEGImages/252.jpg scenetext/Annotations/252.xml
14 | scenetext/JPEGImages/198.jpg scenetext/Annotations/198.xml
15 | scenetext/JPEGImages/194.jpg scenetext/Annotations/194.xml
16 | scenetext/JPEGImages/256.jpg scenetext/Annotations/256.xml
17 | scenetext/JPEGImages/231.jpg scenetext/Annotations/231.xml
18 | scenetext/JPEGImages/181.jpg scenetext/Annotations/181.xml
19 | scenetext/JPEGImages/180.jpg scenetext/Annotations/180.xml
20 | scenetext/JPEGImages/222.jpg scenetext/Annotations/222.xml
21 | scenetext/JPEGImages/125.jpg scenetext/Annotations/125.xml
22 | scenetext/JPEGImages/124.jpg scenetext/Annotations/124.xml
23 | scenetext/JPEGImages/177.jpg scenetext/Annotations/177.xml
24 | scenetext/JPEGImages/201.jpg scenetext/Annotations/201.xml
25 | scenetext/JPEGImages/204.jpg scenetext/Annotations/204.xml
26 | scenetext/JPEGImages/228.jpg scenetext/Annotations/228.xml
27 | scenetext/JPEGImages/219.jpg scenetext/Annotations/219.xml
28 | scenetext/JPEGImages/241.jpg scenetext/Annotations/241.xml
29 | scenetext/JPEGImages/167.jpg scenetext/Annotations/167.xml
30 | scenetext/JPEGImages/130.jpg scenetext/Annotations/130.xml
31 | scenetext/JPEGImages/101.jpg scenetext/Annotations/101.xml
32 | scenetext/JPEGImages/221.jpg scenetext/Annotations/221.xml
33 | scenetext/JPEGImages/243.jpg scenetext/Annotations/243.xml
34 | scenetext/JPEGImages/195.jpg scenetext/Annotations/195.xml
35 | scenetext/JPEGImages/208.jpg scenetext/Annotations/208.xml
36 | scenetext/JPEGImages/138.jpg scenetext/Annotations/138.xml
37 | scenetext/JPEGImages/127.jpg scenetext/Annotations/127.xml
38 | scenetext/JPEGImages/154.jpg scenetext/Annotations/154.xml
39 | scenetext/JPEGImages/242.jpg scenetext/Annotations/242.xml
40 | scenetext/JPEGImages/109.jpg scenetext/Annotations/109.xml
41 | scenetext/JPEGImages/239.jpg scenetext/Annotations/239.xml
42 | scenetext/JPEGImages/144.jpg scenetext/Annotations/144.xml
43 | scenetext/JPEGImages/193.jpg scenetext/Annotations/193.xml
44 | scenetext/JPEGImages/253.jpg scenetext/Annotations/253.xml
45 | scenetext/JPEGImages/216.jpg scenetext/Annotations/216.xml
46 | scenetext/JPEGImages/234.jpg scenetext/Annotations/234.xml
47 | scenetext/JPEGImages/179.jpg scenetext/Annotations/179.xml
48 | scenetext/JPEGImages/137.jpg scenetext/Annotations/137.xml
49 | scenetext/JPEGImages/189.jpg scenetext/Annotations/189.xml
50 | scenetext/JPEGImages/209.jpg scenetext/Annotations/209.xml
51 | scenetext/JPEGImages/116.jpg scenetext/Annotations/116.xml
52 | scenetext/JPEGImages/196.jpg scenetext/Annotations/196.xml
53 | scenetext/JPEGImages/162.jpg scenetext/Annotations/162.xml
54 | scenetext/JPEGImages/174.jpg scenetext/Annotations/174.xml
55 | scenetext/JPEGImages/226.jpg scenetext/Annotations/226.xml
56 | scenetext/JPEGImages/245.jpg scenetext/Annotations/245.xml
57 | scenetext/JPEGImages/120.jpg scenetext/Annotations/120.xml
58 | scenetext/JPEGImages/155.jpg scenetext/Annotations/155.xml
59 | scenetext/JPEGImages/148.jpg scenetext/Annotations/148.xml
60 | scenetext/JPEGImages/248.jpg scenetext/Annotations/248.xml
61 | scenetext/JPEGImages/255.jpg scenetext/Annotations/255.xml
62 | scenetext/JPEGImages/224.jpg scenetext/Annotations/224.xml
63 | scenetext/JPEGImages/118.jpg scenetext/Annotations/118.xml
64 | scenetext/JPEGImages/107.jpg scenetext/Annotations/107.xml
65 | scenetext/JPEGImages/230.jpg scenetext/Annotations/230.xml
66 | scenetext/JPEGImages/172.jpg scenetext/Annotations/172.xml
67 | scenetext/JPEGImages/169.jpg scenetext/Annotations/169.xml
68 | scenetext/JPEGImages/218.jpg scenetext/Annotations/218.xml
69 | scenetext/JPEGImages/112.jpg scenetext/Annotations/112.xml
70 | scenetext/JPEGImages/229.jpg scenetext/Annotations/229.xml
71 | scenetext/JPEGImages/133.jpg scenetext/Annotations/133.xml
72 | scenetext/JPEGImages/176.jpg scenetext/Annotations/176.xml
73 | scenetext/JPEGImages/200.jpg scenetext/Annotations/200.xml
74 | scenetext/JPEGImages/146.jpg scenetext/Annotations/146.xml
75 | scenetext/JPEGImages/119.jpg scenetext/Annotations/119.xml
76 | scenetext/JPEGImages/182.jpg scenetext/Annotations/182.xml
77 | scenetext/JPEGImages/197.jpg scenetext/Annotations/197.xml
78 | scenetext/JPEGImages/142.jpg scenetext/Annotations/142.xml
79 | scenetext/JPEGImages/190.jpg scenetext/Annotations/190.xml
80 | scenetext/JPEGImages/184.jpg scenetext/Annotations/184.xml
81 | scenetext/JPEGImages/117.jpg scenetext/Annotations/117.xml
82 | scenetext/JPEGImages/114.jpg scenetext/Annotations/114.xml
83 | scenetext/JPEGImages/152.jpg scenetext/Annotations/152.xml
84 | scenetext/JPEGImages/140.jpg scenetext/Annotations/140.xml
85 | scenetext/JPEGImages/170.jpg scenetext/Annotations/170.xml
86 | scenetext/JPEGImages/166.jpg scenetext/Annotations/166.xml
87 | scenetext/JPEGImages/111.jpg scenetext/Annotations/111.xml
88 | scenetext/JPEGImages/157.jpg scenetext/Annotations/157.xml
89 | scenetext/JPEGImages/235.jpg scenetext/Annotations/235.xml
90 | scenetext/JPEGImages/223.jpg scenetext/Annotations/223.xml
91 | scenetext/JPEGImages/145.jpg scenetext/Annotations/145.xml
92 | scenetext/JPEGImages/187.jpg scenetext/Annotations/187.xml
93 | scenetext/JPEGImages/108.jpg scenetext/Annotations/108.xml
94 | scenetext/JPEGImages/110.jpg scenetext/Annotations/110.xml
95 | scenetext/JPEGImages/212.jpg scenetext/Annotations/212.xml
96 | scenetext/JPEGImages/247.jpg scenetext/Annotations/247.xml
97 | scenetext/JPEGImages/192.jpg scenetext/Annotations/192.xml
98 | scenetext/JPEGImages/175.jpg scenetext/Annotations/175.xml
99 | scenetext/JPEGImages/257.jpg scenetext/Annotations/257.xml
100 | scenetext/JPEGImages/100.jpg scenetext/Annotations/100.xml
101 | scenetext/JPEGImages/151.jpg scenetext/Annotations/151.xml
102 | scenetext/JPEGImages/171.jpg scenetext/Annotations/171.xml
103 | scenetext/JPEGImages/227.jpg scenetext/Annotations/227.xml
104 | scenetext/JPEGImages/237.jpg scenetext/Annotations/237.xml
105 | scenetext/JPEGImages/210.jpg scenetext/Annotations/210.xml
106 | scenetext/JPEGImages/250.jpg scenetext/Annotations/250.xml
107 | scenetext/JPEGImages/104.jpg scenetext/Annotations/104.xml
108 | scenetext/JPEGImages/160.jpg scenetext/Annotations/160.xml
109 | scenetext/JPEGImages/147.jpg scenetext/Annotations/147.xml
110 | scenetext/JPEGImages/136.jpg scenetext/Annotations/136.xml
111 | scenetext/JPEGImages/150.jpg scenetext/Annotations/150.xml
112 | scenetext/JPEGImages/158.jpg scenetext/Annotations/158.xml
113 | scenetext/JPEGImages/123.jpg scenetext/Annotations/123.xml
114 | scenetext/JPEGImages/202.jpg scenetext/Annotations/202.xml
115 | scenetext/JPEGImages/113.jpg scenetext/Annotations/113.xml
116 | scenetext/JPEGImages/236.jpg scenetext/Annotations/236.xml
117 | scenetext/JPEGImages/165.jpg scenetext/Annotations/165.xml
118 | scenetext/JPEGImages/139.jpg scenetext/Annotations/139.xml
119 | scenetext/JPEGImages/135.jpg scenetext/Annotations/135.xml
120 | scenetext/JPEGImages/215.jpg scenetext/Annotations/215.xml
121 | scenetext/JPEGImages/173.jpg scenetext/Annotations/173.xml
122 | scenetext/JPEGImages/164.jpg scenetext/Annotations/164.xml
123 | scenetext/JPEGImages/186.jpg scenetext/Annotations/186.xml
124 | scenetext/JPEGImages/131.jpg scenetext/Annotations/131.xml
125 | scenetext/JPEGImages/134.jpg scenetext/Annotations/134.xml
126 | scenetext/JPEGImages/188.jpg scenetext/Annotations/188.xml
127 | scenetext/JPEGImages/217.jpg scenetext/Annotations/217.xml
128 | scenetext/JPEGImages/115.jpg scenetext/Annotations/115.xml
129 | scenetext/JPEGImages/205.jpg scenetext/Annotations/205.xml
130 | scenetext/JPEGImages/191.jpg scenetext/Annotations/191.xml
131 | scenetext/JPEGImages/246.jpg scenetext/Annotations/246.xml
132 | scenetext/JPEGImages/254.jpg scenetext/Annotations/254.xml
133 | scenetext/JPEGImages/105.jpg scenetext/Annotations/105.xml
134 | scenetext/JPEGImages/126.jpg scenetext/Annotations/126.xml
135 | scenetext/JPEGImages/153.jpg scenetext/Annotations/153.xml
136 | scenetext/JPEGImages/185.jpg scenetext/Annotations/185.xml
137 | scenetext/JPEGImages/225.jpg scenetext/Annotations/225.xml
138 | scenetext/JPEGImages/220.jpg scenetext/Annotations/220.xml
139 | scenetext/JPEGImages/199.jpg scenetext/Annotations/199.xml
140 | scenetext/JPEGImages/129.jpg scenetext/Annotations/129.xml
141 | scenetext/JPEGImages/156.jpg scenetext/Annotations/156.xml
142 | scenetext/JPEGImages/233.jpg scenetext/Annotations/233.xml
143 | scenetext/JPEGImages/143.jpg scenetext/Annotations/143.xml
144 | scenetext/JPEGImages/141.jpg scenetext/Annotations/141.xml
145 | scenetext/JPEGImages/206.jpg scenetext/Annotations/206.xml
146 | scenetext/JPEGImages/244.jpg scenetext/Annotations/244.xml
147 | scenetext/JPEGImages/128.jpg scenetext/Annotations/128.xml
148 | scenetext/JPEGImages/214.jpg scenetext/Annotations/214.xml
149 | scenetext/JPEGImages/102.jpg scenetext/Annotations/102.xml
150 | scenetext/JPEGImages/238.jpg scenetext/Annotations/238.xml
151 | scenetext/JPEGImages/211.jpg scenetext/Annotations/211.xml
152 | scenetext/JPEGImages/232.jpg scenetext/Annotations/232.xml
153 | scenetext/JPEGImages/183.jpg scenetext/Annotations/183.xml
154 | scenetext/JPEGImages/159.jpg scenetext/Annotations/159.xml
155 | scenetext/JPEGImages/249.jpg scenetext/Annotations/249.xml
156 | scenetext/JPEGImages/251.jpg scenetext/Annotations/251.xml
157 | scenetext/JPEGImages/207.jpg scenetext/Annotations/207.xml
158 | scenetext/JPEGImages/121.jpg scenetext/Annotations/121.xml
159 | scenetext/JPEGImages/240.jpg scenetext/Annotations/240.xml
160 |
--------------------------------------------------------------------------------