├── py_examples ├── utils │ ├── __init__.py │ └── app_utils.py ├── yolo_example.py └── object_detection_app.py ├── images ├── dog.jpg ├── camera.png └── yolo_dog.png ├── LICENSE ├── README.md ├── create_yolo_caffemodel.py ├── prototxt ├── yolo_tiny_deploy.prototxt ├── yolo_deploy.prototxt ├── yolo_small_deploy.prototxt ├── yolo_tiny_train_val.prototxt ├── yolo_train_val.prototxt └── yolo_small_train_val.prototxt ├── yolo_main.py └── create_yolo_prototxt.py /py_examples/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudovskiy/yoloNCS/HEAD/images/dog.jpg -------------------------------------------------------------------------------- /images/camera.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudovskiy/yoloNCS/HEAD/images/camera.png -------------------------------------------------------------------------------- /images/yolo_dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gudovskiy/yoloNCS/HEAD/images/yolo_dog.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 gudovskiy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YOLO for Intel/Movidius Neural Compute Stick (NCS) 2 | 3 | ## News 4 | 5 | * Camera App is working. 6 | * YOLOv1 Tiny is working. 7 | 8 | ## Protobuf Model files 9 | 10 | ./prototxt/ 11 | 12 | ## Download Pretrained Caffe Models to ./weights/ 13 | 14 | * YOLO_tiny: https://drive.google.com/file/d/0Bzy9LxvTYIgKNFEzOEdaZ3U0Nms/view?usp=sharing 15 | 16 | ## Compilation 17 | 18 | * Compile .prototxt and corresponding .caffemodel (with the same name) to get NCS graph file. For example: "mvNCCompile prototxt/yolo_tiny_deploy.prototxt -w weights/yolo_tiny_deploy.caffemodel -s 12" 19 | * The compiled binary file "graph" has to be in main folder after this step. 20 | 21 | ## Single Image Script 22 | 23 | * Run "yolo_example.py" to process a single image. For example: "python3 py_examples/yolo_example.py images/dog.jpg" to get detections as below. 24 | 25 | ![](/images/yolo_dog.png) 26 | 27 | ## Camera Input Script 28 | 29 | * Run "object_detection_app.py" to process a videos from your camera. For example: "python3 py_examples/object_detection_app.py" to get camera detections as below. 30 | * Modify script arguments if needed. 31 | * Press "q" to exit app. 32 | 33 | ![](/images/camera.png) 34 | -------------------------------------------------------------------------------- /create_yolo_caffemodel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 29 16:10:21 2016 4 | 5 | @author: xingw 6 | """ 7 | import sys,os 8 | caffe_root = os.environ["CAFFE_ROOT"] 9 | os.chdir(caffe_root) 10 | print caffe_root 11 | sys.path.insert(0, caffe_root + '/python') 12 | import caffe 13 | import numpy as np 14 | import sys, getopt 15 | 16 | def main(argv): 17 | model_filename = '' 18 | yoloweight_filename = '' 19 | caffemodel_filename = '' 20 | try: 21 | opts, args = getopt.getopt(argv, "hm:w:o:") 22 | print opts 23 | except getopt.GetoptError: 24 | print 'create_yolo_caffemodel.py -m -w -o ' 25 | sys.exit(2) 26 | for opt, arg in opts: 27 | if opt == '-h': 28 | print 'create_yolo_caffemodel.py -m -w -o ' 29 | sys.exit() 30 | elif opt == "-m": 31 | model_filename = arg 32 | elif opt == "-w": 33 | yoloweight_filename = arg 34 | elif opt == "-o": 35 | caffemodel_filename = arg 36 | 37 | print 'model file is ', model_filename 38 | print 'weight file is ', yoloweight_filename 39 | print 'output caffemodel file is ', caffemodel_filename 40 | net = caffe.Net(model_filename, caffe.TEST) 41 | params = net.params.keys() 42 | 43 | # read weights from file and assign to the network 44 | netWeightsInt = np.fromfile(yoloweight_filename, dtype=np.int32) 45 | transFlag = (netWeightsInt[0]>1000 or netWeightsInt[1]>1000) # transpose flag, the first 4 entries are major, minor, revision and net.seen 46 | print transFlag 47 | 48 | netWeightsFloat = np.fromfile(yoloweight_filename, dtype=np.float32) 49 | netWeights = netWeightsFloat[4:] # start from the 5th entry, the first 4 entries are major, minor, revision and net.seen 50 | print netWeights.shape 51 | count = 0 52 | for pr in params: 53 | lidx = list(net._layer_names).index(pr) 54 | layer = net.layers[lidx] 55 | print layer,transFlag 56 | if count == netWeights.shape[0] and (layer.type != 'BatchNorm' and layer.type != 'Scale'): 57 | print "WARNING: no weights left for %s" % pr 58 | break 59 | if layer.type == 'Convolution': 60 | print pr+"(conv)" 61 | # bias 62 | if len(net.params[pr]) > 1: 63 | bias_dim = net.params[pr][1].data.shape 64 | else: 65 | bias_dim = (net.params[pr][0].data.shape[0], ) 66 | biasSize = np.prod(bias_dim) 67 | conv_bias = np.reshape(netWeights[count:count+biasSize], bias_dim) 68 | if len(net.params[pr]) > 1: 69 | assert(bias_dim == net.params[pr][1].data.shape) 70 | net.params[pr][1].data[...] = conv_bias 71 | conv_bias = None 72 | count = count + biasSize 73 | # batch_norm 74 | next_layer = net.layers[lidx+1] 75 | if next_layer.type == 'BatchNorm': 76 | bn_dims = (3, net.params[pr][0].data.shape[0]) 77 | bnSize = np.prod(bn_dims) 78 | batch_norm = np.reshape(netWeights[count:count+bnSize], bn_dims) 79 | count = count + bnSize 80 | # weights 81 | dims = net.params[pr][0].data.shape 82 | weightSize = np.prod(dims) 83 | net.params[pr][0].data[...] = np.reshape(netWeights[count:count+weightSize], dims) 84 | count = count + weightSize 85 | elif layer.type == 'InnerProduct': 86 | print pr+"(fc)" 87 | # bias 88 | biasSize = np.prod(net.params[pr][1].data.shape) 89 | net.params[pr][1].data[...] = np.reshape(netWeights[count:count+biasSize], net.params[pr][1].data.shape) 90 | count = count + biasSize 91 | # weights 92 | dims = net.params[pr][0].data.shape 93 | weightSize = np.prod(dims) 94 | if transFlag: 95 | net.params[pr][0].data[...] = np.reshape(netWeights[count:count+weightSize], (dims[1], dims[0])).transpose() 96 | else: 97 | print dims, count, weightSize, netWeights.shape 98 | net.params[pr][0].data[...] = np.reshape(netWeights[count:count+weightSize], dims) 99 | count = count + weightSize 100 | elif layer.type == 'BatchNorm': 101 | print pr+"(batchnorm)" 102 | net.params[pr][0].data[...] = batch_norm[1] # mean 103 | net.params[pr][1].data[...] = batch_norm[2] # variance 104 | net.params[pr][2].data[...] = 1.0 # scale factor 105 | elif layer.type == 'Scale': 106 | print pr+"(scale)" 107 | net.params[pr][0].data[...] = batch_norm[0] # scale 108 | batch_norm = None 109 | if len(net.params[pr]) > 1: 110 | net.params[pr][1].data[...] = conv_bias # bias 111 | conv_bias = None 112 | else: 113 | print "WARNING: unsupported layer, "+pr 114 | if np.prod(netWeights.shape) != count: 115 | print "ERROR: size mismatch: %d" % count 116 | net.save(caffemodel_filename) 117 | 118 | if __name__=='__main__': 119 | main(sys.argv[1:]) 120 | -------------------------------------------------------------------------------- /prototxt/yolo_tiny_deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | convolution_param { 16 | num_output: 16 17 | kernel_size: 3 18 | pad: 1 19 | stride: 1 20 | } 21 | } 22 | layer { 23 | name: "relu1" 24 | type: "ReLU" 25 | bottom: "conv1" 26 | top: "conv1" 27 | relu_param{ 28 | negative_slope: 0.1 29 | } 30 | } 31 | layer { 32 | name: "pool1" 33 | type: "Pooling" 34 | bottom: "conv1" 35 | top: "pool1" 36 | pooling_param { 37 | pool: MAX 38 | kernel_size: 2 39 | stride: 2 40 | } 41 | } 42 | layer{ 43 | name: "conv2" 44 | type: "Convolution" 45 | bottom: "pool1" 46 | top: "conv2" 47 | convolution_param { 48 | num_output: 32 49 | kernel_size: 3 50 | pad: 1 51 | stride: 1 52 | } 53 | } 54 | layer { 55 | name: "relu2" 56 | type: "ReLU" 57 | bottom: "conv2" 58 | top: "conv2" 59 | relu_param{ 60 | negative_slope: 0.1 61 | } 62 | } 63 | layer { 64 | name: "pool2" 65 | type: "Pooling" 66 | bottom: "conv2" 67 | top: "pool2" 68 | pooling_param { 69 | pool: MAX 70 | kernel_size: 2 71 | stride: 2 72 | } 73 | } 74 | layer{ 75 | name: "conv3" 76 | type: "Convolution" 77 | bottom: "pool2" 78 | top: "conv3" 79 | convolution_param { 80 | num_output: 64 81 | kernel_size: 3 82 | pad: 1 83 | stride: 1 84 | } 85 | } 86 | layer { 87 | name: "relu3" 88 | type: "ReLU" 89 | bottom: "conv3" 90 | top: "conv3" 91 | relu_param{ 92 | negative_slope: 0.1 93 | } 94 | } 95 | layer{ 96 | name: "pool3" 97 | type: "Pooling" 98 | bottom: "conv3" 99 | top: "pool3" 100 | pooling_param { 101 | pool: MAX 102 | kernel_size: 2 103 | stride: 2 104 | } 105 | } 106 | layer{ 107 | name: "conv4" 108 | type: "Convolution" 109 | bottom: "pool3" 110 | top: "conv4" 111 | convolution_param { 112 | num_output: 128 113 | kernel_size: 3 114 | pad: 1 115 | stride: 1 116 | } 117 | } 118 | layer { 119 | name: "relu4" 120 | type: "ReLU" 121 | bottom: "conv4" 122 | top: "conv4" 123 | relu_param{ 124 | negative_slope: 0.1 125 | } 126 | } 127 | layer { 128 | name: "pool4" 129 | type: "Pooling" 130 | bottom: "conv4" 131 | top: "pool4" 132 | pooling_param { 133 | pool: MAX 134 | kernel_size: 2 135 | stride: 2 136 | } 137 | } 138 | layer{ 139 | name: "conv5" 140 | type: "Convolution" 141 | bottom: "pool4" 142 | top: "conv5" 143 | convolution_param { 144 | num_output: 256 145 | kernel_size: 3 146 | pad: 1 147 | stride: 1 148 | } 149 | } 150 | layer { 151 | name: "relu5" 152 | type: "ReLU" 153 | bottom: "conv5" 154 | top: "conv5" 155 | relu_param{ 156 | negative_slope: 0.1 157 | } 158 | } 159 | layer { 160 | name: "pool5" 161 | type: "Pooling" 162 | bottom: "conv5" 163 | top: "pool5" 164 | pooling_param { 165 | pool: MAX 166 | kernel_size: 2 167 | stride: 2 168 | } 169 | } 170 | layer{ 171 | name: "conv6" 172 | type: "Convolution" 173 | bottom: "pool5" 174 | top: "conv6" 175 | convolution_param { 176 | num_output: 512 177 | kernel_size: 3 178 | pad: 1 179 | stride: 1 180 | } 181 | } 182 | layer { 183 | name: "relu6" 184 | type: "ReLU" 185 | bottom: "conv6" 186 | top: "conv6" 187 | relu_param{ 188 | negative_slope: 0.1 189 | } 190 | } 191 | layer { 192 | name: "pool6" 193 | type: "Pooling" 194 | bottom: "conv6" 195 | top: "pool6" 196 | pooling_param { 197 | pool: MAX 198 | kernel_size: 2 199 | stride: 2 200 | } 201 | } 202 | layer{ 203 | name: "conv7" 204 | type: "Convolution" 205 | bottom: "pool6" 206 | top: "conv7" 207 | convolution_param { 208 | num_output: 1024 209 | pad: 1 210 | kernel_size: 3 211 | stride: 1 212 | } 213 | } 214 | layer { 215 | name: "relu7" 216 | type: "ReLU" 217 | bottom: "conv7" 218 | top: "conv7" 219 | relu_param{ 220 | negative_slope: 0.1 221 | } 222 | } 223 | layer{ 224 | name: "conv8" 225 | type: "Convolution" 226 | bottom: "conv7" 227 | top: "conv8" 228 | convolution_param { 229 | num_output: 1024 230 | kernel_size: 3 231 | pad: 1 232 | stride: 1 233 | } 234 | } 235 | layer { 236 | name: "relu8" 237 | type: "ReLU" 238 | bottom: "conv8" 239 | top: "conv8" 240 | relu_param{ 241 | negative_slope: 0.1 242 | } 243 | } 244 | layer{ 245 | name: "conv9" 246 | type: "Convolution" 247 | bottom: "conv8" 248 | top: "conv9" 249 | convolution_param { 250 | num_output: 1024 251 | kernel_size: 3 252 | pad: 1 253 | stride: 1 254 | } 255 | } 256 | layer { 257 | name: "relu9" 258 | type: "ReLU" 259 | bottom: "conv9" 260 | top: "conv9" 261 | relu_param{ 262 | negative_slope: 0.1 263 | } 264 | } 265 | layer{ 266 | name: "fc10" 267 | type: "InnerProduct" 268 | bottom: "conv9" 269 | top: "fc10" 270 | inner_product_param { 271 | num_output: 256 272 | } 273 | } 274 | layer { 275 | name: "fc11" 276 | type: "InnerProduct" 277 | bottom: "fc10" 278 | top: "fc11" 279 | inner_product_param { 280 | num_output: 4096 281 | } 282 | } 283 | layer { 284 | name: "relu11" 285 | type: "ReLU" 286 | bottom: "fc11" 287 | top: "fc11" 288 | relu_param{ 289 | negative_slope: 0.1 290 | } 291 | } 292 | 293 | layer { 294 | name: "fc12" 295 | type: "InnerProduct" 296 | bottom: "fc11" 297 | top: "fc12" 298 | inner_product_param { 299 | num_output: 1470 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /py_examples/yolo_example.py: -------------------------------------------------------------------------------- 1 | from mvnc import mvncapi as mvnc 2 | import sys,os,time,csv,getopt,cv2 3 | import numpy as np 4 | from datetime import datetime 5 | from skimage.transform import resize 6 | 7 | def interpret_output(output, img_width, img_height): 8 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"] 9 | w_img = img_width 10 | h_img = img_height 11 | print ((w_img, h_img)) 12 | threshold = 0.2 13 | iou_threshold = 0.5 14 | num_class = 20 15 | num_box = 2 16 | grid_size = 7 17 | probs = np.zeros((7,7,2,20)) 18 | class_probs = (np.reshape(output[0:980],(7,7,20)))#.copy() 19 | #print(class_probs) 20 | scales = (np.reshape(output[980:1078],(7,7,2)))#.copy() 21 | #print(scales) 22 | boxes = (np.reshape(output[1078:],(7,7,2,4)))#.copy() 23 | offset = np.transpose(np.reshape(np.array([np.arange(7)]*14),(2,7,7)),(1,2,0)) 24 | #boxes.setflags(write=1) 25 | boxes[:,:,:,0] += offset 26 | boxes[:,:,:,1] += np.transpose(offset,(1,0,2)) 27 | boxes[:,:,:,0:2] = boxes[:,:,:,0:2] / 7.0 28 | boxes[:,:,:,2] = np.multiply(boxes[:,:,:,2],boxes[:,:,:,2]) 29 | boxes[:,:,:,3] = np.multiply(boxes[:,:,:,3],boxes[:,:,:,3]) 30 | 31 | boxes[:,:,:,0] *= w_img 32 | boxes[:,:,:,1] *= h_img 33 | boxes[:,:,:,2] *= w_img 34 | boxes[:,:,:,3] *= h_img 35 | 36 | for i in range(2): 37 | for j in range(20): 38 | probs[:,:,i,j] = np.multiply(class_probs[:,:,j],scales[:,:,i]) 39 | #print (probs) 40 | filter_mat_probs = np.array(probs>=threshold,dtype='bool') 41 | filter_mat_boxes = np.nonzero(filter_mat_probs) 42 | boxes_filtered = boxes[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 43 | probs_filtered = probs[filter_mat_probs] 44 | classes_num_filtered = np.argmax(probs,axis=3)[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 45 | 46 | argsort = np.array(np.argsort(probs_filtered))[::-1] 47 | boxes_filtered = boxes_filtered[argsort] 48 | probs_filtered = probs_filtered[argsort] 49 | classes_num_filtered = classes_num_filtered[argsort] 50 | 51 | for i in range(len(boxes_filtered)): 52 | if probs_filtered[i] == 0 : continue 53 | for j in range(i+1,len(boxes_filtered)): 54 | if iou(boxes_filtered[i],boxes_filtered[j]) > iou_threshold : 55 | probs_filtered[j] = 0.0 56 | 57 | filter_iou = np.array(probs_filtered>0.0,dtype='bool') 58 | boxes_filtered = boxes_filtered[filter_iou] 59 | probs_filtered = probs_filtered[filter_iou] 60 | classes_num_filtered = classes_num_filtered[filter_iou] 61 | 62 | result = [] 63 | for i in range(len(boxes_filtered)): 64 | result.append([classes[classes_num_filtered[i]],boxes_filtered[i][0],boxes_filtered[i][1],boxes_filtered[i][2],boxes_filtered[i][3],probs_filtered[i]]) 65 | 66 | return result 67 | 68 | def iou(box1,box2): 69 | tb = min(box1[0]+0.5*box1[2],box2[0]+0.5*box2[2])-max(box1[0]-0.5*box1[2],box2[0]-0.5*box2[2]) 70 | lr = min(box1[1]+0.5*box1[3],box2[1]+0.5*box2[3])-max(box1[1]-0.5*box1[3],box2[1]-0.5*box2[3]) 71 | if tb < 0 or lr < 0 : intersection = 0 72 | else : intersection = tb*lr 73 | return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection) 74 | 75 | 76 | def show_results(img, results, img_width, img_height): 77 | img_cp = img.copy() 78 | disp_console = True 79 | imshow = True 80 | # if self.filewrite_txt : 81 | # ftxt = open(self.tofile_txt,'w') 82 | for i in range(len(results)): 83 | x = int(results[i][1]) 84 | y = int(results[i][2]) 85 | w = int(results[i][3])//2 86 | h = int(results[i][4])//2 87 | if disp_console : print (' class : ' + results[i][0] + ' , [x,y,w,h]=[' + str(x) + ',' + str(y) + ',' + str(int(results[i][3])) + ',' + str(int(results[i][4]))+'], Confidence = ' + str(results[i][5]) ) 88 | xmin = x-w 89 | xmax = x+w 90 | ymin = y-h 91 | ymax = y+h 92 | if xmin<0: 93 | xmin = 0 94 | if ymin<0: 95 | ymin = 0 96 | if xmax>img_width: 97 | xmax = img_width 98 | if ymax>img_height: 99 | ymax = img_height 100 | if imshow: 101 | cv2.rectangle(img_cp,(xmin,ymin),(xmax,ymax),(0,255,0),2) 102 | #print ((xmin, ymin, xmax, ymax)) 103 | cv2.rectangle(img_cp,(xmin,ymin-20),(xmax,ymin),(125,125,125),-1) 104 | cv2.putText(img_cp,results[i][0] + ' : %.2f' % results[i][5],(xmin+5,ymin-7),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,0),1) 105 | if imshow : 106 | cv2.imshow('YOLO detection',img_cp) 107 | cv2.waitKey(1000) 108 | 109 | if len(sys.argv) != 2: 110 | print ("YOLOv1 Tiny example: python3 py_examples/yolo_example.py images/dog.jpg") 111 | sys.exit() 112 | 113 | network_blob='graph' 114 | # configuration NCS 115 | mvnc.SetGlobalOption(mvnc.GlobalOption.LOG_LEVEL, 2) 116 | devices = mvnc.EnumerateDevices() 117 | if len(devices) == 0: 118 | print('No devices found') 119 | quit() 120 | device = mvnc.Device(devices[0]) 121 | device.OpenDevice() 122 | opt = device.GetDeviceOption(mvnc.DeviceOption.OPTIMISATION_LIST) 123 | # load blob 124 | with open(network_blob, mode='rb') as f: 125 | blob = f.read() 126 | graph = device.AllocateGraph(blob) 127 | graph.SetGraphOption(mvnc.GraphOption.ITERATIONS, 1) 128 | iterations = graph.GetGraphOption(mvnc.GraphOption.ITERATIONS) 129 | # image preprocess 130 | dim=(448,448) 131 | img = cv2.imread(sys.argv[1]) 132 | im = resize(img.copy()/255.0,dim,1) 133 | #im = cv2.cvtColor(im, cv2.COLOR_RGB2BGR) 134 | im = im[:,:,(2,1,0)] 135 | #print('NEW shape:',im.shape) 136 | #print(img[0,0,:],im[0,0,:]) 137 | start = datetime.now() 138 | # start MOD 139 | graph.LoadTensor(im.astype(np.float16), 'user object') 140 | out, userobj = graph.GetResult() 141 | # 142 | end = datetime.now() 143 | elapsedTime = end-start 144 | print ('total time is " milliseconds', elapsedTime.total_seconds()*1000) 145 | results = interpret_output(out.astype(np.float32), img.shape[1], img.shape[0]) # fc27 instead of fc12 for yolo_small 146 | #print (results) 147 | #cv2.imshow('YOLO detection',img_cv) 148 | show_results(img, results, img.shape[1], img.shape[0]) 149 | cv2.waitKey(10000) 150 | # 151 | graph.DeallocateGraph() 152 | device.CloseDevice() 153 | -------------------------------------------------------------------------------- /yolo_main.py: -------------------------------------------------------------------------------- 1 | import sys,os 2 | caffe_root = os.environ["CAFFE_ROOT"] 3 | os.chdir(caffe_root) 4 | print caffe_root 5 | sys.path.insert(0, caffe_root + '/python') 6 | import caffe 7 | GPU_ID = 0 # Switch between 0 and 1 depending on the GPU you want to use. 8 | caffe.set_mode_gpu() 9 | caffe.set_device(GPU_ID) 10 | # caffe.set_mode_cpu() 11 | from datetime import datetime 12 | import numpy as np 13 | import sys, getopt 14 | import cv2 15 | 16 | def interpret_output(output, img_width, img_height): 17 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"] 18 | w_img = img_width 19 | h_img = img_height 20 | print w_img, h_img 21 | threshold = 0.2 22 | iou_threshold = 0.5 23 | num_class = 20 24 | num_box = 2 25 | grid_size = 7 26 | probs = np.zeros((7,7,2,20)) 27 | class_probs = np.reshape(output[0:980],(7,7,20)) 28 | # print class_probs 29 | scales = np.reshape(output[980:1078],(7,7,2)) 30 | # print scales 31 | boxes = np.reshape(output[1078:],(7,7,2,4)) 32 | offset = np.transpose(np.reshape(np.array([np.arange(7)]*14),(2,7,7)),(1,2,0)) 33 | 34 | boxes[:,:,:,0] += offset 35 | boxes[:,:,:,1] += np.transpose(offset,(1,0,2)) 36 | boxes[:,:,:,0:2] = boxes[:,:,:,0:2] / 7.0 37 | boxes[:,:,:,2] = np.multiply(boxes[:,:,:,2],boxes[:,:,:,2]) 38 | boxes[:,:,:,3] = np.multiply(boxes[:,:,:,3],boxes[:,:,:,3]) 39 | 40 | boxes[:,:,:,0] *= w_img 41 | boxes[:,:,:,1] *= h_img 42 | boxes[:,:,:,2] *= w_img 43 | boxes[:,:,:,3] *= h_img 44 | 45 | for i in range(2): 46 | for j in range(20): 47 | probs[:,:,i,j] = np.multiply(class_probs[:,:,j],scales[:,:,i]) 48 | filter_mat_probs = np.array(probs>=threshold,dtype='bool') 49 | filter_mat_boxes = np.nonzero(filter_mat_probs) 50 | boxes_filtered = boxes[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 51 | probs_filtered = probs[filter_mat_probs] 52 | classes_num_filtered = np.argmax(probs,axis=3)[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 53 | 54 | argsort = np.array(np.argsort(probs_filtered))[::-1] 55 | boxes_filtered = boxes_filtered[argsort] 56 | probs_filtered = probs_filtered[argsort] 57 | classes_num_filtered = classes_num_filtered[argsort] 58 | 59 | for i in range(len(boxes_filtered)): 60 | if probs_filtered[i] == 0 : continue 61 | for j in range(i+1,len(boxes_filtered)): 62 | if iou(boxes_filtered[i],boxes_filtered[j]) > iou_threshold : 63 | probs_filtered[j] = 0.0 64 | 65 | filter_iou = np.array(probs_filtered>0.0,dtype='bool') 66 | boxes_filtered = boxes_filtered[filter_iou] 67 | probs_filtered = probs_filtered[filter_iou] 68 | classes_num_filtered = classes_num_filtered[filter_iou] 69 | 70 | result = [] 71 | for i in range(len(boxes_filtered)): 72 | result.append([classes[classes_num_filtered[i]],boxes_filtered[i][0],boxes_filtered[i][1],boxes_filtered[i][2],boxes_filtered[i][3],probs_filtered[i]]) 73 | 74 | return result 75 | 76 | def iou(box1,box2): 77 | tb = min(box1[0]+0.5*box1[2],box2[0]+0.5*box2[2])-max(box1[0]-0.5*box1[2],box2[0]-0.5*box2[2]) 78 | lr = min(box1[1]+0.5*box1[3],box2[1]+0.5*box2[3])-max(box1[1]-0.5*box1[3],box2[1]-0.5*box2[3]) 79 | if tb < 0 or lr < 0 : intersection = 0 80 | else : intersection = tb*lr 81 | return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection) 82 | 83 | 84 | def show_results(img,results, img_width, img_height): 85 | img_cp = img.copy() 86 | disp_console = True 87 | imshow = True 88 | # if self.filewrite_txt : 89 | # ftxt = open(self.tofile_txt,'w') 90 | for i in range(len(results)): 91 | x = int(results[i][1]) 92 | y = int(results[i][2]) 93 | w = int(results[i][3])//2 94 | h = int(results[i][4])//2 95 | if disp_console : print ' class : ' + results[i][0] + ' , [x,y,w,h]=[' + str(x) + ',' + str(y) + ',' + str(int(results[i][3])) + ',' + str(int(results[i][4]))+'], Confidence = ' + str(results[i][5]) 96 | xmin = x-w 97 | xmax = x+w 98 | ymin = y-h 99 | ymax = y+h 100 | if xmin<0: 101 | xmin = 0 102 | if ymin<0: 103 | ymin = 0 104 | if xmax>img_width: 105 | xmax = img_width 106 | if ymax>img_height: 107 | ymax = img_height 108 | if imshow: 109 | cv2.rectangle(img_cp,(xmin,ymin),(xmax,ymax),(0,255,0),2) 110 | print xmin, ymin, xmax, ymax 111 | cv2.rectangle(img_cp,(xmin,ymin-20),(xmax,ymin),(125,125,125),-1) 112 | cv2.putText(img_cp,results[i][0] + ' : %.2f' % results[i][5],(xmin+5,ymin-7),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,0),1) 113 | if imshow : 114 | cv2.imshow('YOLO detection',img_cp) 115 | cv2.waitKey(1000) 116 | 117 | 118 | 119 | def main(argv): 120 | model_filename = '' 121 | weight_filename = '' 122 | img_filename = '' 123 | try: 124 | opts, args = getopt.getopt(argv, "hm:w:i:") 125 | print opts 126 | except getopt.GetoptError: 127 | print 'yolo_main.py -m -w -i ' 128 | sys.exit(2) 129 | for opt, arg in opts: 130 | if opt == '-h': 131 | print 'yolo_main.py -m -w -i ' 132 | sys.exit() 133 | elif opt == "-m": 134 | model_filename = arg 135 | elif opt == "-w": 136 | weight_filename = arg 137 | elif opt == "-i": 138 | img_filename = arg 139 | print 'model file is "', model_filename 140 | print 'weight file is "', weight_filename 141 | print 'image file is "', img_filename 142 | net = caffe.Net(model_filename, weight_filename, caffe.TEST) 143 | img = caffe.io.load_image(img_filename) # load the image using caffe io 144 | inputs = img 145 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) 146 | transformer.set_transpose('data', (2,0,1)) 147 | start = datetime.now() 148 | out = net.forward_all(data=np.asarray([transformer.preprocess('data', inputs)])) 149 | end = datetime.now() 150 | elapsedTime = end-start 151 | print 'total time is " milliseconds', elapsedTime.total_seconds()*1000 152 | print out.iteritems() 153 | img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 154 | results = interpret_output(out['result'][0], img.shape[1], img.shape[0]) # fc27 instead of fc12 for yolo_small 155 | show_results(img_cv,results, img.shape[1], img.shape[0]) 156 | cv2.waitKey(10000) 157 | 158 | 159 | 160 | if __name__=='__main__': 161 | main(sys.argv[1:]) 162 | -------------------------------------------------------------------------------- /py_examples/object_detection_app.py: -------------------------------------------------------------------------------- 1 | import os,time,cv2,argparse,multiprocessing 2 | import numpy as np 3 | from mvnc import mvncapi as mvnc 4 | from skimage.transform import resize 5 | from utils.app_utils import FPS, WebcamVideoStream 6 | from multiprocessing import Queue, Pool 7 | 8 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train","tvmonitor"] 9 | dim=(448,448) 10 | threshold = 0.2 11 | iou_threshold = 0.5 12 | num_class = 20 13 | num_box = 2 14 | grid_size = 7 15 | 16 | def show_results(img, results, img_width, img_height): 17 | img_cp = img 18 | disp_console = False 19 | imshow = True 20 | for i in range(len(results)): 21 | x = int(results[i][1]) 22 | y = int(results[i][2]) 23 | w = int(results[i][3])//2 24 | h = int(results[i][4])//2 25 | if disp_console : print (' class : ' + results[i][0] + ' , [x,y,w,h]=[' + str(x) + ',' + str(y) + ',' + str(int(results[i][3])) + ',' + str(int(results[i][4]))+'], Confidence = ' + str(results[i][5]) ) 26 | xmin = x-w 27 | xmax = x+w 28 | ymin = y-h 29 | ymax = y+h 30 | if xmin<0: 31 | xmin = 0 32 | if ymin<0: 33 | ymin = 0 34 | if xmax>img_width: 35 | xmax = img_width 36 | if ymax>img_height: 37 | ymax = img_height 38 | if imshow: 39 | cv2.rectangle(img_cp,(xmin,ymin),(xmax,ymax),(0,255,0),2) 40 | #print ((xmin, ymin, xmax, ymax)) 41 | cv2.rectangle(img_cp,(xmin,ymin-20),(xmax,ymin),(125,125,125),-1) 42 | cv2.putText(img_cp,results[i][0] + ' : %.2f' % results[i][5],(xmin+5,ymin-7),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,0),1) 43 | # 44 | cv2.imshow('YOLO detection',img_cp) 45 | 46 | 47 | def interpret_output(output, img_width, img_height): 48 | w_img = img_width 49 | h_img = img_height 50 | probs = np.zeros((7,7,2,20)) 51 | class_probs = (np.reshape(output[0:980],(7,7,20))) 52 | #print(class_probs) 53 | scales = (np.reshape(output[980:1078],(7,7,2))) 54 | #print(scales) 55 | boxes = (np.reshape(output[1078:],(7,7,2,4))) 56 | offset = np.transpose(np.reshape(np.array([np.arange(7)]*14),(2,7,7)),(1,2,0)) 57 | #boxes.setflags(write=1) 58 | boxes[:,:,:,0] += offset 59 | boxes[:,:,:,1] += np.transpose(offset,(1,0,2)) 60 | boxes[:,:,:,0:2] = boxes[:,:,:,0:2] / 7.0 61 | boxes[:,:,:,2] = np.multiply(boxes[:,:,:,2],boxes[:,:,:,2]) 62 | boxes[:,:,:,3] = np.multiply(boxes[:,:,:,3],boxes[:,:,:,3]) 63 | 64 | boxes[:,:,:,0] *= w_img 65 | boxes[:,:,:,1] *= h_img 66 | boxes[:,:,:,2] *= w_img 67 | boxes[:,:,:,3] *= h_img 68 | 69 | for i in range(2): 70 | for j in range(20): 71 | probs[:,:,i,j] = np.multiply(class_probs[:,:,j],scales[:,:,i]) 72 | #print (probs) 73 | filter_mat_probs = np.array(probs>=threshold,dtype='bool') 74 | filter_mat_boxes = np.nonzero(filter_mat_probs) 75 | boxes_filtered = boxes[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 76 | probs_filtered = probs[filter_mat_probs] 77 | classes_num_filtered = np.argmax(probs,axis=3)[filter_mat_boxes[0],filter_mat_boxes[1],filter_mat_boxes[2]] 78 | 79 | argsort = np.array(np.argsort(probs_filtered))[::-1] 80 | boxes_filtered = boxes_filtered[argsort] 81 | probs_filtered = probs_filtered[argsort] 82 | classes_num_filtered = classes_num_filtered[argsort] 83 | 84 | for i in range(len(boxes_filtered)): 85 | if probs_filtered[i] == 0 : continue 86 | for j in range(i+1,len(boxes_filtered)): 87 | if iou(boxes_filtered[i],boxes_filtered[j]) > iou_threshold : 88 | probs_filtered[j] = 0.0 89 | 90 | filter_iou = np.array(probs_filtered>0.0,dtype='bool') 91 | boxes_filtered = boxes_filtered[filter_iou] 92 | probs_filtered = probs_filtered[filter_iou] 93 | classes_num_filtered = classes_num_filtered[filter_iou] 94 | 95 | result = [] 96 | for i in range(len(boxes_filtered)): 97 | result.append([classes[classes_num_filtered[i]],boxes_filtered[i][0],boxes_filtered[i][1],boxes_filtered[i][2],boxes_filtered[i][3],probs_filtered[i]]) 98 | 99 | return result 100 | 101 | def iou(box1,box2): 102 | tb = min(box1[0]+0.5*box1[2],box2[0]+0.5*box2[2])-max(box1[0]-0.5*box1[2],box2[0]-0.5*box2[2]) 103 | lr = min(box1[1]+0.5*box1[3],box2[1]+0.5*box2[3])-max(box1[1]-0.5*box1[3],box2[1]-0.5*box2[3]) 104 | if tb < 0 or lr < 0 : intersection = 0 105 | else : intersection = tb*lr 106 | return intersection / (box1[2]*box1[3] + box2[2]*box2[3] - intersection) 107 | 108 | def worker(graph, input_q, output_q): 109 | fps = FPS().start() 110 | while True: 111 | fps.update() 112 | frame = input_q.get() 113 | graph.LoadTensor(resize(frame/255.0,dim,1)[:,:,(2,1,0)].astype(np.float16), 'user object') 114 | out, userobj = graph.GetResult() 115 | results = interpret_output(out.astype(np.float32), frame.shape[1], frame.shape[0]) 116 | #print(results) 117 | output_q.put((frame, results, frame.shape[1], frame.shape[0])) 118 | #output_q.put((frame, [], frame.shape[1], frame.shape[0])) 119 | #output_q.put(frame) 120 | # 121 | fps.stop() 122 | 123 | if __name__ == '__main__': 124 | parser = argparse.ArgumentParser() 125 | parser.add_argument('-src', '--source', dest='video_source', type=int, 126 | default=0, help='Device index of the camera.') 127 | parser.add_argument('-wd', '--width', dest='width', type=int, 128 | default=800, help='Width of the frames in the video stream.') 129 | parser.add_argument('-ht', '--height', dest='height', type=int, 130 | default=600, help='Height of the frames in the video stream.') 131 | parser.add_argument('-num-w', '--num-workers', dest='num_workers', type=int, 132 | default=2, help='Number of workers.') 133 | parser.add_argument('-q-size', '--queue-size', dest='queue_size', type=int, 134 | default=5, help='Size of the queue.') 135 | args = parser.parse_args() 136 | 137 | logger = multiprocessing.log_to_stderr() 138 | logger.setLevel(multiprocessing.SUBDEBUG) 139 | 140 | input_q = Queue(maxsize=args.queue_size) 141 | output_q = Queue(maxsize=args.queue_size) 142 | # configuration NCS 143 | network_blob = 'graph' 144 | mvnc.SetGlobalOption(mvnc.GlobalOption.LOG_LEVEL, 2) 145 | devices = mvnc.EnumerateDevices() 146 | if len(devices) == 0: 147 | print('No devices found') 148 | quit() 149 | device = mvnc.Device(devices[0]) 150 | device.OpenDevice() 151 | opt = device.GetDeviceOption(mvnc.DeviceOption.OPTIMISATION_LIST) 152 | # load blob 153 | with open(network_blob, mode='rb') as f: 154 | blob = f.read() 155 | graph = device.AllocateGraph(blob) 156 | graph.SetGraphOption(mvnc.GraphOption.ITERATIONS, 1) 157 | iterations = graph.GetGraphOption(mvnc.GraphOption.ITERATIONS) 158 | # 159 | pool = Pool(args.num_workers, worker, (graph, input_q, output_q)) 160 | # 161 | video_capture = WebcamVideoStream(src=args.video_source, 162 | width=args.width, 163 | height=args.height).start() 164 | fps = FPS().start() 165 | # 166 | while True: # fps._numFrames < 120 167 | frame = video_capture.read() 168 | input_q.put(frame) 169 | t = time.time() 170 | (img, results, img_width, img_height) = output_q.get() 171 | show_results(img, results, img_width, img_height) 172 | #cv2.imshow('Video', output_q.get()) 173 | #cv2.imshow('Video', output_q.get()) 174 | fps.update() 175 | print('[INFO] elapsed time: {:.2f}'.format(time.time() - t)) 176 | if cv2.waitKey(1) & 0xFF == ord('q'): 177 | break 178 | 179 | fps.stop() 180 | print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed())) 181 | print('[INFO] approx. FPS: {:.2f}'.format(fps.fps())) 182 | 183 | pool.terminate() 184 | video_capture.stop() 185 | cv2.destroyAllWindows() 186 | graph.DeallocateGraph() 187 | device.CloseDevice() 188 | -------------------------------------------------------------------------------- /py_examples/utils/app_utils.py: -------------------------------------------------------------------------------- 1 | # From http://www.pyimagesearch.com/2015/12/21/increasing-webcam-fps-with-python-and-opencv/ 2 | 3 | import struct 4 | import six 5 | import collections 6 | import cv2 7 | import datetime 8 | from threading import Thread 9 | from matplotlib import colors 10 | 11 | 12 | class FPS: 13 | def __init__(self): 14 | # store the start time, end time, and total number of frames 15 | # that were examined between the start and end intervals 16 | self._start = None 17 | self._end = None 18 | self._numFrames = 0 19 | 20 | def start(self): 21 | # start the timer 22 | self._start = datetime.datetime.now() 23 | return self 24 | 25 | def stop(self): 26 | # stop the timer 27 | self._end = datetime.datetime.now() 28 | 29 | def update(self): 30 | # increment the total number of frames examined during the 31 | # start and end intervals 32 | self._numFrames += 1 33 | 34 | def elapsed(self): 35 | # return the total number of seconds between the start and 36 | # end interval 37 | return (self._end - self._start).total_seconds() 38 | 39 | def fps(self): 40 | # compute the (approximate) frames per second 41 | return self._numFrames / self.elapsed() 42 | 43 | 44 | class WebcamVideoStream: 45 | def __init__(self, src, width, height): 46 | # initialize the video camera stream and read the first frame 47 | # from the stream 48 | #print(src) 49 | self.stream = cv2.VideoCapture(src) 50 | self.stream.set(cv2.CAP_PROP_FRAME_WIDTH, width) 51 | self.stream.set(cv2.CAP_PROP_FRAME_HEIGHT, height) 52 | (self.grabbed, self.frame) = self.stream.read() 53 | 54 | # initialize the variable used to indicate if the thread should 55 | # be stopped 56 | self.stopped = False 57 | 58 | def start(self): 59 | # start the thread to read frames from the video stream 60 | Thread(target=self.update, args=()).start() 61 | return self 62 | 63 | def update(self): 64 | # keep looping infinitely until the thread is stopped 65 | while True: 66 | # if the thread indicator variable is set, stop the thread 67 | if self.stopped: 68 | return 69 | 70 | # otherwise, read the next frame from the stream 71 | (self.grabbed, self.frame) = self.stream.read() 72 | 73 | def read(self): 74 | # return the frame most recently read 75 | return self.frame 76 | 77 | def stop(self): 78 | # indicate that the thread should be stopped 79 | self.stopped = True 80 | 81 | 82 | def standard_colors(): 83 | colors = [ 84 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 85 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 86 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 87 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 88 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 89 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 90 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 91 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 92 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 93 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 94 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 95 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 96 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 97 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 98 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 99 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 100 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 101 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 102 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 103 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 104 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 105 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 106 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 107 | ] 108 | return colors 109 | 110 | 111 | def color_name_to_rgb(): 112 | colors_rgb = [] 113 | for key, value in colors.cnames.items(): 114 | colors_rgb.append((key, struct.unpack('BBB', bytes.fromhex(value.replace('#', ''))))) 115 | return dict(colors_rgb) 116 | 117 | 118 | def draw_boxes_and_labels( 119 | boxes, 120 | classes, 121 | scores, 122 | category_index, 123 | instance_masks=None, 124 | keypoints=None, 125 | max_boxes_to_draw=20, 126 | min_score_thresh=.5, 127 | agnostic_mode=False): 128 | """Returns boxes coordinates, class names and colors 129 | 130 | Args: 131 | boxes: a numpy array of shape [N, 4] 132 | classes: a numpy array of shape [N] 133 | scores: a numpy array of shape [N] or None. If scores=None, then 134 | this function assumes that the boxes to be plotted are groundtruth 135 | boxes and plot all boxes as black with no classes or scores. 136 | category_index: a dict containing category dictionaries (each holding 137 | category index `id` and category name `name`) keyed by category indices. 138 | instance_masks: a numpy array of shape [N, image_height, image_width], can 139 | be None 140 | keypoints: a numpy array of shape [N, num_keypoints, 2], can 141 | be None 142 | max_boxes_to_draw: maximum number of boxes to visualize. If None, draw 143 | all boxes. 144 | min_score_thresh: minimum score threshold for a box to be visualized 145 | agnostic_mode: boolean (default: False) controlling whether to evaluate in 146 | class-agnostic mode or not. This mode will display scores but ignore 147 | classes. 148 | """ 149 | # Create a display string (and color) for every box location, group any boxes 150 | # that correspond to the same location. 151 | box_to_display_str_map = collections.defaultdict(list) 152 | box_to_color_map = collections.defaultdict(str) 153 | box_to_instance_masks_map = {} 154 | box_to_keypoints_map = collections.defaultdict(list) 155 | if not max_boxes_to_draw: 156 | max_boxes_to_draw = boxes.shape[0] 157 | for i in range(min(max_boxes_to_draw, boxes.shape[0])): 158 | if scores is None or scores[i] > min_score_thresh: 159 | box = tuple(boxes[i].tolist()) 160 | if instance_masks is not None: 161 | box_to_instance_masks_map[box] = instance_masks[i] 162 | if keypoints is not None: 163 | box_to_keypoints_map[box].extend(keypoints[i]) 164 | if scores is None: 165 | box_to_color_map[box] = 'black' 166 | else: 167 | if not agnostic_mode: 168 | if classes[i] in category_index.keys(): 169 | class_name = category_index[classes[i]]['name'] 170 | else: 171 | class_name = 'N/A' 172 | display_str = '{}: {}%'.format( 173 | class_name, 174 | int(100 * scores[i])) 175 | else: 176 | display_str = 'score: {}%'.format(int(100 * scores[i])) 177 | box_to_display_str_map[box].append(display_str) 178 | if agnostic_mode: 179 | box_to_color_map[box] = 'DarkOrange' 180 | else: 181 | box_to_color_map[box] = standard_colors()[ 182 | classes[i] % len(standard_colors())] 183 | 184 | # Store all the coordinates of the boxes, class names and colors 185 | color_rgb = color_name_to_rgb() 186 | rect_points = [] 187 | class_names = [] 188 | class_colors = [] 189 | for box, color in six.iteritems(box_to_color_map): 190 | ymin, xmin, ymax, xmax = box 191 | rect_points.append(dict(ymin=ymin, xmin=xmin, ymax=ymax, xmax=xmax)) 192 | class_names.append(box_to_display_str_map[box]) 193 | class_colors.append(color_rgb[color.lower()]) 194 | return rect_points, class_names, class_colors 195 | -------------------------------------------------------------------------------- /prototxt/yolo_deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | convolution_param { 16 | num_output: 64 17 | kernel_size: 7 18 | pad: 3 19 | stride: 2 20 | } 21 | } 22 | layer { 23 | name: "relu1" 24 | type: "ReLU" 25 | bottom: "conv1" 26 | top: "conv1" 27 | relu_param{ 28 | negative_slope: 0.1 29 | } 30 | } 31 | layer { 32 | name: "pool1" 33 | type: "Pooling" 34 | bottom: "conv1" 35 | top: "pool1" 36 | pooling_param { 37 | pool: MAX 38 | kernel_size: 2 39 | stride: 2 40 | } 41 | } 42 | 43 | layer{ 44 | name: "conv2" 45 | type: "Convolution" 46 | bottom: "pool1" 47 | top: "conv2" 48 | convolution_param { 49 | num_output: 192 50 | kernel_size: 3 51 | pad: 1 52 | stride: 1 53 | } 54 | } 55 | layer { 56 | name: "relu2" 57 | type: "ReLU" 58 | bottom: "conv2" 59 | top: "conv2" 60 | relu_param{ 61 | negative_slope: 0.1 62 | } 63 | } 64 | layer { 65 | name: "pool2" 66 | type: "Pooling" 67 | bottom: "conv2" 68 | top: "pool2" 69 | pooling_param { 70 | pool: MAX 71 | kernel_size: 2 72 | stride: 2 73 | } 74 | } 75 | 76 | layer{ 77 | name: "conv3" 78 | type: "Convolution" 79 | bottom: "pool2" 80 | top: "conv3" 81 | convolution_param { 82 | num_output: 128 83 | kernel_size: 1 84 | pad: 0 85 | stride: 1 86 | } 87 | } 88 | layer { 89 | name: "relu3" 90 | type: "ReLU" 91 | bottom: "conv3" 92 | top: "conv3" 93 | relu_param{ 94 | negative_slope: 0.1 95 | } 96 | } 97 | 98 | 99 | layer{ 100 | name: "conv4" 101 | type: "Convolution" 102 | bottom: "conv3" 103 | top: "conv4" 104 | convolution_param { 105 | num_output: 256 106 | kernel_size: 3 107 | pad: 1 108 | stride: 1 109 | } 110 | } 111 | layer { 112 | name: "relu4" 113 | type: "ReLU" 114 | bottom: "conv4" 115 | top: "conv4" 116 | relu_param{ 117 | negative_slope: 0.1 118 | } 119 | } 120 | 121 | layer{ 122 | name: "conv5" 123 | type: "Convolution" 124 | bottom: "conv4" 125 | top: "conv5" 126 | convolution_param { 127 | num_output: 256 128 | kernel_size: 1 129 | pad: 0 130 | stride: 1 131 | } 132 | } 133 | layer { 134 | name: "relu5" 135 | type: "ReLU" 136 | bottom: "conv5" 137 | top: "conv5" 138 | relu_param{ 139 | negative_slope: 0.1 140 | } 141 | } 142 | 143 | layer{ 144 | name: "conv6" 145 | type: "Convolution" 146 | bottom: "conv5" 147 | top: "conv6" 148 | convolution_param { 149 | num_output: 512 150 | kernel_size: 3 151 | pad: 1 152 | stride: 1 153 | } 154 | } 155 | layer { 156 | name: "relu6" 157 | type: "ReLU" 158 | bottom: "conv6" 159 | top: "conv6" 160 | relu_param{ 161 | negative_slope: 0.1 162 | } 163 | } 164 | layer { 165 | name: "pool6" 166 | type: "Pooling" 167 | bottom: "conv6" 168 | top: "pool6" 169 | pooling_param { 170 | pool: MAX 171 | kernel_size: 2 172 | stride: 2 173 | } 174 | } 175 | 176 | layer{ 177 | name: "conv7" 178 | type: "Convolution" 179 | bottom: "pool6" 180 | top: "conv7" 181 | convolution_param { 182 | num_output: 256 183 | kernel_size: 1 184 | pad: 0 185 | stride: 1 186 | } 187 | } 188 | layer { 189 | name: "relu7" 190 | type: "ReLU" 191 | bottom: "conv7" 192 | top: "conv7" 193 | relu_param{ 194 | negative_slope: 0.1 195 | } 196 | } 197 | 198 | layer{ 199 | name: "conv8" 200 | type: "Convolution" 201 | bottom: "conv7" 202 | top: "conv8" 203 | convolution_param { 204 | num_output: 512 205 | kernel_size: 3 206 | pad: 1 207 | stride: 1 208 | } 209 | } 210 | layer { 211 | name: "relu8" 212 | type: "ReLU" 213 | bottom: "conv8" 214 | top: "conv8" 215 | relu_param{ 216 | negative_slope: 0.1 217 | } 218 | } 219 | 220 | layer{ 221 | name: "conv9" 222 | type: "Convolution" 223 | bottom: "conv8" 224 | top: "conv9" 225 | convolution_param { 226 | num_output: 256 227 | kernel_size: 1 228 | pad: 0 229 | stride: 1 230 | } 231 | } 232 | layer { 233 | name: "relu9" 234 | type: "ReLU" 235 | bottom: "conv9" 236 | top: "conv9" 237 | relu_param{ 238 | negative_slope: 0.1 239 | } 240 | } 241 | 242 | layer{ 243 | name: "conv10" 244 | type: "Convolution" 245 | bottom: "conv9" 246 | top: "conv10" 247 | convolution_param { 248 | num_output: 512 249 | kernel_size: 3 250 | pad: 1 251 | stride: 1 252 | } 253 | } 254 | layer { 255 | name: "relu10" 256 | type: "ReLU" 257 | bottom: "conv10" 258 | top: "conv10" 259 | relu_param{ 260 | negative_slope: 0.1 261 | } 262 | } 263 | 264 | layer{ 265 | name: "conv11" 266 | type: "Convolution" 267 | bottom: "conv10" 268 | top: "conv11" 269 | convolution_param { 270 | num_output: 256 271 | kernel_size: 1 272 | pad: 0 273 | stride: 1 274 | } 275 | } 276 | layer { 277 | name: "relu11" 278 | type: "ReLU" 279 | bottom: "conv11" 280 | top: "conv11" 281 | relu_param{ 282 | negative_slope: 0.1 283 | } 284 | } 285 | 286 | 287 | layer{ 288 | name: "conv12" 289 | type: "Convolution" 290 | bottom: "conv11" 291 | top: "conv12" 292 | convolution_param { 293 | num_output: 512 294 | kernel_size: 3 295 | pad: 1 296 | stride: 1 297 | } 298 | } 299 | layer { 300 | name: "relu12" 301 | type: "ReLU" 302 | bottom: "conv12" 303 | top: "conv12" 304 | relu_param{ 305 | negative_slope: 0.1 306 | } 307 | } 308 | 309 | 310 | layer{ 311 | name: "conv13" 312 | type: "Convolution" 313 | bottom: "conv12" 314 | top: "conv13" 315 | convolution_param { 316 | num_output: 256 317 | kernel_size: 1 318 | pad: 0 319 | stride: 1 320 | } 321 | } 322 | layer { 323 | name: "relu13" 324 | type: "ReLU" 325 | bottom: "conv13" 326 | top: "conv13" 327 | relu_param{ 328 | negative_slope: 0.1 329 | } 330 | } 331 | 332 | layer{ 333 | name: "conv14" 334 | type: "Convolution" 335 | bottom: "conv13" 336 | top: "conv14" 337 | convolution_param { 338 | num_output: 512 339 | kernel_size: 3 340 | pad: 1 341 | stride: 1 342 | } 343 | } 344 | layer { 345 | name: "relu14" 346 | type: "ReLU" 347 | bottom: "conv14" 348 | top: "conv14" 349 | relu_param{ 350 | negative_slope: 0.1 351 | } 352 | } 353 | 354 | layer{ 355 | name: "conv15" 356 | type: "Convolution" 357 | bottom: "conv14" 358 | top: "conv15" 359 | convolution_param { 360 | num_output: 512 361 | kernel_size: 1 362 | pad: 0 363 | stride: 1 364 | } 365 | } 366 | layer { 367 | name: "relu15" 368 | type: "ReLU" 369 | bottom: "conv15" 370 | top: "conv15" 371 | relu_param{ 372 | negative_slope: 0.1 373 | } 374 | } 375 | 376 | 377 | layer{ 378 | name: "conv16" 379 | type: "Convolution" 380 | bottom: "conv15" 381 | top: "conv16" 382 | convolution_param { 383 | num_output: 1024 384 | kernel_size: 3 385 | pad: 1 386 | stride: 1 387 | } 388 | } 389 | layer { 390 | name: "relu16" 391 | type: "ReLU" 392 | bottom: "conv16" 393 | top: "conv16" 394 | relu_param{ 395 | negative_slope: 0.1 396 | } 397 | } 398 | 399 | layer { 400 | name: "pool16" 401 | type: "Pooling" 402 | bottom: "conv16" 403 | top: "pool16" 404 | pooling_param { 405 | pool: MAX 406 | kernel_size: 2 407 | stride: 2 408 | } 409 | } 410 | 411 | 412 | layer{ 413 | name: "conv17" 414 | type: "Convolution" 415 | bottom: "pool16" 416 | top: "conv17" 417 | convolution_param { 418 | num_output: 512 419 | kernel_size: 1 420 | pad: 0 421 | stride: 1 422 | } 423 | } 424 | layer { 425 | name: "relu17" 426 | type: "ReLU" 427 | bottom: "conv17" 428 | top: "conv17" 429 | relu_param{ 430 | negative_slope: 0.1 431 | } 432 | } 433 | 434 | 435 | layer{ 436 | name: "conv18" 437 | type: "Convolution" 438 | bottom: "conv17" 439 | top: "conv18" 440 | convolution_param { 441 | num_output: 1024 442 | kernel_size: 3 443 | pad: 1 444 | stride: 1 445 | } 446 | } 447 | layer { 448 | name: "relu18" 449 | type: "ReLU" 450 | bottom: "conv18" 451 | top: "conv18" 452 | relu_param{ 453 | negative_slope: 0.1 454 | } 455 | } 456 | 457 | 458 | 459 | layer{ 460 | name: "conv19" 461 | type: "Convolution" 462 | bottom: "conv18" 463 | top: "conv19" 464 | convolution_param { 465 | num_output: 512 466 | kernel_size: 1 467 | pad: 0 468 | stride: 1 469 | } 470 | } 471 | layer { 472 | name: "relu19" 473 | type: "ReLU" 474 | bottom: "conv19" 475 | top: "conv19" 476 | relu_param{ 477 | negative_slope: 0.1 478 | } 479 | } 480 | 481 | 482 | 483 | layer{ 484 | name: "conv20" 485 | type: "Convolution" 486 | bottom: "conv19" 487 | top: "conv20" 488 | convolution_param { 489 | num_output: 1024 490 | kernel_size: 3 491 | pad: 1 492 | stride: 1 493 | } 494 | } 495 | layer { 496 | name: "relu20" 497 | type: "ReLU" 498 | bottom: "conv20" 499 | top: "conv20" 500 | relu_param{ 501 | negative_slope: 0.1 502 | } 503 | } 504 | 505 | 506 | 507 | layer{ 508 | name: "conv21" 509 | type: "Convolution" 510 | bottom: "conv20" 511 | top: "conv21" 512 | convolution_param { 513 | num_output: 1024 514 | kernel_size: 3 515 | pad: 1 516 | stride: 1 517 | } 518 | } 519 | layer { 520 | name: "relu21" 521 | type: "ReLU" 522 | bottom: "conv21" 523 | top: "conv21" 524 | relu_param{ 525 | negative_slope: 0.1 526 | } 527 | } 528 | 529 | 530 | layer{ 531 | name: "conv22" 532 | type: "Convolution" 533 | bottom: "conv21" 534 | top: "conv22" 535 | convolution_param { 536 | num_output: 1024 537 | kernel_size: 3 538 | pad: 1 539 | stride: 2 540 | } 541 | } 542 | layer { 543 | name: "relu22" 544 | type: "ReLU" 545 | bottom: "conv22" 546 | top: "conv22" 547 | relu_param{ 548 | negative_slope: 0.1 549 | } 550 | } 551 | 552 | 553 | 554 | layer{ 555 | name: "conv23" 556 | type: "Convolution" 557 | bottom: "conv22" 558 | top: "conv23" 559 | convolution_param { 560 | num_output: 1024 561 | kernel_size: 3 562 | pad: 1 563 | stride: 1 564 | } 565 | } 566 | layer { 567 | name: "relu23" 568 | type: "ReLU" 569 | bottom: "conv23" 570 | top: "conv23" 571 | relu_param{ 572 | negative_slope: 0.1 573 | } 574 | } 575 | 576 | 577 | layer{ 578 | name: "conv24" 579 | type: "Convolution" 580 | bottom: "conv23" 581 | top: "conv24" 582 | convolution_param { 583 | num_output: 1024 584 | kernel_size: 3 585 | pad: 1 586 | stride: 1 587 | } 588 | } 589 | layer { 590 | name: "relu24" 591 | type: "ReLU" 592 | bottom: "conv24" 593 | top: "conv24" 594 | relu_param{ 595 | negative_slope: 0.1 596 | } 597 | } 598 | 599 | 600 | 601 | 602 | layer{ 603 | name: "fc25" 604 | type: "InnerProduct" 605 | bottom: "conv24" 606 | top: "fc25" 607 | inner_product_param { 608 | num_output: 4096 609 | } 610 | } 611 | layer { 612 | name: "relu25" 613 | type: "ReLU" 614 | bottom: "fc25" 615 | top: "fc25" 616 | relu_param{ 617 | negative_slope: 0.1 618 | } 619 | } 620 | 621 | 622 | layer{ 623 | name: "fc26" 624 | type: "InnerProduct" 625 | bottom: "fc25" 626 | top: "fc26" 627 | inner_product_param { 628 | num_output: 1470 629 | } 630 | } 631 | -------------------------------------------------------------------------------- /prototxt/yolo_small_deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | convolution_param { 16 | num_output: 64 17 | kernel_size: 7 18 | pad: 3 19 | stride: 2 20 | } 21 | } 22 | layer { 23 | name: "relu1" 24 | type: "ReLU" 25 | bottom: "conv1" 26 | top: "conv1" 27 | relu_param{ 28 | negative_slope: 0.1 29 | } 30 | } 31 | layer { 32 | name: "pool1" 33 | type: "Pooling" 34 | bottom: "conv1" 35 | top: "pool1" 36 | pooling_param { 37 | pool: MAX 38 | kernel_size: 2 39 | stride: 2 40 | } 41 | } 42 | 43 | layer{ 44 | name: "conv2" 45 | type: "Convolution" 46 | bottom: "pool1" 47 | top: "conv2" 48 | convolution_param { 49 | num_output: 192 50 | kernel_size: 3 51 | pad: 1 52 | stride: 1 53 | } 54 | } 55 | layer { 56 | name: "relu2" 57 | type: "ReLU" 58 | bottom: "conv2" 59 | top: "conv2" 60 | relu_param{ 61 | negative_slope: 0.1 62 | } 63 | } 64 | layer { 65 | name: "pool2" 66 | type: "Pooling" 67 | bottom: "conv2" 68 | top: "pool2" 69 | pooling_param { 70 | pool: MAX 71 | kernel_size: 2 72 | stride: 2 73 | } 74 | } 75 | 76 | layer{ 77 | name: "conv3" 78 | type: "Convolution" 79 | bottom: "pool2" 80 | top: "conv3" 81 | convolution_param { 82 | num_output: 128 83 | kernel_size: 1 84 | pad: 0 85 | stride: 1 86 | } 87 | } 88 | layer { 89 | name: "relu3" 90 | type: "ReLU" 91 | bottom: "conv3" 92 | top: "conv3" 93 | relu_param{ 94 | negative_slope: 0.1 95 | } 96 | } 97 | 98 | 99 | layer{ 100 | name: "conv4" 101 | type: "Convolution" 102 | bottom: "conv3" 103 | top: "conv4" 104 | convolution_param { 105 | num_output: 256 106 | kernel_size: 3 107 | pad: 1 108 | stride: 1 109 | } 110 | } 111 | layer { 112 | name: "relu4" 113 | type: "ReLU" 114 | bottom: "conv4" 115 | top: "conv4" 116 | relu_param{ 117 | negative_slope: 0.1 118 | } 119 | } 120 | 121 | layer{ 122 | name: "conv5" 123 | type: "Convolution" 124 | bottom: "conv4" 125 | top: "conv5" 126 | convolution_param { 127 | num_output: 256 128 | kernel_size: 1 129 | pad: 0 130 | stride: 1 131 | } 132 | } 133 | layer { 134 | name: "relu5" 135 | type: "ReLU" 136 | bottom: "conv5" 137 | top: "conv5" 138 | relu_param{ 139 | negative_slope: 0.1 140 | } 141 | } 142 | 143 | layer{ 144 | name: "conv6" 145 | type: "Convolution" 146 | bottom: "conv5" 147 | top: "conv6" 148 | convolution_param { 149 | num_output: 512 150 | kernel_size: 3 151 | pad: 1 152 | stride: 1 153 | } 154 | } 155 | layer { 156 | name: "relu6" 157 | type: "ReLU" 158 | bottom: "conv6" 159 | top: "conv6" 160 | relu_param{ 161 | negative_slope: 0.1 162 | } 163 | } 164 | layer { 165 | name: "pool6" 166 | type: "Pooling" 167 | bottom: "conv6" 168 | top: "pool6" 169 | pooling_param { 170 | pool: MAX 171 | kernel_size: 2 172 | stride: 2 173 | } 174 | } 175 | 176 | layer{ 177 | name: "conv7" 178 | type: "Convolution" 179 | bottom: "pool6" 180 | top: "conv7" 181 | convolution_param { 182 | num_output: 256 183 | kernel_size: 1 184 | pad: 0 185 | stride: 1 186 | } 187 | } 188 | layer { 189 | name: "relu7" 190 | type: "ReLU" 191 | bottom: "conv7" 192 | top: "conv7" 193 | relu_param{ 194 | negative_slope: 0.1 195 | } 196 | } 197 | 198 | layer{ 199 | name: "conv8" 200 | type: "Convolution" 201 | bottom: "conv7" 202 | top: "conv8" 203 | convolution_param { 204 | num_output: 512 205 | kernel_size: 3 206 | pad: 1 207 | stride: 1 208 | } 209 | } 210 | layer { 211 | name: "relu8" 212 | type: "ReLU" 213 | bottom: "conv8" 214 | top: "conv8" 215 | relu_param{ 216 | negative_slope: 0.1 217 | } 218 | } 219 | 220 | layer{ 221 | name: "conv9" 222 | type: "Convolution" 223 | bottom: "conv8" 224 | top: "conv9" 225 | convolution_param { 226 | num_output: 256 227 | kernel_size: 1 228 | pad: 0 229 | stride: 1 230 | } 231 | } 232 | layer { 233 | name: "relu9" 234 | type: "ReLU" 235 | bottom: "conv9" 236 | top: "conv9" 237 | relu_param{ 238 | negative_slope: 0.1 239 | } 240 | } 241 | 242 | layer{ 243 | name: "conv10" 244 | type: "Convolution" 245 | bottom: "conv9" 246 | top: "conv10" 247 | convolution_param { 248 | num_output: 512 249 | kernel_size: 3 250 | pad: 1 251 | stride: 1 252 | } 253 | } 254 | layer { 255 | name: "relu10" 256 | type: "ReLU" 257 | bottom: "conv10" 258 | top: "conv10" 259 | relu_param{ 260 | negative_slope: 0.1 261 | } 262 | } 263 | 264 | layer{ 265 | name: "conv11" 266 | type: "Convolution" 267 | bottom: "conv10" 268 | top: "conv11" 269 | convolution_param { 270 | num_output: 256 271 | kernel_size: 1 272 | pad: 0 273 | stride: 1 274 | } 275 | } 276 | layer { 277 | name: "relu11" 278 | type: "ReLU" 279 | bottom: "conv11" 280 | top: "conv11" 281 | relu_param{ 282 | negative_slope: 0.1 283 | } 284 | } 285 | 286 | 287 | layer{ 288 | name: "conv12" 289 | type: "Convolution" 290 | bottom: "conv11" 291 | top: "conv12" 292 | convolution_param { 293 | num_output: 512 294 | kernel_size: 3 295 | pad: 1 296 | stride: 1 297 | } 298 | } 299 | layer { 300 | name: "relu12" 301 | type: "ReLU" 302 | bottom: "conv12" 303 | top: "conv12" 304 | relu_param{ 305 | negative_slope: 0.1 306 | } 307 | } 308 | 309 | 310 | layer{ 311 | name: "conv13" 312 | type: "Convolution" 313 | bottom: "conv12" 314 | top: "conv13" 315 | convolution_param { 316 | num_output: 256 317 | kernel_size: 1 318 | pad: 0 319 | stride: 1 320 | } 321 | } 322 | layer { 323 | name: "relu13" 324 | type: "ReLU" 325 | bottom: "conv13" 326 | top: "conv13" 327 | relu_param{ 328 | negative_slope: 0.1 329 | } 330 | } 331 | 332 | layer{ 333 | name: "conv14" 334 | type: "Convolution" 335 | bottom: "conv13" 336 | top: "conv14" 337 | convolution_param { 338 | num_output: 512 339 | kernel_size: 3 340 | pad: 1 341 | stride: 1 342 | } 343 | } 344 | layer { 345 | name: "relu14" 346 | type: "ReLU" 347 | bottom: "conv14" 348 | top: "conv14" 349 | relu_param{ 350 | negative_slope: 0.1 351 | } 352 | } 353 | 354 | layer{ 355 | name: "conv15" 356 | type: "Convolution" 357 | bottom: "conv14" 358 | top: "conv15" 359 | convolution_param { 360 | num_output: 512 361 | kernel_size: 1 362 | pad: 0 363 | stride: 1 364 | } 365 | } 366 | layer { 367 | name: "relu15" 368 | type: "ReLU" 369 | bottom: "conv15" 370 | top: "conv15" 371 | relu_param{ 372 | negative_slope: 0.1 373 | } 374 | } 375 | 376 | 377 | layer{ 378 | name: "conv16" 379 | type: "Convolution" 380 | bottom: "conv15" 381 | top: "conv16" 382 | convolution_param { 383 | num_output: 1024 384 | kernel_size: 3 385 | pad: 1 386 | stride: 1 387 | } 388 | } 389 | layer { 390 | name: "relu16" 391 | type: "ReLU" 392 | bottom: "conv16" 393 | top: "conv16" 394 | relu_param{ 395 | negative_slope: 0.1 396 | } 397 | } 398 | 399 | layer { 400 | name: "pool16" 401 | type: "Pooling" 402 | bottom: "conv16" 403 | top: "pool16" 404 | pooling_param { 405 | pool: MAX 406 | kernel_size: 2 407 | stride: 2 408 | } 409 | } 410 | 411 | 412 | layer{ 413 | name: "conv17" 414 | type: "Convolution" 415 | bottom: "pool16" 416 | top: "conv17" 417 | convolution_param { 418 | num_output: 512 419 | kernel_size: 1 420 | pad: 0 421 | stride: 1 422 | } 423 | } 424 | layer { 425 | name: "relu17" 426 | type: "ReLU" 427 | bottom: "conv17" 428 | top: "conv17" 429 | relu_param{ 430 | negative_slope: 0.1 431 | } 432 | } 433 | 434 | 435 | layer{ 436 | name: "conv18" 437 | type: "Convolution" 438 | bottom: "conv17" 439 | top: "conv18" 440 | convolution_param { 441 | num_output: 1024 442 | kernel_size: 3 443 | pad: 1 444 | stride: 1 445 | } 446 | } 447 | layer { 448 | name: "relu18" 449 | type: "ReLU" 450 | bottom: "conv18" 451 | top: "conv18" 452 | relu_param{ 453 | negative_slope: 0.1 454 | } 455 | } 456 | 457 | 458 | 459 | layer{ 460 | name: "conv19" 461 | type: "Convolution" 462 | bottom: "conv18" 463 | top: "conv19" 464 | convolution_param { 465 | num_output: 512 466 | kernel_size: 1 467 | pad: 0 468 | stride: 1 469 | } 470 | } 471 | layer { 472 | name: "relu19" 473 | type: "ReLU" 474 | bottom: "conv19" 475 | top: "conv19" 476 | relu_param{ 477 | negative_slope: 0.1 478 | } 479 | } 480 | 481 | 482 | 483 | layer{ 484 | name: "conv20" 485 | type: "Convolution" 486 | bottom: "conv19" 487 | top: "conv20" 488 | convolution_param { 489 | num_output: 1024 490 | kernel_size: 3 491 | pad: 1 492 | stride: 1 493 | } 494 | } 495 | layer { 496 | name: "relu20" 497 | type: "ReLU" 498 | bottom: "conv20" 499 | top: "conv20" 500 | relu_param{ 501 | negative_slope: 0.1 502 | } 503 | } 504 | 505 | 506 | 507 | layer{ 508 | name: "conv21" 509 | type: "Convolution" 510 | bottom: "conv20" 511 | top: "conv21" 512 | convolution_param { 513 | num_output: 1024 514 | kernel_size: 3 515 | pad: 1 516 | stride: 1 517 | } 518 | } 519 | layer { 520 | name: "relu21" 521 | type: "ReLU" 522 | bottom: "conv21" 523 | top: "conv21" 524 | relu_param{ 525 | negative_slope: 0.1 526 | } 527 | } 528 | 529 | 530 | layer{ 531 | name: "conv22" 532 | type: "Convolution" 533 | bottom: "conv21" 534 | top: "conv22" 535 | convolution_param { 536 | num_output: 1024 537 | kernel_size: 3 538 | pad: 1 539 | stride: 2 540 | } 541 | } 542 | layer { 543 | name: "relu22" 544 | type: "ReLU" 545 | bottom: "conv22" 546 | top: "conv22" 547 | relu_param{ 548 | negative_slope: 0.1 549 | } 550 | } 551 | 552 | 553 | 554 | layer{ 555 | name: "conv23" 556 | type: "Convolution" 557 | bottom: "conv22" 558 | top: "conv23" 559 | convolution_param { 560 | num_output: 1024 561 | kernel_size: 3 562 | pad: 1 563 | stride: 1 564 | } 565 | } 566 | layer { 567 | name: "relu23" 568 | type: "ReLU" 569 | bottom: "conv23" 570 | top: "conv23" 571 | relu_param{ 572 | negative_slope: 0.1 573 | } 574 | } 575 | 576 | 577 | layer{ 578 | name: "conv24" 579 | type: "Convolution" 580 | bottom: "conv23" 581 | top: "conv24" 582 | convolution_param { 583 | num_output: 1024 584 | kernel_size: 3 585 | pad: 1 586 | stride: 1 587 | } 588 | } 589 | layer { 590 | name: "relu24" 591 | type: "ReLU" 592 | bottom: "conv24" 593 | top: "conv24" 594 | relu_param{ 595 | negative_slope: 0.1 596 | } 597 | } 598 | 599 | 600 | 601 | 602 | layer{ 603 | name: "fc25" 604 | type: "InnerProduct" 605 | bottom: "conv24" 606 | top: "fc25" 607 | inner_product_param { 608 | num_output: 512 609 | } 610 | } 611 | layer { 612 | name: "relu25" 613 | type: "ReLU" 614 | bottom: "fc25" 615 | top: "fc25" 616 | relu_param{ 617 | negative_slope: 0.1 618 | } 619 | } 620 | 621 | 622 | layer{ 623 | name: "fc26" 624 | type: "InnerProduct" 625 | bottom: "fc25" 626 | top: "fc26" 627 | inner_product_param { 628 | num_output: 4096 629 | } 630 | } 631 | layer { 632 | name: "relu26" 633 | type: "ReLU" 634 | bottom: "fc26" 635 | top: "fc26" 636 | relu_param{ 637 | negative_slope: 0.1 638 | } 639 | } 640 | 641 | 642 | layer{ 643 | name: "fc27" 644 | type: "InnerProduct" 645 | bottom: "fc26" 646 | top: "fc27" 647 | inner_product_param { 648 | num_output: 1470 649 | } 650 | } 651 | -------------------------------------------------------------------------------- /create_yolo_prototxt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from ConfigParser import ConfigParser 3 | from collections import OrderedDict 4 | import argparse 5 | import logging 6 | import os 7 | import sys 8 | 9 | class CaffeLayerGenerator(object): 10 | def __init__(self, name, ltype): 11 | self.name = name 12 | self.bottom = [] 13 | self.top = [] 14 | self.type = ltype 15 | def get_template(self): 16 | return """ 17 | layer {{{{ 18 | name: "{}" 19 | type: "{}" 20 | bottom: "{}" 21 | top: "{}"{{}} 22 | }}}}""".format(self.name, self.type, self.bottom[0], self.top[0]) 23 | 24 | class CaffeInputLayer(CaffeLayerGenerator): 25 | def __init__(self, name, channels, width, height): 26 | super(CaffeInputLayer, self).__init__(name, 'Input') 27 | self.channels = channels 28 | self.width = width 29 | self.height = height 30 | def write(self, f): 31 | f.write(""" 32 | input: "{}" 33 | input_shape {{ 34 | dim: 1 35 | dim: {} 36 | dim: {} 37 | dim: {} 38 | }}""".format(self.name, self.channels, self.width, self.height)) 39 | 40 | class CaffeConvolutionLayer(CaffeLayerGenerator): 41 | def __init__(self, name, filters, ksize=None, stride=None, pad=None, bias=True): 42 | super(CaffeConvolutionLayer, self).__init__(name, 'Convolution') 43 | self.filters = filters 44 | self.ksize = ksize 45 | self.stride = stride 46 | self.pad = pad 47 | self.bias = bias 48 | def write(self, f): 49 | opts = [''] 50 | if self.ksize is not None: opts.append('kernel_size: {}'.format(self.ksize)) 51 | if self.stride is not None: opts.append('stride: {}'.format(self.stride)) 52 | if self.pad is not None: opts.append('pad: {}'.format(self.pad)) 53 | if not self.bias: opts.append('bias_term: false') 54 | param_str = """ 55 | convolution_param {{ 56 | num_output: {}{} 57 | }}""".format(self.filters, '\n '.join(opts)) 58 | f.write(self.get_template().format(param_str)) 59 | 60 | class CaffePoolingLayer(CaffeLayerGenerator): 61 | def __init__(self, name, pooltype, ksize=None, stride=None, pad=None, global_pooling=None): 62 | super(CaffePoolingLayer, self).__init__(name, 'Pooling') 63 | self.pooltype = pooltype 64 | self.ksize = ksize 65 | self.stride = stride 66 | self.pad = pad 67 | self.global_pooling = global_pooling 68 | def write(self, f): 69 | opts = [''] 70 | if self.ksize is not None: opts.append('kernel_size: {}'.format(self.ksize)) 71 | if self.stride is not None: opts.append('stride: {}'.format(self.stride)) 72 | if self.pad is not None: opts.append('pad: {}'.format(self.pad)) 73 | if self.global_pooling is not None: opts.append('global_pooling: {}'.format('True' if self.global_pooling else 'False')) 74 | param_str = """ 75 | pooling_param {{ 76 | pool: {}{} 77 | }}""".format(self.pooltype, '\n '.join(opts)) 78 | f.write(self.get_template().format(param_str)) 79 | 80 | class CaffeInnerProductLayer(CaffeLayerGenerator): 81 | def __init__(self, name, num_output): 82 | super(CaffeInnerProductLayer, self).__init__(name, 'InnerProduct') 83 | self.num_output = num_output 84 | def write(self, f): 85 | param_str = """ 86 | inner_product_param {{ 87 | num_output: {} 88 | }}""".format(self.num_output) 89 | f.write(self.get_template().format(param_str)) 90 | 91 | class CaffeBatchNormLayer(CaffeLayerGenerator): 92 | def __init__(self, name): 93 | super(CaffeBatchNormLayer, self).__init__(name, 'BatchNorm') 94 | def write(self, f): 95 | param_str = """ 96 | batch_norm_param { 97 | use_global_stats: true 98 | }""" 99 | f.write(self.get_template().format(param_str)) 100 | 101 | class CaffeScaleLayer(CaffeLayerGenerator): 102 | def __init__(self, name): 103 | super(CaffeScaleLayer, self).__init__(name, 'Scale') 104 | def write(self, f): 105 | param_str = """ 106 | scale_param { 107 | bias_term: true 108 | }""" 109 | f.write(self.get_template().format(param_str)) 110 | 111 | class CaffeReluLayer(CaffeLayerGenerator): 112 | def __init__(self, name, negslope=None): 113 | super(CaffeReluLayer, self).__init__(name, 'Relu') 114 | self.negslope = negslope 115 | def write(self, f): 116 | param_str = "" 117 | if self.negslope is not None: 118 | param_str = """ 119 | relu_param {{ 120 | negative_slope: {} 121 | }}""".format(self.negslope) 122 | f.write(self.get_template().format(param_str)) 123 | 124 | class CaffeDropoutLayer(CaffeLayerGenerator): 125 | def __init__(self, name, prob): 126 | super(CaffeDropoutLayer, self).__init__(name, 'Dropout') 127 | self.prob = prob 128 | def write(self, f): 129 | param_str = """ 130 | dropout_param {{ 131 | dropout_ratio: {} 132 | }}""".format(self.prob) 133 | f.write(self.get_template().format(param_str)) 134 | 135 | class CaffeSoftmaxLayer(CaffeLayerGenerator): 136 | def __init__(self, name): 137 | super(CaffeSoftmaxLayer, self).__init__(name, 'Softmax') 138 | def write(self, f): 139 | f.write(self.get_template().format("")) 140 | 141 | class CaffeProtoGenerator: 142 | def __init__(self, name): 143 | self.name = name 144 | self.sections = [] 145 | self.lnum = 0 146 | self.layer = None 147 | def add_layer(self, l): 148 | self.sections.append( l ) 149 | def add_input_layer(self, items): 150 | self.lnum = 0 151 | lname = "data" 152 | self.layer = CaffeInputLayer(lname, items['channels'], items['width'], items['height']) 153 | self.layer.top.append( lname ) 154 | self.add_layer( self.layer ) 155 | def add_convolution_layer(self, items): 156 | self.lnum += 1 157 | prev_blob = self.layer.top[0] 158 | lname = "conv"+str(self.lnum) 159 | filters = items['filters'] 160 | ksize = items['size'] if 'size' in items else None 161 | stride = items['stride'] if 'stride' in items else None 162 | pad = items['pad'] if 'pad' in items else None 163 | bias = not bool(items['batch_normalize']) if 'batch_normalize' in items else True 164 | self.layer = CaffeConvolutionLayer( lname, filters, ksize=ksize, stride=stride, pad=pad, bias=bias ) 165 | self.layer.bottom.append( prev_blob ) 166 | self.layer.top.append( lname ) 167 | self.add_layer( self.layer ) 168 | def add_innerproduct_layer(self, items): 169 | self.lnum += 1 170 | prev_blob = self.layer.top[0] 171 | lname = "fc"+str(self.lnum) 172 | num_output = items['output'] 173 | self.layer = CaffeInnerProductLayer( lname, num_output ) 174 | self.layer.bottom.append( prev_blob ) 175 | self.layer.top.append( lname ) 176 | self.add_layer( self.layer ) 177 | def add_pooling_layer(self, ltype, items, global_pooling=None): 178 | prev_blob = self.layer.top[0] 179 | lname = "pool"+str(self.lnum) 180 | ksize = items['size'] if 'size' in items else None 181 | stride = items['stride'] if 'stride' in items else None 182 | pad = items['pad'] if 'pad' in items else None 183 | self.layer = CaffePoolingLayer( lname, ltype, ksize=ksize, stride=stride, pad=pad, global_pooling=global_pooling ) 184 | self.layer.bottom.append( prev_blob ) 185 | self.layer.top.append( lname ) 186 | self.add_layer( self.layer ) 187 | def add_batchnorm_layer(self, items): 188 | prev_blob = self.layer.top[0] 189 | lname = "bn"+str(self.lnum) 190 | self.layer = CaffeBatchNormLayer( lname ) 191 | self.layer.bottom.append( prev_blob ) 192 | self.layer.top.append( lname ) 193 | self.add_layer( self.layer ) 194 | def add_scale_layer(self, items): 195 | prev_blob = self.layer.top[0] 196 | lname = "scale"+str(self.lnum) 197 | self.layer = CaffeScaleLayer( lname ) 198 | self.layer.bottom.append( prev_blob ) 199 | self.layer.top.append( lname ) 200 | self.add_layer( self.layer ) 201 | def add_relu_layer(self, items): 202 | prev_blob = self.layer.top[0] 203 | lname = "relu"+str(self.lnum) 204 | self.layer = CaffeReluLayer( lname ) 205 | self.layer.bottom.append( prev_blob ) 206 | self.layer.top.append( prev_blob ) # loopback 207 | self.add_layer( self.layer ) 208 | def add_dropout_layer(self, items): 209 | prev_blob = self.layer.top[0] 210 | lname = "drop"+str(self.lnum) 211 | self.layer = CaffeDropoutLayer( lname, items['probability'] ) 212 | self.layer.bottom.append( prev_blob ) 213 | self.layer.top.append( prev_blob ) # loopback 214 | self.add_layer( self.layer ) 215 | def add_softmax_layer(self, items): 216 | prev_blob = self.layer.top[0] 217 | lname = "prob" 218 | self.layer = CaffeSoftmaxLayer( lname ) 219 | self.layer.bottom.append( prev_blob ) 220 | self.layer.top.append( lname ) 221 | self.add_layer( self.layer ) 222 | def finalize(self, name): 223 | self.layer.top[0] = name # replace 224 | def write(self, fname): 225 | with open(fname, 'w') as f: 226 | f.write('name: "{}"'.format(self.name)) 227 | for sec in self.sections: 228 | sec.write(f) 229 | logging.info('{} is generated'.format(fname)) 230 | 231 | ###################################################################33 232 | class uniqdict(OrderedDict): 233 | _unique = 0 234 | def __setitem__(self, key, val): 235 | if isinstance(val, OrderedDict): 236 | self._unique += 1 237 | key += "_"+str(self._unique) 238 | OrderedDict.__setitem__(self, key, val) 239 | 240 | def convert(cfgfile, ptxtfile): 241 | # 242 | parser = ConfigParser(dict_type=uniqdict) 243 | parser.read(cfgfile) 244 | netname = os.path.basename(cfgfile).split('.')[0] 245 | #print netname 246 | gen = CaffeProtoGenerator(netname) 247 | for section in parser.sections(): 248 | _section = section.split('_')[0] 249 | if _section in ["crop", "cost"]: 250 | continue 251 | # 252 | batchnorm_followed = False 253 | relu_followed = False 254 | items = dict(parser.items(section)) 255 | if 'batch_normalize' in items and items['batch_normalize']: 256 | batchnorm_followed = True 257 | if 'activation' in items and items['activation'] != 'linear': 258 | relu_followed = True 259 | # 260 | if _section == 'net': 261 | gen.add_input_layer(items) 262 | elif _section == 'convolutional': 263 | gen.add_convolution_layer(items) 264 | if batchnorm_followed: 265 | gen.add_batchnorm_layer(items) 266 | gen.add_scale_layer(items) 267 | if relu_followed: 268 | gen.add_relu_layer(items) 269 | elif _section == 'connected': 270 | gen.add_innerproduct_layer(items) 271 | if relu_followed: 272 | gen.add_relu_layer(items) 273 | elif _section == 'maxpool': 274 | gen.add_pooling_layer('MAX', items) 275 | elif _section == 'avgpool': 276 | gen.add_pooling_layer('AVE', items, global_pooling=True) 277 | elif _section == 'dropout': 278 | gen.add_dropout_layer(items) 279 | elif _section == 'softmax': 280 | gen.add_softmax_layer(items) 281 | else: 282 | logging.error("{} layer is not supported".format(_section)) 283 | #gen.finalize('result') 284 | gen.write(ptxtfile) 285 | 286 | def main(): 287 | parser = argparse.ArgumentParser(description='Convert YOLO cfg to Caffe prototxt') 288 | parser.add_argument('cfg', type=str, help='YOLO cfg') 289 | parser.add_argument('prototxt', type=str, help='Caffe prototxt') 290 | args = parser.parse_args() 291 | 292 | convert(args.cfg, args.prototxt) 293 | 294 | if __name__ == "__main__": 295 | main() 296 | 297 | # vim:sw=4:ts=4:et 298 | -------------------------------------------------------------------------------- /prototxt/yolo_tiny_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | param { 16 | lr_mult: 1 17 | decay_mult: 1 18 | } 19 | convolution_param { 20 | num_output: 16 21 | kernel_size: 3 22 | pad: 1 23 | bias_term: false 24 | weight_filler { 25 | type: "xavier" 26 | } 27 | } 28 | } 29 | layer { 30 | name: "bn1" 31 | type: "BatchNorm" 32 | bottom: "conv1" 33 | top: "bn1" 34 | include { phase: TRAIN } 35 | param { 36 | lr_mult: 0 37 | decay_mult: 0 38 | } 39 | param { 40 | lr_mult: 0 41 | decay_mult: 0 42 | } 43 | param { 44 | lr_mult: 0 45 | decay_mult: 0 46 | } 47 | batch_norm_param { 48 | use_global_stats: false 49 | } 50 | } 51 | layer { 52 | name: "bn1" 53 | type: "BatchNorm" 54 | bottom: "conv1" 55 | top: "bn1" 56 | include { phase: TEST } 57 | batch_norm_param { 58 | use_global_stats: true 59 | } 60 | } 61 | layer { 62 | name: "scale1" 63 | type: "Scale" 64 | bottom: "bn1" 65 | top: "scale1" 66 | param { 67 | lr_mult: 0 68 | decay_mult: 0 69 | } 70 | param { 71 | lr_mult: 0 72 | decay_mult: 0 73 | } 74 | scale_param { 75 | bias_term: true 76 | } 77 | } 78 | layer { 79 | name: "relu1" 80 | type: "ReLU" 81 | bottom: "scale1" 82 | top: "scale1" 83 | relu_param { 84 | negative_slope: 0.1 85 | } 86 | } 87 | layer { 88 | name: "pool1" 89 | type: "Pooling" 90 | bottom: "scale1" 91 | top: "pool1" 92 | pooling_param { 93 | pool: MAX 94 | kernel_size: 2 95 | stride: 2 96 | } 97 | } 98 | 99 | layer { 100 | name: "conv2" 101 | type: "Convolution" 102 | bottom: "pool1" 103 | top: "conv2" 104 | param { 105 | lr_mult: 1 106 | decay_mult: 1 107 | } 108 | convolution_param { 109 | num_output: 32 110 | kernel_size: 3 111 | pad: 1 112 | bias_term: false 113 | weight_filler { 114 | type: "xavier" 115 | } 116 | } 117 | } 118 | layer { 119 | name: "bn2" 120 | type: "BatchNorm" 121 | bottom: "conv2" 122 | top: "bn2" 123 | include { phase: TRAIN } 124 | param { 125 | lr_mult: 0 126 | decay_mult: 0 127 | } 128 | param { 129 | lr_mult: 0 130 | decay_mult: 0 131 | } 132 | param { 133 | lr_mult: 0 134 | decay_mult: 0 135 | } 136 | batch_norm_param { 137 | use_global_stats: false 138 | } 139 | } 140 | layer { 141 | name: "bn2" 142 | type: "BatchNorm" 143 | bottom: "conv2" 144 | top: "bn2" 145 | include { phase: TEST } 146 | batch_norm_param { 147 | use_global_stats: true 148 | } 149 | } 150 | layer { 151 | name: "scale2" 152 | type: "Scale" 153 | bottom: "bn2" 154 | top: "scale2" 155 | param { 156 | lr_mult: 0 157 | decay_mult: 0 158 | } 159 | param { 160 | lr_mult: 0 161 | decay_mult: 0 162 | } 163 | scale_param { 164 | bias_term: true 165 | } 166 | } 167 | layer { 168 | name: "relu2" 169 | type: "ReLU" 170 | bottom: "scale2" 171 | top: "scale2" 172 | relu_param { 173 | negative_slope: 0.1 174 | } 175 | } 176 | layer { 177 | name: "pool2" 178 | type: "Pooling" 179 | bottom: "scale2" 180 | top: "pool2" 181 | pooling_param { 182 | pool: MAX 183 | kernel_size: 2 184 | stride: 2 185 | } 186 | } 187 | 188 | layer { 189 | name: "conv3" 190 | type: "Convolution" 191 | bottom: "pool2" 192 | top: "conv3" 193 | param { 194 | lr_mult: 1 195 | decay_mult: 1 196 | } 197 | convolution_param { 198 | num_output: 64 199 | kernel_size: 3 200 | pad: 1 201 | bias_term: false 202 | weight_filler { 203 | type: "xavier" 204 | } 205 | } 206 | } 207 | layer { 208 | name: "bn3" 209 | type: "BatchNorm" 210 | bottom: "conv3" 211 | top: "bn3" 212 | include { phase: TRAIN } 213 | param { 214 | lr_mult: 0 215 | decay_mult: 0 216 | } 217 | param { 218 | lr_mult: 0 219 | decay_mult: 0 220 | } 221 | param { 222 | lr_mult: 0 223 | decay_mult: 0 224 | } 225 | batch_norm_param { 226 | use_global_stats: false 227 | } 228 | } 229 | layer { 230 | name: "bn3" 231 | type: "BatchNorm" 232 | bottom: "conv3" 233 | top: "bn3" 234 | include { phase: TEST } 235 | batch_norm_param { 236 | use_global_stats: true 237 | } 238 | } 239 | layer { 240 | name: "scale3" 241 | type: "Scale" 242 | bottom: "bn3" 243 | top: "scale3" 244 | param { 245 | lr_mult: 0 246 | decay_mult: 0 247 | } 248 | param { 249 | lr_mult: 0 250 | decay_mult: 0 251 | } 252 | scale_param { 253 | bias_term: true 254 | } 255 | } 256 | layer { 257 | name: "relu3" 258 | type: "ReLU" 259 | bottom: "scale3" 260 | top: "scale3" 261 | relu_param { 262 | negative_slope: 0.1 263 | } 264 | } 265 | layer { 266 | name: "pool3" 267 | type: "Pooling" 268 | bottom: "scale3" 269 | top: "pool3" 270 | pooling_param { 271 | pool: MAX 272 | kernel_size: 2 273 | stride: 2 274 | } 275 | } 276 | 277 | layer { 278 | name: "conv4" 279 | type: "Convolution" 280 | bottom: "pool3" 281 | top: "conv4" 282 | param { 283 | lr_mult: 1 284 | decay_mult: 1 285 | } 286 | convolution_param { 287 | num_output: 128 288 | kernel_size: 3 289 | pad: 1 290 | bias_term: false 291 | weight_filler { 292 | type: "xavier" 293 | } 294 | } 295 | } 296 | layer { 297 | name: "bn4" 298 | type: "BatchNorm" 299 | bottom: "conv4" 300 | top: "bn4" 301 | include { phase: TRAIN } 302 | param { 303 | lr_mult: 0 304 | decay_mult: 0 305 | } 306 | param { 307 | lr_mult: 0 308 | decay_mult: 0 309 | } 310 | param { 311 | lr_mult: 0 312 | decay_mult: 0 313 | } 314 | batch_norm_param { 315 | use_global_stats: false 316 | } 317 | } 318 | layer { 319 | name: "bn4" 320 | type: "BatchNorm" 321 | bottom: "conv4" 322 | top: "bn4" 323 | include { phase: TEST } 324 | batch_norm_param { 325 | use_global_stats: true 326 | } 327 | } 328 | layer { 329 | name: "scale4" 330 | type: "Scale" 331 | bottom: "bn4" 332 | top: "scale4" 333 | param { 334 | lr_mult: 0 335 | decay_mult: 0 336 | } 337 | param { 338 | lr_mult: 0 339 | decay_mult: 0 340 | } 341 | scale_param { 342 | bias_term: true 343 | } 344 | } 345 | layer { 346 | name: "relu4" 347 | type: "ReLU" 348 | bottom: "scale4" 349 | top: "scale4" 350 | relu_param { 351 | negative_slope: 0.1 352 | } 353 | } 354 | layer { 355 | name: "pool4" 356 | type: "Pooling" 357 | bottom: "scale4" 358 | top: "pool4" 359 | pooling_param { 360 | pool: MAX 361 | kernel_size: 2 362 | stride: 2 363 | } 364 | } 365 | 366 | layer { 367 | name: "conv5" 368 | type: "Convolution" 369 | bottom: "pool4" 370 | top: "conv5" 371 | param { 372 | lr_mult: 1 373 | decay_mult: 1 374 | } 375 | convolution_param { 376 | num_output: 256 377 | kernel_size: 3 378 | pad: 1 379 | bias_term: false 380 | weight_filler { 381 | type: "xavier" 382 | } 383 | } 384 | } 385 | layer { 386 | name: "bn5" 387 | type: "BatchNorm" 388 | bottom: "conv5" 389 | top: "bn5" 390 | include { phase: TRAIN } 391 | param { 392 | lr_mult: 0 393 | decay_mult: 0 394 | } 395 | param { 396 | lr_mult: 0 397 | decay_mult: 0 398 | } 399 | param { 400 | lr_mult: 0 401 | decay_mult: 0 402 | } 403 | batch_norm_param { 404 | use_global_stats: false 405 | } 406 | } 407 | layer { 408 | name: "bn5" 409 | type: "BatchNorm" 410 | bottom: "conv5" 411 | top: "bn5" 412 | include { phase: TEST } 413 | batch_norm_param { 414 | use_global_stats: true 415 | } 416 | } 417 | layer { 418 | name: "scale5" 419 | type: "Scale" 420 | bottom: "bn5" 421 | top: "scale5" 422 | param { 423 | lr_mult: 0 424 | decay_mult: 0 425 | } 426 | param { 427 | lr_mult: 0 428 | decay_mult: 0 429 | } 430 | scale_param { 431 | bias_term: true 432 | } 433 | } 434 | layer { 435 | name: "relu5" 436 | type: "ReLU" 437 | bottom: "scale5" 438 | top: "scale5" 439 | relu_param { 440 | negative_slope: 0.1 441 | } 442 | } 443 | layer { 444 | name: "pool5" 445 | type: "Pooling" 446 | bottom: "scale5" 447 | top: "pool5" 448 | pooling_param { 449 | pool: MAX 450 | kernel_size: 2 451 | stride: 2 452 | } 453 | } 454 | 455 | layer { 456 | name: "conv6" 457 | type: "Convolution" 458 | bottom: "pool5" 459 | top: "conv6" 460 | param { 461 | lr_mult: 1 462 | decay_mult: 1 463 | } 464 | convolution_param { 465 | num_output: 512 466 | kernel_size: 3 467 | pad: 1 468 | bias_term: false 469 | weight_filler { 470 | type: "xavier" 471 | } 472 | } 473 | } 474 | layer { 475 | name: "bn6" 476 | type: "BatchNorm" 477 | bottom: "conv6" 478 | top: "bn6" 479 | include { phase: TRAIN } 480 | param { 481 | lr_mult: 0 482 | decay_mult: 0 483 | } 484 | param { 485 | lr_mult: 0 486 | decay_mult: 0 487 | } 488 | param { 489 | lr_mult: 0 490 | decay_mult: 0 491 | } 492 | batch_norm_param { 493 | use_global_stats: false 494 | } 495 | } 496 | layer { 497 | name: "bn6" 498 | type: "BatchNorm" 499 | bottom: "conv6" 500 | top: "bn6" 501 | include { phase: TEST } 502 | batch_norm_param { 503 | use_global_stats: true 504 | } 505 | } 506 | layer { 507 | name: "scale6" 508 | type: "Scale" 509 | bottom: "bn6" 510 | top: "scale6" 511 | param { 512 | lr_mult: 0 513 | decay_mult: 0 514 | } 515 | param { 516 | lr_mult: 0 517 | decay_mult: 0 518 | } 519 | scale_param { 520 | bias_term: true 521 | } 522 | } 523 | layer { 524 | name: "relu6" 525 | type: "ReLU" 526 | bottom: "scale6" 527 | top: "scale6" 528 | relu_param { 529 | negative_slope: 0.1 530 | } 531 | } 532 | layer { 533 | name: "pool6" 534 | type: "Pooling" 535 | bottom: "scale6" 536 | top: "pool6" 537 | pooling_param { 538 | pool: MAX 539 | kernel_size: 2 540 | stride: 2 541 | } 542 | } 543 | 544 | layer { 545 | name: "conv7" 546 | type: "Convolution" 547 | bottom: "pool6" 548 | top: "conv7" 549 | param { 550 | lr_mult: 1 551 | decay_mult: 1 552 | } 553 | convolution_param { 554 | num_output: 1024 555 | kernel_size: 3 556 | pad: 1 557 | bias_term: false 558 | weight_filler { 559 | type: "xavier" 560 | } 561 | } 562 | } 563 | layer { 564 | name: "bn7" 565 | type: "BatchNorm" 566 | bottom: "conv7" 567 | top: "bn7" 568 | include { phase: TRAIN } 569 | param { 570 | lr_mult: 0 571 | decay_mult: 0 572 | } 573 | param { 574 | lr_mult: 0 575 | decay_mult: 0 576 | } 577 | param { 578 | lr_mult: 0 579 | decay_mult: 0 580 | } 581 | batch_norm_param { 582 | use_global_stats: false 583 | } 584 | } 585 | layer { 586 | name: "bn7" 587 | type: "BatchNorm" 588 | bottom: "conv7" 589 | top: "bn7" 590 | include { phase: TEST } 591 | batch_norm_param { 592 | use_global_stats: true 593 | } 594 | } 595 | layer { 596 | name: "scale7" 597 | type: "Scale" 598 | bottom: "bn7" 599 | top: "scale7" 600 | param { 601 | lr_mult: 0 602 | decay_mult: 0 603 | } 604 | param { 605 | lr_mult: 0 606 | decay_mult: 0 607 | } 608 | scale_param { 609 | bias_term: true 610 | } 611 | } 612 | layer { 613 | name: "relu7" 614 | type: "ReLU" 615 | bottom: "scale7" 616 | top: "scale7" 617 | relu_param { 618 | negative_slope: 0.1 619 | } 620 | } 621 | #layer { 622 | # name: "drop7" 623 | # type: "Dropout" 624 | # bottom: "scale7" 625 | # top: "scale7" 626 | # dropout_param { 627 | # dropout_ratio: 0.5 628 | # } 629 | #} 630 | 631 | layer { 632 | name: "conv8_y" 633 | type: "Convolution" 634 | bottom: "scale7" 635 | top: "conv8" 636 | param { 637 | lr_mult: 1 638 | decay_mult: 1 639 | } 640 | convolution_param { 641 | num_output: 256 642 | kernel_size: 3 643 | pad: 1 644 | bias_term: false 645 | weight_filler { 646 | type: "xavier" 647 | } 648 | } 649 | } 650 | layer { 651 | name: "bn8" 652 | type: "BatchNorm" 653 | bottom: "conv8" 654 | top: "bn8" 655 | include { phase: TRAIN } 656 | param { 657 | lr_mult: 0 658 | decay_mult: 0 659 | } 660 | param { 661 | lr_mult: 0 662 | decay_mult: 0 663 | } 664 | param { 665 | lr_mult: 0 666 | decay_mult: 0 667 | } 668 | batch_norm_param { 669 | use_global_stats: false 670 | } 671 | } 672 | layer { 673 | name: "bn8" 674 | type: "BatchNorm" 675 | bottom: "conv8" 676 | top: "bn8" 677 | include { phase: TEST } 678 | batch_norm_param { 679 | use_global_stats: true 680 | } 681 | } 682 | layer { 683 | name: "scale8" 684 | type: "Scale" 685 | bottom: "bn8" 686 | top: "scale8" 687 | param { 688 | lr_mult: 0 689 | decay_mult: 0 690 | } 691 | param { 692 | lr_mult: 0 693 | decay_mult: 0 694 | } 695 | scale_param { 696 | bias_term: true 697 | } 698 | } 699 | layer { 700 | name: "relu8" 701 | type: "ReLU" 702 | bottom: "scale8" 703 | top: "scale8" 704 | relu_param { 705 | negative_slope: 0.1 706 | } 707 | } 708 | 709 | layer { 710 | name: "fc9" 711 | type: "InnerProduct" 712 | bottom: "scale8" 713 | top: "result" 714 | param { 715 | lr_mult: 1 716 | decay_mult: 1 717 | } 718 | param { 719 | lr_mult: 2 720 | decay_mult: 0 721 | } 722 | inner_product_param { 723 | num_output: 1470 724 | weight_filler { 725 | type: "gaussian" 726 | std: 0.01 727 | } 728 | bias_filler { 729 | type: "constant" 730 | value: 0 731 | } 732 | } 733 | } 734 | -------------------------------------------------------------------------------- /prototxt/yolo_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | convolution_param { 16 | num_output: 64 17 | kernel_size: 7 18 | pad: 3 19 | stride: 2 20 | weight_filler { 21 | type: "gaussian" 22 | std: 0.01 23 | } 24 | bias_filler { 25 | type: "constant" 26 | value: 0 27 | } 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | relu_param{ 36 | negative_slope: 0.1 37 | } 38 | } 39 | layer { 40 | name: "pool1" 41 | type: "Pooling" 42 | bottom: "conv1" 43 | top: "pool1" 44 | pooling_param { 45 | pool: MAX 46 | kernel_size: 2 47 | stride: 2 48 | } 49 | } 50 | 51 | layer{ 52 | name: "conv2" 53 | type: "Convolution" 54 | bottom: "pool1" 55 | top: "conv2" 56 | convolution_param { 57 | num_output: 192 58 | kernel_size: 3 59 | pad: 1 60 | stride: 1 61 | weight_filler { 62 | type: "gaussian" 63 | std: 0.01 64 | } 65 | bias_filler { 66 | type: "constant" 67 | value: 0 68 | } 69 | } 70 | } 71 | layer { 72 | name: "relu2" 73 | type: "ReLU" 74 | bottom: "conv2" 75 | top: "conv2" 76 | relu_param{ 77 | negative_slope: 0.1 78 | } 79 | } 80 | layer { 81 | name: "pool2" 82 | type: "Pooling" 83 | bottom: "conv2" 84 | top: "pool2" 85 | pooling_param { 86 | pool: MAX 87 | kernel_size: 2 88 | stride: 2 89 | } 90 | } 91 | 92 | layer{ 93 | name: "conv3" 94 | type: "Convolution" 95 | bottom: "pool2" 96 | top: "conv3" 97 | convolution_param { 98 | num_output: 128 99 | kernel_size: 1 100 | pad: 0 101 | stride: 1 102 | weight_filler { 103 | type: "gaussian" 104 | std: 0.01 105 | } 106 | bias_filler { 107 | type: "constant" 108 | value: 0 109 | } 110 | } 111 | } 112 | layer { 113 | name: "relu3" 114 | type: "ReLU" 115 | bottom: "conv3" 116 | top: "conv3" 117 | relu_param{ 118 | negative_slope: 0.1 119 | } 120 | } 121 | 122 | 123 | layer{ 124 | name: "conv4" 125 | type: "Convolution" 126 | bottom: "conv3" 127 | top: "conv4" 128 | convolution_param { 129 | num_output: 256 130 | kernel_size: 3 131 | pad: 1 132 | stride: 1 133 | weight_filler { 134 | type: "gaussian" 135 | std: 0.01 136 | } 137 | bias_filler { 138 | type: "constant" 139 | value: 0 140 | } 141 | } 142 | } 143 | layer { 144 | name: "relu4" 145 | type: "ReLU" 146 | bottom: "conv4" 147 | top: "conv4" 148 | relu_param{ 149 | negative_slope: 0.1 150 | } 151 | } 152 | 153 | layer{ 154 | name: "conv5" 155 | type: "Convolution" 156 | bottom: "conv4" 157 | top: "conv5" 158 | convolution_param { 159 | num_output: 256 160 | kernel_size: 1 161 | pad: 0 162 | stride: 1 163 | weight_filler { 164 | type: "gaussian" 165 | std: 0.01 166 | } 167 | bias_filler { 168 | type: "constant" 169 | value: 0 170 | } 171 | } 172 | } 173 | layer { 174 | name: "relu5" 175 | type: "ReLU" 176 | bottom: "conv5" 177 | top: "conv5" 178 | relu_param{ 179 | negative_slope: 0.1 180 | } 181 | } 182 | 183 | layer{ 184 | name: "conv6" 185 | type: "Convolution" 186 | bottom: "conv5" 187 | top: "conv6" 188 | convolution_param { 189 | num_output: 512 190 | kernel_size: 3 191 | pad: 1 192 | stride: 1 193 | weight_filler { 194 | type: "gaussian" 195 | std: 0.01 196 | } 197 | bias_filler { 198 | type: "constant" 199 | value: 0 200 | } 201 | } 202 | } 203 | layer { 204 | name: "relu6" 205 | type: "ReLU" 206 | bottom: "conv6" 207 | top: "conv6" 208 | relu_param{ 209 | negative_slope: 0.1 210 | } 211 | } 212 | layer { 213 | name: "pool6" 214 | type: "Pooling" 215 | bottom: "conv6" 216 | top: "pool6" 217 | pooling_param { 218 | pool: MAX 219 | kernel_size: 2 220 | stride: 2 221 | } 222 | } 223 | 224 | layer{ 225 | name: "conv7" 226 | type: "Convolution" 227 | bottom: "pool6" 228 | top: "conv7" 229 | convolution_param { 230 | num_output: 256 231 | kernel_size: 1 232 | pad: 0 233 | stride: 1 234 | weight_filler { 235 | type: "gaussian" 236 | std: 0.01 237 | } 238 | bias_filler { 239 | type: "constant" 240 | value: 0 241 | } 242 | } 243 | } 244 | layer { 245 | name: "relu7" 246 | type: "ReLU" 247 | bottom: "conv7" 248 | top: "conv7" 249 | relu_param{ 250 | negative_slope: 0.1 251 | } 252 | } 253 | 254 | layer{ 255 | name: "conv8" 256 | type: "Convolution" 257 | bottom: "conv7" 258 | top: "conv8" 259 | convolution_param { 260 | num_output: 512 261 | kernel_size: 3 262 | pad: 1 263 | stride: 1 264 | weight_filler { 265 | type: "gaussian" 266 | std: 0.01 267 | } 268 | bias_filler { 269 | type: "constant" 270 | value: 0 271 | } 272 | } 273 | } 274 | layer { 275 | name: "relu8" 276 | type: "ReLU" 277 | bottom: "conv8" 278 | top: "conv8" 279 | relu_param{ 280 | negative_slope: 0.1 281 | } 282 | } 283 | 284 | layer{ 285 | name: "conv9" 286 | type: "Convolution" 287 | bottom: "conv8" 288 | top: "conv9" 289 | convolution_param { 290 | num_output: 256 291 | kernel_size: 1 292 | pad: 0 293 | stride: 1 294 | weight_filler { 295 | type: "gaussian" 296 | std: 0.01 297 | } 298 | bias_filler { 299 | type: "constant" 300 | value: 0 301 | } 302 | } 303 | } 304 | layer { 305 | name: "relu9" 306 | type: "ReLU" 307 | bottom: "conv9" 308 | top: "conv9" 309 | relu_param{ 310 | negative_slope: 0.1 311 | } 312 | } 313 | 314 | layer{ 315 | name: "conv10" 316 | type: "Convolution" 317 | bottom: "conv9" 318 | top: "conv10" 319 | convolution_param { 320 | num_output: 512 321 | kernel_size: 3 322 | pad: 1 323 | stride: 1 324 | weight_filler { 325 | type: "gaussian" 326 | std: 0.01 327 | } 328 | bias_filler { 329 | type: "constant" 330 | value: 0 331 | } 332 | } 333 | } 334 | layer { 335 | name: "relu10" 336 | type: "ReLU" 337 | bottom: "conv10" 338 | top: "conv10" 339 | relu_param{ 340 | negative_slope: 0.1 341 | } 342 | } 343 | 344 | layer{ 345 | name: "conv11" 346 | type: "Convolution" 347 | bottom: "conv10" 348 | top: "conv11" 349 | convolution_param { 350 | num_output: 256 351 | kernel_size: 1 352 | pad: 0 353 | stride: 1 354 | weight_filler { 355 | type: "gaussian" 356 | std: 0.01 357 | } 358 | bias_filler { 359 | type: "constant" 360 | value: 0 361 | } 362 | } 363 | } 364 | layer { 365 | name: "relu11" 366 | type: "ReLU" 367 | bottom: "conv11" 368 | top: "conv11" 369 | relu_param{ 370 | negative_slope: 0.1 371 | } 372 | } 373 | 374 | 375 | layer{ 376 | name: "conv12" 377 | type: "Convolution" 378 | bottom: "conv11" 379 | top: "conv12" 380 | convolution_param { 381 | num_output: 512 382 | kernel_size: 3 383 | pad: 1 384 | stride: 1 385 | weight_filler { 386 | type: "gaussian" 387 | std: 0.01 388 | } 389 | bias_filler { 390 | type: "constant" 391 | value: 0 392 | } 393 | } 394 | } 395 | layer { 396 | name: "relu12" 397 | type: "ReLU" 398 | bottom: "conv12" 399 | top: "conv12" 400 | relu_param{ 401 | negative_slope: 0.1 402 | } 403 | } 404 | 405 | 406 | layer{ 407 | name: "conv13" 408 | type: "Convolution" 409 | bottom: "conv12" 410 | top: "conv13" 411 | convolution_param { 412 | num_output: 256 413 | kernel_size: 1 414 | pad: 0 415 | stride: 1 416 | weight_filler { 417 | type: "gaussian" 418 | std: 0.01 419 | } 420 | bias_filler { 421 | type: "constant" 422 | value: 0 423 | } 424 | } 425 | } 426 | layer { 427 | name: "relu13" 428 | type: "ReLU" 429 | bottom: "conv13" 430 | top: "conv13" 431 | relu_param{ 432 | negative_slope: 0.1 433 | } 434 | } 435 | 436 | layer{ 437 | name: "conv14" 438 | type: "Convolution" 439 | bottom: "conv13" 440 | top: "conv14" 441 | convolution_param { 442 | num_output: 512 443 | kernel_size: 3 444 | pad: 1 445 | stride: 1 446 | weight_filler { 447 | type: "gaussian" 448 | std: 0.01 449 | } 450 | bias_filler { 451 | type: "constant" 452 | value: 0 453 | } 454 | } 455 | } 456 | layer { 457 | name: "relu14" 458 | type: "ReLU" 459 | bottom: "conv14" 460 | top: "conv14" 461 | relu_param{ 462 | negative_slope: 0.1 463 | } 464 | } 465 | 466 | layer{ 467 | name: "conv15" 468 | type: "Convolution" 469 | bottom: "conv14" 470 | top: "conv15" 471 | convolution_param { 472 | num_output: 512 473 | kernel_size: 1 474 | pad: 0 475 | stride: 1 476 | weight_filler { 477 | type: "gaussian" 478 | std: 0.01 479 | } 480 | bias_filler { 481 | type: "constant" 482 | value: 0 483 | } 484 | } 485 | } 486 | layer { 487 | name: "relu15" 488 | type: "ReLU" 489 | bottom: "conv15" 490 | top: "conv15" 491 | relu_param{ 492 | negative_slope: 0.1 493 | } 494 | } 495 | 496 | 497 | layer{ 498 | name: "conv16" 499 | type: "Convolution" 500 | bottom: "conv15" 501 | top: "conv16" 502 | convolution_param { 503 | num_output: 1024 504 | kernel_size: 3 505 | pad: 1 506 | stride: 1 507 | weight_filler { 508 | type: "gaussian" 509 | std: 0.01 510 | } 511 | bias_filler { 512 | type: "constant" 513 | value: 0 514 | } 515 | } 516 | } 517 | layer { 518 | name: "relu16" 519 | type: "ReLU" 520 | bottom: "conv16" 521 | top: "conv16" 522 | relu_param{ 523 | negative_slope: 0.1 524 | } 525 | } 526 | 527 | layer { 528 | name: "pool16" 529 | type: "Pooling" 530 | bottom: "conv16" 531 | top: "pool16" 532 | pooling_param { 533 | pool: MAX 534 | kernel_size: 2 535 | stride: 2 536 | } 537 | } 538 | 539 | 540 | layer{ 541 | name: "conv17" 542 | type: "Convolution" 543 | bottom: "pool16" 544 | top: "conv17" 545 | convolution_param { 546 | num_output: 512 547 | kernel_size: 1 548 | pad: 0 549 | stride: 1 550 | weight_filler { 551 | type: "gaussian" 552 | std: 0.01 553 | } 554 | bias_filler { 555 | type: "constant" 556 | value: 0 557 | } 558 | } 559 | } 560 | layer { 561 | name: "relu17" 562 | type: "ReLU" 563 | bottom: "conv17" 564 | top: "conv17" 565 | relu_param{ 566 | negative_slope: 0.1 567 | } 568 | } 569 | 570 | 571 | layer{ 572 | name: "conv18" 573 | type: "Convolution" 574 | bottom: "conv17" 575 | top: "conv18" 576 | convolution_param { 577 | num_output: 1024 578 | kernel_size: 3 579 | pad: 1 580 | stride: 1 581 | weight_filler { 582 | type: "gaussian" 583 | std: 0.01 584 | } 585 | bias_filler { 586 | type: "constant" 587 | value: 0 588 | } 589 | } 590 | } 591 | layer { 592 | name: "relu18" 593 | type: "ReLU" 594 | bottom: "conv18" 595 | top: "conv18" 596 | relu_param{ 597 | negative_slope: 0.1 598 | } 599 | } 600 | 601 | 602 | 603 | layer{ 604 | name: "conv19" 605 | type: "Convolution" 606 | bottom: "conv18" 607 | top: "conv19" 608 | convolution_param { 609 | num_output: 512 610 | kernel_size: 1 611 | pad: 0 612 | stride: 1 613 | weight_filler { 614 | type: "gaussian" 615 | std: 0.01 616 | } 617 | bias_filler { 618 | type: "constant" 619 | value: 0 620 | } 621 | } 622 | } 623 | layer { 624 | name: "relu19" 625 | type: "ReLU" 626 | bottom: "conv19" 627 | top: "conv19" 628 | relu_param{ 629 | negative_slope: 0.1 630 | } 631 | } 632 | 633 | 634 | 635 | layer{ 636 | name: "conv20" 637 | type: "Convolution" 638 | bottom: "conv19" 639 | top: "conv20" 640 | convolution_param { 641 | num_output: 1024 642 | kernel_size: 3 643 | pad: 1 644 | stride: 1 645 | weight_filler { 646 | type: "gaussian" 647 | std: 0.01 648 | } 649 | bias_filler { 650 | type: "constant" 651 | value: 0 652 | } 653 | } 654 | } 655 | layer { 656 | name: "relu20" 657 | type: "ReLU" 658 | bottom: "conv20" 659 | top: "conv20" 660 | relu_param{ 661 | negative_slope: 0.1 662 | } 663 | } 664 | 665 | 666 | 667 | layer{ 668 | name: "conv21" 669 | type: "Convolution" 670 | bottom: "conv20" 671 | top: "conv21" 672 | convolution_param { 673 | num_output: 1024 674 | kernel_size: 3 675 | pad: 1 676 | stride: 1 677 | weight_filler { 678 | type: "gaussian" 679 | std: 0.01 680 | } 681 | bias_filler { 682 | type: "constant" 683 | value: 0 684 | } 685 | } 686 | } 687 | layer { 688 | name: "relu21" 689 | type: "ReLU" 690 | bottom: "conv21" 691 | top: "conv21" 692 | relu_param{ 693 | negative_slope: 0.1 694 | } 695 | } 696 | 697 | 698 | layer{ 699 | name: "conv22" 700 | type: "Convolution" 701 | bottom: "conv21" 702 | top: "conv22" 703 | convolution_param { 704 | num_output: 1024 705 | kernel_size: 3 706 | pad: 1 707 | stride: 2 708 | weight_filler { 709 | type: "gaussian" 710 | std: 0.01 711 | } 712 | bias_filler { 713 | type: "constant" 714 | value: 0 715 | } 716 | } 717 | } 718 | layer { 719 | name: "relu22" 720 | type: "ReLU" 721 | bottom: "conv22" 722 | top: "conv22" 723 | relu_param{ 724 | negative_slope: 0.1 725 | } 726 | } 727 | 728 | 729 | 730 | layer{ 731 | name: "conv23" 732 | type: "Convolution" 733 | bottom: "conv22" 734 | top: "conv23" 735 | convolution_param { 736 | num_output: 1024 737 | kernel_size: 3 738 | pad: 1 739 | stride: 1 740 | weight_filler { 741 | type: "gaussian" 742 | std: 0.01 743 | } 744 | bias_filler { 745 | type: "constant" 746 | value: 0 747 | } 748 | } 749 | } 750 | layer { 751 | name: "relu23" 752 | type: "ReLU" 753 | bottom: "conv23" 754 | top: "conv23" 755 | relu_param{ 756 | negative_slope: 0.1 757 | } 758 | } 759 | 760 | 761 | layer{ 762 | name: "conv24" 763 | type: "Convolution" 764 | bottom: "conv23" 765 | top: "conv24" 766 | convolution_param { 767 | num_output: 1024 768 | kernel_size: 3 769 | pad: 1 770 | stride: 1 771 | weight_filler { 772 | type: "gaussian" 773 | std: 0.01 774 | } 775 | bias_filler { 776 | type: "constant" 777 | value: 0 778 | } 779 | } 780 | } 781 | layer { 782 | name: "relu24" 783 | type: "ReLU" 784 | bottom: "conv24" 785 | top: "conv24" 786 | relu_param{ 787 | negative_slope: 0.1 788 | } 789 | } 790 | 791 | 792 | 793 | 794 | layer{ 795 | name: "fc25" 796 | type: "InnerProduct" 797 | bottom: "conv24" 798 | top: "fc25" 799 | inner_product_param { 800 | num_output: 4096 801 | weight_filler { 802 | type: "gaussian" 803 | std: 0.01 804 | } 805 | bias_filler { 806 | type: "constant" 807 | value: 0 808 | } 809 | } 810 | } 811 | layer { 812 | name: "relu25" 813 | type: "ReLU" 814 | bottom: "fc25" 815 | top: "fc25" 816 | relu_param{ 817 | negative_slope: 0.1 818 | } 819 | } 820 | 821 | 822 | layer{ 823 | name: "fc26" 824 | type: "InnerProduct" 825 | bottom: "fc25" 826 | top: "result" 827 | inner_product_param { 828 | num_output: 1470 829 | weight_filler { 830 | type: "gaussian" 831 | std: 0.01 832 | } 833 | bias_filler { 834 | type: "constant" 835 | value: 0 836 | } 837 | } 838 | } 839 | 840 | -------------------------------------------------------------------------------- /prototxt/yolo_small_train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "YOLONet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 448 7 | dim: 448 8 | } 9 | 10 | layer { 11 | name: "conv1" 12 | type: "Convolution" 13 | bottom: "data" 14 | top: "conv1" 15 | convolution_param { 16 | num_output: 64 17 | kernel_size: 7 18 | pad: 3 19 | stride: 2 20 | weight_filler { 21 | type: "gaussian" 22 | std: 0.01 23 | } 24 | bias_filler { 25 | type: "constant" 26 | value: 0 27 | } 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | relu_param{ 36 | negative_slope: 0.1 37 | } 38 | } 39 | layer { 40 | name: "pool1" 41 | type: "Pooling" 42 | bottom: "conv1" 43 | top: "pool1" 44 | pooling_param { 45 | pool: MAX 46 | kernel_size: 2 47 | stride: 2 48 | } 49 | } 50 | 51 | layer{ 52 | name: "conv2" 53 | type: "Convolution" 54 | bottom: "pool1" 55 | top: "conv2" 56 | convolution_param { 57 | num_output: 192 58 | kernel_size: 3 59 | pad: 1 60 | stride: 1 61 | weight_filler { 62 | type: "gaussian" 63 | std: 0.01 64 | } 65 | bias_filler { 66 | type: "constant" 67 | value: 0 68 | } 69 | } 70 | } 71 | layer { 72 | name: "relu2" 73 | type: "ReLU" 74 | bottom: "conv2" 75 | top: "conv2" 76 | relu_param{ 77 | negative_slope: 0.1 78 | } 79 | } 80 | layer { 81 | name: "pool2" 82 | type: "Pooling" 83 | bottom: "conv2" 84 | top: "pool2" 85 | pooling_param { 86 | pool: MAX 87 | kernel_size: 2 88 | stride: 2 89 | } 90 | } 91 | 92 | layer{ 93 | name: "conv3" 94 | type: "Convolution" 95 | bottom: "pool2" 96 | top: "conv3" 97 | convolution_param { 98 | num_output: 128 99 | kernel_size: 1 100 | pad: 0 101 | stride: 1 102 | weight_filler { 103 | type: "gaussian" 104 | std: 0.01 105 | } 106 | bias_filler { 107 | type: "constant" 108 | value: 0 109 | } 110 | } 111 | } 112 | layer { 113 | name: "relu3" 114 | type: "ReLU" 115 | bottom: "conv3" 116 | top: "conv3" 117 | relu_param{ 118 | negative_slope: 0.1 119 | } 120 | } 121 | 122 | 123 | layer{ 124 | name: "conv4" 125 | type: "Convolution" 126 | bottom: "conv3" 127 | top: "conv4" 128 | convolution_param { 129 | num_output: 256 130 | kernel_size: 3 131 | pad: 1 132 | stride: 1 133 | weight_filler { 134 | type: "gaussian" 135 | std: 0.01 136 | } 137 | bias_filler { 138 | type: "constant" 139 | value: 0 140 | } 141 | } 142 | } 143 | layer { 144 | name: "relu4" 145 | type: "ReLU" 146 | bottom: "conv4" 147 | top: "conv4" 148 | relu_param{ 149 | negative_slope: 0.1 150 | } 151 | } 152 | 153 | layer{ 154 | name: "conv5" 155 | type: "Convolution" 156 | bottom: "conv4" 157 | top: "conv5" 158 | convolution_param { 159 | num_output: 256 160 | kernel_size: 1 161 | pad: 0 162 | stride: 1 163 | weight_filler { 164 | type: "gaussian" 165 | std: 0.01 166 | } 167 | bias_filler { 168 | type: "constant" 169 | value: 0 170 | } 171 | } 172 | } 173 | layer { 174 | name: "relu5" 175 | type: "ReLU" 176 | bottom: "conv5" 177 | top: "conv5" 178 | relu_param{ 179 | negative_slope: 0.1 180 | } 181 | } 182 | 183 | layer{ 184 | name: "conv6" 185 | type: "Convolution" 186 | bottom: "conv5" 187 | top: "conv6" 188 | convolution_param { 189 | num_output: 512 190 | kernel_size: 3 191 | pad: 1 192 | stride: 1 193 | weight_filler { 194 | type: "gaussian" 195 | std: 0.01 196 | } 197 | bias_filler { 198 | type: "constant" 199 | value: 0 200 | } 201 | } 202 | } 203 | layer { 204 | name: "relu6" 205 | type: "ReLU" 206 | bottom: "conv6" 207 | top: "conv6" 208 | relu_param{ 209 | negative_slope: 0.1 210 | } 211 | } 212 | layer { 213 | name: "pool6" 214 | type: "Pooling" 215 | bottom: "conv6" 216 | top: "pool6" 217 | pooling_param { 218 | pool: MAX 219 | kernel_size: 2 220 | stride: 2 221 | } 222 | } 223 | 224 | layer{ 225 | name: "conv7" 226 | type: "Convolution" 227 | bottom: "pool6" 228 | top: "conv7" 229 | convolution_param { 230 | num_output: 256 231 | kernel_size: 1 232 | pad: 0 233 | stride: 1 234 | weight_filler { 235 | type: "gaussian" 236 | std: 0.01 237 | } 238 | bias_filler { 239 | type: "constant" 240 | value: 0 241 | } 242 | } 243 | } 244 | layer { 245 | name: "relu7" 246 | type: "ReLU" 247 | bottom: "conv7" 248 | top: "conv7" 249 | relu_param{ 250 | negative_slope: 0.1 251 | } 252 | } 253 | 254 | layer{ 255 | name: "conv8" 256 | type: "Convolution" 257 | bottom: "conv7" 258 | top: "conv8" 259 | convolution_param { 260 | num_output: 512 261 | kernel_size: 3 262 | pad: 1 263 | stride: 1 264 | weight_filler { 265 | type: "gaussian" 266 | std: 0.01 267 | } 268 | bias_filler { 269 | type: "constant" 270 | value: 0 271 | } 272 | } 273 | } 274 | layer { 275 | name: "relu8" 276 | type: "ReLU" 277 | bottom: "conv8" 278 | top: "conv8" 279 | relu_param{ 280 | negative_slope: 0.1 281 | } 282 | } 283 | 284 | layer{ 285 | name: "conv9" 286 | type: "Convolution" 287 | bottom: "conv8" 288 | top: "conv9" 289 | convolution_param { 290 | num_output: 256 291 | kernel_size: 1 292 | pad: 0 293 | stride: 1 294 | weight_filler { 295 | type: "gaussian" 296 | std: 0.01 297 | } 298 | bias_filler { 299 | type: "constant" 300 | value: 0 301 | } 302 | } 303 | } 304 | layer { 305 | name: "relu9" 306 | type: "ReLU" 307 | bottom: "conv9" 308 | top: "conv9" 309 | relu_param{ 310 | negative_slope: 0.1 311 | } 312 | } 313 | 314 | layer{ 315 | name: "conv10" 316 | type: "Convolution" 317 | bottom: "conv9" 318 | top: "conv10" 319 | convolution_param { 320 | num_output: 512 321 | kernel_size: 3 322 | pad: 1 323 | stride: 1 324 | weight_filler { 325 | type: "gaussian" 326 | std: 0.01 327 | } 328 | bias_filler { 329 | type: "constant" 330 | value: 0 331 | } 332 | } 333 | } 334 | layer { 335 | name: "relu10" 336 | type: "ReLU" 337 | bottom: "conv10" 338 | top: "conv10" 339 | relu_param{ 340 | negative_slope: 0.1 341 | } 342 | } 343 | 344 | layer{ 345 | name: "conv11" 346 | type: "Convolution" 347 | bottom: "conv10" 348 | top: "conv11" 349 | convolution_param { 350 | num_output: 256 351 | kernel_size: 1 352 | pad: 0 353 | stride: 1 354 | weight_filler { 355 | type: "gaussian" 356 | std: 0.01 357 | } 358 | bias_filler { 359 | type: "constant" 360 | value: 0 361 | } 362 | } 363 | } 364 | layer { 365 | name: "relu11" 366 | type: "ReLU" 367 | bottom: "conv11" 368 | top: "conv11" 369 | relu_param{ 370 | negative_slope: 0.1 371 | } 372 | } 373 | 374 | 375 | layer{ 376 | name: "conv12" 377 | type: "Convolution" 378 | bottom: "conv11" 379 | top: "conv12" 380 | convolution_param { 381 | num_output: 512 382 | kernel_size: 3 383 | pad: 1 384 | stride: 1 385 | weight_filler { 386 | type: "gaussian" 387 | std: 0.01 388 | } 389 | bias_filler { 390 | type: "constant" 391 | value: 0 392 | } 393 | } 394 | } 395 | layer { 396 | name: "relu12" 397 | type: "ReLU" 398 | bottom: "conv12" 399 | top: "conv12" 400 | relu_param{ 401 | negative_slope: 0.1 402 | } 403 | } 404 | 405 | 406 | layer{ 407 | name: "conv13" 408 | type: "Convolution" 409 | bottom: "conv12" 410 | top: "conv13" 411 | convolution_param { 412 | num_output: 256 413 | kernel_size: 1 414 | pad: 0 415 | stride: 1 416 | weight_filler { 417 | type: "gaussian" 418 | std: 0.01 419 | } 420 | bias_filler { 421 | type: "constant" 422 | value: 0 423 | } 424 | } 425 | } 426 | layer { 427 | name: "relu13" 428 | type: "ReLU" 429 | bottom: "conv13" 430 | top: "conv13" 431 | relu_param{ 432 | negative_slope: 0.1 433 | } 434 | } 435 | 436 | layer{ 437 | name: "conv14" 438 | type: "Convolution" 439 | bottom: "conv13" 440 | top: "conv14" 441 | convolution_param { 442 | num_output: 512 443 | kernel_size: 3 444 | pad: 1 445 | stride: 1 446 | weight_filler { 447 | type: "gaussian" 448 | std: 0.01 449 | } 450 | bias_filler { 451 | type: "constant" 452 | value: 0 453 | } 454 | } 455 | } 456 | layer { 457 | name: "relu14" 458 | type: "ReLU" 459 | bottom: "conv14" 460 | top: "conv14" 461 | relu_param{ 462 | negative_slope: 0.1 463 | } 464 | } 465 | 466 | layer{ 467 | name: "conv15" 468 | type: "Convolution" 469 | bottom: "conv14" 470 | top: "conv15" 471 | convolution_param { 472 | num_output: 512 473 | kernel_size: 1 474 | pad: 0 475 | stride: 1 476 | weight_filler { 477 | type: "gaussian" 478 | std: 0.01 479 | } 480 | bias_filler { 481 | type: "constant" 482 | value: 0 483 | } 484 | } 485 | } 486 | layer { 487 | name: "relu15" 488 | type: "ReLU" 489 | bottom: "conv15" 490 | top: "conv15" 491 | relu_param{ 492 | negative_slope: 0.1 493 | } 494 | } 495 | 496 | 497 | layer{ 498 | name: "conv16" 499 | type: "Convolution" 500 | bottom: "conv15" 501 | top: "conv16" 502 | convolution_param { 503 | num_output: 1024 504 | kernel_size: 3 505 | pad: 1 506 | stride: 1 507 | weight_filler { 508 | type: "gaussian" 509 | std: 0.01 510 | } 511 | bias_filler { 512 | type: "constant" 513 | value: 0 514 | } 515 | } 516 | } 517 | layer { 518 | name: "relu16" 519 | type: "ReLU" 520 | bottom: "conv16" 521 | top: "conv16" 522 | relu_param{ 523 | negative_slope: 0.1 524 | } 525 | } 526 | 527 | layer { 528 | name: "pool16" 529 | type: "Pooling" 530 | bottom: "conv16" 531 | top: "pool16" 532 | pooling_param { 533 | pool: MAX 534 | kernel_size: 2 535 | stride: 2 536 | } 537 | } 538 | 539 | 540 | layer{ 541 | name: "conv17" 542 | type: "Convolution" 543 | bottom: "pool16" 544 | top: "conv17" 545 | convolution_param { 546 | num_output: 512 547 | kernel_size: 1 548 | pad: 0 549 | stride: 1 550 | weight_filler { 551 | type: "gaussian" 552 | std: 0.01 553 | } 554 | bias_filler { 555 | type: "constant" 556 | value: 0 557 | } 558 | } 559 | } 560 | layer { 561 | name: "relu17" 562 | type: "ReLU" 563 | bottom: "conv17" 564 | top: "conv17" 565 | relu_param{ 566 | negative_slope: 0.1 567 | } 568 | } 569 | 570 | 571 | layer{ 572 | name: "conv18" 573 | type: "Convolution" 574 | bottom: "conv17" 575 | top: "conv18" 576 | convolution_param { 577 | num_output: 1024 578 | kernel_size: 3 579 | pad: 1 580 | stride: 1 581 | weight_filler { 582 | type: "gaussian" 583 | std: 0.01 584 | } 585 | bias_filler { 586 | type: "constant" 587 | value: 0 588 | } 589 | } 590 | } 591 | layer { 592 | name: "relu18" 593 | type: "ReLU" 594 | bottom: "conv18" 595 | top: "conv18" 596 | relu_param{ 597 | negative_slope: 0.1 598 | } 599 | } 600 | 601 | 602 | 603 | layer{ 604 | name: "conv19" 605 | type: "Convolution" 606 | bottom: "conv18" 607 | top: "conv19" 608 | convolution_param { 609 | num_output: 512 610 | kernel_size: 1 611 | pad: 0 612 | stride: 1 613 | weight_filler { 614 | type: "gaussian" 615 | std: 0.01 616 | } 617 | bias_filler { 618 | type: "constant" 619 | value: 0 620 | } 621 | } 622 | } 623 | layer { 624 | name: "relu19" 625 | type: "ReLU" 626 | bottom: "conv19" 627 | top: "conv19" 628 | relu_param{ 629 | negative_slope: 0.1 630 | } 631 | } 632 | 633 | 634 | 635 | layer{ 636 | name: "conv20" 637 | type: "Convolution" 638 | bottom: "conv19" 639 | top: "conv20" 640 | convolution_param { 641 | num_output: 1024 642 | kernel_size: 3 643 | pad: 1 644 | stride: 1 645 | weight_filler { 646 | type: "gaussian" 647 | std: 0.01 648 | } 649 | bias_filler { 650 | type: "constant" 651 | value: 0 652 | } 653 | } 654 | } 655 | layer { 656 | name: "relu20" 657 | type: "ReLU" 658 | bottom: "conv20" 659 | top: "conv20" 660 | relu_param{ 661 | negative_slope: 0.1 662 | } 663 | } 664 | 665 | 666 | 667 | layer{ 668 | name: "conv21" 669 | type: "Convolution" 670 | bottom: "conv20" 671 | top: "conv21" 672 | convolution_param { 673 | num_output: 1024 674 | kernel_size: 3 675 | pad: 1 676 | stride: 1 677 | weight_filler { 678 | type: "gaussian" 679 | std: 0.01 680 | } 681 | bias_filler { 682 | type: "constant" 683 | value: 0 684 | } 685 | } 686 | } 687 | layer { 688 | name: "relu21" 689 | type: "ReLU" 690 | bottom: "conv21" 691 | top: "conv21" 692 | relu_param{ 693 | negative_slope: 0.1 694 | } 695 | } 696 | 697 | 698 | layer{ 699 | name: "conv22" 700 | type: "Convolution" 701 | bottom: "conv21" 702 | top: "conv22" 703 | convolution_param { 704 | num_output: 1024 705 | kernel_size: 3 706 | pad: 1 707 | stride: 2 708 | weight_filler { 709 | type: "gaussian" 710 | std: 0.01 711 | } 712 | bias_filler { 713 | type: "constant" 714 | value: 0 715 | } 716 | } 717 | } 718 | layer { 719 | name: "relu22" 720 | type: "ReLU" 721 | bottom: "conv22" 722 | top: "conv22" 723 | relu_param{ 724 | negative_slope: 0.1 725 | } 726 | } 727 | 728 | 729 | 730 | layer{ 731 | name: "conv23" 732 | type: "Convolution" 733 | bottom: "conv22" 734 | top: "conv23" 735 | convolution_param { 736 | num_output: 1024 737 | kernel_size: 3 738 | pad: 1 739 | stride: 1 740 | weight_filler { 741 | type: "gaussian" 742 | std: 0.01 743 | } 744 | bias_filler { 745 | type: "constant" 746 | value: 0 747 | } 748 | } 749 | } 750 | layer { 751 | name: "relu23" 752 | type: "ReLU" 753 | bottom: "conv23" 754 | top: "conv23" 755 | relu_param{ 756 | negative_slope: 0.1 757 | } 758 | } 759 | 760 | 761 | layer{ 762 | name: "conv24" 763 | type: "Convolution" 764 | bottom: "conv23" 765 | top: "conv24" 766 | convolution_param { 767 | num_output: 1024 768 | kernel_size: 3 769 | pad: 1 770 | stride: 1 771 | weight_filler { 772 | type: "gaussian" 773 | std: 0.01 774 | } 775 | bias_filler { 776 | type: "constant" 777 | value: 0 778 | } 779 | } 780 | } 781 | layer { 782 | name: "relu24" 783 | type: "ReLU" 784 | bottom: "conv24" 785 | top: "conv24" 786 | relu_param{ 787 | negative_slope: 0.1 788 | } 789 | } 790 | 791 | 792 | 793 | 794 | layer{ 795 | name: "fc25" 796 | type: "InnerProduct" 797 | bottom: "conv24" 798 | top: "fc25" 799 | inner_product_param { 800 | num_output: 512 801 | weight_filler { 802 | type: "gaussian" 803 | std: 0.01 804 | } 805 | bias_filler { 806 | type: "constant" 807 | value: 0 808 | } 809 | } 810 | } 811 | layer { 812 | name: "relu25" 813 | type: "ReLU" 814 | bottom: "fc25" 815 | top: "fc25" 816 | relu_param{ 817 | negative_slope: 0.1 818 | } 819 | } 820 | 821 | 822 | layer{ 823 | name: "fc26" 824 | type: "InnerProduct" 825 | bottom: "fc25" 826 | top: "fc26" 827 | inner_product_param { 828 | num_output: 4096 829 | weight_filler { 830 | type: "gaussian" 831 | std: 0.01 832 | } 833 | bias_filler { 834 | type: "constant" 835 | value: 0 836 | } 837 | } 838 | } 839 | layer { 840 | name: "relu26" 841 | type: "ReLU" 842 | bottom: "fc26" 843 | top: "fc26" 844 | relu_param{ 845 | negative_slope: 0.1 846 | } 847 | } 848 | 849 | 850 | layer{ 851 | name: "fc27" 852 | type: "InnerProduct" 853 | bottom: "fc26" 854 | top: "result" 855 | inner_product_param { 856 | num_output: 1470 857 | weight_filler { 858 | type: "gaussian" 859 | std: 0.01 860 | } 861 | bias_filler { 862 | type: "constant" 863 | value: 0 864 | } 865 | } 866 | } 867 | 868 | --------------------------------------------------------------------------------