├── LICENSE ├── README.md ├── cfg.py ├── cfg ├── coco.data ├── darknet19_448.cfg ├── tiny-yolo-voc.cfg ├── tiny-yolo.cfg ├── voc.data ├── yolo-voc.cfg ├── yolo.cfg └── yolo_v3.cfg ├── darknet.py ├── data ├── coco.names ├── dog.jpg ├── eagle.jpg ├── giraffe.jpg ├── horses.jpg ├── person.jpg ├── predictions-yolov2.jpg ├── predictions-yolov3.jpg ├── scream.jpg ├── voc.names └── voc_label.py ├── dataset.py ├── debug.py ├── demo.py ├── detect.py ├── eval.py ├── focal_loss.py ├── image.py ├── layers └── batchnorm │ ├── Makefile │ ├── bn.py │ ├── bn_lib │ └── __init__.py │ ├── build.py │ └── src │ ├── batchnorm.c │ ├── batchnorm.h │ ├── blas.c │ ├── blas.h │ ├── blas_kernels.cu │ ├── cuda.c │ └── cuda.h ├── models ├── caffe_net.py ├── resnet.py └── tiny_yolo.py ├── outputs.py ├── partial.py ├── recall.py ├── region_layer.py ├── scripts ├── coco_eval.py ├── eval_all.py ├── eval_ap.py ├── eval_widerface.py ├── my_eval.py ├── voc_eval.py_old_version_ └── voc_label.py ├── tools └── lmdb │ ├── create_dataset.py │ ├── lmdb_utils.py │ ├── plot_lmdb.py │ └── train_lmdb.py ├── train.py ├── utils.py ├── valid.py └── yolo_layer.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018 Young-Sun (Andy) Yun 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-0.4-yolov3 : Yet Another Implimentation of Pytroch 0.41 or over and YoloV3 2 | ## This repository is created for implmentation of yolov3 with pytorch 0.4 from marvis/pytorch-yolo2. 3 | This repository is forked from great work pytorch-yolo2 of @github/marvis, 4 | but I couldn't upload or modify directly to marvis source files because many files were changed even filenames. 5 | 6 | ### Difference between this repository and marvis original version. 7 | * some programs are re-structured for windows environments. 8 | (for example \_\_name\_\_ == '\_\_main\_\_' (variable in python program) is checked for multiple threads). 9 | * load and save weights are modified to compatible to yolov2 and yolov3 versions 10 | (means that this repository works for yolov2 and yolov3 configuration without source modification.) 11 | * fully support yolov3 detection and training 12 | * region_loss.py is renamed to region_layer.py. 13 | * outputs of region_layer.py and yolo_layer.py are enclosed to dictionary variables. 14 | * codes are modified to work on pytorch 0.4 and python3 15 | * some codes are modified to speed up and easy readings. (I'm not sure.. T_T) 16 | * in training mode, check nan value and use gradient clipping. 17 | 18 | #### If you want to know the training and detect procedures, please refer to https://github.com/marvis/pytorch-yolo2 for the detail information. 19 | 20 | ### Train your own data or coco, voc data as follows: 21 | ``` 22 | python train.py -d cfg/coco.data -c cfg/yolo_v3.cfg -w yolov3.weights 23 | ``` 24 | 25 | * new weights are saved in backup directory along to epoch numbers (last 5 weights are saved, you control the number of backups in train.py) 26 | 27 | * The above command shows the example of training process. I didn't execute the above command. But, I did successully train my own data with the pretrained yolov3.weights. 28 | 29 | * You __should__ notice that the anchor information is different when it used in yolov2 or yolov3 model. 30 | 31 | * If you want to use the pretrained weight as the initial weights, add -r option in the training command 32 | 33 | ``` 34 | python train.py -d cfg/my.data -c cfg/my.cfg -w yolov3.weights -r 35 | ``` 36 | 37 | * maximum epochs option, which is automatically calculated, somestimes is too small, then you can set the max_epochs in your configuration. 38 | 39 | #### Recorded yolov2 and yolov3 training for my own data 40 | * When you clicked the images, videos will played on yoube.com 41 | 42 | * yolov2 training recorded : 43 | [![yolov2 training](https://img.youtube.com/vi/jhoaVeqtOQw/0.jpg)](https://www.youtube.com/watch?v=jhoaVeqtOQw) 44 | 45 | * yolov3 training recorded : 46 | [![yolov3 training](https://img.youtube.com/vi/zazKAm9FClc/0.jpg)](https://www.youtube.com/watch?v=zazKAm9FClc) 47 | 48 | * In above recorded videos, if you use the pretrained weights as base, about less than 10 or 20 epochs, you can see the large number of proposals. However, when training is in progress, nPP decreases to zero and increases with model updates. 49 | 50 | * The converges of yolov2 and yolov3 are different because yolov2 updates all boxes below 12800 exposures. 51 | 52 | ### Detect the objects in dog image using pretrained weights 53 | 54 | #### yolov2 models 55 | ``` 56 | wget http://pjreddie.com/media/files/yolo.weights 57 | python detect.py cfg/yolo.cfg yolo.weights data/dog.jpg data/coco.names 58 | ``` 59 | 60 | ![predictions](data/predictions-yolov2.jpg) 61 | 62 | Loading weights from yolo.weights... Done! 63 | data\dog.jpg: Predicted in 0.832918 seconds. 64 | 3 box(es) is(are) found 65 | truck: 0.934710 66 | bicycle: 0.998012 67 | dog: 0.990524 68 | save plot results to predictions.jpg 69 | 70 | #### yolov3 models 71 | ``` 72 | wget https://pjreddie.com/media/files/yolov3.weights 73 | python detect.py cfg/yolo_v3.cfg yolov3.weights data/dog.jpg data/coco.names 74 | ``` 75 | 76 | ![predictions](data/predictions-yolov3.jpg) 77 | 78 | Loading weights from yolov3.weights... Done! 79 | 80 | data\dog.jpg: Predicted in 0.837523 seconds. 81 | 3 box(es) is(are) found 82 | dog: 0.999996 83 | truck: 0.995232 84 | bicycle: 0.999973 85 | save plot results to predictions.jpg 86 | 87 | ### validation and get evaluation results 88 | 89 | ``` 90 | valid.py data/yourown.data cfg/yourown.cfg yourown_weights 91 | ``` 92 | 93 | ### Performances for voc datasets using yolov2 (with 100 epochs training) 94 | - CrossEntropyLoss is used to compare classes 95 | - Performances are varied along to the weighting factor, for example. 96 | ``` 97 | coord_scale=1, object_scale=5, class_scale=1 mAP = 73.1 98 | coord_scale=1, object_scale=5, class_scale=2 mAP = 72.7 99 | coord_scale=1, object_scale=3, class_scale=1 mAP = 73.4 100 | coord_scale=1, object_scale=3, class_scale=2 mAP = 72.8 101 | coord_scale=1, object_scale=1, class_scale=1 mAP = 50.4 102 | ``` 103 | 104 | - After modifying anchors information at yolo-voc.cfg and applying new coord_mask 105 | Finally, I got the 106 | ``` 107 | anchors = 1.1468, 1.5021, 2.7780, 3.4751, 4.3845, 7.0162, 8.2523, 4.2100, 9.7340, 8.682 108 | coord_scale=1, object_scale=3, class_scale=1 mAP = 74.4 109 | ``` 110 | 111 | - using yolov3 with self.rescore = 1 and latest code, ___mAP = 74.9___. (with 170 epochs training) 112 | 113 | Therefore, you may do many experiments to get the best performances. 114 | 115 | ### License 116 | 117 | MIT License (see LICENSE file). 118 | 119 | -------------------------------------------------------------------------------- /cfg.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils import convert2cpu 3 | 4 | def parse_cfg(cfgfile): 5 | blocks = [] 6 | fp = open(cfgfile, 'r') 7 | block = None 8 | line = fp.readline() 9 | while line != '': 10 | line = line.rstrip() 11 | if line == '' or line[0] == '#': 12 | line = fp.readline() 13 | continue 14 | elif line[0] == '[': 15 | if block: 16 | blocks.append(block) 17 | block = dict() 18 | block['type'] = line.lstrip('[').rstrip(']') 19 | # set default value 20 | if block['type'] == 'convolutional': 21 | block['batch_normalize'] = 0 22 | else: 23 | key,value = line.split('=') 24 | key = key.strip() 25 | if key == 'type': 26 | key = '_type' 27 | value = value.strip() 28 | block[key] = value 29 | line = fp.readline() 30 | 31 | if block: 32 | blocks.append(block) 33 | fp.close() 34 | return blocks 35 | 36 | def print_cfg(blocks): 37 | print('layer filters size input output'); 38 | prev_width = 416 39 | prev_height = 416 40 | prev_filters = 3 41 | out_filters =[] 42 | out_widths =[] 43 | out_heights =[] 44 | ind = -2 45 | for block in blocks: 46 | ind = ind + 1 47 | if block['type'] == 'net': 48 | prev_width = int(block['width']) 49 | prev_height = int(block['height']) 50 | continue 51 | elif block['type'] == 'convolutional': 52 | filters = int(block['filters']) 53 | kernel_size = int(block['size']) 54 | stride = int(block['stride']) 55 | is_pad = int(block['pad']) 56 | pad = (kernel_size-1)//2 if is_pad else 0 57 | width = (prev_width + 2*pad - kernel_size)//stride + 1 58 | height = (prev_height + 2*pad - kernel_size)//stride + 1 59 | print('%5d %-6s %4d %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width, height, filters)) 60 | prev_width = width 61 | prev_height = height 62 | prev_filters = filters 63 | out_widths.append(prev_width) 64 | out_heights.append(prev_height) 65 | out_filters.append(prev_filters) 66 | elif block['type'] == 'maxpool': 67 | pool_size = int(block['size']) 68 | stride = int(block['stride']) 69 | width = prev_width//stride 70 | height = prev_height//stride 71 | print('%5d %-6s %d x %d / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height, filters)) 72 | prev_width = width 73 | prev_height = height 74 | prev_filters = filters 75 | out_widths.append(prev_width) 76 | out_heights.append(prev_height) 77 | out_filters.append(prev_filters) 78 | elif block['type'] == 'avgpool': 79 | width = 1 80 | height = 1 81 | print('%5d %-6s %3d x %3d x%4d -> %3d' % (ind, 'avg', prev_width, prev_height, prev_filters, prev_filters)) 82 | prev_width = width 83 | prev_height = height 84 | prev_filters = filters 85 | out_widths.append(prev_width) 86 | out_heights.append(prev_height) 87 | out_filters.append(prev_filters) 88 | elif block['type'] == 'softmax': 89 | print('%5d %-6s -> %3d' % (ind, 'softmax', prev_filters)) 90 | out_widths.append(prev_width) 91 | out_heights.append(prev_height) 92 | out_filters.append(prev_filters) 93 | elif block['type'] == 'cost': 94 | print('%5d %-6s -> %3d' % (ind, 'cost', prev_filters)) 95 | out_widths.append(prev_width) 96 | out_heights.append(prev_height) 97 | out_filters.append(prev_filters) 98 | elif block['type'] == 'reorg': 99 | stride = int(block['stride']) 100 | filters = stride * stride * prev_filters 101 | width = prev_width//stride 102 | height = prev_height//stride 103 | print('%5d %-6s / %d %3d x %3d x%4d -> %3d x %3d x%4d' % (ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters)) 104 | prev_width = width 105 | prev_height = height 106 | prev_filters = filters 107 | out_widths.append(prev_width) 108 | out_heights.append(prev_height) 109 | out_filters.append(prev_filters) 110 | elif block['type'] == 'upsample': 111 | stride = int(block['stride']) 112 | filters = prev_filters 113 | width = prev_width*stride 114 | height = prev_height*stride 115 | print('%5d %-6s * %d %3d x %3d x%4d -> %3d x %3d x%4d' % (ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters)) 116 | prev_width = width 117 | prev_height = height 118 | prev_filters = filters 119 | out_widths.append(prev_width) 120 | out_heights.append(prev_height) 121 | out_filters.append(prev_filters) 122 | elif block['type'] == 'route': 123 | layers = block['layers'].split(',') 124 | layers = [int(i) if int(i) > 0 else int(i)+ind for i in layers] 125 | if len(layers) == 1: 126 | print('%5d %-6s %d' % (ind, 'route', layers[0])) 127 | prev_width = out_widths[layers[0]] 128 | prev_height = out_heights[layers[0]] 129 | prev_filters = out_filters[layers[0]] 130 | elif len(layers) == 2: 131 | print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1])) 132 | prev_width = out_widths[layers[0]] 133 | prev_height = out_heights[layers[0]] 134 | assert(prev_width == out_widths[layers[1]]) 135 | assert(prev_height == out_heights[layers[1]]) 136 | prev_filters = out_filters[layers[0]] + out_filters[layers[1]] 137 | out_widths.append(prev_width) 138 | out_heights.append(prev_height) 139 | out_filters.append(prev_filters) 140 | elif block['type'] in ['region', 'yolo']: 141 | print('%5d %-6s' % (ind, 'detection')) 142 | out_widths.append(prev_width) 143 | out_heights.append(prev_height) 144 | out_filters.append(prev_filters) 145 | elif block['type'] == 'shortcut': 146 | from_id = int(block['from']) 147 | from_id = from_id if from_id > 0 else from_id+ind 148 | print('%5d %-6s %d' % (ind, 'shortcut', from_id)) 149 | prev_width = out_widths[from_id] 150 | prev_height = out_heights[from_id] 151 | prev_filters = out_filters[from_id] 152 | out_widths.append(prev_width) 153 | out_heights.append(prev_height) 154 | out_filters.append(prev_filters) 155 | elif block['type'] == 'connected': 156 | filters = int(block['output']) 157 | print('%5d %-6s %d -> %3d' % (ind, 'connected', prev_filters, filters)) 158 | prev_filters = filters 159 | out_widths.append(1) 160 | out_heights.append(1) 161 | out_filters.append(prev_filters) 162 | else: 163 | print('unknown type %s' % (block['type'])) 164 | 165 | def load_conv(buf, start, conv_model): 166 | 167 | num_w = conv_model.weight.numel() 168 | num_b = conv_model.bias.numel() 169 | #print("start: {}, num_w: {}, num_b: {}".format(start, num_w, num_b)) 170 | # by ysyun, use .view_as() 171 | conv_model.bias.data.copy_(torch.from_numpy(buf[start:start+num_b]).view_as(conv_model.bias.data)); start = start + num_b 172 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_w]).view_as(conv_model.weight.data)); start = start + num_w 173 | return start 174 | 175 | def save_conv(fp, conv_model): 176 | if conv_model.bias.is_cuda: 177 | convert2cpu(conv_model.bias.data).numpy().tofile(fp) 178 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 179 | else: 180 | conv_model.bias.data.numpy().tofile(fp) 181 | conv_model.weight.data.numpy().tofile(fp) 182 | 183 | def load_conv_bn(buf, start, conv_model, bn_model): 184 | num_w = conv_model.weight.numel() 185 | num_b = bn_model.bias.numel() 186 | bn_model.bias.data.copy_(torch.from_numpy(buf[start:start+num_b])); start = start + num_b 187 | bn_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_b])); start = start + num_b 188 | bn_model.running_mean.copy_(torch.from_numpy(buf[start:start+num_b])); start = start + num_b 189 | bn_model.running_var.copy_(torch.from_numpy(buf[start:start+num_b])); start = start + num_b 190 | #conv_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_w])); start = start + num_w 191 | conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).view_as(conv_model.weight.data)); start = start + num_w 192 | return start 193 | 194 | def save_conv_bn(fp, conv_model, bn_model): 195 | if bn_model.bias.is_cuda: 196 | convert2cpu(bn_model.bias.data).numpy().tofile(fp) 197 | convert2cpu(bn_model.weight.data).numpy().tofile(fp) 198 | convert2cpu(bn_model.running_mean).numpy().tofile(fp) 199 | convert2cpu(bn_model.running_var).numpy().tofile(fp) 200 | convert2cpu(conv_model.weight.data).numpy().tofile(fp) 201 | else: 202 | bn_model.bias.data.numpy().tofile(fp) 203 | bn_model.weight.data.numpy().tofile(fp) 204 | bn_model.running_mean.numpy().tofile(fp) 205 | bn_model.running_var.numpy().tofile(fp) 206 | conv_model.weight.data.numpy().tofile(fp) 207 | 208 | def load_fc(buf, start, fc_model): 209 | num_w = fc_model.weight.numel() 210 | num_b = fc_model.bias.numel() 211 | fc_model.bias.data.copy_(torch.from_numpy(buf[start:start+num_b])); start = start + num_b 212 | fc_model.weight.data.copy_(torch.from_numpy(buf[start:start+num_w])); start = start + num_w 213 | return start 214 | 215 | def save_fc(fp, fc_model): 216 | fc_model.bias.data.numpy().tofile(fp) 217 | fc_model.weight.data.numpy().tofile(fp) 218 | 219 | if __name__ == '__main__': 220 | import sys 221 | blocks = parse_cfg('cfg/yolo.cfg') 222 | if len(sys.argv) == 2: 223 | blocks = parse_cfg(sys.argv[1]) 224 | print_cfg(blocks) 225 | -------------------------------------------------------------------------------- /cfg/coco.data: -------------------------------------------------------------------------------- 1 | train = coco_train.txt 2 | valid = coco_test.txt 3 | names = data/coco.names 4 | backup = backup 5 | gpus = 0,1,2,3 6 | -------------------------------------------------------------------------------- /cfg/darknet19_448.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=128 3 | subdivisions=4 4 | height=448 5 | width=448 6 | max_crop=512 7 | channels=3 8 | momentum=0.9 9 | decay=0.0005 10 | 11 | learning_rate=0.001 12 | policy=poly 13 | power=4 14 | max_batches=100000 15 | 16 | angle=7 17 | hue = .1 18 | saturation=.75 19 | exposure=.75 20 | aspect=.75 21 | 22 | [convolutional] 23 | batch_normalize=1 24 | filters=32 25 | size=3 26 | stride=1 27 | pad=1 28 | activation=leaky 29 | 30 | [maxpool] 31 | size=2 32 | stride=2 33 | 34 | [convolutional] 35 | batch_normalize=1 36 | filters=64 37 | size=3 38 | stride=1 39 | pad=1 40 | activation=leaky 41 | 42 | [maxpool] 43 | size=2 44 | stride=2 45 | 46 | [convolutional] 47 | batch_normalize=1 48 | filters=128 49 | size=3 50 | stride=1 51 | pad=1 52 | activation=leaky 53 | 54 | [convolutional] 55 | batch_normalize=1 56 | filters=64 57 | size=1 58 | stride=1 59 | pad=1 60 | activation=leaky 61 | 62 | [convolutional] 63 | batch_normalize=1 64 | filters=128 65 | size=3 66 | stride=1 67 | pad=1 68 | activation=leaky 69 | 70 | [maxpool] 71 | size=2 72 | stride=2 73 | 74 | [convolutional] 75 | batch_normalize=1 76 | filters=256 77 | size=3 78 | stride=1 79 | pad=1 80 | activation=leaky 81 | 82 | [convolutional] 83 | batch_normalize=1 84 | filters=128 85 | size=1 86 | stride=1 87 | pad=1 88 | activation=leaky 89 | 90 | [convolutional] 91 | batch_normalize=1 92 | filters=256 93 | size=3 94 | stride=1 95 | pad=1 96 | activation=leaky 97 | 98 | [maxpool] 99 | size=2 100 | stride=2 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | filters=512 105 | size=3 106 | stride=1 107 | pad=1 108 | activation=leaky 109 | 110 | [convolutional] 111 | batch_normalize=1 112 | filters=256 113 | size=1 114 | stride=1 115 | pad=1 116 | activation=leaky 117 | 118 | [convolutional] 119 | batch_normalize=1 120 | filters=512 121 | size=3 122 | stride=1 123 | pad=1 124 | activation=leaky 125 | 126 | [convolutional] 127 | batch_normalize=1 128 | filters=256 129 | size=1 130 | stride=1 131 | pad=1 132 | activation=leaky 133 | 134 | [convolutional] 135 | batch_normalize=1 136 | filters=512 137 | size=3 138 | stride=1 139 | pad=1 140 | activation=leaky 141 | 142 | [maxpool] 143 | size=2 144 | stride=2 145 | 146 | [convolutional] 147 | batch_normalize=1 148 | filters=1024 149 | size=3 150 | stride=1 151 | pad=1 152 | activation=leaky 153 | 154 | [convolutional] 155 | batch_normalize=1 156 | filters=512 157 | size=1 158 | stride=1 159 | pad=1 160 | activation=leaky 161 | 162 | [convolutional] 163 | batch_normalize=1 164 | filters=1024 165 | size=3 166 | stride=1 167 | pad=1 168 | activation=leaky 169 | 170 | [convolutional] 171 | batch_normalize=1 172 | filters=512 173 | size=1 174 | stride=1 175 | pad=1 176 | activation=leaky 177 | 178 | [convolutional] 179 | batch_normalize=1 180 | filters=1024 181 | size=3 182 | stride=1 183 | pad=1 184 | activation=leaky 185 | 186 | [convolutional] 187 | filters=1000 188 | size=1 189 | stride=1 190 | pad=1 191 | activation=linear 192 | 193 | [avgpool] 194 | 195 | [softmax] 196 | groups=1 197 | 198 | [cost] 199 | type=sse 200 | 201 | -------------------------------------------------------------------------------- /cfg/tiny-yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | batch=64 3 | subdivisions=8 4 | width=416 5 | height=416 6 | channels=3 7 | momentum=0.9 8 | decay=0.0005 9 | angle=0 10 | saturation = 1.5 11 | exposure = 1.5 12 | hue=.1 13 | 14 | learning_rate=0.001 15 | max_batches = 40200 16 | policy=steps 17 | steps=-1,100,20000,30000 18 | scales=.1,10,.1,.1 19 | 20 | [convolutional] 21 | batch_normalize=1 22 | filters=16 23 | size=3 24 | stride=1 25 | pad=1 26 | activation=leaky 27 | 28 | [maxpool] 29 | size=2 30 | stride=2 31 | 32 | [convolutional] 33 | batch_normalize=1 34 | filters=32 35 | size=3 36 | stride=1 37 | pad=1 38 | activation=leaky 39 | 40 | [maxpool] 41 | size=2 42 | stride=2 43 | 44 | [convolutional] 45 | batch_normalize=1 46 | filters=64 47 | size=3 48 | stride=1 49 | pad=1 50 | activation=leaky 51 | 52 | [maxpool] 53 | size=2 54 | stride=2 55 | 56 | [convolutional] 57 | batch_normalize=1 58 | filters=128 59 | size=3 60 | stride=1 61 | pad=1 62 | activation=leaky 63 | 64 | [maxpool] 65 | size=2 66 | stride=2 67 | 68 | [convolutional] 69 | batch_normalize=1 70 | filters=256 71 | size=3 72 | stride=1 73 | pad=1 74 | activation=leaky 75 | 76 | [maxpool] 77 | size=2 78 | stride=2 79 | 80 | [convolutional] 81 | batch_normalize=1 82 | filters=512 83 | size=3 84 | stride=1 85 | pad=1 86 | activation=leaky 87 | 88 | [maxpool] 89 | size=2 90 | stride=1 91 | 92 | [convolutional] 93 | batch_normalize=1 94 | filters=1024 95 | size=3 96 | stride=1 97 | pad=1 98 | activation=leaky 99 | 100 | ########### 101 | 102 | [convolutional] 103 | batch_normalize=1 104 | size=3 105 | stride=1 106 | pad=1 107 | filters=1024 108 | activation=leaky 109 | 110 | [convolutional] 111 | size=1 112 | stride=1 113 | pad=1 114 | filters=125 115 | activation=linear 116 | 117 | [region] 118 | anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 119 | bias_match=1 120 | classes=20 121 | coords=4 122 | num=5 123 | softmax=1 124 | jitter=.2 125 | rescore=1 126 | 127 | object_scale=5 128 | noobject_scale=1 129 | class_scale=1 130 | coord_scale=1 131 | 132 | absolute=1 133 | thresh = .6 134 | random=1 135 | -------------------------------------------------------------------------------- /cfg/tiny-yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Training 3 | # batch=64 4 | # subdivisions=2 5 | # Testing 6 | batch=1 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | size=3 110 | stride=1 111 | pad=1 112 | filters=512 113 | activation=leaky 114 | 115 | [convolutional] 116 | size=1 117 | stride=1 118 | pad=1 119 | filters=425 120 | activation=linear 121 | 122 | [region] 123 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 124 | bias_match=1 125 | classes=80 126 | coords=4 127 | num=5 128 | softmax=1 129 | jitter=.2 130 | rescore=0 131 | 132 | object_scale=5 133 | noobject_scale=1 134 | class_scale=1 135 | coord_scale=1 136 | 137 | absolute=1 138 | thresh = .6 139 | random=1 140 | 141 | -------------------------------------------------------------------------------- /cfg/voc.data: -------------------------------------------------------------------------------- 1 | train = voc_train.txt 2 | valid = 2007_test.txt 3 | names = data/voc.names 4 | backup = backup 5 | gpus = 0,1,2,3 6 | -------------------------------------------------------------------------------- /cfg/yolo-voc.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=64 4 | subdivisions=8 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | height=416 9 | width=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 80200 21 | policy=steps 22 | steps=-1,500,40000,60000 23 | scales=0.1,10,.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=125 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 243 | bias_match=1 244 | classes=20 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolo.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=8 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=64 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=128 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [convolutional] 58 | batch_normalize=1 59 | filters=64 60 | size=1 61 | stride=1 62 | pad=1 63 | activation=leaky 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=1 70 | pad=1 71 | activation=leaky 72 | 73 | [maxpool] 74 | size=2 75 | stride=2 76 | 77 | [convolutional] 78 | batch_normalize=1 79 | filters=256 80 | size=3 81 | stride=1 82 | pad=1 83 | activation=leaky 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=128 88 | size=1 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=256 96 | size=3 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [maxpool] 102 | size=2 103 | stride=2 104 | 105 | [convolutional] 106 | batch_normalize=1 107 | filters=512 108 | size=3 109 | stride=1 110 | pad=1 111 | activation=leaky 112 | 113 | [convolutional] 114 | batch_normalize=1 115 | filters=256 116 | size=1 117 | stride=1 118 | pad=1 119 | activation=leaky 120 | 121 | [convolutional] 122 | batch_normalize=1 123 | filters=512 124 | size=3 125 | stride=1 126 | pad=1 127 | activation=leaky 128 | 129 | [convolutional] 130 | batch_normalize=1 131 | filters=256 132 | size=1 133 | stride=1 134 | pad=1 135 | activation=leaky 136 | 137 | [convolutional] 138 | batch_normalize=1 139 | filters=512 140 | size=3 141 | stride=1 142 | pad=1 143 | activation=leaky 144 | 145 | [maxpool] 146 | size=2 147 | stride=2 148 | 149 | [convolutional] 150 | batch_normalize=1 151 | filters=1024 152 | size=3 153 | stride=1 154 | pad=1 155 | activation=leaky 156 | 157 | [convolutional] 158 | batch_normalize=1 159 | filters=512 160 | size=1 161 | stride=1 162 | pad=1 163 | activation=leaky 164 | 165 | [convolutional] 166 | batch_normalize=1 167 | filters=1024 168 | size=3 169 | stride=1 170 | pad=1 171 | activation=leaky 172 | 173 | [convolutional] 174 | batch_normalize=1 175 | filters=512 176 | size=1 177 | stride=1 178 | pad=1 179 | activation=leaky 180 | 181 | [convolutional] 182 | batch_normalize=1 183 | filters=1024 184 | size=3 185 | stride=1 186 | pad=1 187 | activation=leaky 188 | 189 | 190 | ####### 191 | 192 | [convolutional] 193 | batch_normalize=1 194 | size=3 195 | stride=1 196 | pad=1 197 | filters=1024 198 | activation=leaky 199 | 200 | [convolutional] 201 | batch_normalize=1 202 | size=3 203 | stride=1 204 | pad=1 205 | filters=1024 206 | activation=leaky 207 | 208 | [route] 209 | layers=-9 210 | 211 | [convolutional] 212 | batch_normalize=1 213 | size=1 214 | stride=1 215 | pad=1 216 | filters=64 217 | activation=leaky 218 | 219 | [reorg] 220 | stride=2 221 | 222 | [route] 223 | layers=-1,-4 224 | 225 | [convolutional] 226 | batch_normalize=1 227 | size=3 228 | stride=1 229 | pad=1 230 | filters=1024 231 | activation=leaky 232 | 233 | [convolutional] 234 | size=1 235 | stride=1 236 | pad=1 237 | filters=425 238 | activation=linear 239 | 240 | 241 | [region] 242 | anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 243 | bias_match=1 244 | classes=80 245 | coords=4 246 | num=5 247 | softmax=1 248 | jitter=.3 249 | rescore=1 250 | 251 | object_scale=5 252 | noobject_scale=1 253 | class_scale=1 254 | coord_scale=1 255 | 256 | absolute=1 257 | thresh = .6 258 | random=1 259 | -------------------------------------------------------------------------------- /cfg/yolo_v3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /data/coco.names: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /data/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/dog.jpg -------------------------------------------------------------------------------- /data/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/eagle.jpg -------------------------------------------------------------------------------- /data/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/giraffe.jpg -------------------------------------------------------------------------------- /data/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/horses.jpg -------------------------------------------------------------------------------- /data/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/person.jpg -------------------------------------------------------------------------------- /data/predictions-yolov2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/predictions-yolov2.jpg -------------------------------------------------------------------------------- /data/predictions-yolov3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/predictions-yolov3.jpg -------------------------------------------------------------------------------- /data/scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andy-yun/pytorch-0.4-yolov3/7928dc9820dd7cae8e9b0c56d1d43a69976996af/data/scream.jpg -------------------------------------------------------------------------------- /data/voc.names: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /data/voc_label.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import pickle 3 | import os 4 | from os import listdir, getcwd 5 | from os.path import join 6 | 7 | sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')] 8 | 9 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] 10 | 11 | 12 | def convert(size, box): 13 | dw = 1./size[0] 14 | dh = 1./size[1] 15 | x = (box[0] + box[1])/2.0 16 | y = (box[2] + box[3])/2.0 17 | w = box[1] - box[0] 18 | h = box[3] - box[2] 19 | x = x*dw 20 | w = w*dw 21 | y = y*dh 22 | h = h*dh 23 | return (x,y,w,h) 24 | 25 | def convert_annotation(year, image_id): 26 | in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id)) 27 | out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w') 28 | tree=ET.parse(in_file) 29 | root = tree.getroot() 30 | size = root.find('size') 31 | w = int(size.find('width').text) 32 | h = int(size.find('height').text) 33 | 34 | for obj in root.iter('object'): 35 | difficult = obj.find('difficult').text 36 | cls = obj.find('name').text 37 | if cls not in classes or int(difficult) == 1: 38 | continue 39 | cls_id = classes.index(cls) 40 | xmlbox = obj.find('bndbox') 41 | b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text)) 42 | bb = convert((w,h), b) 43 | out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n') 44 | 45 | wd = getcwd() 46 | 47 | for year, image_set in sets: 48 | if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)): 49 | os.makedirs('VOCdevkit/VOC%s/labels/'%(year)) 50 | image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split() 51 | list_file = open('%s_%s.txt'%(year, image_set), 'w') 52 | for image_id in image_ids: 53 | list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id)) 54 | convert_annotation(year, image_id) 55 | list_file.close() 56 | 57 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # encoding: utf-8 3 | 4 | import os 5 | import random 6 | import torch 7 | import numpy as np 8 | from torch.utils.data import Dataset 9 | from PIL import Image 10 | from utils import read_truths_args, read_truths 11 | from image import * 12 | 13 | def custom_collate(batch): 14 | data = torch.stack([item[0] for item in batch], 0) 15 | targets = torch.stack([item[1] for item in batch], 0) 16 | return data, targets 17 | 18 | class listDataset(Dataset): 19 | def __init__(self, root, shape=None, shuffle=True, crop=False, jitter=0.3, hue=0.1, saturation=1.5, exposure=1.5, transform=None, target_transform=None, train=False, seen=0, batch_size=64, num_workers=4): 20 | with open(root, 'r') as file: 21 | self.lines = file.readlines() 22 | 23 | if shuffle: 24 | random.shuffle(self.lines) 25 | 26 | self.nSamples = len(self.lines) 27 | self.transform = transform 28 | self.target_transform = target_transform 29 | self.train = train 30 | self.shape = shape 31 | self.seen = seen 32 | self.batch_size = batch_size 33 | self.num_workers = num_workers 34 | 35 | self.crop = crop 36 | self.jitter = jitter 37 | self.hue = hue 38 | self.saturation = saturation 39 | self.exposure = exposure 40 | 41 | def __len__(self): 42 | return self.nSamples 43 | 44 | def get_different_scale(self): 45 | if self.seen < 4000*self.batch_size: 46 | wh = 13*32 # 416 47 | elif self.seen < 8000*self.batch_size: 48 | wh = (random.randint(0,3) + 13)*32 # 416, 480 49 | elif self.seen < 12000*self.batch_size: 50 | wh = (random.randint(0,5) + 12)*32 # 384, ..., 544 51 | elif self.seen < 16000*self.batch_size: 52 | wh = (random.randint(0,7) + 11)*32 # 352, ..., 576 53 | else: # self.seen < 20000*self.batch_size: 54 | wh = (random.randint(0,9) + 10)*32 # 320, ..., 608 55 | return (wh, wh) 56 | 57 | def __getitem__(self, index): 58 | assert index <= len(self), 'index range error' 59 | imgpath = self.lines[index].rstrip() 60 | 61 | if self.train: 62 | if self.seen % (self.batch_size * 10) == 0: # in paper, every 10 batches, but we did every 64 images 63 | self.shape = self.get_different_scale() 64 | img, label = load_data_detection(imgpath, self.shape, self.crop, self.jitter, self.hue, self.saturation, self.exposure) 65 | label = torch.from_numpy(label) 66 | else: 67 | img = Image.open(imgpath).convert('RGB') 68 | if self.shape: 69 | img, org_w, org_h = letterbox_image(img, self.shape[0], self.shape[1]), img.width, img.height 70 | 71 | labpath = imgpath.replace('images', 'labels').replace('JPEGImages', 'labels').replace('.jpg', '.txt').replace('.png','.txt') 72 | label = torch.zeros(50*5) 73 | #if os.path.getsize(labpath): 74 | #tmp = torch.from_numpy(np.loadtxt(labpath)) 75 | try: 76 | tmp = torch.from_numpy(read_truths_args(labpath, 8.0/img.width).astype('float32')) 77 | except Exception: 78 | tmp = torch.zeros(1,5) 79 | #tmp = torch.from_numpy(read_truths(labpath)) 80 | tmp = tmp.view(-1) 81 | tsz = tmp.numel() 82 | #print('labpath = %s , tsz = %d' % (labpath, tsz)) 83 | if tsz > 50*5: 84 | label = tmp[0:50*5] 85 | elif tsz > 0: 86 | label[0:tsz] = tmp 87 | 88 | if self.transform is not None: 89 | img = self.transform(img) 90 | 91 | if self.target_transform is not None: 92 | label = self.target_transform(label) 93 | 94 | self.seen = self.seen + self.num_workers 95 | if self.train: 96 | return (img, label) 97 | else: 98 | return (img, label, org_w, org_h) 99 | -------------------------------------------------------------------------------- /debug.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch.optim as optim 3 | import os 4 | import torch 5 | import numpy as np 6 | from darknet import Darknet 7 | from PIL import Image 8 | from utils import image2torch, convert2cpu 9 | 10 | cfgfile = 'face4.1re_95.91.cfg' 11 | weightfile = 'face4.1re_95.91.conv.15' 12 | imgpath = 'data/train/images/10002.png' 13 | labpath = imgpath.replace('images', 'labels').replace('JPEGImages', 'labels').replace('.jpg', '.txt').replace('.png','.txt') 14 | label = torch.zeros(50*5) 15 | if os.path.getsize(labpath): 16 | tmp = torch.from_numpy(np.loadtxt(labpath)) 17 | #tmp = torch.from_numpy(read_truths_args(labpath, 8.0/img.width)) 18 | #tmp = torch.from_numpy(read_truths(labpath)) 19 | tmp = tmp.view(-1) 20 | tsz = tmp.numel() 21 | #print('labpath = %s , tsz = %d' % (labpath, tsz)) 22 | if tsz > 50*5: 23 | label = tmp[0:50*5] 24 | elif tsz > 0: 25 | label[0:tsz] = tmp 26 | label = label.view(1, 50*5) 27 | 28 | m = Darknet(cfgfile) 29 | region_loss = m.loss 30 | m.load_weights(weightfile) 31 | 32 | print('--- bn weight ---') 33 | print(m.models[0][1].weight) 34 | print('--- bn bias ---') 35 | print(m.models[0][1].bias) 36 | print('--- bn running_mean ---') 37 | print(m.models[0][1].running_mean) 38 | print('--- bn running_var ---') 39 | print(m.models[0][1].running_var) 40 | 41 | m.train() 42 | m = m.cuda() 43 | 44 | optimizer = optim.SGD(m.parameters(), lr=1e-2, momentum=0.9, weight_decay=0.1) 45 | 46 | img = Image.open(imgpath) 47 | img = image2torch(img).cuda() 48 | 49 | target = label 50 | 51 | print('----- img ---------------------') 52 | print(img.data.storage()[0:100]) 53 | print('----- target -----------------') 54 | print(target.data.storage()[0:100]) 55 | 56 | optimizer.zero_grad() 57 | output = m(img) 58 | print('----- output ------------------') 59 | print(output.data.storage()[0:100]) 60 | exit() 61 | 62 | loss = region_loss(output, target) 63 | print('----- loss --------------------') 64 | print(loss) 65 | 66 | save_grad = None 67 | def extract(grad): 68 | global saved_grad 69 | saved_grad = convert2cpu(grad.data) 70 | 71 | output.register_hook(extract) 72 | loss.backward() 73 | 74 | saved_grad = saved_grad.view(-1) 75 | for i in xrange(saved_grad.size(0)): 76 | if abs(saved_grad[i]) >= 0.001: 77 | print('%d : %f' % (i, saved_grad[i])) 78 | 79 | print(m.state_dict().keys()) 80 | #print(m.models[0][0].weight.grad.data.storage()[0:100]) 81 | #print(m.models[14][0].weight.data.storage()[0:100]) 82 | weight = m.models[13][0].weight.data 83 | grad = m.models[13][0].weight.grad.data 84 | mask = torch.abs(grad) >= 0.1 85 | print(weight[mask]) 86 | print(grad[mask]) 87 | 88 | optimizer.step() 89 | weight2 = m.models[13][0].weight.data 90 | print(weight2[mask]) 91 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | from darknet import Darknet 3 | import cv2 4 | 5 | def demo(cfgfile, weightfile): 6 | m = Darknet(cfgfile) 7 | m.print_network() 8 | m.load_weights(weightfile) 9 | print('Loading weights from %s... Done!' % (weightfile)) 10 | 11 | if m.num_classes == 20: 12 | namesfile = 'data/voc.names' 13 | elif m.num_classes == 80: 14 | namesfile = 'data/coco.names' 15 | else: 16 | namesfile = 'data/names' 17 | print("{} is used for classification".format(namesfile)) 18 | class_names = load_class_names(namesfile) 19 | 20 | use_cuda = True 21 | if use_cuda: 22 | m.cuda() 23 | 24 | cap = cv2.VideoCapture(1) 25 | if not cap.isOpened(): 26 | print("Unable to open camera") 27 | exit(-1) 28 | 29 | while True: 30 | res, img = cap.read() 31 | if res: 32 | sized = cv2.resize(img, (m.width, m.height)) 33 | bboxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 34 | print('------') 35 | draw_img = plot_boxes_cv2(img, bboxes, None, class_names) 36 | cv2.imshow(cfgfile, draw_img) 37 | cv2.waitKey(1) 38 | else: 39 | print("Unable to read image") 40 | exit(-1) 41 | 42 | ############################################ 43 | if __name__ == '__main__': 44 | if len(sys.argv) == 3: 45 | cfgfile = sys.argv[1] 46 | weightfile = sys.argv[2] 47 | demo(cfgfile, weightfile) 48 | #demo('cfg/tiny-yolo-voc.cfg', 'tiny-yolo-voc.weights') 49 | else: 50 | print('Usage:') 51 | print(' python demo.py cfgfile weightfile') 52 | print('') 53 | print(' perform detection on camera') 54 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | from PIL import Image, ImageDraw 4 | #from models.tiny_yolo import TinyYoloNet 5 | from utils import * 6 | from image import letterbox_image, correct_yolo_boxes 7 | from darknet import Darknet 8 | 9 | namesfile=None 10 | def detect(cfgfile, weightfile, imgfile): 11 | m = Darknet(cfgfile) 12 | 13 | m.print_network() 14 | m.load_weights(weightfile) 15 | print('Loading weights from %s... Done!' % (weightfile)) 16 | 17 | # if m.num_classes == 20: 18 | # namesfile = 'data/voc.names' 19 | # elif m.num_classes == 80: 20 | # namesfile = 'data/coco.names' 21 | # else: 22 | # namesfile = 'data/names' 23 | 24 | use_cuda = torch.cuda.is_available() 25 | if use_cuda: 26 | m.cuda() 27 | 28 | img = Image.open(imgfile).convert('RGB') 29 | sized = letterbox_image(img, m.width, m.height) 30 | 31 | start = time.time() 32 | boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 33 | correct_yolo_boxes(boxes, img.width, img.height, m.width, m.height) 34 | 35 | finish = time.time() 36 | print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) 37 | 38 | class_names = load_class_names(namesfile) 39 | plot_boxes(img, boxes, 'predictions.jpg', class_names) 40 | 41 | def detect_cv2(cfgfile, weightfile, imgfile): 42 | import cv2 43 | m = Darknet(cfgfile) 44 | 45 | m.print_network() 46 | m.load_weights(weightfile) 47 | print('Loading weights from %s... Done!' % (weightfile)) 48 | 49 | if m.num_classes == 20: 50 | namesfile = 'data/voc.names' 51 | elif m.num_classes == 80: 52 | namesfile = 'data/coco.names' 53 | else: 54 | namesfile = 'data/names' 55 | 56 | use_cuda = True 57 | if use_cuda: 58 | m.cuda() 59 | 60 | img = cv2.imread(imgfile) 61 | sized = cv2.resize(img, (m.width, m.height)) 62 | sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB) 63 | 64 | for i in range(2): 65 | start = time.time() 66 | boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 67 | finish = time.time() 68 | if i == 1: 69 | print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) 70 | 71 | class_names = load_class_names(namesfile) 72 | plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) 73 | 74 | def detect_skimage(cfgfile, weightfile, imgfile): 75 | from skimage import io 76 | from skimage.transform import resize 77 | m = Darknet(cfgfile) 78 | 79 | m.print_network() 80 | m.load_weights(weightfile) 81 | print('Loading weights from %s... Done!' % (weightfile)) 82 | 83 | if m.num_classes == 20: 84 | namesfile = 'data/voc.names' 85 | elif m.num_classes == 80: 86 | namesfile = 'data/coco.names' 87 | else: 88 | namesfile = 'data/names' 89 | 90 | use_cuda = True 91 | if use_cuda: 92 | m.cuda() 93 | 94 | img = io.imread(imgfile) 95 | sized = resize(img, (m.width, m.height)) * 255 96 | 97 | for i in range(2): 98 | start = time.time() 99 | boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 100 | finish = time.time() 101 | if i == 1: 102 | print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) 103 | 104 | class_names = load_class_names(namesfile) 105 | plot_boxes_cv2(img, boxes, savename='predictions.jpg', class_names=class_names) 106 | 107 | if __name__ == '__main__': 108 | if len(sys.argv) == 5: 109 | cfgfile = sys.argv[1] 110 | weightfile = sys.argv[2] 111 | imgfile = sys.argv[3] 112 | globals()["namesfile"] = sys.argv[4] 113 | detect(cfgfile, weightfile, imgfile) 114 | #detect_cv2(cfgfile, weightfile, imgfile) 115 | #detect_skimage(cfgfile, weightfile, imgfile) 116 | else: 117 | print('Usage: ') 118 | print(' python detect.py cfgfile weightfile imgfile names') 119 | #detect('cfg/tiny-yolo-voc.cfg', 'tiny-yolo-voc.weights', 'data/person.jpg', version=1) 120 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import time 4 | import torch 5 | from torchvision import datasets, transforms 6 | import os 7 | import dataset 8 | import random 9 | import math 10 | import numpy as np 11 | from utils import get_all_boxes, multi_bbox_ious, nms, read_data_cfg, logging 12 | from cfg import parse_cfg 13 | from darknet import Darknet 14 | import argparse 15 | from image import correct_yolo_boxes 16 | 17 | # etc parameters 18 | use_cuda = True 19 | seed = 22222 20 | eps = 1e-5 21 | 22 | # Test parameters 23 | conf_thresh = 0.25 24 | nms_thresh = 0.4 25 | iou_thresh = 0.5 26 | 27 | FLAGS = None 28 | 29 | def main(): 30 | # Training settings 31 | datacfg = FLAGS.data 32 | cfgfile = FLAGS.config 33 | 34 | data_options = read_data_cfg(datacfg) 35 | testlist = data_options['valid'] 36 | gpus = data_options['gpus'] # e.g. 0,1,2,3 37 | ngpus = len(gpus.split(',')) 38 | 39 | num_workers = int(data_options['num_workers']) 40 | # for testing, batch_size is setted to 1 (one) 41 | batch_size = 1 # int(net_options['batch']) 42 | 43 | global use_cuda 44 | use_cuda = torch.cuda.is_available() and (True if use_cuda is None else use_cuda) 45 | 46 | ############### 47 | torch.manual_seed(seed) 48 | if use_cuda: 49 | os.environ['CUDA_VISIBLE_DEVICES'] = gpus 50 | torch.cuda.manual_seed(seed) 51 | 52 | global model 53 | model = Darknet(cfgfile) 54 | #model.print_network() 55 | 56 | init_width = model.width 57 | init_height = model.height 58 | 59 | kwargs = {'num_workers': num_workers, 'pin_memory': True} if use_cuda else {} 60 | 61 | global test_loader 62 | test_loader = torch.utils.data.DataLoader( 63 | dataset.listDataset(testlist, shape=(init_width, init_height), 64 | shuffle=False, 65 | transform=transforms.Compose([ 66 | transforms.ToTensor(), 67 | ]), train=False), 68 | batch_size=batch_size, shuffle=False, **kwargs) 69 | 70 | if use_cuda: 71 | if ngpus > 1: 72 | model = torch.nn.DataParallel(model) 73 | model = model.module 74 | model = model.to(torch.device("cuda" if use_cuda else "cpu")) 75 | for w in FLAGS.weights: 76 | model.load_weights(w) 77 | logging('evaluating ... %s' % (w)) 78 | test() 79 | 80 | def test(): 81 | def truths_length(truths): 82 | for i in range(50): 83 | if truths[i][1] == 0: 84 | return i 85 | return 50 86 | 87 | model.eval() 88 | num_classes = model.num_classes 89 | total = 0.0 90 | proposals = 0.0 91 | correct = 0.0 92 | device = torch.device("cuda" if use_cuda else "cpu") 93 | 94 | if model.net_name() == 'region': # region_layer 95 | shape=(0,0) 96 | else: 97 | shape=(model.width, model.height) 98 | for data, target, org_w, org_h in test_loader: 99 | data = data.to(device) 100 | output = model(data) 101 | all_boxes = get_all_boxes(output, shape, conf_thresh, num_classes, use_cuda=use_cuda) 102 | 103 | for k in range(len(all_boxes)): 104 | boxes = all_boxes[k] 105 | correct_yolo_boxes(boxes, org_w[k], org_h[k], model.width, model.height) 106 | boxes = np.array(nms(boxes, nms_thresh)) 107 | truths = target[k].view(-1, 5) 108 | num_gts = truths_length(truths) 109 | total = total + num_gts 110 | num_pred = len(boxes) 111 | if num_pred == 0: 112 | continue 113 | 114 | proposals += int((boxes[:,4]>conf_thresh).sum()) 115 | for i in range(num_gts): 116 | gt_boxes = torch.FloatTensor([truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0, 1.0, truths[i][0]]) 117 | gt_boxes = gt_boxes.repeat(num_pred,1).t() 118 | pred_boxes = torch.FloatTensor(boxes).t() 119 | best_iou, best_j = torch.max(multi_bbox_ious(gt_boxes, pred_boxes, x1y1x2y2=False),0) 120 | # pred_boxes and gt_boxes are transposed for torch.max 121 | if best_iou > iou_thresh and pred_boxes[6][best_j] == gt_boxes[6][0]: 122 | correct += 1 123 | 124 | precision = 1.0*correct/(proposals+eps) 125 | recall = 1.0*correct/(total+eps) 126 | fscore = 2.0*precision*recall/(precision+recall+eps) 127 | logging("correct: %d, precision: %f, recall: %f, fscore: %f" % (correct, precision, recall, fscore)) 128 | 129 | if __name__ == '__main__': 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument('--data', '-d', type=str, 132 | default='cfg/sketch.data', help='data definition file') 133 | parser.add_argument('--config', '-c', type=str, 134 | default='cfg/sketch.cfg', help='network configuration file') 135 | parser.add_argument('--weights', '-w', type=str, nargs='+', 136 | default=['weights/yolov3.weights'], help='initial weights file') 137 | FLAGS, _ = parser.parse_known_args() 138 | 139 | main() 140 | -------------------------------------------------------------------------------- /focal_loss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # -------------------------------------------------------- 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Chao CHEN (chaochancs@gmail.com) 6 | # Created On: 2017-08-11 7 | # -------------------------------------------------------- 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | class FocalLoss(nn.Module): 13 | r""" 14 | This criterion is a implemenation of Focal Loss, which is proposed in 15 | Focal Loss for Dense Object Detection. 16 | 17 | Loss(x, class) = - \alpha (1-softmax(x)[class])^gamma \log(softmax(x)[class]) 18 | 19 | The losses are averaged across observations for each minibatch. 20 | 21 | Args: 22 | alpha(1D Tensor) : the scalar factor for this criterion 23 | gamma(float, double) : gamma > 0; reduces the relative loss for well-classified examples (p > .5), 24 | putting more focus on hard, misclassified examples 25 | size_average(bool): size_average(bool): By default, the losses are averaged over observations for each minibatch. 26 | However, if the field size_average is set to False, the losses are 27 | instead summed for each minibatch. 28 | 29 | """ 30 | def __init__(self, class_num, alpha=None, gamma=2, size_average=True): 31 | super(FocalLoss, self).__init__() 32 | if alpha is None: 33 | self.alpha = torch.ones(class_num, 1) 34 | else: 35 | self.alpha = alpha 36 | self.gamma = gamma 37 | self.class_num = class_num 38 | self.size_average = size_average 39 | 40 | def forward(self, inputs, targets): 41 | N = inputs.size(0) 42 | print(N) 43 | C = inputs.size(1) 44 | P = F.softmax(inputs) 45 | 46 | class_mask = inputs.data.new(N, C).fill_(0) 47 | ids = targets.view(-1, 1) 48 | class_mask.scatter_(1, ids.data, 1.) 49 | #print(class_mask) 50 | 51 | 52 | if inputs.is_cuda and not self.alpha.is_cuda: 53 | self.alpha = self.alpha.cuda() 54 | alpha = self.alpha[ids.data.view(-1)] 55 | 56 | probs = (P*class_mask).sum(1).view(-1,1) 57 | 58 | log_p = probs.log() 59 | #print('probs size= {}'.format(probs.size())) 60 | #print(probs) 61 | 62 | batch_loss = -alpha*(torch.pow((1-probs), self.gamma))*log_p 63 | #print('-----bacth_loss------') 64 | #print(batch_loss) 65 | 66 | 67 | if self.size_average: 68 | loss = batch_loss.mean() 69 | else: 70 | loss = batch_loss.sum() 71 | return loss 72 | 73 | 74 | 75 | if __name__ == "__main__": 76 | alpha = torch.rand(21, 1) 77 | print(alpha) 78 | FL = FocalLoss(class_num=5, gamma=0 ) 79 | CE = nn.CrossEntropyLoss() 80 | N = 4 81 | C = 5 82 | inputs = torch.rand(N, C, requires_grad=True) 83 | targets = torch.LongTensor(N).random_(C) 84 | inputs_fl = inputs.clone() 85 | targets_fl = targets.clone() 86 | 87 | inputs_ce = inputs.clone() 88 | targets_ce = targets.clone() 89 | print('----inputs----') 90 | print(inputs) 91 | print('---target-----') 92 | print(targets) 93 | 94 | fl_loss = FL(inputs_fl, targets_fl) 95 | ce_loss = CE(inputs_ce, targets_ce) 96 | print('ce = {}, fl ={}'.format(ce_loss.data[0], fl_loss.data[0])) 97 | fl_loss.backward() 98 | ce_loss.backward() 99 | #print(inputs_fl.grad.data) 100 | print(inputs_ce.grad.data) 101 | -------------------------------------------------------------------------------- /image.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # encoding: utf-8 3 | import os 4 | from PIL import Image, ImageFile 5 | import numpy as np 6 | 7 | # to avoid image file truncation error 8 | ImageFile.LOAD_TRUNCATED_IMAGES = True 9 | 10 | def scale_image_channel(im, c, v): 11 | cs = list(im.split()) 12 | cs[c] = cs[c].point(lambda i: i * v) 13 | out = Image.merge(im.mode, tuple(cs)) 14 | return out 15 | 16 | def image_scale_and_shift(img, new_w, new_h, net_w, net_h, dx, dy): 17 | scaled = img.resize((new_w, new_h)) 18 | # find to be cropped area 19 | sx, sy = -dx if dx < 0 else 0, -dy if dy < 0 else 0 20 | ex, ey = new_w if sx+new_w<=net_w else net_w-sx, new_h if sy+new_h<=net_h else net_h-sy 21 | scaled = scaled.crop((sx, sy, ex, ey)) 22 | 23 | # find the paste position 24 | sx, sy = dx if dx > 0 else 0, dy if dy > 0 else 0 25 | assert sx+scaled.width<=net_w and sy+scaled.height<=net_h 26 | new_img = Image.new("RGB", (net_w, net_h), (127, 127, 127)) 27 | new_img.paste(scaled, (sx, sy)) 28 | del scaled 29 | return new_img 30 | 31 | def image_scale_and_shift_nosafe(img, new_w, new_h, net_w, net_h, dx, dy): 32 | scaled = img.resize((new_w, new_h)) 33 | new_img = Image.new("RGB", (net_w, net_h), (127, 127, 127)) 34 | new_img.paste(scaled, (dx, dy)) 35 | del scaled 36 | return new_img 37 | 38 | def image_scale_and_shift_slow(img, new_w, new_h, net_w, net_h, dx, dy): 39 | scaled = np.array(img.resize((new_w, new_h))) 40 | # scaled.size : [height, width, channel] 41 | 42 | if dx > 0: 43 | shifted = np.pad(scaled, ((0,0), (dx,0), (0,0)), mode='constant', constant_values=127) 44 | else: 45 | shifted = scaled[:,-dx:,:] 46 | 47 | if (new_w + dx) < net_w: 48 | shifted = np.pad(shifted, ((0,0), (0, net_w - (new_w+dx)), (0,0)), mode='constant', constant_values=127) 49 | 50 | if dy > 0: 51 | shifted = np.pad(shifted, ((dy,0), (0,0), (0,0)), mode='constant', constant_values=127) 52 | else: 53 | shifted = shifted[-dy:,:,:] 54 | 55 | if (new_h + dy) < net_h: 56 | shifted = np.pad(shifted, ((0, net_h - (new_h+dy)), (0,0), (0,0)), mode='constant', constant_values=127) 57 | #print("scaled: {} ==> dx {} dy {} for shifted: {}".format(scaled.shape, dx, dy, shifted.shape)) 58 | return Image.fromarray(shifted[:net_h, :net_w,:]) 59 | 60 | def distort_image(im, hue, sat, val): 61 | im = im.convert('HSV') 62 | cs = list(im.split()) 63 | cs[1] = cs[1].point(lambda i: i * sat) 64 | cs[2] = cs[2].point(lambda i: i * val) 65 | 66 | def change_hue(x): 67 | x += hue*255 68 | if x > 255: 69 | x -= 255 70 | if x < 0: 71 | x += 255 72 | return x 73 | cs[0] = cs[0].point(change_hue) 74 | im = Image.merge(im.mode, tuple(cs)) 75 | 76 | im = im.convert('RGB') 77 | #constrain_image(im) 78 | return im 79 | 80 | def rand_scale(s): 81 | scale = np.random.uniform(1, s) 82 | if np.random.randint(2): 83 | return scale 84 | return 1./scale 85 | 86 | def random_distort_image(im, hue, saturation, exposure): 87 | dhue = np.random.uniform(-hue, hue) 88 | dsat = rand_scale(saturation) 89 | dexp = rand_scale(exposure) 90 | res = distort_image(im, dhue, dsat, dexp) 91 | return res 92 | 93 | def data_augmentation_crop(img, shape, jitter, hue, saturation, exposure): 94 | oh = img.height 95 | ow = img.width 96 | 97 | dw =int(ow*jitter) 98 | dh =int(oh*jitter) 99 | 100 | pleft = np.random.randint(-dw, dw) 101 | pright = np.random.randint(-dw, dw) 102 | ptop = np.random.randint(-dh, dh) 103 | pbot = np.random.randint(-dh, dh) 104 | 105 | swidth = ow - pleft - pright 106 | sheight = oh - ptop - pbot 107 | 108 | sx = ow / float(swidth) 109 | sy = oh / float(sheight) 110 | 111 | flip = np.random.randint(2) 112 | 113 | cropbb = np.array([pleft, ptop, pleft + swidth - 1, ptop + sheight - 1]) 114 | # following two lines are old method. out of image boundary is filled with black (0,0,0) 115 | #cropped = img.crop( cropbb ) 116 | #sized = cropped.resize(shape) 117 | 118 | nw, nh = cropbb[2]-cropbb[0], cropbb[3]-cropbb[1] 119 | # get the real image part 120 | cropbb[0] = -min(cropbb[0], 0) 121 | cropbb[1] = -min(cropbb[1], 0) 122 | cropbb[2] = min(cropbb[2], ow) 123 | cropbb[3] = min(cropbb[3], oh) 124 | cropped = img.crop( cropbb ) 125 | 126 | # calculate the position to paste 127 | bb = (pleft if pleft > 0 else 0, ptop if ptop > 0 else 0) 128 | new_img = Image.new("RGB", (nw, nh), (127,127,127)) 129 | new_img.paste(cropped, bb) 130 | 131 | sized = new_img.resize(shape) 132 | del cropped, new_img 133 | 134 | dx = (float(pleft)/ow) * sx 135 | dy = (float(ptop) /oh) * sy 136 | 137 | if flip: 138 | sized = sized.transpose(Image.FLIP_LEFT_RIGHT) 139 | img = random_distort_image(sized, hue, saturation, exposure) 140 | # for compatibility to nocrop version (like original version) 141 | return img, flip, dx, dy, sx, sy 142 | 143 | def data_augmentation_nocrop(img, shape, jitter, hue, sat, exp): 144 | net_w, net_h = shape 145 | img_w, img_h = img.width, img.height 146 | 147 | # determine the amount of scaling and cropping 148 | dw = jitter * img_w 149 | dh = jitter * img_h 150 | 151 | new_ar = (img_w + np.random.uniform(-dw, dw)) / (img_h + np.random.uniform(-dh, dh)) 152 | # scale = np.random.uniform(0.25, 2) 153 | scale = 1. 154 | 155 | if (new_ar < 1): 156 | new_h = int(scale * net_h) 157 | new_w = int(net_h * new_ar) 158 | else: 159 | new_w = int(scale * net_w) 160 | new_h = int(net_w / new_ar) 161 | 162 | dx = int(np.random.uniform(0, net_w - new_w)) 163 | dy = int(np.random.uniform(0, net_h - new_h)) 164 | sx, sy = new_w / net_w, new_h / net_h 165 | 166 | # apply scaling and shifting 167 | new_img = image_scale_and_shift(img, new_w, new_h, net_w, net_h, dx, dy) 168 | 169 | # randomly distort hsv space 170 | new_img = random_distort_image(new_img, hue, sat, exp) 171 | 172 | # randomly flip 173 | flip = np.random.randint(2) 174 | if flip: 175 | new_img = new_img.transpose(Image.FLIP_LEFT_RIGHT) 176 | 177 | dx, dy = dx/net_w, dy/net_h 178 | return new_img, flip, dx, dy, sx, sy 179 | 180 | def fill_truth_detection(labpath, crop, flip, dx, dy, sx, sy): 181 | max_boxes = 50 182 | label = np.zeros((max_boxes,5)) 183 | if os.path.getsize(labpath): 184 | bs = np.loadtxt(labpath) 185 | if bs is None: 186 | return label 187 | bs = np.reshape(bs, (-1, 5)) 188 | cc = 0 189 | for i in range(bs.shape[0]): 190 | x1 = bs[i][1] - bs[i][3]/2 191 | y1 = bs[i][2] - bs[i][4]/2 192 | x2 = bs[i][1] + bs[i][3]/2 193 | y2 = bs[i][2] + bs[i][4]/2 194 | 195 | x1 = min(0.999, max(0, x1 * sx - dx)) 196 | y1 = min(0.999, max(0, y1 * sy - dy)) 197 | x2 = min(0.999, max(0, x2 * sx - dx)) 198 | y2 = min(0.999, max(0, y2 * sy - dy)) 199 | 200 | bs[i][1] = (x1 + x2)/2 # center x 201 | bs[i][2] = (y1 + y2)/2 # center y 202 | bs[i][3] = (x2 - x1) # width 203 | bs[i][4] = (y2 - y1) # height 204 | 205 | if flip: 206 | bs[i][1] = 0.999 - bs[i][1] 207 | 208 | # when crop is applied, we should check the cropped width/height ratio 209 | if bs[i][3] < 0.002 or bs[i][4] < 0.002 or \ 210 | (crop and (bs[i][3]/bs[i][4] > 20 or bs[i][4]/bs[i][3] > 20)): 211 | continue 212 | label[cc] = bs[i] 213 | cc += 1 214 | if cc >= 50: 215 | break 216 | 217 | label = np.reshape(label, (-1)) 218 | return label 219 | 220 | def letterbox_image(img, net_w, net_h): 221 | im_w, im_h = img.size 222 | if float(net_w)/float(im_w) < float(net_h)/float(im_h): 223 | new_w = net_w 224 | new_h = (im_h * net_w)//im_w 225 | else: 226 | new_w = (im_w * net_h)//im_h 227 | new_h = net_h 228 | resized = img.resize((new_w, new_h), Image.ANTIALIAS) 229 | lbImage = Image.new("RGB", (net_w, net_h), (127,127,127)) 230 | lbImage.paste(resized, \ 231 | ((net_w-new_w)//2, (net_h-new_h)//2, \ 232 | (net_w+new_w)//2, (net_h+new_h)//2)) 233 | return lbImage 234 | 235 | def correct_yolo_boxes(boxes, im_w, im_h, net_w, net_h): 236 | im_w, im_h = float(im_w), float(im_h) 237 | net_w, net_h = float(net_w), float(net_h) 238 | if net_w/im_w < net_h/im_h: 239 | new_w = net_w 240 | new_h = (im_h * net_w)/im_w 241 | else: 242 | new_w = (im_w * net_h)/im_h 243 | new_h = net_h 244 | 245 | xo, xs = (net_w - new_w)/(2*net_w), net_w/new_w 246 | yo, ys = (net_h - new_h)/(2*net_h), net_h/new_h 247 | for i in range(len(boxes)): 248 | b = boxes[i] 249 | b[0] = (b[0] - xo) * xs 250 | b[1] = (b[1] - yo) * ys 251 | b[2] *= xs 252 | b[3] *= ys 253 | return 254 | 255 | def load_data_detection(imgpath, shape, crop, jitter, hue, saturation, exposure): 256 | labpath = imgpath.replace('images', 'labels').replace('JPEGImages', 'labels').replace('.jpg', '.txt').replace('.png','.txt') 257 | 258 | ## data augmentation 259 | img = Image.open(imgpath).convert('RGB') 260 | if crop: # marvis version 261 | img,flip,dx,dy,sx,sy = data_augmentation_crop(img, shape, jitter, hue, saturation, exposure) 262 | else: # original version 263 | img,flip,dx,dy,sx,sy = data_augmentation_nocrop(img, shape, jitter, hue, saturation, exposure) 264 | label = fill_truth_detection(labpath, crop, flip, -dx, -dy, sx, sy) 265 | return img, label 266 | -------------------------------------------------------------------------------- /layers/batchnorm/Makefile: -------------------------------------------------------------------------------- 1 | GPU=1 2 | CUDNN=0 3 | 4 | ARCH= -gencode arch=compute_50,code=[sm_50,compute_50] \ 5 | -gencode arch=compute_52,code=[sm_52,compute_52] 6 | 7 | # This is what I use, uncomment if you know your arch and want to specify 8 | # ARCH= -gencode arch=compute_52,code=compute_52 9 | VPATH=./src/ 10 | OBJDIR=./obj/ 11 | 12 | CC=gcc 13 | NVCC=nvcc 14 | OPTS=-Ofast 15 | LDFLAGS= -lm -pthread 16 | COMMON= 17 | CFLAGS=-Wall -Wfatal-errors 18 | 19 | CFLAGS+=$(OPTS) 20 | 21 | ifeq ($(GPU), 1) 22 | COMMON+= -DGPU -I/usr/local/cuda/include/ 23 | CFLAGS+= -DGPU 24 | LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand 25 | endif 26 | 27 | ifeq ($(CUDNN), 1) 28 | COMMON+= -DCUDNN 29 | CFLAGS+= -DCUDNN 30 | LDFLAGS+= -lcudnn 31 | endif 32 | 33 | OBJ=blas.o cuda.o 34 | ifeq ($(GPU), 1) 35 | LDFLAGS+= -lstdc++ 36 | OBJ+=blas_kernels.o 37 | endif 38 | 39 | OBJS = $(addprefix $(OBJDIR), $(OBJ)) 40 | DEPS = $(wildcard src/*.h) Makefile 41 | 42 | all: obj $(OBJS) 43 | 44 | $(OBJDIR)%.o: %.c $(DEPS) 45 | $(CC) $(COMMON) $(CFLAGS) -fPIC -c $< -o $@ 46 | 47 | $(OBJDIR)%.o: %.cu $(DEPS) 48 | $(NVCC) $(ARCH) $(COMMON) -Xcompiler -fPIC --compiler-options "$(CFLAGS)" -c $< -o $@ 49 | 50 | obj: 51 | mkdir -p obj 52 | 53 | .PHONY: clean 54 | 55 | clean: 56 | rm -rf $(OBJS) $(EXEC) 57 | 58 | -------------------------------------------------------------------------------- /layers/batchnorm/bn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | from torch.nn.parameter import Parameter 5 | from torch.autograd import Function 6 | import bn_lib 7 | 8 | class BN2dFunc(Function): 9 | def __init__(self, running_mean, running_var, training, momentum, eps): 10 | self.running_mean = running_mean 11 | self.running_var = running_var 12 | self.training = training 13 | self.momentum = momentum 14 | self.eps = eps 15 | 16 | def forward(self, input, weight, bias): 17 | nB = input.size(0) 18 | nC = input.size(1) 19 | nH = input.size(2) 20 | nW = input.size(3) 21 | 22 | output = input.new(nB, nC, nH, nW) 23 | self.input = input 24 | self.weight = weight 25 | self.bias = bias 26 | self.x = input.new(nB, nC, nH, nW) 27 | self.x_norm = input.new(nB, nC, nH, nW) 28 | self.mean = input.new(nB, nC) 29 | self.var = input.new(nB, nC) 30 | 31 | if input.is_cuda: 32 | bn_lib.bn_forward_gpu(input, self.x, self.x_norm, self.mean, self.running_mean, self.var, self.running_var, weight, bias, self.training, output) 33 | else: 34 | bn_lib.bn_forward(input, self.x, self.x_norm, self.mean, self.running_mean, self.var, self.running_var, weight, bias, self.training, output) 35 | return output 36 | 37 | def backward(self, grad_output): 38 | nB = grad_output.size(0) 39 | nC = grad_output.size(1) 40 | nH = grad_output.size(2) 41 | nW = grad_output.size(3) 42 | grad_input = grad_output.new(nB, nC, nH, nW) 43 | grad_mean = grad_output.new(nC) 44 | grad_var = grad_output.new(nC) 45 | grad_weight = grad_output.new(nC) 46 | grad_bias = grad_output.new(nC) 47 | 48 | if grad_output.is_cuda: 49 | bn_lib.bn_backward_gpu(grad_output, self.input, self.x_norm, self.mean, grad_mean, self.var, grad_var, self.weight, grad_weight, self.bias, grad_bias, self.training, grad_input) 50 | else: 51 | bn_lib.bn_backward(grad_output, self.input, self.x_norm, self.mean, grad_mean, self.var, grad_var, self.weight, grad_weight, self.bias, grad_bias, self.training, grad_input) 52 | 53 | return grad_input, grad_weight, grad_bias 54 | 55 | class BN2d(nn.Module): 56 | def __init__(self, num_features, momentum=0.01, eps=1e-5): 57 | super(BN2d, self).__init__() 58 | self.num_features = num_features 59 | self.weight = Parameter(torch.Tensor(num_features)) 60 | self.bias = Parameter(torch.Tensor(num_features)) 61 | self.register_buffer('running_mean', torch.zeros(num_features)) 62 | self.register_buffer('running_var', torch.zeros(num_features)) 63 | self.momentum = momentum 64 | self.eps = eps 65 | 66 | self.running_mean.zero_() 67 | self.running_var.fill_(1) 68 | self.weight.data.uniform_() 69 | self.bias.data.zero_() 70 | 71 | def forward(self, input): 72 | #print('------------ BN2d input -------------') 73 | #print(input.data.storage()[0:10]) 74 | return BN2dFunc(self.running_mean, self.running_var, self.training, self.momentum, self.eps)(input, self.weight, self.bias) 75 | 76 | class BN2d_slow(nn.Module): 77 | def __init__(self, num_features, momentum=0.01): 78 | super(BN2d_slow, self).__init__() 79 | self.num_features = num_features 80 | self.weight = Parameter(torch.Tensor(num_features)) 81 | self.bias = Parameter(torch.Tensor(num_features)) 82 | self.register_buffer('running_mean', torch.zeros(num_features)) 83 | self.register_buffer('running_var', torch.zeros(num_features)) 84 | self.eps = 1e-5 85 | self.momentum = momentum 86 | 87 | self.running_mean.zero_() 88 | self.running_var.fill_(1) 89 | self.weight.data.uniform_() 90 | self.bias.data.zero_() 91 | def forward(self, x): 92 | nB = x.data.size(0) 93 | nC = x.data.size(1) 94 | nH = x.data.size(2) 95 | nW = x.data.size(3) 96 | samples = nB*nH*nW 97 | y = x.view(nB, nC, nH*nW).transpose(1,2).contiguous().view(-1,nC) 98 | if self.training: 99 | print('forward in training mode on autograd') 100 | m = Variable(y.mean(0).data, requires_grad=False) 101 | v = Variable(y.var(0).data, requires_grad=False) 102 | self.running_mean = (1-self.momentum)*self.running_mean + self.momentum * m.data.view(-1) 103 | self.running_var = (1-self.momentum)*self.running_var + self.momentum * v.data.view(-1) 104 | m = m.repeat(samples, 1) 105 | v = v.repeat(samples, 1)*(samples-1.0)/samples 106 | else: 107 | m = Variable(self.running_mean.repeat(samples, 1), requires_grad=False) 108 | v = Variable(self.running_var.repeat(samples, 1), requires_grad=False) 109 | w = self.weight.repeat(samples, 1) 110 | b = self.bias.repeat(samples, 1) 111 | y = (y - m)/(v+self.eps).sqrt() * w + b 112 | y = y.view(nB, nH*nW, nC).transpose(1,2).contiguous().view(nB, nC, nH, nW) 113 | return y 114 | 115 | 116 | if __name__ == '__main__': 117 | nB = 64 118 | nC = 3 119 | nH = 4 120 | nW = 4 121 | samples = nB*nH*nW 122 | a = torch.rand(nB,nC,nH,nW) 123 | a = Variable(a) 124 | nn_model = nn.BatchNorm2d(nC) 125 | dkn_model = BN2d(nC) 126 | atg_model = BN2d_slow(nC) 127 | 128 | nn_model.weight.data.fill_(1.0) 129 | nn_model.bias.data.zero_() 130 | dkn_model.weight.data.fill_(1.0) 131 | dkn_model.bias.data.zero_() 132 | atg_model.weight.data.fill_(1.0) 133 | atg_model.bias.data.zero_() 134 | nn_out_cpu = nn_model(a) 135 | dkn_out_cpu = dkn_model(a) 136 | atg_out_cpu = atg_model(a) 137 | 138 | 139 | 140 | a = a.cuda() 141 | nn_model.cuda() 142 | dkn_model.cuda() 143 | atg_model.cuda() 144 | 145 | nn_out_gpu = nn_model(a) 146 | dkn_out_gpu = dkn_model(a) 147 | atg_out_gpu = atg_model(a) 148 | 149 | print('--- nn cpu out ---') 150 | print(nn_out_cpu.data.storage()[0:10]) 151 | print('--- dkn cpu out ---') 152 | print(dkn_out_cpu.data.storage()[0:10]) 153 | print('--- atg cpu out ---') 154 | print(atg_out_cpu.data.storage()[0:10]) 155 | 156 | 157 | print('--- nn gpu out ---') 158 | print(nn_out_gpu.data.storage()[0:10]) 159 | print('--- dkn gpu out ---') 160 | print(dkn_out_gpu.data.storage()[0:10]) 161 | print('--- atg gpu out ---') 162 | print(atg_out_gpu.data.storage()[0:10]) 163 | -------------------------------------------------------------------------------- /layers/batchnorm/bn_lib/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._bn_lib import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | locals[symbol] = _wrap_function(fn, _ffi) 10 | __all__.append(symbol) 11 | 12 | _import_symbols(locals()) 13 | -------------------------------------------------------------------------------- /layers/batchnorm/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | 6 | sources = ['src/batchnorm.c'] 7 | headers = ['src/batchnorm.h'] 8 | defines = [] 9 | with_cuda = False 10 | 11 | if torch.cuda.is_available(): 12 | print('Including CUDA code.') 13 | #sources += ['src/cuda.c'] 14 | #headers += ['src/cuda.h'] 15 | defines += [('WITH_CUDA', None)] 16 | with_cuda = True 17 | 18 | this_file = os.path.dirname(os.path.realpath(__file__)) 19 | print(this_file) 20 | #extra_objects=[] 21 | extra_objects = ['obj/blas_kernels.o', 'obj/cuda.o', 'obj/blas.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | 24 | ffi = create_extension( 25 | 'bn_lib', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects 32 | ) 33 | 34 | if __name__ == '__main__': 35 | ffi.build() 36 | -------------------------------------------------------------------------------- /layers/batchnorm/src/batchnorm.h: -------------------------------------------------------------------------------- 1 | 2 | void bn_forward(THFloatTensor* in_t, THFloatTensor* x_t, THFloatTensor* x_norm_t, THFloatTensor* mean_t, THFloatTensor* rolling_mean_t, THFloatTensor* variance_t, THFloatTensor* rolling_variance_t, THFloatTensor* scales_t, THFloatTensor* biases_t, int train, THFloatTensor* out_t); 3 | 4 | void bn_backward(THFloatTensor* grad_out_t, THFloatTensor* x_t, THFloatTensor* x_norm_t, THFloatTensor* mean_t, THFloatTensor* mean_delta_t, THFloatTensor* variance_t, THFloatTensor* variance_delta_t, THFloatTensor* scales_t,THFloatTensor* scale_delta_t, THFloatTensor* biases_t,THFloatTensor* bias_delta_t, int train, THFloatTensor* grad_in_t); 5 | 6 | void bn_forward_gpu(THCudaTensor* in_t, THCudaTensor* x_t, THCudaTensor* x_norm_t, THCudaTensor* mean_t, THCudaTensor* rolling_mean_t, THCudaTensor* variance_t, THCudaTensor* rolling_variance_t, THCudaTensor* scales_t, THCudaTensor* biases_t, int train, THCudaTensor* out_t); 7 | void bn_backward_gpu(THCudaTensor* grad_out_t, THCudaTensor* x_t, THCudaTensor* x_norm_t, THCudaTensor* mean_t, THCudaTensor* mean_delta_t, THCudaTensor* variance_t, THCudaTensor* variance_delta_t, THCudaTensor* scales_t,THCudaTensor* scale_delta_t, THCudaTensor* biases_t,THCudaTensor* bias_delta_t, int train, THCudaTensor* grad_in_t); 8 | -------------------------------------------------------------------------------- /layers/batchnorm/src/blas.c: -------------------------------------------------------------------------------- 1 | #include "blas.h" 2 | #include "math.h" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) 9 | { 10 | int b,i,j,k; 11 | int out_c = c/(stride*stride); 12 | 13 | for(b = 0; b < batch; ++b){ 14 | for(k = 0; k < c; ++k){ 15 | for(j = 0; j < h; ++j){ 16 | for(i = 0; i < w; ++i){ 17 | int in_index = i + w*(j + h*(k + c*b)); 18 | int c2 = k % out_c; 19 | int offset = k / out_c; 20 | int w2 = i*stride + offset % stride; 21 | int h2 = j*stride + offset / stride; 22 | int out_index = w2 + w*stride*(h2 + h*stride*(c2 + out_c*b)); 23 | if(forward) out[out_index] = x[in_index]; 24 | else out[in_index] = x[out_index]; 25 | } 26 | } 27 | } 28 | } 29 | } 30 | 31 | void flatten(float *x, int size, int layers, int batch, int forward) 32 | { 33 | float *swap = calloc(size*layers*batch, sizeof(float)); 34 | int i,c,b; 35 | for(b = 0; b < batch; ++b){ 36 | for(c = 0; c < layers; ++c){ 37 | for(i = 0; i < size; ++i){ 38 | int i1 = b*layers*size + c*size + i; 39 | int i2 = b*layers*size + i*layers + c; 40 | if (forward) swap[i2] = x[i1]; 41 | else swap[i1] = x[i2]; 42 | } 43 | } 44 | } 45 | memcpy(x, swap, size*layers*batch*sizeof(float)); 46 | free(swap); 47 | } 48 | 49 | void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c) 50 | { 51 | int i; 52 | for(i = 0; i < n; ++i){ 53 | c[i] = s[i]*a[i] + (1-s[i])*(b ? b[i] : 0); 54 | } 55 | } 56 | 57 | void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out) 58 | { 59 | int stride = w1/w2; 60 | int sample = w2/w1; 61 | assert(stride == h1/h2); 62 | assert(sample == h2/h1); 63 | if(stride < 1) stride = 1; 64 | if(sample < 1) sample = 1; 65 | int minw = (w1 < w2) ? w1 : w2; 66 | int minh = (h1 < h2) ? h1 : h2; 67 | int minc = (c1 < c2) ? c1 : c2; 68 | 69 | int i,j,k,b; 70 | for(b = 0; b < batch; ++b){ 71 | for(k = 0; k < minc; ++k){ 72 | for(j = 0; j < minh; ++j){ 73 | for(i = 0; i < minw; ++i){ 74 | int out_index = i*sample + w2*(j*sample + h2*(k + c2*b)); 75 | int add_index = i*stride + w1*(j*stride + h1*(k + c1*b)); 76 | out[out_index] += add[add_index]; 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | void mean_cpu(float *x, int batch, int filters, int spatial, float *mean) 84 | { 85 | float scale = 1./(batch * spatial); 86 | int i,j,k; 87 | for(i = 0; i < filters; ++i){ 88 | mean[i] = 0; 89 | for(j = 0; j < batch; ++j){ 90 | for(k = 0; k < spatial; ++k){ 91 | int index = j*filters*spatial + i*spatial + k; 92 | mean[i] += x[index]; 93 | } 94 | } 95 | mean[i] *= scale; 96 | } 97 | } 98 | 99 | void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) 100 | { 101 | float scale = 1./(batch * spatial - 1); 102 | int i,j,k; 103 | for(i = 0; i < filters; ++i){ 104 | variance[i] = 0; 105 | for(j = 0; j < batch; ++j){ 106 | for(k = 0; k < spatial; ++k){ 107 | int index = j*filters*spatial + i*spatial + k; 108 | variance[i] += pow((x[index] - mean[i]), 2); 109 | } 110 | } 111 | variance[i] *= scale; 112 | } 113 | } 114 | 115 | void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) 116 | { 117 | int b, f, i; 118 | for(b = 0; b < batch; ++b){ 119 | for(f = 0; f < filters; ++f){ 120 | for(i = 0; i < spatial; ++i){ 121 | int index = b*filters*spatial + f*spatial + i; 122 | x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f); 123 | } 124 | } 125 | } 126 | } 127 | 128 | void const_cpu(int N, float ALPHA, float *X, int INCX) 129 | { 130 | int i; 131 | for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; 132 | } 133 | 134 | void mul_cpu(int N, float *X, int INCX, float *Y, int INCY) 135 | { 136 | int i; 137 | for(i = 0; i < N; ++i) Y[i*INCY] *= X[i*INCX]; 138 | } 139 | 140 | void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) 141 | { 142 | int i; 143 | for(i = 0; i < N; ++i) Y[i*INCY] = pow(X[i*INCX], ALPHA); 144 | } 145 | 146 | void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) 147 | { 148 | int i; 149 | for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; 150 | } 151 | 152 | void scal_cpu(int N, float ALPHA, float *X, int INCX) 153 | { 154 | int i; 155 | for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; 156 | } 157 | 158 | void fill_cpu(int N, float ALPHA, float *X, int INCX) 159 | { 160 | int i; 161 | for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; 162 | } 163 | 164 | void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) 165 | { 166 | int i; 167 | for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; 168 | } 169 | 170 | void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) 171 | { 172 | int i; 173 | for(i = 0; i < n; ++i){ 174 | float diff = truth[i] - pred[i]; 175 | float abs_val = fabs(diff); 176 | if(abs_val < 1) { 177 | error[i] = diff * diff; 178 | delta[i] = diff; 179 | } 180 | else { 181 | error[i] = 2*abs_val - 1; 182 | delta[i] = (diff < 0) ? -1 : 1; 183 | } 184 | } 185 | } 186 | 187 | void l2_cpu(int n, float *pred, float *truth, float *delta, float *error) 188 | { 189 | int i; 190 | for(i = 0; i < n; ++i){ 191 | float diff = truth[i] - pred[i]; 192 | error[i] = diff * diff; 193 | delta[i] = diff; 194 | } 195 | } 196 | 197 | float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) 198 | { 199 | int i; 200 | float dot = 0; 201 | for(i = 0; i < N; ++i) dot += X[i*INCX] * Y[i*INCY]; 202 | return dot; 203 | } 204 | 205 | void softmax(float *input, int n, float temp, float *output) 206 | { 207 | int i; 208 | float sum = 0; 209 | float largest = -FLT_MAX; 210 | for(i = 0; i < n; ++i){ 211 | if(input[i] > largest) largest = input[i]; 212 | } 213 | for(i = 0; i < n; ++i){ 214 | float e = exp(input[i]/temp - largest/temp); 215 | sum += e; 216 | output[i] = e; 217 | } 218 | for(i = 0; i < n; ++i){ 219 | output[i] /= sum; 220 | } 221 | } 222 | 223 | -------------------------------------------------------------------------------- /layers/batchnorm/src/blas.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAS_H 2 | #define BLAS_H 3 | void flatten(float *x, int size, int layers, int batch, int forward); 4 | void pm(int M, int N, float *A); 5 | float *random_matrix(int rows, int cols); 6 | void time_random_matrix(int TA, int TB, int m, int k, int n); 7 | void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); 8 | 9 | void test_blas(); 10 | 11 | void const_cpu(int N, float ALPHA, float *X, int INCX); 12 | void constrain_ongpu(int N, float ALPHA, float * X, int INCX); 13 | void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); 14 | void mul_cpu(int N, float *X, int INCX, float *Y, int INCY); 15 | 16 | void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); 17 | void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); 18 | void scal_cpu(int N, float ALPHA, float *X, int INCX); 19 | void fill_cpu(int N, float ALPHA, float * X, int INCX); 20 | float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); 21 | void test_gpu_blas(); 22 | void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); 23 | 24 | void mean_cpu(float *x, int batch, int filters, int spatial, float *mean); 25 | void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); 26 | void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial); 27 | 28 | void scale_bias(float *output, float *scales, int batch, int n, int size); 29 | void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); 30 | void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); 31 | void variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); 32 | void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta); 33 | 34 | void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error); 35 | void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); 36 | void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); 37 | 38 | void softmax(float *input, int n, float temp, float *output); 39 | 40 | #ifdef GPU 41 | #include "cuda.h" 42 | 43 | void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); 44 | void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); 45 | void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); 46 | void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); 47 | void scal_ongpu(int N, float ALPHA, float * X, int INCX); 48 | void supp_ongpu(int N, float ALPHA, float * X, int INCX); 49 | void mask_ongpu(int N, float * X, float mask_num, float * mask); 50 | void const_ongpu(int N, float ALPHA, float *X, int INCX); 51 | void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); 52 | void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY); 53 | void fill_ongpu(int N, float ALPHA, float * X, int INCX); 54 | 55 | void mean_gpu(float *x, int batch, int filters, int spatial, float *mean); 56 | void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); 57 | void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial); 58 | 59 | void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta); 60 | 61 | void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); 62 | void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); 63 | 64 | void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); 65 | void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean); 66 | void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out); 67 | void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); 68 | void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); 69 | void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); 70 | void add_bias_gpu(float *output, float *biases, int batch, int n, int size); 71 | void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); 72 | 73 | void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error); 74 | void l2_gpu(int n, float *pred, float *truth, float *delta, float *error); 75 | void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc); 76 | void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c); 77 | void mult_add_into_gpu(int num, float *a, float *b, float *c); 78 | 79 | void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); 80 | 81 | void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output); 82 | void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); 83 | 84 | void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); 85 | 86 | #endif 87 | #endif 88 | -------------------------------------------------------------------------------- /layers/batchnorm/src/cuda.c: -------------------------------------------------------------------------------- 1 | int gpu_index = 0; 2 | 3 | #ifdef GPU 4 | 5 | #include "cuda.h" 6 | //#include "utils.h" 7 | #include "blas.h" 8 | #include "assert.h" 9 | #include 10 | #include 11 | 12 | void cuda_set_device(int n) 13 | { 14 | gpu_index = n; 15 | cudaError_t status = cudaSetDevice(n); 16 | check_error(status); 17 | } 18 | 19 | int cuda_get_device() 20 | { 21 | int n = 0; 22 | cudaError_t status = cudaGetDevice(&n); 23 | check_error(status); 24 | return n; 25 | } 26 | 27 | void check_error(cudaError_t status) 28 | { 29 | //cudaDeviceSynchronize(); 30 | cudaError_t status2 = cudaGetLastError(); 31 | if (status != cudaSuccess) 32 | { 33 | const char *s = cudaGetErrorString(status); 34 | char buffer[256]; 35 | printf("CUDA Error: %s\n", s); 36 | assert(0); 37 | snprintf(buffer, 256, "CUDA Error: %s", s); 38 | error(buffer); 39 | } 40 | if (status2 != cudaSuccess) 41 | { 42 | const char *s = cudaGetErrorString(status); 43 | char buffer[256]; 44 | printf("CUDA Error Prev: %s\n", s); 45 | assert(0); 46 | snprintf(buffer, 256, "CUDA Error Prev: %s", s); 47 | error(buffer); 48 | } 49 | } 50 | 51 | dim3 cuda_gridsize(size_t n){ 52 | size_t k = (n-1) / BLOCK + 1; 53 | size_t x = k; 54 | size_t y = 1; 55 | if(x > 65535){ 56 | x = ceil(sqrt(k)); 57 | y = (n-1)/(x*BLOCK) + 1; 58 | } 59 | dim3 d = {x, y, 1}; 60 | //printf("%ld %ld %ld %ld\n", n, x, y, x*y*BLOCK); 61 | return d; 62 | } 63 | 64 | #ifdef CUDNN 65 | cudnnHandle_t cudnn_handle() 66 | { 67 | static int init[16] = {0}; 68 | static cudnnHandle_t handle[16]; 69 | int i = cuda_get_device(); 70 | if(!init[i]) { 71 | cudnnCreate(&handle[i]); 72 | init[i] = 1; 73 | } 74 | return handle[i]; 75 | } 76 | #endif 77 | 78 | cublasHandle_t blas_handle() 79 | { 80 | static int init[16] = {0}; 81 | static cublasHandle_t handle[16]; 82 | int i = cuda_get_device(); 83 | if(!init[i]) { 84 | cublasCreate(&handle[i]); 85 | init[i] = 1; 86 | } 87 | return handle[i]; 88 | } 89 | 90 | float *cuda_make_array(float *x, size_t n) 91 | { 92 | float *x_gpu; 93 | size_t size = sizeof(float)*n; 94 | cudaError_t status = cudaMalloc((void **)&x_gpu, size); 95 | check_error(status); 96 | if(x){ 97 | status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); 98 | check_error(status); 99 | } 100 | if(!x_gpu) error("Cuda malloc failed\n"); 101 | return x_gpu; 102 | } 103 | 104 | void cuda_random(float *x_gpu, size_t n) 105 | { 106 | static curandGenerator_t gen[16]; 107 | static int init[16] = {0}; 108 | int i = cuda_get_device(); 109 | if(!init[i]){ 110 | curandCreateGenerator(&gen[i], CURAND_RNG_PSEUDO_DEFAULT); 111 | curandSetPseudoRandomGeneratorSeed(gen[i], time(0)); 112 | init[i] = 1; 113 | } 114 | curandGenerateUniform(gen[i], x_gpu, n); 115 | check_error(cudaPeekAtLastError()); 116 | } 117 | 118 | float cuda_compare(float *x_gpu, float *x, size_t n, char *s) 119 | { 120 | float *tmp = calloc(n, sizeof(float)); 121 | cuda_pull_array(x_gpu, tmp, n); 122 | //int i; 123 | //for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]); 124 | axpy_cpu(n, -1, x, 1, tmp, 1); 125 | float err = dot_cpu(n, tmp, 1, tmp, 1); 126 | printf("Error %s: %f\n", s, sqrt(err/n)); 127 | free(tmp); 128 | return err; 129 | } 130 | 131 | int *cuda_make_int_array(size_t n) 132 | { 133 | int *x_gpu; 134 | size_t size = sizeof(int)*n; 135 | cudaError_t status = cudaMalloc((void **)&x_gpu, size); 136 | check_error(status); 137 | return x_gpu; 138 | } 139 | 140 | void cuda_free(float *x_gpu) 141 | { 142 | cudaError_t status = cudaFree(x_gpu); 143 | check_error(status); 144 | } 145 | 146 | void cuda_push_array(float *x_gpu, float *x, size_t n) 147 | { 148 | size_t size = sizeof(float)*n; 149 | cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); 150 | check_error(status); 151 | } 152 | 153 | void cuda_pull_array(float *x_gpu, float *x, size_t n) 154 | { 155 | size_t size = sizeof(float)*n; 156 | cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost); 157 | check_error(status); 158 | } 159 | 160 | #endif 161 | -------------------------------------------------------------------------------- /layers/batchnorm/src/cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef CUDA_H 2 | #define CUDA_H 3 | 4 | extern int gpu_index; 5 | 6 | #ifdef GPU 7 | 8 | #define BLOCK 512 9 | 10 | #include "cuda_runtime.h" 11 | #include "curand.h" 12 | #include "cublas_v2.h" 13 | 14 | #ifdef CUDNN 15 | #include "cudnn.h" 16 | #endif 17 | 18 | void check_error(cudaError_t status); 19 | cublasHandle_t blas_handle(); 20 | float *cuda_make_array(float *x, size_t n); 21 | int *cuda_make_int_array(size_t n); 22 | void cuda_push_array(float *x_gpu, float *x, size_t n); 23 | void cuda_pull_array(float *x_gpu, float *x, size_t n); 24 | void cuda_set_device(int n); 25 | void cuda_free(float *x_gpu); 26 | void cuda_random(float *x_gpu, size_t n); 27 | float cuda_compare(float *x_gpu, float *x, size_t n, char *s); 28 | dim3 cuda_gridsize(size_t n); 29 | 30 | #ifdef CUDNN 31 | cudnnHandle_t cudnn_handle(); 32 | #endif 33 | 34 | #endif 35 | #endif 36 | -------------------------------------------------------------------------------- /models/caffe_net.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from torch.autograd import Variable 7 | from PIL import Image, ImageDraw 8 | import sys 9 | from collections import OrderedDict 10 | from utils import do_detect, plot_boxes, load_class_names 11 | sys.path.append('/home/xiaohang/caffe/python') 12 | sys.path.append('.') 13 | import caffe 14 | from region_loss import RegionLoss 15 | class Scale(nn.Module): 16 | def __init__(self): 17 | super(Scale, self).__init__() 18 | def forward(self, x): 19 | return x 20 | 21 | 22 | class Eltwise(nn.Module): 23 | def __init__(self, operation='+'): 24 | super(Eltwise, self).__init__() 25 | self.operation = operation 26 | 27 | def forward(self, input_feats): 28 | if isinstance(input_feats, tuple): 29 | print "error : The input of Eltwise layer must be a tuple" 30 | for i, feat in enumerate(input_feats): 31 | if x is None: 32 | x = feat 33 | continue 34 | if self.operation == '+': 35 | x += feat 36 | if self.operation == '*': 37 | x *= feat 38 | if self.operation == '/': 39 | x /= feat 40 | return x 41 | 42 | class Concat(nn.Module): 43 | def __init__(self): 44 | super(Concat, self).__init__() 45 | 46 | def forward(self, input_feats): 47 | if not isinstance(input_feats, tuple): 48 | print 'The input of Concat layer must be a tuple' 49 | self.length = len(input_feats) 50 | x = torch.cat(input_feats, 1) 51 | return x 52 | 53 | 54 | 55 | def parse_prototxt(protofile): 56 | def line_type(line): 57 | if line.find(':') >= 0: 58 | return 0 59 | elif line.find('{') >= 0: 60 | return 1 61 | return -1 62 | 63 | def parse_param_block(fp): 64 | block = dict() 65 | line = fp.readline().strip() 66 | while line != '}': 67 | ltype = line_type(line) 68 | if ltype == 0: # key: value 69 | key, value = line.split(':') 70 | key = key.strip() 71 | value = value.strip().strip('"') 72 | block[key] = value 73 | elif ltype == 1: # blockname { 74 | key = line.split('{')[0].strip() 75 | sub_block = parse_param_block(fp) 76 | block[key] = sub_block 77 | line = fp.readline().strip() 78 | return block 79 | 80 | def parse_layer_block(fp): 81 | block = dict() 82 | block['top'] = [] 83 | block['bottom'] = [] 84 | line = fp.readline().strip() 85 | while line != '}': 86 | ltype = line_type(line) 87 | if ltype == 0: # key: value 88 | key, value = line.split(':') 89 | key = key.strip() 90 | value = value.strip().strip('"') 91 | if key == 'top' or key == 'bottom': 92 | block[key].append(value) 93 | else: 94 | block[key] = value 95 | elif ltype == 1: # blockname { 96 | key = line.split('{')[0].strip() 97 | sub_block = parse_param_block(fp) 98 | block[key] = sub_block 99 | line = fp.readline().strip() 100 | return block 101 | 102 | fp = open(protofile, 'r') 103 | props = dict() 104 | layers = [] 105 | line = fp.readline() 106 | while line != '': 107 | ltype = line_type(line) 108 | if ltype == 0: # key: value 109 | key, value = line.split(':') 110 | key = key.strip() 111 | value = value.strip().strip('"') 112 | props[key] = value 113 | elif ltype == 1: # blockname { 114 | key = line.split('{')[0].strip() 115 | assert(key == 'layer' or key == 'input_shape') 116 | layer = parse_layer_block(fp) 117 | layers.append(layer) 118 | #print layer 119 | line = fp.readline() 120 | net_info = dict() 121 | net_info['props'] = props 122 | net_info['layers'] = layers 123 | #print net_info 124 | 125 | return net_info 126 | 127 | 128 | class CaffeNet(nn.Module): 129 | def __init__(self, protofile, caffemodel): 130 | super(CaffeNet, self).__init__() 131 | self.seen = 0 132 | self.num_classes = 1 133 | self.is_pretrained = True 134 | if not caffemodel is None: 135 | self.is_pretrained = True 136 | self.anchors = [0.625,0.750, 0.625,0.750, 0.625,0.750, \ 137 | 0.625,0.750, 0.625,0.750, 1.000,1.200, \ 138 | 1.000,1.200, 1.000,1.200, 1.000,1.200, \ 139 | 1.600,1.920, 2.560,3.072, 4.096,4.915, \ 140 | 6.554,7.864, 10.486,12.583] 141 | #self.anchors = [1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071] 142 | self.num_anchors = len(self.anchors)/2 143 | self.width = 480 144 | self.height = 320 145 | 146 | self.loss = RegionLoss(self.num_classes, self.anchors, self.num_anchors) 147 | 148 | self.net_info = parse_prototxt(protofile) 149 | self.models = self.create_network(self.net_info) 150 | self.modelList = nn.ModuleList() 151 | if self.is_pretrained: 152 | self.load_weigths_from_caffe(protofile, caffemodel) 153 | for name,model in self.models.items(): 154 | self.modelList.append(model) 155 | 156 | 157 | def load_weigths_from_caffe(self, protofile, caffemodel): 158 | caffe.set_mode_cpu() 159 | net = caffe.Net(protofile, caffemodel, caffe.TEST) 160 | for name, layer in self.models.items(): 161 | if isinstance(layer, nn.Conv2d): 162 | caffe_weight = net.params[name][0].data 163 | layer.weight.data = torch.from_numpy(caffe_weight) 164 | if len(net.params[name]) > 1: 165 | caffe_bias = net.params[name][1].data 166 | layer.bias.data = torch.from_numpy(caffe_bias) 167 | continue 168 | if isinstance(layer, nn.BatchNorm2d): 169 | caffe_means = net.params[name][0].data 170 | caffe_var = net.params[name][1].data 171 | layer.running_mean = torch.from_numpy(caffe_means) 172 | layer.running_var = torch.from_numpy(caffe_var) 173 | # find the scale layer 174 | top_name_of_bn = self.layer_map_to_top[name][0] 175 | scale_name = '' 176 | for caffe_layer in self.net_info['layers']: 177 | if caffe_layer['type'] == 'Scale' and caffe_layer['bottom'][0] == top_name_of_bn: 178 | scale_name = caffe_layer['name'] 179 | break 180 | if scale_name != '': 181 | caffe_weight = net.params[scale_name][0].data 182 | layer.weight.data = torch.from_numpy(caffe_weight) 183 | if len(net.params[name]) > 1: 184 | caffe_bias = net.params[scale_name][1].data 185 | layer.bias.data = torch.from_numpy(caffe_bias) 186 | 187 | 188 | 189 | def print_network(self): 190 | print(self.net_info) 191 | 192 | def create_network(self, net_info): 193 | #print net_info 194 | models = OrderedDict() 195 | top_dim = {'data': 3} 196 | self.layer_map_to_bottom = dict() 197 | self.layer_map_to_top = dict() 198 | 199 | for layer in net_info['layers']: 200 | name = layer['name'] 201 | ltype = layer['type'] 202 | 203 | if ltype == 'Data': 204 | continue 205 | if ltype == 'ImageData': 206 | continue 207 | if layer.has_key('top'): 208 | tops = layer['top'] 209 | self.layer_map_to_top[name] = tops 210 | if layer.has_key('bottom'): 211 | bottoms = layer['bottom'] 212 | self.layer_map_to_bottom[name] = bottoms 213 | if ltype == 'Convolution': 214 | filters = int(layer['convolution_param']['num_output']) 215 | kernel_size = int(layer['convolution_param']['kernel_size']) 216 | stride = 1 217 | group = 1 218 | pad = 0 219 | bias = True 220 | dilation = 1 221 | if layer['convolution_param'].has_key('stride'): 222 | stride = int(layer['convolution_param']['stride']) 223 | if layer['convolution_param'].has_key('pad'): 224 | pad = int(layer['convolution_param']['pad']) 225 | if layer['convolution_param'].has_key('group'): 226 | group = int(layer['convolution_param']['group']) 227 | if layer['convolution_param'].has_key('bias_term'): 228 | bias = True if layer['convolution_param']\ 229 | ['bias_term'].lower() == 'false' else False 230 | if layer['convolution_param'].has_key('dilation'): 231 | dilation = int(layer['convolution_param']['dilation']) 232 | num_output = int(layer['convolution_param']['num_output']) 233 | top_dim[tops[0]]=num_output 234 | num_input = top_dim[bottoms[0]] 235 | models[name] = nn.Conv2d(num_input, num_output, kernel_size, 236 | stride,pad,groups=group, bias=bias, dilation=dilation) 237 | elif ltype == 'ReLU': 238 | inplace = (bottoms == tops) 239 | top_dim[tops[0]] = top_dim[bottoms[0]] 240 | models[name] = nn.ReLU(inplace=False) 241 | elif ltype == 'Pooling': 242 | kernel_size = int(layer['pooling_param']['kernel_size']) 243 | stride = 1 244 | if layer['pooling_param'].has_key('stride'): 245 | stride = int(layer['pooling_param']['stride']) 246 | top_dim[tops[0]] = top_dim[bottoms[0]] 247 | models[name] = nn.MaxPool2d(kernel_size, stride) 248 | elif ltype == 'BatchNorm': 249 | if layer['batch_norm_param'].has_key('use_global_stats'): 250 | use_global_stats = True if layer['batch_norm_param']\ 251 | ['use_global_stats'].lower() == 'true' else False 252 | top_dim[tops[0]] = top_dim[bottoms[0]] 253 | 254 | models[name] = nn.BatchNorm2d(top_dim[bottoms[0]]) 255 | 256 | elif ltype == 'Scale': 257 | top_dim[tops[0]] = top_dim[bottoms[0]] 258 | models[name] = Scale() 259 | elif ltype == 'Eltwise': 260 | top_dim[tops[0]] = top_dim[bottoms[0]] 261 | models[name] = Eltwise('+') 262 | elif ltype == 'Concat': 263 | top_dim[tops[0]] = 0 264 | for i, x in enumerate(bottoms): 265 | top_dim[tops[0]] += top_dim[x] 266 | models[name] = Concat() 267 | elif ltype == 'Dropout': 268 | if layer['top'][0] == layer['bottom'][0]: 269 | inplace = True 270 | else: 271 | inplace = False 272 | top_dim[tops[0]] = top_dim[bottoms[0]] 273 | models[name] = nn.Dropout2d(inplace=inplace) 274 | else: 275 | print '%s is not NotImplemented'%ltype 276 | 277 | return models 278 | 279 | def forward(self, x, target=None): 280 | blobs = OrderedDict() 281 | for name, layer in self.models.items(): 282 | output_names = self.layer_map_to_top[name] 283 | input_names = self.layer_map_to_bottom[name] 284 | print "-----------------------------------------" 285 | print 'input_names: ',input_names 286 | print 'output_names:',output_names 287 | print layer 288 | # frist layer 289 | if input_names[0] == 'data': 290 | top_blobs = layer(x) 291 | else: 292 | input_blobs = [blobs[i] for i in input_names ] 293 | if isinstance(layer, Concat) or isinstance(layer, Eltwise): 294 | top_blobs = layer(input_blobs) 295 | else: 296 | top_blobs = layer(input_blobs[0]) 297 | if not isinstance(top_blobs, tuple): 298 | top_blobs = (top_blobs,) 299 | 300 | for k, v in zip(output_names, top_blobs): 301 | blobs[k] = v 302 | output_name = blobs.keys()[-1] 303 | print 'output_name',output_name 304 | return blobs[output_name] 305 | 306 | 307 | 308 | if __name__ == '__main__': 309 | prototxt = 'tiny_yolo_nbn_reluface.prototxt' 310 | caffemodel = '/nfs/xiaohang/for_chenchao/tiny_yolo_nbn_reluface.caffemodel' 311 | imgfile = 'data/face.jpg' 312 | 313 | m = CaffeNet(prototxt, caffemodel) 314 | use_cuda = 1 315 | if use_cuda: 316 | m.cuda() 317 | 318 | img = Image.open(imgfile).convert('RGB') 319 | sized = img.resize((m.width, m.height)) 320 | #if m.num_classes == 20: 321 | # namesfile = '../data/voc.names' 322 | #class_names = load_class_names(namesfile) 323 | class_names = ['face'] 324 | for i in range(1): 325 | start = time.time() 326 | boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 327 | finish = time.time() 328 | if i == 1: 329 | print('%s: Predicted in %f seconds.' % (imgfile, (finish-start))) 330 | 331 | plot_boxes(img, boxes, 'predictions.jpg', class_names) 332 | -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | from region_loss import RegionLoss 6 | 7 | def conv3x3(in_planes, out_planes, stride=1): 8 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, 9 | stride=stride, padding=1, bias=False) 10 | 11 | class BasicBlock(nn.Module): 12 | expansion = 1 13 | 14 | def __init__(self, inplanes, planes, stride=1, downsample=None): 15 | super(BasicBlock, self).__init__() 16 | self.conv1 = conv3x3(inplanes, planes, stride) 17 | self.bn1 = nn.BatchNorm2d(planes) 18 | self.relu = nn.ReLU(inplace=True) 19 | self.conv2 = conv3x3(planes, planes) 20 | self.bn2 = nn.BatchNorm2d(planes) 21 | self.downsample = downsample 22 | self.stride = stride 23 | 24 | def forward(self, x): 25 | residual = x 26 | 27 | out = self.conv1(x) 28 | out = self.bn1(out) 29 | out = self.relu(out) 30 | 31 | out = self.conv2(out) 32 | out = self.bn2(out) 33 | 34 | if self.downsample is not None: 35 | residual = self.downsample(x) 36 | 37 | out += residual 38 | out = self.relu(out) 39 | 40 | return out 41 | 42 | class Bottleneck(nn.Module): 43 | expansion = 4 44 | 45 | def __init__(self, inplanes, planes, stride=1, downsample=None): 46 | super(Bottleneck, self).__init__() 47 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 48 | self.bn1 = nn.BatchNorm2d(planes) 49 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 50 | padding=1, bias=False) 51 | self.bn2 = nn.BatchNorm2d(planes) 52 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 53 | self.bn3 = nn.BatchNorm2d(planes * 4) 54 | self.relu = nn.ReLU(inplace=True) 55 | self.downsample = downsample 56 | self.stride = stride 57 | 58 | def forward(self, x): 59 | residual = x 60 | 61 | out = self.conv1(x) 62 | out = self.bn1(out) 63 | out = self.relu(out) 64 | 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | out = self.relu(out) 68 | 69 | out = self.conv3(out) 70 | out = self.bn3(out) 71 | 72 | if self.downsample is not None: 73 | residual = self.downsample(x) 74 | 75 | out += residual 76 | out = self.relu(out) 77 | 78 | return out 79 | 80 | class ResNet(nn.Module): 81 | def __init__(self, block, layers, num_classes=1000): 82 | self.inplanes = 64 83 | super(ResNet, self).__init__() 84 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 85 | bias=False) 86 | self.bn1 = nn.BatchNorm2d(64) 87 | self.relu = nn.ReLU(inplace=True) 88 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 89 | self.layer1 = self._make_layer(block, 64, layers[0]) 90 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 91 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 92 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 93 | self.avgpool = nn.AvgPool2d(7) 94 | self.fc = nn.Linear(512 * block.expansion, num_classes) 95 | 96 | for m in self.modules(): 97 | if isinstance(m, nn.Conv2d): 98 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 99 | m.weight.data.normal_(0, math.sqrt(2. / n)) 100 | elif isinstance(m, nn.BatchNorm2d): 101 | m.weight.data.fill_(1) 102 | m.bias.data.zero_() 103 | 104 | def _make_layer(self, block, planes, blocks, stride=1): 105 | downsample = None 106 | if stride != 1 or self.inplanes != planes * block.expansion: 107 | downsample = nn.Sequential( 108 | nn.Conv2d(self.inplanes, planes * block.expansion, 109 | kernel_size=1, stride=stride, bias=False), 110 | nn.BatchNorm2d(planes * block.expansion), 111 | ) 112 | 113 | layers = [] 114 | layers.append(block(self.inplanes, planes, stride, downsample)) 115 | self.inplanes = planes * block.expansion 116 | for i in range(1, blocks): 117 | layers.append(block(self.inplanes, planes)) 118 | 119 | return nn.Sequential(*layers) 120 | 121 | def forward(self, x): 122 | x = self.conv1(x) 123 | x = self.bn1(x) 124 | x = self.relu(x) 125 | x = self.maxpool(x) 126 | 127 | x = self.layer1(x) 128 | x = self.layer2(x) 129 | x = self.layer3(x) 130 | x = self.layer4(x) 131 | 132 | x = self.avgpool(x) 133 | x = x.view(x.size(0), -1) 134 | x = self.fc(x) 135 | 136 | return x 137 | 138 | class Resnet101(nn.Module): 139 | def __init__(self): 140 | super(Resnet, self).__init__() 141 | self.seen = 0 142 | self.num_classes = 20 143 | self.anchors = [1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52] 144 | self.num_anchors = len(self.anchors)/2 145 | num_output = (5+self.num_classes)*self.num_anchors 146 | self.width = 160 147 | self.height = 160 148 | 149 | self.loss = RegionLoss(self.num_classes, self.anchors, self.num_anchors) 150 | self.model = ResNet(Bottleneck, [3, 4, 6, 3]) 151 | 152 | def forward(self, x): 153 | x = self.model(x) 154 | return x 155 | 156 | def print_network(self): 157 | print(self) 158 | 159 | -------------------------------------------------------------------------------- /models/tiny_yolo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from collections import OrderedDict 5 | from cfg import * 6 | from darknet import MaxPoolStride1 7 | from region_loss import RegionLoss 8 | 9 | class TinyYoloNet(nn.Module): 10 | def __init__(self): 11 | super(TinyYoloNet, self).__init__() 12 | self.seen = 0 13 | self.num_classes = 20 14 | self.anchors = [1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52] 15 | self.num_anchors = len(self.anchors)/2 16 | num_output = (5+self.num_classes)*self.num_anchors 17 | self.width = 160 18 | self.height = 160 19 | 20 | self.loss = RegionLoss(self.num_classes, self.anchors, self.num_anchors) 21 | self.cnn = nn.Sequential(OrderedDict([ 22 | # conv1 23 | ('conv1', nn.Conv2d( 3, 16, 3, 1, 1, bias=False)), 24 | ('bn1', nn.BatchNorm2d(16)), 25 | ('leaky1', nn.LeakyReLU(0.1, inplace=True)), 26 | ('pool1', nn.MaxPool2d(2, 2)), 27 | 28 | # conv2 29 | ('conv2', nn.Conv2d(16, 32, 3, 1, 1, bias=False)), 30 | ('bn2', nn.BatchNorm2d(32)), 31 | ('leaky2', nn.LeakyReLU(0.1, inplace=True)), 32 | ('pool2', nn.MaxPool2d(2, 2)), 33 | 34 | # conv3 35 | ('conv3', nn.Conv2d(32, 64, 3, 1, 1, bias=False)), 36 | ('bn3', nn.BatchNorm2d(64)), 37 | ('leaky3', nn.LeakyReLU(0.1, inplace=True)), 38 | ('pool3', nn.MaxPool2d(2, 2)), 39 | 40 | # conv4 41 | ('conv4', nn.Conv2d(64, 128, 3, 1, 1, bias=False)), 42 | ('bn4', nn.BatchNorm2d(128)), 43 | ('leaky4', nn.LeakyReLU(0.1, inplace=True)), 44 | ('pool4', nn.MaxPool2d(2, 2)), 45 | 46 | # conv5 47 | ('conv5', nn.Conv2d(128, 256, 3, 1, 1, bias=False)), 48 | ('bn5', nn.BatchNorm2d(256)), 49 | ('leaky5', nn.LeakyReLU(0.1, inplace=True)), 50 | ('pool5', nn.MaxPool2d(2, 2)), 51 | 52 | # conv6 53 | ('conv6', nn.Conv2d(256, 512, 3, 1, 1, bias=False)), 54 | ('bn6', nn.BatchNorm2d(512)), 55 | ('leaky6', nn.LeakyReLU(0.1, inplace=True)), 56 | ('pool6', MaxPoolStride1()), 57 | 58 | # conv7 59 | ('conv7', nn.Conv2d(512, 1024, 3, 1, 1, bias=False)), 60 | ('bn7', nn.BatchNorm2d(1024)), 61 | ('leaky7', nn.LeakyReLU(0.1, inplace=True)), 62 | 63 | # conv8 64 | ('conv8', nn.Conv2d(1024, 1024, 3, 1, 1, bias=False)), 65 | ('bn8', nn.BatchNorm2d(1024)), 66 | ('leaky8', nn.LeakyReLU(0.1, inplace=True)), 67 | 68 | # output 69 | ('output', nn.Conv2d(1024, num_output, 1, 1, 0)), 70 | ])) 71 | 72 | def forward(self, x): 73 | x = self.cnn(x) 74 | return x 75 | 76 | def print_network(self): 77 | print(self) 78 | 79 | def load_weights(self, path): 80 | #buf = np.fromfile('tiny-yolo-voc.weights', dtype = np.float32) 81 | buf = np.fromfile(path, dtype = np.float32) 82 | start = 4 83 | 84 | start = load_conv_bn(buf, start, self.cnn[0], self.cnn[1]) 85 | start = load_conv_bn(buf, start, self.cnn[4], self.cnn[5]) 86 | start = load_conv_bn(buf, start, self.cnn[8], self.cnn[9]) 87 | start = load_conv_bn(buf, start, self.cnn[12], self.cnn[13]) 88 | start = load_conv_bn(buf, start, self.cnn[16], self.cnn[17]) 89 | start = load_conv_bn(buf, start, self.cnn[20], self.cnn[21]) 90 | 91 | start = load_conv_bn(buf, start, self.cnn[24], self.cnn[25]) 92 | start = load_conv_bn(buf, start, self.cnn[27], self.cnn[28]) 93 | start = load_conv(buf, start, self.cnn[30]) 94 | 95 | if __name__ == '__main__': 96 | from PIL import Image 97 | from utils import * 98 | m = TinyYoloNet() 99 | m.float() 100 | m.eval() 101 | m.load_darknet_weights('tiny-yolo-voc.weights') 102 | print(m) 103 | 104 | use_cuda = 1 105 | if use_cuda: 106 | m.cuda() 107 | 108 | img = Image.open('data/person.jpg').convert('RGB') 109 | sized = img.resize((416,416)) 110 | boxes = do_detect(m, sized, 0.5, 0.4, use_cuda) 111 | 112 | class_names = load_class_names('data/voc.names') 113 | plot_boxes(img, boxes, 'predict1.jpg', class_names) 114 | 115 | -------------------------------------------------------------------------------- /outputs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | class Outputs: 3 | def __init__(self): 4 | self.num_outputs =0 5 | self.outputs = [] 6 | self.masks = [] 7 | self.num_masks = [] 8 | 9 | def __iter__(self): 10 | self.current = 0 11 | return self 12 | 13 | def __next__(self): 14 | if self.current > self.num_outputs: 15 | raise StopIteration 16 | else: 17 | self.current += 1 18 | return self.get(self.current-1) 19 | 20 | def num(self): 21 | return self.num_outputs 22 | 23 | def size(self): 24 | if self.num_outputs > 0: 25 | return self.outputs[0].data.size(0) 26 | else: 27 | return 0 28 | 29 | def get(self, index): 30 | if index < self.num_outputs: 31 | return [self.outputs[index].data, self.masks[index], self.num_masks[index]] 32 | else: 33 | return [None, None, None] 34 | 35 | def get_out(self, index): 36 | if index < self.num_outputs: 37 | return self.outputs[index] 38 | else: 39 | return None 40 | 41 | def add(self, outbox): 42 | if len(outbox) == 3: 43 | self.outputs.append(outbox[0]) 44 | self.masks.append(outbox[1]) 45 | self.num_masks.append(outbox[2]) 46 | self.num_outputs += 1 -------------------------------------------------------------------------------- /partial.py: -------------------------------------------------------------------------------- 1 | from darknet import Darknet 2 | 3 | def partial(cfgfile, weightfile, outfile, cutoff): 4 | m = Darknet(cfgfile) 5 | m.print_network() 6 | m.load_weights(weightfile) 7 | m.seen = 0 8 | m.save_weights(outfile, cutoff) 9 | print('save %s' % (outfile)) 10 | 11 | if __name__ == '__main__': 12 | import sys 13 | if len(sys.argv) == 5: 14 | cfgfile = sys.argv[1] 15 | weightfile = sys.argv[2] 16 | outfile = sys.argv[3] 17 | cutoff = int(sys.argv[4]) 18 | partial(cfgfile, weightfile, outfile, cutoff) 19 | else: 20 | print('Usage:') 21 | print('python partial.py cfgfile weightfile output cutoff') 22 | #partial('cfg/tiny-yolo-voc.cfg', 'tiny-yolo-voc.weights', 'tiny-yolo-voc.conv.15', 15) 23 | 24 | -------------------------------------------------------------------------------- /recall.py: -------------------------------------------------------------------------------- 1 | from PIL import Image, ImageDraw 2 | from utils import * 3 | from darknet import Darknet 4 | 5 | def eval_list(cfgfile, weightfile, imglist): 6 | #m = TinyYoloFace14Net() 7 | #m.eval() 8 | #m.load_darknet_weights(tiny_yolo_weight) 9 | 10 | m = Darknet(cfgfile) 11 | m.eval() 12 | m.load_weights(weightfile) 13 | eval_wid = m.width 14 | eval_hei = m.height 15 | 16 | use_cuda = True 17 | if use_cuda: 18 | m.cuda() 19 | 20 | conf_thresh = 0.25 21 | nms_thresh = 0.4 22 | iou_thresh = 0.5 23 | min_box_scale = 8. / m.width 24 | 25 | with open(imglist) as fp: 26 | lines = fp.readlines() 27 | 28 | total = 0.0 29 | proposals = 0.0 30 | correct = 0.0 31 | lineId = 0 32 | avg_iou = 0.0 33 | for line in lines: 34 | img_path = line.rstrip() 35 | if img_path[0] == '#': 36 | continue 37 | lineId = lineId + 1 38 | lab_path = img_path.replace('images', 'labels') 39 | lab_path = lab_path.replace('JPEGImages', 'labels') 40 | lab_path = lab_path.replace('.jpg', '.txt').replace('.png', '.txt') 41 | #truths = read_truths(lab_path) 42 | truths = read_truths_args(lab_path, min_box_scale) 43 | #print(truths) 44 | 45 | img = Image.open(img_path).convert('RGB').resize((eval_wid, eval_hei)) 46 | boxes = do_detect(m, img, conf_thresh, nms_thresh, use_cuda) 47 | if False: 48 | savename = "tmp/%06d.jpg" % (lineId) 49 | print("save %s" % savename) 50 | plot_boxes(img, boxes, savename) 51 | 52 | total = total + truths.shape[0] 53 | 54 | for i in range(len(boxes)): 55 | if boxes[i][4] > conf_thresh: 56 | proposals = proposals+1 57 | 58 | for i in range(truths.shape[0]): 59 | box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0] 60 | best_iou = 0 61 | for j in range(len(boxes)): 62 | iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) 63 | best_iou = max(iou, best_iou) 64 | if best_iou > iou_thresh: 65 | avg_iou += best_iou 66 | correct = correct+1 67 | 68 | precision = 1.0*correct/proposals 69 | recall = 1.0*correct/total 70 | fscore = 2.0*precision*recall/(precision+recall) 71 | print("%d IOU: %f, Recal: %f, Precision: %f, Fscore: %f\n" % (lineId-1, avg_iou/correct, recall, precision, fscore)) 72 | 73 | if __name__ == '__main__': 74 | import sys 75 | if len(sys.argv) == 4: 76 | cfgfile = sys.argv[1] 77 | weightfile = sys.argv[2] 78 | imglist = sys.argv[3] 79 | eval_list(cfgfile, weightfile, imglist) 80 | else: 81 | print('Usage:') 82 | print('python recall.py cfgfile weightfile imglist') 83 | #python recall.py test160.cfg backup/000022.weights face_test.txt 84 | -------------------------------------------------------------------------------- /region_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import sys 4 | import time 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from utils import bbox_iou, multi_bbox_ious, convert2cpu 9 | 10 | class RegionLayer(nn.Module): 11 | def __init__(self, num_classes=0, anchors=[1.0], num_anchors=1, use_cuda=None): 12 | super(RegionLayer, self).__init__() 13 | use_cuda = torch.cuda.is_available() and (True if use_cuda is None else use_cuda) 14 | self.device = torch.device("cuda" if use_cuda else "cpu") 15 | self.num_classes = num_classes 16 | self.num_anchors = num_anchors 17 | self.anchor_step = len(anchors)//num_anchors 18 | #self.anchors = torch.stack(torch.FloatTensor(anchors).split(self.anchor_step)).to(self.device) 19 | self.anchors = torch.FloatTensor(anchors).view(self.num_anchors, self.anchor_step).to(self.device) 20 | self.rescore = 1 21 | self.coord_scale = 1 22 | self.noobject_scale = 1 23 | self.object_scale = 5 24 | self.class_scale = 1 25 | self.thresh = 0.6 26 | self.seen = 0 27 | 28 | def build_targets(self, pred_boxes, target, nH, nW): 29 | nB = target.size(0) 30 | nA = self.num_anchors 31 | noobj_mask = torch.ones (nB, nA, nH, nW) 32 | obj_mask = torch.zeros(nB, nA, nH, nW) 33 | coord_mask = torch.zeros(nB, nA, nH, nW) 34 | tcoord = torch.zeros( 4, nB, nA, nH, nW) 35 | tconf = torch.zeros(nB, nA, nH, nW) 36 | tcls = torch.zeros(nB, nA, nH, nW) 37 | 38 | nAnchors = nA*nH*nW 39 | nPixels = nH*nW 40 | nGT = 0 # number of ground truth 41 | nRecall = 0 42 | # it works faster on CPU than on GPU. 43 | anchors = self.anchors.to("cpu") 44 | 45 | if self.seen < 12800: 46 | tcoord[0].fill_(0.5) 47 | tcoord[1].fill_(0.5) 48 | coord_mask.fill_(0.01) 49 | # initial w, h == 0 means log(1)==0, s.t, anchor is equal to ground truth. 50 | 51 | for b in range(nB): 52 | cur_pred_boxes = pred_boxes[b*nAnchors:(b+1)*nAnchors].t() 53 | cur_ious = torch.zeros(nAnchors) 54 | tbox = target[b].view(-1,5).to("cpu") 55 | for t in range(50): 56 | if tbox[t][1] == 0: 57 | break 58 | gx, gw = [ i * nW for i in (tbox[t][1], tbox[t][3]) ] 59 | gy, gh = [ i * nH for i in (tbox[t][2], tbox[t][4]) ] 60 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors,1).t() 61 | cur_ious = torch.max(cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 62 | ignore_ix = (cur_ious>self.thresh).view(nA,nH,nW) 63 | noobj_mask[b][ignore_ix] = 0 64 | 65 | for t in range(50): 66 | if tbox[t][1] == 0: 67 | break 68 | nGT += 1 69 | gx, gw = [ i * nW for i in (tbox[t][1], tbox[t][3]) ] 70 | gy, gh = [ i * nH for i in (tbox[t][2], tbox[t][4]) ] 71 | gw, gh = gw.float(), gh.float() 72 | gi, gj = int(gx), int(gy) 73 | 74 | tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA,1).t() 75 | anchor_boxes = torch.cat((torch.zeros(nA, 2), anchors),1).t() 76 | tmp_ious = multi_bbox_ious(anchor_boxes, tmp_gt_boxes, x1y1x2y2=False) 77 | best_iou, best_n = torch.max(tmp_ious, 0) 78 | 79 | if self.anchor_step == 4: # this part is not tested. 80 | tmp_ious_mask = (tmp_ious==best_iou) 81 | if tmp_ious_mask.sum() > 0: 82 | gt_pos = torch.FloatTensor([gi, gj, gx, gy]).repeat(nA,1).t() 83 | an_pos = anchor_boxes[4:6] # anchor_boxes are consisted of [0 0 aw ah ax ay] 84 | dist = pow(((gt_pos[0]+an_pos[0])-gt_pos[2]),2) + pow(((gt_pos[1]+an_pos[1])-gt_pos[3]),2) 85 | dist[1-tmp_ious_mask]=10000 # set the large number for the small ious 86 | _, best_n = torch.min(dist,0) 87 | 88 | gt_box = torch.FloatTensor([gx, gy, gw, gh]) 89 | pred_box = pred_boxes[b*nAnchors+best_n*nPixels+gj*nW+gi] 90 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) 91 | 92 | obj_mask [b][best_n][gj][gi] = 1 93 | noobj_mask[b][best_n][gj][gi] = 0 94 | coord_mask[b][best_n][gj][gi] = 2. - tbox[t][3]*tbox[t][4] 95 | tcoord [0][b][best_n][gj][gi] = gx - gi 96 | tcoord [1][b][best_n][gj][gi] = gy - gj 97 | tcoord [2][b][best_n][gj][gi] = math.log(gw/anchors[best_n][0]) 98 | tcoord [3][b][best_n][gj][gi] = math.log(gh/anchors[best_n][1]) 99 | tcls [b][best_n][gj][gi] = tbox[t][0] 100 | tconf [b][best_n][gj][gi] = iou if self.rescore else 1. 101 | if iou > 0.5: 102 | nRecall += 1 103 | 104 | return nGT, nRecall, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls 105 | 106 | def get_mask_boxes(self, output): 107 | if not isinstance(self.anchors, torch.Tensor): 108 | self.anchors = torch.FloatTensor(self.anchors).view(self.num_anchors, self.anchor_step).to(self.device) 109 | masked_anchors = self.anchors.view(-1) 110 | num_anchors = torch.IntTensor([self.num_anchors]).to(self.device) 111 | return {'x':output, 'a':masked_anchors, 'n':num_anchors} 112 | 113 | def forward(self, output, target): 114 | #output : BxAs*(4+1+num_classes)*H*W 115 | t0 = time.time() 116 | nB = output.data.size(0) # batch size 117 | nA = self.num_anchors 118 | nC = self.num_classes 119 | nH = output.data.size(2) 120 | nW = output.data.size(3) 121 | cls_anchor_dim = nB*nA*nH*nW 122 | 123 | if not isinstance(self.anchors, torch.Tensor): 124 | self.anchors = torch.FloatTensor(self.anchors).view(self.num_anchors, self.anchor_step).to(self.device) 125 | 126 | output = output.view(nB, nA, (5+nC), nH, nW).to(self.device) 127 | cls_grid = torch.linspace(5,5+nC-1,nC).long().to(self.device) 128 | ix = torch.LongTensor(range(0,5)).to(self.device) 129 | pred_boxes = torch.FloatTensor(4, cls_anchor_dim).to(self.device) 130 | 131 | coord = output.index_select(2, ix[0:4]).view(nB*nA, -1, nH*nW).transpose(0,1).contiguous().view(-1,cls_anchor_dim) # x, y, w, h 132 | coord[0:2] = coord[0:2].sigmoid() 133 | conf = output.index_select(2, ix[4]).view(cls_anchor_dim).sigmoid() 134 | 135 | cls = output.index_select(2, cls_grid) 136 | cls = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(cls_anchor_dim, nC) 137 | 138 | t1 = time.time() 139 | grid_x = torch.linspace(0, nW-1, nW).repeat(nB*nA, nH, 1).view(cls_anchor_dim).to(self.device) 140 | grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(cls_anchor_dim).to(self.device) 141 | anchor_w = self.anchors.index_select(1, ix[0]).repeat(nB, nH*nW).view(cls_anchor_dim) 142 | anchor_h = self.anchors.index_select(1, ix[1]).repeat(nB, nH*nW).view(cls_anchor_dim) 143 | 144 | pred_boxes[0] = coord[0] + grid_x 145 | pred_boxes[1] = coord[1] + grid_y 146 | pred_boxes[2] = coord[2].exp() * anchor_w 147 | pred_boxes[3] = coord[3].exp() * anchor_h 148 | # for build_targets. it works faster on CPU than on GPU 149 | pred_boxes = convert2cpu(pred_boxes.transpose(0,1).contiguous().view(-1,4)).detach() 150 | 151 | t2 = time.time() 152 | nGT, nRecall, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls = \ 153 | self.build_targets(pred_boxes, target.detach(), nH, nW) 154 | 155 | cls_mask = (obj_mask == 1) 156 | tcls = tcls[cls_mask].long().view(-1).to(self.device) 157 | cls_mask = cls_mask.view(-1, 1).repeat(1,nC).to(self.device) 158 | cls = cls[cls_mask].view(-1, nC) 159 | 160 | nProposals = int((conf > 0.25).sum()) 161 | 162 | tcoord = tcoord.view(4, cls_anchor_dim).to(self.device) 163 | tconf = tconf.view(cls_anchor_dim).to(self.device) 164 | 165 | conf_mask = (self.object_scale * obj_mask + self.noobject_scale * noobj_mask).view(cls_anchor_dim).to(self.device) 166 | obj_mask = obj_mask.view(cls_anchor_dim).to(self.device) 167 | coord_mask = coord_mask.view(cls_anchor_dim).to(self.device) 168 | 169 | t3 = time.time() 170 | loss_coord = self.coord_scale * nn.MSELoss(reduction='sum')(coord*coord_mask, tcoord*coord_mask)/nB 171 | loss_conf = nn.MSELoss(reduction='sum')(conf*conf_mask, tconf*conf_mask)/nB 172 | loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls)/nB 173 | loss = loss_coord + loss_conf + loss_cls 174 | 175 | t4 = time.time() 176 | if False: 177 | print('-'*30) 178 | print(' activation : %f' % (t1 - t0)) 179 | print(' create pred_boxes : %f' % (t2 - t1)) 180 | print(' build targets : %f' % (t3 - t2)) 181 | print(' create loss : %f' % (t4 - t3)) 182 | print(' total : %f' % (t4 - t0)) 183 | print('%d: nGT %3d, nRC %3d, nPP %3d, loss: box %6.3f, conf %6.3f, class %6.3f, total %7.3f' 184 | % (self.seen, nGT, nRecall, nProposals, loss_coord, loss_conf, loss_cls, loss)) 185 | if math.isnan(loss.item()): 186 | print(conf, tconf) 187 | sys.exit(0) 188 | return loss 189 | -------------------------------------------------------------------------------- /scripts/coco_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import os,sys 8 | #import cPickle 9 | import _pickle as cPickle 10 | import numpy as np 11 | from scripts.eval_ap import parse_rec 12 | from scripts.eval_all import get_image_xml_name 13 | from utils import load_class_names 14 | from scripts.my_eval import compute_ap 15 | from PIL import Image, ImageFile 16 | ImageFile.LOAD_TRUNCATED_IMAGES = True 17 | 18 | _classes = None 19 | def convert_bb2lab(classname, imagepath): 20 | info_path = imagepath.replace('images', 'labels'). \ 21 | replace('JPEGImages', 'labels'). \ 22 | replace('.jpg', '.txt').replace('.png','.txt') 23 | img = Image.open(imagepath) 24 | w, h = img.size 25 | objs = [] 26 | try: 27 | gt_bbs = np.loadtxt(info_path) 28 | except: 29 | return objs 30 | 31 | gt_bbs = gt_bbs.reshape(gt_bbs.size//5, 5) 32 | for i in range(len(gt_bbs)): 33 | obj = {} 34 | gt_bb = gt_bbs[i] 35 | 36 | obj['name'] = classname[(int)(gt_bb[0])] 37 | obj['pose'] = 'Unspecified' 38 | obj['truncated'] = 0 39 | obj['difficult'] = 0 40 | 41 | bb = np.zeros(4); 42 | hbw = gt_bb[3]/2.0 # half bounding box width 43 | hbh = gt_bb[4]/2.0 # half bounding box height 44 | bb[0] = (int)((gt_bb[1] - hbw) * w) # xmin 45 | bb[1] = (int)((gt_bb[2] - hbh) * h) # ymin 46 | bb[2] = (int)((gt_bb[1] + hbw) * w) # xmax 47 | bb[3] = (int)((gt_bb[2] + hbh) * h) # ymax 48 | obj['bbox'] = bb 49 | objs.append(obj) 50 | return objs 51 | 52 | def coco_eval(detpath, imagesetfile, classname, cachedir, 53 | ovthresh=0.5, use_07_metric=False): 54 | """rec, prec, ap = coco_eval(detpath, 55 | imagesetfile, 56 | classname, 57 | [ovthresh], 58 | [use_07_metric]) 59 | 60 | Top level function that does the PASCAL VOC evaluation. 61 | 62 | detpath: Path to detections 63 | detpath.format(classname) should produce the detection results file. 64 | annopath: Path to annotations 65 | annopath.format(imagename) should be the xml annotations file. 66 | imagesetfile: Text file containing the list of images, one image per line. 67 | classname: Category name (duh) 68 | cachedir: Directory for caching the annotations 69 | [ovthresh]: Overlap threshold (default = 0.5) 70 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 71 | (default False) 72 | """ 73 | # assumes detections are in detpath.format(classname) 74 | # assumes annotations are in annopath.format(imagename) 75 | # assumes imagesetfile is a text file with each line an image name 76 | # cachedir caches the annotations in a pickle file 77 | 78 | # first load gt 79 | if not os.path.isdir(cachedir): 80 | os.mkdir(cachedir) 81 | cachefile = os.path.join(cachedir, 'annots.pkl') 82 | # read list of images 83 | with open(imagesetfile, 'r') as f: 84 | lines = f.readlines() 85 | imagenames = [x.strip() for x in lines] 86 | 87 | if not os.path.isfile(cachefile): 88 | # load annots 89 | recs = {} 90 | for i, imagename in enumerate(imagenames): 91 | imagekey = os.path.basename(imagename).split('.')[0] 92 | lab = convert_bb2lab(_classes, imagename) 93 | if len(lab) > 0: 94 | recs[imagekey] = lab 95 | else: 96 | print("skipped key: {}, path: {}".format(imagekey, imagename)) 97 | 98 | if i % 100 == 0: 99 | print ('Reading annotation for {:d}/{:d}'.format( 100 | i + 1, len(imagenames))) 101 | # save 102 | print ('Saving cached annotations to {:s}'.format(cachefile)) 103 | with open(cachefile, 'wb') as f: 104 | cPickle.dump(recs, f) 105 | else: 106 | # load 107 | with open(cachefile, 'rb') as f: 108 | recs = cPickle.load(f) 109 | 110 | # extract gt objects for this class 111 | class_recs = {} 112 | npos = 0 113 | for imagename in imagenames: 114 | imagekey = os.path.basename(imagename).split('.')[0] 115 | try: 116 | R = [obj for obj in recs[imagekey] if obj['name'] == classname] 117 | except KeyError: 118 | #print("skipped: %s %s" % (imagename, imagekey)) 119 | continue; 120 | except Exception as e: 121 | print(type(e)) 122 | print(e.args) 123 | print(e) 124 | print("%s %s" % (imagename, imagekey)) 125 | exit(0) 126 | 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagekey] = {'bbox': bbox, 'difficult': difficult, 'det': det} 132 | 133 | # read dets 134 | detfile = detpath.format(classname) 135 | with open(detfile, 'r') as f: 136 | lines = f.readlines() 137 | 138 | splitlines = [x.strip().split(' ') for x in lines] 139 | image_ids = [x[0] for x in splitlines] 140 | confidence = np.array([float(x[1]) for x in splitlines]) 141 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 142 | 143 | # sort by confidence 144 | sorted_ind = np.argsort(-confidence) 145 | sorted_scores = np.sort(-confidence) 146 | BB = BB[sorted_ind, :] 147 | image_ids = [image_ids[x] for x in sorted_ind] 148 | 149 | # go down dets and mark TPs and FPs 150 | nd = len(image_ids) 151 | tp = np.zeros(nd) 152 | fp = np.zeros(nd) 153 | for d in range(nd): 154 | try: 155 | R = class_recs[image_ids[d]] 156 | except KeyError: 157 | #print("skipeed: {}".format(image_ids[d])) 158 | continue; 159 | 160 | bb = BB[d, :].astype(float) 161 | ovmax = -np.inf 162 | BBGT = R['bbox'].astype(float) 163 | 164 | if BBGT.size > 0: 165 | # compute overlaps 166 | # intersection 167 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 168 | iymin = np.maximum(BBGT[:, 1], bb[1]) 169 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 170 | iymax = np.minimum(BBGT[:, 3], bb[3]) 171 | iw = np.maximum(ixmax - ixmin + 1., 0.) 172 | ih = np.maximum(iymax - iymin + 1., 0.) 173 | inters = iw * ih 174 | 175 | # union 176 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 177 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 178 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 179 | 180 | overlaps = inters / uni 181 | ovmax = np.max(overlaps) 182 | jmax = np.argmax(overlaps) 183 | 184 | if ovmax > ovthresh: 185 | if not R['difficult'][jmax]: 186 | if not R['det'][jmax]: 187 | tp[d] = 1. 188 | R['det'][jmax] = 1 189 | else: 190 | fp[d] = 1. 191 | else: 192 | fp[d] = 1. 193 | 194 | # compute precision recall 195 | fp = np.cumsum(fp) 196 | tp = np.cumsum(tp) 197 | rec = tp / float(npos) 198 | # avoid divide by zero in case the first detection matches a difficult 199 | # ground truth 200 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 201 | ap = compute_ap(rec, prec, use_07_metric) 202 | 203 | #print('class: {:<10s} \t num occurrence: {:4d}'.format(classname, npos)) 204 | 205 | return rec, prec, ap, npos 206 | 207 | def _do_python_eval(res_prefix, imagesetfile, classesfile, output_dir = 'output'): 208 | 209 | filename = res_prefix + '{:s}.txt' 210 | 211 | cachedir = os.path.join(output_dir, 'annotations_cache') 212 | aps = [] 213 | # The PASCAL VOC metric changed in 2010 214 | use_07_metric = False 215 | #print ('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 216 | if not os.path.isdir(output_dir): 217 | os.mkdir(output_dir) 218 | 219 | global _classes 220 | _classes = load_class_names(classesfile) 221 | 222 | total = 0 223 | for i, cls in enumerate(_classes): 224 | if cls == '__background__': 225 | continue 226 | 227 | rec, prec, ap, noccur = coco_eval( 228 | filename, imagesetfile, cls, cachedir, ovthresh=0.5, 229 | use_07_metric=use_07_metric) 230 | aps += [ap] 231 | total += noccur 232 | print('AP for {:<10s} = {:.4f} with {:4d} views'.format(cls, ap, noccur)) 233 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 234 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 235 | 236 | print('Mean AP = {:.4f} with total {:4d} views'.format(np.mean(aps), total)) 237 | 238 | print('~'*30) 239 | print(' '*10, 'Results:') 240 | print('-'*30) 241 | for i, ap in enumerate(aps): 242 | print('{:<10s}\t{:.3f}'.format(_classes[i], ap)) 243 | print('='*30) 244 | print('{:^10s}\t{:.3f}'.format('Average', np.mean(aps))) 245 | print('~'*30) 246 | print('') 247 | print('--------------------------------------------------------------') 248 | print('Results computed with the **unofficial** Python eval code.') 249 | print('Results should be very close to the official MATLAB eval code.') 250 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 251 | print('-- Thanks, The Management') 252 | print('--------------------------------------------------------------') 253 | 254 | 255 | if __name__ == '__main__': 256 | #res_prefixc = '/data/hongji/darknet/results/comp4_det_test_' 257 | #res_prefix = 'results/comp4_det_test_' 258 | #test_file = 'data/sketch_test.txt' 259 | #class_names = 'data/sketch.names' 260 | res_prefix = sys.argv[1] 261 | test_file = sys.argv[2] 262 | class_names = sys.argv[3] 263 | _do_python_eval(res_prefix, test_file, class_names, output_dir = 'output') 264 | 265 | 266 | -------------------------------------------------------------------------------- /scripts/eval_all.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from PIL import Image 4 | import sys 5 | from torch.autograd import Variable 6 | from darknet import Darknet 7 | from utils import get_all_boxes, do_detect, plot_boxes, load_class_names, image2torch, get_region_boxes, nms 8 | import numpy as np 9 | 10 | conf_thresh = 0.005 11 | #conf_thresh = 0.5 12 | nms_thresh = 0.45 13 | def save_boxes(imgfile, img, boxes, savename): 14 | fp = open(savename, 'w') 15 | filename = os.path.basename(savename) 16 | filename = os.path.splitext(filename)[0] 17 | fp.write('# imagepath = %s\n' % imgfile) 18 | fp.write('# basename = %s\n' % filename) 19 | fp.write('# nbbs = %d\n' % len(boxes)) 20 | width = img.width 21 | height = img.height 22 | # box[0], box[1] : center x, center y 23 | # box[2], box[3] : width, height 24 | # box[4] : confidence 25 | # box[5] : max confidence of the class 26 | # box[6] : max class id 27 | for box in boxes: 28 | x1 = (box[0] - box[2]/2.0) * width 29 | y1 = (box[1] - box[3]/2.0) * height 30 | x2 = (box[0] + box[2]/2.0) * width 31 | y2 = (box[1] + box[3]/2.0) * height 32 | 33 | det_conf = box[4] 34 | for j in range((len(box)-5)//2): 35 | cls_conf = box[5+2*j] 36 | cls_id = box[6+2*j] 37 | prob = det_conf * cls_conf 38 | fp.write('%d %f %f %f %f %f\n' % (cls_id, prob, x1, y1, x2, y2 )) 39 | fp.close() 40 | 41 | def get_det_image_name(imagefile): 42 | file, ext = os.path.splitext(imagefile) 43 | imgname = file + "_det" + ext 44 | return imgname 45 | 46 | def get_det_result_name(imagefile): 47 | return imagefile.replace('images', 'results').replace('JPEGImages', 'results').replace('.jpg', '.det').replace('.png','.det') 48 | 49 | def get_image_xml_name(imagefile): 50 | return imagefile.replace('images', 'Annotations').replace('JPEGImages', 'Annotations').replace('.jpg', '.xml').replace('.png','.xml') 51 | 52 | def eval_list(cfgfile, namefile, weightfile, testfile): 53 | m = Darknet(cfgfile) 54 | m.load_weights(weightfile) 55 | use_cuda = 1 56 | if use_cuda: 57 | m.cuda() 58 | 59 | class_names = load_class_names(namefile) 60 | 61 | file_list = [] 62 | with open(testfile, "r") as fin: 63 | for f in fin: 64 | file_list.append(f.strip()) 65 | 66 | for imgfile in file_list: 67 | img = Image.open(imgfile).convert('RGB') 68 | sized = img.resize((m.width, m.height)) 69 | filename = os.path.basename(imgfile) 70 | filename = os.path.splitext(filename)[0] 71 | #print(filename, img.width, img.height, sized_width, sized_height) 72 | 73 | if m.width * m.height > 1024 * 2560: 74 | print('omit %s' % filename) 75 | continue 76 | 77 | if False: 78 | boxes = do_detect(m, sized, conf_thresh, nms_thresh, use_cuda) 79 | else: 80 | m.eval() 81 | sized = image2torch(sized).cuda(); 82 | #output = m(Variable(sized, volatile=True)).data 83 | output = m(sized) 84 | #boxes = get_region_boxes(output, conf_thresh, m.num_classes, m.anchors, m.num_anchors, 0, 1)[0] 85 | boxes = get_all_boxes(output, conf_thresh, m.num_classes)[0] 86 | boxes = np.array(nms(boxes, nms_thresh)) 87 | 88 | if False: 89 | savename = get_det_image_name(imgfile) 90 | print('img: save to %s' % savename) 91 | plot_boxes(img, boxes, savename, class_names) 92 | 93 | if False: 94 | savename = get_det_result_name(imgfile) 95 | print('det: save to %s' % savename) 96 | save_boxes(imgfile, img, boxes, savename) 97 | 98 | if __name__ == '__main__': 99 | savedir = None 100 | if len(sys.argv) == 5: 101 | cfgfile = sys.argv[1] 102 | namefile = sys.argv[2] 103 | wgtfile = sys.argv[3] 104 | testlist = sys.argv[4] 105 | 106 | eval_list (cfgfile, namefile, wgtfile, testlist) 107 | else: 108 | print("Usage: %s cfgfile classname weight testlist" % sys.argv[0] ) 109 | 110 | -------------------------------------------------------------------------------- /scripts/eval_ap.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os,sys 9 | #import cPickle 10 | import _pickle as cPickle 11 | import numpy as np 12 | from scripts.eval_all import get_det_result_name, get_image_xml_name 13 | from utils import load_class_names 14 | 15 | def parse_rec(filename): 16 | """ Parse a PASCAL VOC xml file """ 17 | tree = ET.parse(filename) 18 | objects = [] 19 | for obj in tree.findall('object'): 20 | obj_struct = {} 21 | obj_struct['name'] = obj.find('name').text 22 | obj_struct['pose'] = obj.find('pose').text 23 | obj_struct['truncated'] = int(obj.find('truncated').text) 24 | obj_struct['difficult'] = int(obj.find('difficult').text) 25 | bbox = obj.find('bndbox') 26 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 27 | int(bbox.find('ymin').text), 28 | int(bbox.find('xmax').text), 29 | int(bbox.find('ymax').text)] 30 | objects.append(obj_struct) 31 | 32 | return objects 33 | 34 | def eval_ap(rec, prec): 35 | """ ap = eval_ap(rec, prec, [use_07_metric]) 36 | Compute VOC AP given precision and recall. 37 | """ 38 | 39 | # correct AP calculation 40 | # first append sentinel values at the end 41 | mrec = np.concatenate(([0.], rec, [1.])) 42 | mpre = np.concatenate(([0.], prec, [0.])) 43 | 44 | # compute the precision envelope 45 | for i in range(mpre.size - 1, 0, -1): 46 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 47 | 48 | # to calculate area under PR curve, look for points 49 | # where X axis (recall) changes value 50 | i = np.where(mrec[1:] != mrec[:-1])[0] 51 | 52 | # and sum (\Delta recall) * prec 53 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 54 | return ap 55 | 56 | def get_recs_from_cache(imagenames, cachedir, cachename): 57 | # first load gt 58 | if not os.path.isdir(cachedir): 59 | os.mkdir(cachedir) 60 | cachefile = os.path.join(cachedir, cachename) 61 | 62 | if not os.path.isfile(cachefile): 63 | # load annots 64 | recs = {} 65 | for i, imagename in enumerate(imagenames): 66 | recs[imagename] = parse_rec(get_image_xml_name(imagename)) 67 | #if i % 100 == 0: 68 | # print ('Reading annotation for {:d}/{:d}'.format( 69 | # i + 1, len(imagenames))) 70 | # save 71 | # print ('Saving cached annotations to {:s}'.format(cachefile)) 72 | with open(cachefile, 'wb') as f: 73 | cPickle.dump(recs, f) 74 | else: 75 | # load 76 | # print ('loaded cached annotations from {:s}'.format(cachefile)) 77 | with open(cachefile, 'rb') as f: 78 | recs = cPickle.load(f) 79 | try: 80 | for imagename in imagenames: 81 | recs[imagename] 82 | except Exception as e: 83 | print("Exception: {0}".format(e)) 84 | print ('\t{:s} is corrupted. retry!!'.format(cachefile)) 85 | os.remove(cachefile) 86 | recs = get_recs_from_cache(imagenames, cachedir, cachename) 87 | return recs 88 | 89 | def get_class_det_result(detpath, classname): 90 | lines = [] 91 | cls = classes.index(classname) 92 | imagename = None 93 | with open(detpath, 'r') as f: 94 | lines = f.readlines() 95 | splitlines = [x.strip().split(' ') for x in lines] 96 | lines = [] 97 | for i, l in enumerate(splitlines): 98 | if l[0] == '#' and l[2] == '=': 99 | if l[1] == 'imagepath': 100 | imagename = l[3] 101 | elif l[0] != '' and l[0] != '#': 102 | if int(l[0]) == cls: 103 | lines.append([imagename] + l[1:]) 104 | assert(imagename is not None) 105 | #print("{:s} {:s} {:d}".format(detpath, classname, len(lines))) 106 | return lines 107 | 108 | def get_class_detection(imagenames, classname ): 109 | # load annots 110 | classlines = [] 111 | for i, imagename in enumerate(imagenames): 112 | det = get_det_result_name(imagename) 113 | lines = get_class_det_result(det, classname) 114 | classlines.extend(lines) 115 | 116 | #print(classlines) 117 | ids = [x[0] for x in classlines] 118 | conf = np.array([float(x[1])for x in classlines]) 119 | bb = np.array([[float(z)for z in x[2:]] for x in classlines]) 120 | 121 | #print(ids) 122 | #print(bb) 123 | #print(conf) 124 | 125 | return ids, conf, bb 126 | 127 | def eval(imagelist, classname, cachedir, ovthresh=0.5): 128 | """rec, prec, ap = eval(imagelist, classname, [ovthresh]) 129 | 130 | Top level function that does the PASCAL VOC evaluation. 131 | 132 | imagelist: Text file containing the list of images, one image per line. 133 | classname: Category name (duh) 134 | cachedir: Directory for caching the annotations 135 | [ovthresh]: Overlap threshold (default = 0.5) 136 | """ 137 | # read list of images 138 | with open(imagelist, 'r') as f: 139 | lines = f.readlines() 140 | imagenames = [x.strip() for x in lines] 141 | 142 | # cachedir caches the annotations in a pickle file 143 | recs = get_recs_from_cache(imagenames, cachedir, 'annots.pk') 144 | 145 | # extract gt objects for this class 146 | class_recs = {} 147 | npos = 0 148 | for imagename in imagenames: 149 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 150 | bbox = np.array([x['bbox'] for x in R]) 151 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 152 | det = [False] * len(R) 153 | npos = npos + sum(~difficult) 154 | class_recs[imagename] = {'bbox': bbox, 155 | 'difficult': difficult, 156 | 'det': det} 157 | 158 | image_ids, confidence, BB = \ 159 | get_class_detection(imagenames, classname ) 160 | 161 | # sort by confidence 162 | sorted_ind = np.argsort(-confidence) 163 | sorted_scores = np.sort(-confidence) 164 | BB = BB[sorted_ind, :] 165 | #print(image_ids) 166 | image_ids = [image_ids[x] for x in sorted_ind] 167 | 168 | # go down dets and mark TPs and FPs 169 | nd = len(image_ids) 170 | tp = np.zeros(nd) 171 | fp = np.zeros(nd) 172 | for d in range(nd): 173 | R = class_recs[image_ids[d]] 174 | #print("%s (%s) " % (image_ids[d],classname), end='') 175 | #print(R) 176 | bb = BB[d, :].astype(float) 177 | ovmax = -np.inf 178 | BBGT = R['bbox'].astype(float) 179 | 180 | if BBGT.size > 0: 181 | # compute overlaps 182 | # intersection 183 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 184 | iymin = np.maximum(BBGT[:, 1], bb[1]) 185 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 186 | iymax = np.minimum(BBGT[:, 3], bb[3]) 187 | iw = np.maximum(ixmax - ixmin + 1., 0.) 188 | ih = np.maximum(iymax - iymin + 1., 0.) 189 | inters = iw * ih 190 | 191 | # union 192 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 193 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 194 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 195 | 196 | overlaps = inters / uni 197 | ovmax = np.max(overlaps) 198 | jmax = np.argmax(overlaps) 199 | 200 | if ovmax > ovthresh: 201 | if not R['difficult'][jmax]: 202 | if not R['det'][jmax]: 203 | tp[d] = 1. 204 | R['det'][jmax] = 1 205 | else: 206 | fp[d] = 1. 207 | else: 208 | fp[d] = 1. 209 | 210 | # compute precision recall 211 | fp = np.cumsum(fp) 212 | tp = np.cumsum(tp) 213 | rec = tp / float(npos) 214 | # avoid divide by zero in case the first detection matches a difficult 215 | # ground truth 216 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 217 | ap = eval_ap(rec, prec) 218 | 219 | return rec, prec, ap 220 | 221 | 222 | def _do_python_eval(testlist, namelist, output_dir = 'output'): 223 | 224 | cachedir = os.path.join(output_dir, 'annotations_cache') 225 | aps = [] 226 | if not os.path.isdir(output_dir): 227 | os.mkdir(output_dir) 228 | 229 | global classes 230 | classes = load_class_names(namelist) 231 | 232 | for i, cls in enumerate(classes): 233 | rec, prec, ap = eval(testlist, cls, cachedir, ovthresh=0.5) 234 | aps += [ap] 235 | print('AP for {} = {:.4f}'.format(cls, ap)) 236 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 237 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 238 | 239 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 240 | print('~~~~~~~~~~~~~') 241 | print(' Results:') 242 | print('-------------') 243 | for i, ap in enumerate(aps): 244 | print('{:<10s}\t{:.3f}'.format(classes[i], ap)) 245 | print('=============') 246 | print('{:^10s}\t{:.3f}'.format('Average', np.mean(aps))) 247 | print('~~~~~~~~~~~~~') 248 | print('') 249 | print('--------------------------------------------------------------') 250 | print('Results computed with the **unofficial** Python eval code.') 251 | print('Results should be very close to the official MATLAB eval code.') 252 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 253 | print('-- Thanks, The Management') 254 | print('--------------------------------------------------------------') 255 | 256 | 257 | if __name__ == '__main__': 258 | if len(sys.argv) == 3: 259 | testlist = sys.argv[1] 260 | namelist = sys.argv[2] 261 | _do_python_eval(testlist, namelist, output_dir = 'output') 262 | else: 263 | print("Usage: %s testlist namelist" % sys.argv[0] ) 264 | 265 | 266 | -------------------------------------------------------------------------------- /scripts/eval_widerface.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | from PIL import Image 4 | import sys 5 | from darknet import Darknet 6 | from utils import do_detect, plot_boxes, load_class_names 7 | 8 | def save_boxes(img, boxes, savename): 9 | fp = open(savename, 'w') 10 | filename = os.path.basename(savename) 11 | filename = os.path.splitext(filename)[0] 12 | fp.write('%s\n' % filename) 13 | fp.write('%d\n' % len(boxes)) 14 | width = img.width 15 | height = img.height 16 | for box in boxes: 17 | x1 = round((box[0] - box[2]/2.0) * width) 18 | y1 = round((box[1] - box[3]/2.0) * height) 19 | x2 = round((box[0] + box[2]/2.0) * width) 20 | y2 = round((box[1] + box[3]/2.0) * height) 21 | w = x2 - x1 22 | h = y2 - y1 23 | conf = box[4] 24 | fp.write('%d %d %d %d %f\n' % (x1, y1, w, h, conf)) 25 | fp.close() 26 | 27 | def eval_widerface(cfgfile, weightfile, valdir, savedir): 28 | m = Darknet(cfgfile) 29 | m.load_weights(weightfile) 30 | use_cuda = 1 31 | if use_cuda: 32 | m.cuda() 33 | 34 | scale_size = 16 35 | class_names = load_class_names('data/names') 36 | for parent,dirnames,filenames in os.walk(valdir): 37 | if parent != valdir: 38 | targetdir = os.path.join(savedir, os.path.basename(parent)) 39 | if not os.path.isdir(targetdir): 40 | os.mkdir(targetdir) 41 | for filename in filenames: 42 | imgfile = os.path.join(parent,filename) 43 | img = Image.open(imgfile).convert('RGB') 44 | sized_width = int(round(img.width*1.0/scale_size) * 16) 45 | sized_height = int(round(img.height*1.0/scale_size) * 16) 46 | sized = img.resize((sized_width, sized_height)) 47 | print(filename, img.width, img.height, sized_width, sized_height) 48 | if sized_width * sized_height > 1024 * 2560: 49 | print('omit %s' % filename) 50 | continue 51 | boxes = do_detect(m, sized, 0.05, 0.4, use_cuda) 52 | if True: 53 | savename = os.path.join(targetdir, filename) 54 | print('save to %s' % savename) 55 | plot_boxes(img, boxes, savename, class_names) 56 | if True: 57 | savename = os.path.join(targetdir, os.path.splitext(filename)[0]+".txt") 58 | print('save to %s' % savename) 59 | save_boxes(img, boxes, savename) 60 | 61 | if __name__ == '__main__': 62 | #eval_widerface('resnet50_test.cfg', 'resnet50_98000.weights', 'widerface/WIDER_val/images/', 'widerface/wider_val_pred/') 63 | #eval_widerface('resnet50_test.cfg', 'resnet50_148000.weights', 'widerface/WIDER_val/images/', 'widerface/wider_val_pred/') 64 | eval_widerface('resnet50_x32_test.cfg', 'resnet50_x32_288000.weights', 'widerface/WIDER_val/images/', 'widerface/wider_val_pred/') 65 | 66 | -------------------------------------------------------------------------------- /scripts/my_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import os,sys 8 | #import cPickle 9 | import _pickle as cPickle 10 | import numpy as np 11 | from scripts.eval_ap import parse_rec 12 | from scripts.eval_all import get_image_xml_name 13 | from utils import load_class_names 14 | 15 | 16 | def compute_ap(rec, prec, use_07_metric=False): 17 | """ ap = compute_ap(rec, prec, [use_07_metric]) 18 | Compute VOC AP given precision and recall. 19 | If use_07_metric is true, uses the 20 | VOC 07 11 point method (default:False). 21 | """ 22 | if use_07_metric: 23 | # 11 point metric 24 | ap = 0. 25 | for t in np.arange(0., 1.1, 0.1): 26 | if np.sum(rec >= t) == 0: 27 | p = 0 28 | else: 29 | p = np.max(prec[rec >= t]) 30 | ap = ap + p / 11. 31 | else: 32 | # correct AP calculation 33 | # first append sentinel values at the end 34 | mrec = np.concatenate(([0.], rec, [1.])) 35 | mpre = np.concatenate(([0.], prec, [0.])) 36 | 37 | # compute the precision envelope 38 | for i in range(mpre.size - 1, 0, -1): 39 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 40 | 41 | # to calculate area under PR curve, look for points 42 | # where X axis (recall) changes value 43 | i = np.where(mrec[1:] != mrec[:-1])[0] 44 | 45 | # and sum (\Delta recall) * prec 46 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 47 | return ap 48 | 49 | def my_eval(detpath, imagesetfile, classname, cachedir, 50 | ovthresh=0.5, use_07_metric=False): 51 | """rec, prec, ap = my_eval(detpath, 52 | imagesetfile, 53 | classname, 54 | [ovthresh], 55 | [use_07_metric]) 56 | 57 | Top level function that does the PASCAL VOC evaluation. 58 | 59 | detpath: Path to detections 60 | detpath.format(classname) should produce the detection results file. 61 | annopath: Path to annotations 62 | annopath.format(imagename) should be the xml annotations file. 63 | imagesetfile: Text file containing the list of images, one image per line. 64 | classname: Category name (duh) 65 | cachedir: Directory for caching the annotations 66 | [ovthresh]: Overlap threshold (default = 0.5) 67 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 68 | (default False) 69 | """ 70 | # assumes detections are in detpath.format(classname) 71 | # assumes annotations are in annopath.format(imagename) 72 | # assumes imagesetfile is a text file with each line an image name 73 | # cachedir caches the annotations in a pickle file 74 | 75 | # first load gt 76 | if not os.path.isdir(cachedir): 77 | os.mkdir(cachedir) 78 | cachefile = os.path.join(cachedir, 'annots.pkl') 79 | # read list of images 80 | with open(imagesetfile, 'r') as f: 81 | lines = f.readlines() 82 | imagenames = [x.strip() for x in lines] 83 | 84 | if not os.path.isfile(cachefile): 85 | # load annots 86 | recs = {} 87 | for i, imagename in enumerate(imagenames): 88 | imagekey = os.path.basename(imagename).split('.')[0] 89 | recs[imagekey] = parse_rec(get_image_xml_name(imagename)) 90 | if i % 100 == 0: 91 | print ('Reading annotation for {:d}/{:d}'.format( 92 | i + 1, len(imagenames))) 93 | # save 94 | print ('Saving cached annotations to {:s}'.format(cachefile)) 95 | with open(cachefile, 'wb') as f: 96 | cPickle.dump(recs, f) 97 | else: 98 | # load 99 | with open(cachefile, 'rb') as f: 100 | recs = cPickle.load(f) 101 | 102 | # extract gt objects for this class 103 | class_recs = {} 104 | npos = 0 105 | for imagename in imagenames: 106 | imagekey = os.path.basename(imagename).split('.')[0] 107 | try: 108 | R = [obj for obj in recs[imagekey] if obj['name'] == classname] 109 | except: 110 | print("%s %s" % (imagename, imagekey)) 111 | exit(0) 112 | 113 | bbox = np.array([x['bbox'] for x in R]) 114 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 115 | det = [False] * len(R) 116 | npos = npos + sum(~difficult) 117 | class_recs[imagekey] = {'bbox': bbox, 118 | 'difficult': difficult, 119 | 'det': det} 120 | 121 | # read dets 122 | detfile = detpath.format(classname) 123 | with open(detfile, 'r') as f: 124 | lines = f.readlines() 125 | 126 | splitlines = [x.strip().split(' ') for x in lines] 127 | image_ids = [x[0] for x in splitlines] 128 | confidence = np.array([float(x[1]) for x in splitlines]) 129 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 130 | 131 | # sort by confidence 132 | sorted_ind = np.argsort(-confidence) 133 | sorted_scores = np.sort(-confidence) 134 | if len(sorted_ind) > 0: 135 | BB = BB[sorted_ind, :] 136 | image_ids = [image_ids[x] for x in sorted_ind] 137 | 138 | # go down dets and mark TPs and FPs 139 | nd = len(image_ids) 140 | tp = np.zeros(nd) 141 | fp = np.zeros(nd) 142 | for d in range(nd): 143 | R = class_recs[image_ids[d]] 144 | bb = BB[d, :].astype(float) 145 | ovmax = -np.inf 146 | BBGT = R['bbox'].astype(float) 147 | 148 | if BBGT.size > 0: 149 | # compute overlaps 150 | # intersection 151 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 152 | iymin = np.maximum(BBGT[:, 1], bb[1]) 153 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 154 | iymax = np.minimum(BBGT[:, 3], bb[3]) 155 | iw = np.maximum(ixmax - ixmin + 1., 0.) 156 | ih = np.maximum(iymax - iymin + 1., 0.) 157 | inters = iw * ih 158 | 159 | # union 160 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 161 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 162 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 163 | 164 | overlaps = inters / uni 165 | ovmax = np.max(overlaps) 166 | jmax = np.argmax(overlaps) 167 | 168 | if ovmax > ovthresh: 169 | if not R['difficult'][jmax]: 170 | if not R['det'][jmax]: 171 | tp[d] = 1. 172 | R['det'][jmax] = 1 173 | else: 174 | fp[d] = 1. 175 | else: 176 | fp[d] = 1. 177 | 178 | # compute precision recall 179 | fp = np.cumsum(fp) 180 | tp = np.cumsum(tp) 181 | rec = tp / float(npos) 182 | # avoid divide by zero in case the first detection matches a difficult 183 | # ground truth 184 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 185 | ap = compute_ap(rec, prec, use_07_metric) 186 | 187 | #print('class: {:<10s} \t num occurrence: {:4d}'.format(classname, npos)) 188 | 189 | return rec, prec, ap, npos 190 | 191 | 192 | 193 | def _do_python_eval(res_prefix, imagesetfile, classesfile, output_dir = 'output'): 194 | 195 | filename = res_prefix + '{:s}.txt' 196 | 197 | cachedir = os.path.join(output_dir, 'annotations_cache') 198 | aps = [] 199 | # The PASCAL VOC metric changed in 2010 200 | use_07_metric = False 201 | #print ('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 202 | if not os.path.isdir(output_dir): 203 | os.mkdir(output_dir) 204 | 205 | _classes = load_class_names(classesfile) 206 | total = 0 207 | for i, cls in enumerate(_classes): 208 | if cls == '__background__': 209 | continue 210 | 211 | rec, prec, ap, noccur = my_eval( 212 | filename, imagesetfile, cls, cachedir, ovthresh=0.5, 213 | use_07_metric=use_07_metric) 214 | aps += [ap] 215 | total += noccur 216 | print('AP for {:<10s} = {:.4f} with {:4d} views'.format(cls, ap, noccur)) 217 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 218 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 219 | 220 | print('Mean AP = {:.4f} with total {:4d} views'.format(np.mean(aps), total)) 221 | 222 | print('~'*30) 223 | print(' '*10, 'Results:') 224 | print('-'*30) 225 | for i, ap in enumerate(aps): 226 | print('{:<10s}\t{:.3f}'.format(_classes[i], ap)) 227 | print('='*30) 228 | print('{:^10s}\t{:.3f}'.format('Average', np.mean(aps))) 229 | print('~'*30) 230 | print('') 231 | print('--------------------------------------------------------------') 232 | print('Results computed with the **unofficial** Python eval code.') 233 | print('Results should be very close to the official MATLAB eval code.') 234 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 235 | print('-- Thanks, The Management') 236 | print('--------------------------------------------------------------') 237 | 238 | 239 | if __name__ == '__main__': 240 | #res_prefixc = '/data/hongji/darknet/results/comp4_det_test_' 241 | #res_prefix = 'results/comp4_det_test_' 242 | #test_file = 'data/sketch_test.txt' 243 | #class_names = 'data/sketch.names' 244 | res_prefix = sys.argv[1] 245 | test_file = sys.argv[2] 246 | class_names = sys.argv[3] 247 | _do_python_eval(res_prefix, test_file, class_names, output_dir = 'output') 248 | 249 | 250 | -------------------------------------------------------------------------------- /scripts/voc_eval.py_old_version_: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os,sys 9 | import cPickle 10 | import numpy as np 11 | 12 | def parse_rec(filename): 13 | """ Parse a PASCAL VOC xml file """ 14 | tree = ET.parse(filename) 15 | objects = [] 16 | for obj in tree.findall('object'): 17 | obj_struct = {} 18 | obj_struct['name'] = obj.find('name').text 19 | obj_struct['pose'] = obj.find('pose').text 20 | obj_struct['truncated'] = int(obj.find('truncated').text) 21 | obj_struct['difficult'] = int(obj.find('difficult').text) 22 | bbox = obj.find('bndbox') 23 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 24 | int(bbox.find('ymin').text), 25 | int(bbox.find('xmax').text), 26 | int(bbox.find('ymax').text)] 27 | objects.append(obj_struct) 28 | 29 | return objects 30 | 31 | def voc_ap(rec, prec, use_07_metric=False): 32 | """ ap = voc_ap(rec, prec, [use_07_metric]) 33 | Compute VOC AP given precision and recall. 34 | If use_07_metric is true, uses the 35 | VOC 07 11 point method (default:False). 36 | """ 37 | if use_07_metric: 38 | # 11 point metric 39 | ap = 0. 40 | for t in np.arange(0., 1.1, 0.1): 41 | if np.sum(rec >= t) == 0: 42 | p = 0 43 | else: 44 | p = np.max(prec[rec >= t]) 45 | ap = ap + p / 11. 46 | else: 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], rec, [1.])) 50 | mpre = np.concatenate(([0.], prec, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | def voc_eval(detpath, 65 | annopath, 66 | imagesetfile, 67 | classname, 68 | cachedir, 69 | ovthresh=0.5, 70 | use_07_metric=False): 71 | """rec, prec, ap = voc_eval(detpath, 72 | annopath, 73 | imagesetfile, 74 | classname, 75 | [ovthresh], 76 | [use_07_metric]) 77 | 78 | Top level function that does the PASCAL VOC evaluation. 79 | 80 | detpath: Path to detections 81 | detpath.format(classname) should produce the detection results file. 82 | annopath: Path to annotations 83 | annopath.format(imagename) should be the xml annotations file. 84 | imagesetfile: Text file containing the list of images, one image per line. 85 | classname: Category name (duh) 86 | cachedir: Directory for caching the annotations 87 | [ovthresh]: Overlap threshold (default = 0.5) 88 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 89 | (default False) 90 | """ 91 | # assumes detections are in detpath.format(classname) 92 | # assumes annotations are in annopath.format(imagename) 93 | # assumes imagesetfile is a text file with each line an image name 94 | # cachedir caches the annotations in a pickle file 95 | 96 | # first load gt 97 | if not os.path.isdir(cachedir): 98 | os.mkdir(cachedir) 99 | cachefile = os.path.join(cachedir, 'annots.pkl') 100 | # read list of images 101 | with open(imagesetfile, 'r') as f: 102 | lines = f.readlines() 103 | imagenames = [x.strip() for x in lines] 104 | 105 | if not os.path.isfile(cachefile): 106 | # load annots 107 | recs = {} 108 | for i, imagename in enumerate(imagenames): 109 | recs[imagename] = parse_rec(annopath.format(imagename)) 110 | if i % 100 == 0: 111 | print 'Reading annotation for {:d}/{:d}'.format( 112 | i + 1, len(imagenames)) 113 | # save 114 | print 'Saving cached annotations to {:s}'.format(cachefile) 115 | with open(cachefile, 'w') as f: 116 | cPickle.dump(recs, f) 117 | else: 118 | # load 119 | with open(cachefile, 'r') as f: 120 | recs = cPickle.load(f) 121 | 122 | # extract gt objects for this class 123 | class_recs = {} 124 | npos = 0 125 | for imagename in imagenames: 126 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagename] = {'bbox': bbox, 132 | 'difficult': difficult, 133 | 'det': det} 134 | 135 | # read dets 136 | detfile = detpath.format(classname) 137 | with open(detfile, 'r') as f: 138 | lines = f.readlines() 139 | 140 | splitlines = [x.strip().split(' ') for x in lines] 141 | image_ids = [x[0] for x in splitlines] 142 | confidence = np.array([float(x[1]) for x in splitlines]) 143 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 144 | 145 | # sort by confidence 146 | sorted_ind = np.argsort(-confidence) 147 | sorted_scores = np.sort(-confidence) 148 | BB = BB[sorted_ind, :] 149 | image_ids = [image_ids[x] for x in sorted_ind] 150 | 151 | # go down dets and mark TPs and FPs 152 | nd = len(image_ids) 153 | tp = np.zeros(nd) 154 | fp = np.zeros(nd) 155 | for d in range(nd): 156 | R = class_recs[image_ids[d]] 157 | bb = BB[d, :].astype(float) 158 | ovmax = -np.inf 159 | BBGT = R['bbox'].astype(float) 160 | 161 | if BBGT.size > 0: 162 | # compute overlaps 163 | # intersection 164 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 165 | iymin = np.maximum(BBGT[:, 1], bb[1]) 166 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 167 | iymax = np.minimum(BBGT[:, 3], bb[3]) 168 | iw = np.maximum(ixmax - ixmin + 1., 0.) 169 | ih = np.maximum(iymax - iymin + 1., 0.) 170 | inters = iw * ih 171 | 172 | # union 173 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 174 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 175 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 176 | 177 | overlaps = inters / uni 178 | ovmax = np.max(overlaps) 179 | jmax = np.argmax(overlaps) 180 | 181 | if ovmax > ovthresh: 182 | if not R['difficult'][jmax]: 183 | if not R['det'][jmax]: 184 | tp[d] = 1. 185 | R['det'][jmax] = 1 186 | else: 187 | fp[d] = 1. 188 | else: 189 | fp[d] = 1. 190 | 191 | # compute precision recall 192 | fp = np.cumsum(fp) 193 | tp = np.cumsum(tp) 194 | rec = tp / float(npos) 195 | # avoid divide by zero in case the first detection matches a difficult 196 | # ground truth 197 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 198 | ap = voc_ap(rec, prec, use_07_metric) 199 | 200 | return rec, prec, ap 201 | 202 | 203 | 204 | def _do_python_eval(res_prefix, output_dir = 'output'): 205 | _devkit_path = '/data/xiaohang/pytorch-yolo2/VOCdevkit' 206 | _year = '2007' 207 | _classes = ('__background__', # always index 0 208 | 'aeroplane', 'bicycle', 'bird', 'boat', 209 | 'bottle', 'bus', 'car', 'cat', 'chair', 210 | 'cow', 'diningtable', 'dog', 'horse', 211 | 'motorbike', 'person', 'pottedplant', 212 | 'sheep', 'sofa', 'train', 'tvmonitor') 213 | 214 | #filename = '/data/hongji/darknet/results/comp4_det_test_{:s}.txt' 215 | filename = res_prefix + '{:s}.txt' 216 | annopath = os.path.join( 217 | _devkit_path, 218 | 'VOC' + _year, 219 | 'Annotations', 220 | '{:s}.xml') 221 | imagesetfile = os.path.join( 222 | _devkit_path, 223 | 'VOC' + _year, 224 | 'ImageSets', 225 | 'Main', 226 | 'test.txt') 227 | cachedir = os.path.join(_devkit_path, 'annotations_cache') 228 | aps = [] 229 | # The PASCAL VOC metric changed in 2010 230 | use_07_metric = True if int(_year) < 2010 else False 231 | print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No') 232 | if not os.path.isdir(output_dir): 233 | os.mkdir(output_dir) 234 | for i, cls in enumerate(_classes): 235 | if cls == '__background__': 236 | continue 237 | 238 | rec, prec, ap = voc_eval( 239 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 240 | use_07_metric=use_07_metric) 241 | aps += [ap] 242 | print('AP for {} = {:.4f}'.format(cls, ap)) 243 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'w') as f: 244 | cPickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 245 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 246 | print('~~~~~~~~') 247 | print('Results:') 248 | for ap in aps: 249 | print('{:.3f}'.format(ap)) 250 | print('{:.3f}'.format(np.mean(aps))) 251 | print('~~~~~~~~') 252 | print('') 253 | print('--------------------------------------------------------------') 254 | print('Results computed with the **unofficial** Python eval code.') 255 | print('Results should be very close to the official MATLAB eval code.') 256 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 257 | print('-- Thanks, The Management') 258 | print('--------------------------------------------------------------') 259 | 260 | 261 | if __name__ == '__main__': 262 | #res_prefix = '/data/hongji/darknet/project/voc/results/comp4_det_test_' 263 | res_prefix = sys.argv[1] 264 | _do_python_eval(res_prefix, output_dir = 'output') 265 | 266 | 267 | -------------------------------------------------------------------------------- /scripts/voc_label.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import pickle 3 | import os 4 | from os import listdir, getcwd 5 | from os.path import join 6 | 7 | sets=[('2012', 'train'), ('2012', 'val'), ('2007', 'train'), ('2007', 'val'), ('2007', 'test')] 8 | 9 | classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"] 10 | 11 | 12 | def convert(size, box): 13 | dw = 1./size[0] 14 | dh = 1./size[1] 15 | x = (box[0] + box[1])/2.0 16 | y = (box[2] + box[3])/2.0 17 | w = box[1] - box[0] 18 | h = box[3] - box[2] 19 | x = x*dw 20 | w = w*dw 21 | y = y*dh 22 | h = h*dh 23 | return (x,y,w,h) 24 | 25 | def convert_annotation(year, image_id): 26 | in_file = open('VOCdevkit/VOC%s/Annotations/%s.xml'%(year, image_id)) 27 | out_file = open('VOCdevkit/VOC%s/labels/%s.txt'%(year, image_id), 'w') 28 | tree=ET.parse(in_file) 29 | root = tree.getroot() 30 | size = root.find('size') 31 | w = int(size.find('width').text) 32 | h = int(size.find('height').text) 33 | 34 | for obj in root.iter('object'): 35 | difficult = obj.find('difficult').text 36 | cls = obj.find('name').text 37 | if cls not in classes or int(difficult) == 1: 38 | continue 39 | cls_id = classes.index(cls) 40 | xmlbox = obj.find('bndbox') 41 | b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text)) 42 | bb = convert((w,h), b) 43 | out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n') 44 | 45 | wd = getcwd() 46 | 47 | for year, image_set in sets: 48 | if not os.path.exists('VOCdevkit/VOC%s/labels/'%(year)): 49 | os.makedirs('VOCdevkit/VOC%s/labels/'%(year)) 50 | image_ids = open('VOCdevkit/VOC%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split() 51 | list_file = open('%s_%s.txt'%(year, image_set), 'w') 52 | for image_id in image_ids: 53 | list_file.write('%s/VOCdevkit/VOC%s/JPEGImages/%s.jpg\n'%(wd, year, image_id)) 54 | convert_annotation(year, image_id) 55 | list_file.close() 56 | 57 | -------------------------------------------------------------------------------- /tools/lmdb/create_dataset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import lmdb # install lmdb by "pip install lmdb" 4 | import cv2 5 | import numpy as np 6 | 7 | def checkImageIsValid(imageBin): 8 | if imageBin is None: 9 | return False 10 | imageBuf = np.fromstring(imageBin, dtype=np.uint8) 11 | img = cv2.imdecode(imageBuf, cv2.IMREAD_COLOR) 12 | imgH, imgW = img.shape[0], img.shape[1] 13 | if imgH * imgW == 0: 14 | return False 15 | return True 16 | 17 | 18 | def writeCache(env, cache): 19 | with env.begin(write=True) as txn: 20 | for k, v in cache.iteritems(): 21 | txn.put(k, v) 22 | 23 | 24 | def createDataset(outputPath, imageListFile, checkValid=True): 25 | """ 26 | Create LMDB dataset for CRNN training. 27 | 28 | ARGS: 29 | outputPath : LMDB output path 30 | imagePathList : list of image path 31 | checkValid : if true, check the validity of every image 32 | """ 33 | with open(imageListFile) as fp: 34 | imagePathList = fp.readlines() 35 | nSamples = len(imagePathList) 36 | env = lmdb.open(outputPath, map_size=1099511627776) 37 | cache = {} 38 | cnt = 1 39 | for i in xrange(nSamples): 40 | imagePath = imagePathList[i].rstrip() 41 | labelPath = imagePath.replace('images', 'labels').replace('JPEGImages', 'labels').replace('.jpg', '.txt').replace('.png','.txt') 42 | with open(labelPath) as f: 43 | label = f.readlines() 44 | label = ''.join(label) 45 | 46 | if not os.path.exists(imagePath): 47 | print('%s does not exist' % imagePath) 48 | continue 49 | with open(imagePath, 'r') as f: 50 | imageBin = f.read() 51 | if checkValid: 52 | if not checkImageIsValid(imageBin): 53 | print('%s is not a valid image' % imagePath) 54 | continue 55 | 56 | imageKey = 'image-%09d' % cnt 57 | labelKey = 'label-%09d' % cnt 58 | cache[imageKey] = imageBin 59 | cache[labelKey] = label 60 | if cnt % 1000 == 0: 61 | writeCache(env, cache) 62 | cache = {} 63 | print('Written %d / %d' % (cnt, nSamples)) 64 | cnt += 1 65 | nSamples = cnt-1 66 | cache['num-samples'] = str(nSamples) 67 | writeCache(env, cache) 68 | print('Created dataset with %d samples' % nSamples) 69 | 70 | if __name__ == '__main__': 71 | outputPath = sys.argv[1] 72 | imageListFile = sys.argv[2] 73 | createDataset(outputPath, imageListFile, checkValid=True) 74 | -------------------------------------------------------------------------------- /tools/lmdb/lmdb_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # encoding: utf-8 3 | 4 | import os 5 | import random 6 | import torch 7 | import numpy as np 8 | from torch.utils.data import Dataset 9 | from PIL import Image 10 | from utils import read_truths_args, read_truths 11 | import cv2 12 | import lmdb 13 | 14 | class lmdbDataset(Dataset): 15 | 16 | def __init__(self, lmdb_root, shape=None, shuffle=True, transform=None, target_transform=None, train=False, seen=0): 17 | self.env = lmdb.open(lmdb_root, 18 | max_readers=1, 19 | readonly=True, 20 | lock=False, 21 | readahead=False, 22 | meminit=False) 23 | self.txn = self.env.begin(write=False) 24 | self.nSamples = int(self.txn.get('num-samples')) 25 | self.indices = range(self.nSamples) 26 | if shuffle: 27 | random.shuffle(self.indices) 28 | 29 | self.transform = transform 30 | self.target_transform = target_transform 31 | self.train = train 32 | self.shape = shape 33 | self.seen = seen 34 | #if self.train: 35 | # print('init seen to %d' % (self.seen)) 36 | 37 | def __len__(self): 38 | return self.nSamples 39 | 40 | def __getitem__(self, index): 41 | assert index <= len(self), 'index range error' 42 | imgkey = 'image-%09d' % (self.indices[index]+1) 43 | labkey = 'label-%09d' % (self.indices[index]+1) 44 | label = torch.zeros(50*5) 45 | 46 | imageBin = self.txn.get(imgkey) 47 | imageBuf = np.fromstring(imageBin, dtype=np.uint8) 48 | img = cv2.imdecode(imageBuf, cv2.IMREAD_COLOR) 49 | if self.train and index % 64 == 0: 50 | if self.seen < 4000*64*4: 51 | width = (random.randint(0,2)*2 + 13)*32 52 | self.shape = (width, width) 53 | elif self.seen < 8000*64*4: 54 | width = (random.randint(0,4)*2 + 9)*32 55 | self.shape = (width, width) 56 | elif self.seen < 12000*64*4: 57 | width = (random.randint(0,6)*2 + 5)*32 58 | self.shape = (width, width) 59 | elif self.seen < 12000*64*4: 60 | width = (random.randint(0,12) + 5)*32 61 | self.shape = (width, width) 62 | else: # self.seen < 20000*64*4: 63 | width = (random.randint(0,16) + 3)*32 64 | self.shape = (width, width) 65 | 66 | if self.shape: 67 | img = cv2.resize(img, self.shape, interpolation = cv2.INTER_CUBIC) 68 | 69 | tid = 0 70 | truths = self.txn.get(labkey).rstrip().split('\n') 71 | for truth in truths: 72 | truth = truth.split() 73 | tmp = [float(t) for t in truth] 74 | if tmp[3] > 8.0/img.shape[0]: 75 | label[tid*5+0] = tmp[0] 76 | label[tid*5+1] = tmp[1] 77 | label[tid*5+2] = tmp[2] 78 | label[tid*5+3] = tmp[3] 79 | label[tid*5+4] = tmp[4] 80 | tid = tid + 1 81 | 82 | width = img.shape[0] 83 | height = img.shape[1] 84 | img = torch.from_numpy(img) 85 | img = img.view(height, width, 3).transpose(0,1).transpose(0,2).contiguous() 86 | img = img.view(1, 3, height, width) 87 | img = img.float().div(255.0) 88 | 89 | if self.transform is not None: 90 | img = self.transform(img) 91 | 92 | if self.target_transform is not None: 93 | label = self.target_transform(label) 94 | 95 | self.seen = self.seen + 4 96 | return (img, label) 97 | 98 | def lmdb_nsamples(db): 99 | env = lmdb.open(db, 100 | max_readers=1, 101 | readonly=True, 102 | lock=False, 103 | readahead=False, 104 | meminit=False) 105 | 106 | with env.begin(write=False) as txn: 107 | nSamples = int(txn.get('num-samples')) 108 | return nSamples 109 | 110 | -------------------------------------------------------------------------------- /tools/lmdb/plot_lmdb.py: -------------------------------------------------------------------------------- 1 | import lmdb 2 | import cv2 3 | import numpy as np 4 | 5 | env = lmdb.open('data/face_test_lmdb', 6 | max_readers=1, 7 | readonly=True, 8 | lock=False, 9 | readahead=False, 10 | meminit=False) 11 | 12 | with env.begin(write=False) as txn: 13 | nSamples = int(txn.get('num-samples')) 14 | #print nSamples 15 | for index in range(nSamples): 16 | image_key = 'image-%09d' % (index+1) 17 | label_key = 'label-%09d' % (index+1) 18 | imageBin = txn.get(image_key) 19 | imageBuf = np.fromstring(imageBin, dtype=np.uint8) 20 | img = cv2.imdecode(imageBuf, cv2.IMREAD_COLOR) 21 | imgH, imgW = img.shape[0], img.shape[1] 22 | labels = txn.get(label_key).rstrip().split('\n') 23 | for label in labels: 24 | label = label.split() 25 | box = [float(i) for i in label] 26 | x = box[1]*imgW 27 | y = box[2]*imgH 28 | w = box[3]*imgW 29 | h = box[4]*imgH 30 | x1 = int(x - w/2.0) 31 | x2 = int(x + w/2.0) 32 | y1 = int(y - h/2.0) 33 | y2 = int(y + h/2.0) 34 | cv2.rectangle(img, (x1,y1), (x2,y2), (255,0,0), 3) 35 | savename = 'tmp/%s.png'%(image_key) 36 | print('save %s' % (savename)) 37 | cv2.imwrite(savename, img) 38 | 39 | -------------------------------------------------------------------------------- /tools/lmdb/train_lmdb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | if len(sys.argv) != 4: 4 | print('Usage:') 5 | print('python train.py datacfg cfgfile weightfile') 6 | exit() 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | import torch.optim as optim 12 | import torch.backends.cudnn as cudnn 13 | from torchvision import datasets, transforms 14 | from torch.autograd import Variable 15 | 16 | import lmdb_utils 17 | import random 18 | import math 19 | from utils import * 20 | from cfg import parse_cfg 21 | from region_loss import RegionLoss 22 | from darknet import Darknet 23 | 24 | 25 | # Training settings 26 | datacfg = sys.argv[1] 27 | cfgfile = sys.argv[2] 28 | weightfile = sys.argv[3] 29 | 30 | data_options = read_data_cfg(datacfg) 31 | net_options = parse_cfg(cfgfile)[0] 32 | 33 | traindb = data_options['train'] 34 | testdb = data_options['valid'] 35 | backupdir = data_options['backup'] 36 | nsamples = lmdb_utils.lmdb_nsamples(traindb) 37 | 38 | batch_size = int(net_options['batch']) 39 | max_batches = int(net_options['max_batches']) 40 | learning_rate = float(net_options['learning_rate']) 41 | momentum = float(net_options['momentum']) 42 | 43 | max_epochs = max_batches*batch_size/nsamples+1 44 | use_cuda = True 45 | seed = 22222 46 | eps = 1e-5 47 | 48 | ############### 49 | torch.manual_seed(seed) 50 | if use_cuda: 51 | torch.cuda.manual_seed(seed) 52 | 53 | model = Darknet(cfgfile) 54 | region_loss = model.loss 55 | 56 | model.load_weights(weightfile) 57 | model.print_network() 58 | init_epoch = model.seen / nsamples 59 | 60 | kwargs = {'num_workers': 8, 'pin_memory': True} if use_cuda else {} 61 | test_loader = torch.utils.data.DataLoader( 62 | lmdb_utils.lmdbDataset(testdb, shape=(160, 160), 63 | shuffle=False, 64 | transform=None, 65 | train=False), 66 | batch_size=batch_size, shuffle=False, **kwargs) 67 | 68 | if use_cuda: 69 | model = torch.nn.DataParallel(model).cuda() 70 | 71 | optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) 72 | 73 | def adjust_learning_rate(optimizer, epoch): 74 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 75 | lr = learning_rate * (0.1 ** (epoch // 50)) 76 | for param_group in optimizer.param_groups: 77 | param_group['lr'] = lr 78 | logging('set lr=%f' % (lr)) 79 | 80 | def train(epoch): 81 | train_loader = torch.utils.data.DataLoader( 82 | lmdb_utils.lmdbDataset(traindb, shape=(model.module.width, model.module.height), 83 | shuffle=True, 84 | train=True, seen=model.module.seen), 85 | batch_size=batch_size, shuffle=False, **kwargs) 86 | 87 | logging('epoch %d : processed %d samples' % (epoch, epoch * len(train_loader.dataset))) 88 | model.train() 89 | adjust_learning_rate(optimizer, epoch) 90 | for batch_idx, (data, target) in enumerate(train_loader): 91 | if (batch_idx+1) % 70 == 0: 92 | sys.stdout.write('.') 93 | 94 | if use_cuda: 95 | data = data.cuda() 96 | #target= target.cuda() 97 | data, target = Variable(data), Variable(target) 98 | optimizer.zero_grad() 99 | output = model(data) 100 | loss = region_loss(output, target) 101 | loss.backward() 102 | optimizer.step() 103 | print('') 104 | logging('save weights to %s/%06d.weights' % (backupdir, epoch+1)) 105 | model.module.seen = (epoch + 1) * len(train_loader.dataset) 106 | model.module.save_weights('%s/%06d.weights' % (backupdir, epoch+1)) 107 | 108 | def test(epoch): 109 | def truths_length(truths): 110 | for i in range(50): 111 | if truths[i][1] == 0: 112 | return i 113 | 114 | model.eval() 115 | num_classes = model.module.num_classes 116 | anchors = model.module.anchors 117 | num_anchors = model.module.num_anchors 118 | conf_thresh = 0.25 119 | nms_thresh = 0.4 120 | iou_thresh = 0.5 121 | total = 0.0 122 | proposals = 0.0 123 | correct = 0.0 124 | 125 | for batch_idx, (data, target) in enumerate(test_loader): 126 | if use_cuda: 127 | data = data.cuda() 128 | data = Variable(data, volatile=True) 129 | output = model(data).data 130 | all_boxes = get_region_boxes(output, conf_thresh, num_classes, anchors, num_anchors) 131 | if output.size(0) == 1: 132 | all_boxes = [all_boxes] 133 | for i in range(output.size(0)): 134 | boxes = all_boxes[i] 135 | boxes = nms(boxes, nms_thresh) 136 | truths = target[i].view(-1, 5) 137 | num_gts = truths_length(truths) 138 | 139 | total = total + num_gts 140 | 141 | for i in range(len(boxes)): 142 | if boxes[i][4] > conf_thresh: 143 | proposals = proposals+1 144 | 145 | for i in range(num_gts): 146 | box_gt = [truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0] 147 | best_iou = 0 148 | for j in range(len(boxes)): 149 | iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) 150 | best_iou = max(iou, best_iou) 151 | if best_iou > iou_thresh: 152 | correct = correct+1 153 | 154 | precision = 1.0*correct/(proposals+eps) 155 | recall = 1.0*correct/(total+eps) 156 | fscore = 2.0*precision*recall/(precision+recall+eps) 157 | logging("precision: %f, recall: %f, fscore: %f" % (precision, recall, fscore)) 158 | 159 | evaluate = True 160 | if evaluate: 161 | print('evaluating ...') 162 | test(0) 163 | else: 164 | for epoch in range(init_epoch, max_epochs): 165 | train(epoch) 166 | test(epoch) 167 | -------------------------------------------------------------------------------- /valid.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from darknet import Darknet 3 | import dataset 4 | from torchvision import datasets, transforms 5 | from utils import get_all_boxes, bbox_iou, nms, read_data_cfg, load_class_names 6 | from image import correct_yolo_boxes 7 | import os 8 | 9 | def valid(datacfg, cfgfile, weightfile, outfile): 10 | options = read_data_cfg(datacfg) 11 | valid_images = options['valid'] 12 | name_list = options['names'] 13 | prefix = 'results' 14 | names = load_class_names(name_list) 15 | 16 | with open(valid_images) as fp: 17 | tmp_files = fp.readlines() 18 | valid_files = [item.rstrip() for item in tmp_files] 19 | 20 | m = Darknet(cfgfile) 21 | m.print_network() 22 | m.load_weights(weightfile) 23 | m.cuda() 24 | m.eval() 25 | 26 | valid_dataset = dataset.listDataset(valid_images, shape=(m.width, m.height), 27 | shuffle=False, 28 | transform=transforms.Compose([ 29 | transforms.ToTensor(), 30 | ])) 31 | valid_batchsize = 2 32 | assert(valid_batchsize > 1) 33 | 34 | kwargs = {'num_workers': 4, 'pin_memory': True} 35 | valid_loader = torch.utils.data.DataLoader( 36 | valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) 37 | 38 | fps = [0]*m.num_classes 39 | if not os.path.exists('results'): 40 | os.mkdir('results') 41 | for i in range(m.num_classes): 42 | buf = '%s/%s%s.txt' % (prefix, outfile, names[i]) 43 | fps[i] = open(buf, 'w') 44 | 45 | lineId = -1 46 | 47 | conf_thresh = 0.005 48 | nms_thresh = 0.45 49 | if m.net_name() == 'region': # region_layer 50 | shape=(0,0) 51 | else: 52 | shape=(m.width, m.height) 53 | for _, (data, target, org_w, org_h) in enumerate(valid_loader): 54 | data = data.cuda() 55 | output = m(data) 56 | batch_boxes = get_all_boxes(output, shape, conf_thresh, m.num_classes, only_objectness=0, validation=True) 57 | 58 | for i in range(len(batch_boxes)): 59 | lineId += 1 60 | fileId = os.path.basename(valid_files[lineId]).split('.')[0] 61 | #width, height = get_image_size(valid_files[lineId]) 62 | width, height = float(org_w[i]), float(org_h[i]) 63 | print(valid_files[lineId]) 64 | boxes = batch_boxes[i] 65 | correct_yolo_boxes(boxes, width, height, m.width, m.height) 66 | boxes = nms(boxes, nms_thresh) 67 | for box in boxes: 68 | x1 = (box[0] - box[2]/2.0) * width 69 | y1 = (box[1] - box[3]/2.0) * height 70 | x2 = (box[0] + box[2]/2.0) * width 71 | y2 = (box[1] + box[3]/2.0) * height 72 | 73 | det_conf = box[4] 74 | for j in range((len(box)-5)//2): 75 | cls_conf = box[5+2*j] 76 | cls_id = int(box[6+2*j]) 77 | prob = det_conf * cls_conf 78 | fps[cls_id].write('%s %f %f %f %f %f\n' % (fileId, prob, x1, y1, x2, y2)) 79 | 80 | for i in range(m.num_classes): 81 | fps[i].close() 82 | 83 | if __name__ == '__main__': 84 | import sys 85 | if len(sys.argv) == 4: 86 | datacfg = sys.argv[1] 87 | cfgfile = sys.argv[2] 88 | weightfile = sys.argv[3] 89 | outfile = 'comp4_det_test_' 90 | valid(datacfg, cfgfile, weightfile, outfile) 91 | else: 92 | print('Usage:') 93 | print(' python valid.py datacfg cfgfile weightfile') 94 | -------------------------------------------------------------------------------- /yolo_layer.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | import sys 4 | import time 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from utils import bbox_iou, multi_bbox_ious, convert2cpu 9 | 10 | class YoloLayer(nn.Module): 11 | def __init__(self, anchor_mask=[], num_classes=0, anchors=[1.0], num_anchors=1, use_cuda=None): 12 | super(YoloLayer, self).__init__() 13 | use_cuda = torch.cuda.is_available() and (True if use_cuda is None else use_cuda) 14 | self.device = torch.device("cuda" if use_cuda else "cpu") 15 | 16 | self.anchor_mask = anchor_mask 17 | self.num_classes = num_classes 18 | self.anchors = anchors 19 | self.num_anchors = num_anchors 20 | self.anchor_step = len(anchors)//num_anchors 21 | self.rescore = 1 22 | self.ignore_thresh = 0.5 23 | self.truth_thresh = 1. 24 | self.nth_layer = 0 25 | self.seen = 0 26 | self.net_width = 0 27 | self.net_height = 0 28 | 29 | def get_mask_boxes(self, output): 30 | masked_anchors = [] 31 | for m in self.anchor_mask: 32 | masked_anchors += self.anchors[m*self.anchor_step:(m+1)*self.anchor_step] 33 | 34 | masked_anchors = torch.FloatTensor(masked_anchors).to(self.device) 35 | num_anchors = torch.IntTensor([len(self.anchor_mask)]).to(self.device) 36 | return {'x':output, 'a':masked_anchors, 'n':num_anchors} 37 | 38 | def build_targets(self, pred_boxes, target, anchors, nA, nH, nW): 39 | nB = target.size(0) 40 | anchor_step = anchors.size(1) # anchors[nA][anchor_step] 41 | noobj_mask = torch.ones (nB, nA, nH, nW) 42 | obj_mask = torch.zeros(nB, nA, nH, nW) 43 | coord_mask = torch.zeros(nB, nA, nH, nW) 44 | tcoord = torch.zeros( 4, nB, nA, nH, nW) 45 | tconf = torch.zeros(nB, nA, nH, nW) 46 | tcls = torch.zeros(nB, nA, nH, nW, self.num_classes) 47 | 48 | nAnchors = nA*nH*nW 49 | nPixels = nH*nW 50 | nGT = 0 51 | nRecall = 0 52 | nRecall75 = 0 53 | 54 | # it works faster on CPU than on GPU. 55 | anchors = anchors.to("cpu") 56 | 57 | for b in range(nB): 58 | cur_pred_boxes = pred_boxes[b*nAnchors:(b+1)*nAnchors].t() 59 | cur_ious = torch.zeros(nAnchors) 60 | tbox = target[b].view(-1,5).to("cpu") 61 | 62 | for t in range(50): 63 | if tbox[t][1] == 0: 64 | break 65 | gx, gy = tbox[t][1] * nW, tbox[t][2] * nH 66 | gw, gh = tbox[t][3] * self.net_width, tbox[t][4] * self.net_height 67 | cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors,1).t() 68 | cur_ious = torch.max(cur_ious, multi_bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False)) 69 | ignore_ix = (cur_ious>self.ignore_thresh).view(nA,nH,nW) 70 | noobj_mask[b][ignore_ix] = 0 71 | 72 | for t in range(50): 73 | if tbox[t][1] == 0: 74 | break 75 | nGT += 1 76 | gx, gy = tbox[t][1] * nW, tbox[t][2] * nH 77 | gw, gh = tbox[t][3] * self.net_width, tbox[t][4] * self.net_height 78 | gw, gh = gw.float(), gh.float() 79 | gi, gj = int(gx), int(gy) 80 | 81 | tmp_gt_boxes = torch.FloatTensor([0, 0, gw, gh]).repeat(nA,1).t() 82 | anchor_boxes = torch.cat((torch.zeros(nA, anchor_step), anchors),1).t() 83 | _, best_n = torch.max(multi_bbox_ious(anchor_boxes, tmp_gt_boxes, x1y1x2y2=False), 0) 84 | 85 | gt_box = torch.FloatTensor([gx, gy, gw, gh]) 86 | pred_box = pred_boxes[b*nAnchors+best_n*nPixels+gj*nW+gi] 87 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) 88 | 89 | obj_mask [b][best_n][gj][gi] = 1 90 | noobj_mask[b][best_n][gj][gi] = 0 91 | coord_mask[b][best_n][gj][gi] = 2. - tbox[t][3]*tbox[t][4] 92 | tcoord [0][b][best_n][gj][gi] = gx - gi 93 | tcoord [1][b][best_n][gj][gi] = gy - gj 94 | tcoord [2][b][best_n][gj][gi] = math.log(gw/anchors[best_n][0]) 95 | tcoord [3][b][best_n][gj][gi] = math.log(gh/anchors[best_n][1]) 96 | tcls [b][best_n][gj][gi][int(tbox[t][0])] = 1 97 | tconf [b][best_n][gj][gi] = iou if self.rescore else 1. 98 | 99 | if iou > 0.5: 100 | nRecall += 1 101 | if iou > 0.75: 102 | nRecall75 += 1 103 | 104 | return nGT, nRecall, nRecall75, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls 105 | 106 | def forward(self, output, target): 107 | #output : BxAs*(4+1+num_classes)*H*W 108 | mask_tuple = self.get_mask_boxes(output) 109 | t0 = time.time() 110 | nB = output.data.size(0) # batch size 111 | nA = mask_tuple['n'].item() # num_anchors 112 | nC = self.num_classes 113 | nH = output.data.size(2) 114 | nW = output.data.size(3) 115 | anchor_step = mask_tuple['a'].size(0)//nA 116 | anchors = mask_tuple['a'].view(nA, anchor_step).to(self.device) 117 | cls_anchor_dim = nB*nA*nH*nW 118 | 119 | output = output.view(nB, nA, (5+nC), nH, nW) 120 | cls_grid = torch.linspace(5,5+nC-1,nC).long().to(self.device) 121 | ix = torch.LongTensor(range(0,5)).to(self.device) 122 | pred_boxes = torch.FloatTensor(4, cls_anchor_dim).to(self.device) 123 | 124 | coord = output.index_select(2, ix[0:4]).view(nB*nA, -1, nH*nW).transpose(0,1).contiguous().view(-1,cls_anchor_dim) # x, y, w, h 125 | coord[0:2] = coord[0:2].sigmoid() 126 | conf = output.index_select(2, ix[4]).view(cls_anchor_dim).sigmoid() 127 | 128 | cls = output.index_select(2, cls_grid) 129 | cls = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(cls_anchor_dim, nC).to(self.device) 130 | 131 | t1 = time.time() 132 | grid_x = torch.linspace(0, nW-1, nW).repeat(nB*nA, nH, 1).view(cls_anchor_dim).to(self.device) 133 | grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(cls_anchor_dim).to(self.device) 134 | anchor_w = anchors.index_select(1, ix[0]).repeat(nB, nH*nW).view(cls_anchor_dim) 135 | anchor_h = anchors.index_select(1, ix[1]).repeat(nB, nH*nW).view(cls_anchor_dim) 136 | 137 | pred_boxes[0] = coord[0] + grid_x 138 | pred_boxes[1] = coord[1] + grid_y 139 | pred_boxes[2] = coord[2].exp() * anchor_w 140 | pred_boxes[3] = coord[3].exp() * anchor_h 141 | # for build_targets. it works faster on CPU than on GPU 142 | pred_boxes = convert2cpu(pred_boxes.transpose(0,1).contiguous().view(-1,4)).detach() 143 | 144 | t2 = time.time() 145 | nGT, nRecall, nRecall75, obj_mask, noobj_mask, coord_mask, tcoord, tconf, tcls = \ 146 | self.build_targets(pred_boxes, target.detach(), anchors.detach(), nA, nH, nW) 147 | 148 | conf_mask = (obj_mask + noobj_mask).view(cls_anchor_dim).to(self.device) 149 | obj_mask = (obj_mask==1).view(cls_anchor_dim) 150 | 151 | nProposals = int((conf > 0.25).sum()) 152 | 153 | coord = coord[:,obj_mask] 154 | tcoord = tcoord.view(4, cls_anchor_dim)[:,obj_mask].to(self.device) 155 | 156 | tconf = tconf.view(cls_anchor_dim).to(self.device) 157 | 158 | cls = cls[obj_mask,:].to(self.device) 159 | tcls = tcls.view(cls_anchor_dim, nC)[obj_mask,:].to(self.device) 160 | 161 | t3 = time.time() 162 | loss_coord = nn.BCELoss(reduction='sum')(coord[0:2], tcoord[0:2])/nB + \ 163 | nn.MSELoss(reduction='sum')(coord[2:4], tcoord[2:4])/nB 164 | loss_conf = nn.BCELoss(reduction='sum')(conf*conf_mask, tconf*conf_mask)/nB 165 | loss_cls = nn.BCEWithLogitsLoss(reduction='sum')(cls, tcls)/nB 166 | 167 | loss = loss_coord + loss_conf + loss_cls 168 | 169 | t4 = time.time() 170 | if False: 171 | print('-'*30) 172 | print(' activation : %f' % (t1 - t0)) 173 | print(' create pred_boxes : %f' % (t2 - t1)) 174 | print(' build targets : %f' % (t3 - t2)) 175 | print(' create loss : %f' % (t4 - t3)) 176 | print(' total : %f' % (t4 - t0)) 177 | print('%d: Layer(%03d) nGT %3d, nRC %3d, nRC75 %3d, nPP %3d, loss: box %6.3f, conf %6.3f, class %6.3f, total %7.3f' 178 | % (self.seen, self.nth_layer, nGT, nRecall, nRecall75, nProposals, loss_coord, loss_conf, loss_cls, loss)) 179 | if math.isnan(loss.item()): 180 | print(coord, conf, tconf) 181 | sys.exit(0) 182 | return loss 183 | --------------------------------------------------------------------------------