├── CMakeLists.txt ├── Darknet.cpp ├── Darknet.h ├── README.md ├── imgs ├── dog.jpg ├── eagle.jpg ├── giraffe.jpg ├── herd_of_horses.jpg ├── img1.jpg ├── img2.jpg ├── img3.jpg ├── img4.jpg ├── messi.jpg ├── person.jpg └── scream.jpg ├── main.cpp └── models ├── yolov3-tiny.cfg └── yolov3.cfg /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0 FATAL_ERROR) 2 | project(yolo-app) 3 | 4 | find_package( OpenCV REQUIRED ) 5 | include_directories( ${OpenCV_INCLUDE_DIRS} ) 6 | 7 | find_package(Torch REQUIRED) 8 | 9 | aux_source_directory(. DIR_SRCS) 10 | 11 | add_executable(yolo-app ${DIR_SRCS}) 12 | target_link_libraries(yolo-app "${TORCH_LIBRARIES}" "${OpenCV_LIBS}") 13 | set_property(TARGET yolo-app PROPERTY CXX_STANDARD 11) 14 | -------------------------------------------------------------------------------- /Darknet.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * 3 | * Author : walktree 4 | * Email : walktree@gmail.com 5 | * 6 | * A Libtorch implementation of the YOLO v3 object detection algorithm, written with pure C++. 7 | * It's fast, easy to be integrated to your production, and supports CPU and GPU computation. Enjoy ~ 8 | * 9 | *******************************************************************************/ 10 | #include "Darknet.h" 11 | #include 12 | #include 13 | #include 14 | 15 | // trim from start (in place) 16 | static inline void ltrim(std::string &s) { 17 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) { 18 | return !std::isspace(ch); 19 | })); 20 | } 21 | 22 | // trim from end (in place) 23 | static inline void rtrim(std::string &s) { 24 | s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) { 25 | return !std::isspace(ch); 26 | }).base(), s.end()); 27 | } 28 | 29 | // trim from both ends (in place) 30 | static inline void trim(std::string &s) { 31 | ltrim(s); 32 | rtrim(s); 33 | } 34 | 35 | static inline int split(const string& str, std::vector& ret_, string sep = ",") 36 | { 37 | if (str.empty()) 38 | { 39 | return 0; 40 | } 41 | 42 | string tmp; 43 | string::size_type pos_begin = str.find_first_not_of(sep); 44 | string::size_type comma_pos = 0; 45 | 46 | while (pos_begin != string::npos) 47 | { 48 | comma_pos = str.find(sep, pos_begin); 49 | if (comma_pos != string::npos) 50 | { 51 | tmp = str.substr(pos_begin, comma_pos - pos_begin); 52 | pos_begin = comma_pos + sep.length(); 53 | } 54 | else 55 | { 56 | tmp = str.substr(pos_begin); 57 | pos_begin = comma_pos; 58 | } 59 | 60 | if (!tmp.empty()) 61 | { 62 | trim(tmp); 63 | ret_.push_back(tmp); 64 | tmp.clear(); 65 | } 66 | } 67 | return 0; 68 | } 69 | 70 | static inline int split(const string& str, std::vector& ret_, string sep = ",") 71 | { 72 | std::vector tmp; 73 | auto rc = split(str, tmp, sep); 74 | 75 | for(int i = 0; i < tmp.size(); i++) 76 | { 77 | ret_.push_back(std::stoi(tmp[i])); 78 | } 79 | return rc; 80 | } 81 | 82 | // returns the IoU of two bounding boxes 83 | static inline torch::Tensor get_bbox_iou(torch::Tensor box1, torch::Tensor box2) 84 | { 85 | // Get the coordinates of bounding boxes 86 | torch::Tensor b1_x1, b1_y1, b1_x2, b1_y2; 87 | b1_x1 = box1.select(1, 0); 88 | b1_y1 = box1.select(1, 1); 89 | b1_x2 = box1.select(1, 2); 90 | b1_y2 = box1.select(1, 3); 91 | torch::Tensor b2_x1, b2_y1, b2_x2, b2_y2; 92 | b2_x1 = box2.select(1, 0); 93 | b2_y1 = box2.select(1, 1); 94 | b2_x2 = box2.select(1, 2); 95 | b2_y2 = box2.select(1, 3); 96 | 97 | // et the corrdinates of the intersection rectangle 98 | torch::Tensor inter_rect_x1 = torch::max(b1_x1, b2_x1); 99 | torch::Tensor inter_rect_y1 = torch::max(b1_y1, b2_y1); 100 | torch::Tensor inter_rect_x2 = torch::min(b1_x2, b2_x2); 101 | torch::Tensor inter_rect_y2 = torch::min(b1_y2, b2_y2); 102 | 103 | // Intersection area 104 | torch::Tensor inter_area = torch::max(inter_rect_x2 - inter_rect_x1 + 1,torch::zeros(inter_rect_x2.sizes()))*torch::max(inter_rect_y2 - inter_rect_y1 + 1, torch::zeros(inter_rect_x2.sizes())); 105 | 106 | // Union Area 107 | torch::Tensor b1_area = (b1_x2 - b1_x1 + 1)*(b1_y2 - b1_y1 + 1); 108 | torch::Tensor b2_area = (b2_x2 - b2_x1 + 1)*(b2_y2 - b2_y1 + 1); 109 | 110 | torch::Tensor iou = inter_area / (b1_area + b2_area - inter_area); 111 | 112 | return iou; 113 | } 114 | 115 | 116 | int Darknet::get_int_from_cfg(map block, string key, int default_value) 117 | { 118 | if ( block.find(key) != block.end() ) 119 | { 120 | return std::stoi(block.at(key)); 121 | } 122 | return default_value; 123 | } 124 | 125 | string Darknet::get_string_from_cfg(map block, string key, string default_value) 126 | { 127 | if ( block.find(key) != block.end() ) 128 | { 129 | return block.at(key); 130 | } 131 | return default_value; 132 | } 133 | 134 | torch::nn::Conv2dOptions conv_options(int64_t in_planes, int64_t out_planes, int64_t kerner_size, 135 | int64_t stride, int64_t padding, int64_t groups, bool with_bias=false){ 136 | torch::nn::Conv2dOptions conv_options = torch::nn::Conv2dOptions(in_planes, out_planes, kerner_size); 137 | conv_options.stride(stride); 138 | conv_options.padding(padding); 139 | conv_options.groups(groups); 140 | conv_options.bias(with_bias); //@ihmc3jn09hk Fix for PyTorch 1.6 141 | return conv_options; 142 | } 143 | 144 | torch::nn::BatchNormOptions bn_options(int64_t features){ 145 | torch::nn::BatchNormOptions bn_options = torch::nn::BatchNormOptions(features); 146 | bn_options.affine(true); 147 | bn_options.track_running_stats(true); //@ihmc3jn09hk Fix for PyTorch 1.6 148 | return bn_options; 149 | } 150 | 151 | struct EmptyLayer : torch::nn::Module 152 | { 153 | EmptyLayer(){ 154 | 155 | } 156 | 157 | torch::Tensor forward(torch::Tensor x) { 158 | return x; 159 | } 160 | }; 161 | 162 | struct UpsampleLayer : torch::nn::Module 163 | { 164 | int _stride; 165 | UpsampleLayer(int stride){ 166 | _stride = stride; 167 | } 168 | 169 | torch::Tensor forward(torch::Tensor x) { 170 | 171 | torch::IntArrayRef sizes = x.sizes(); 172 | 173 | int64_t w, h; 174 | 175 | if (sizes.size() == 4) 176 | { 177 | w = sizes[2] * _stride; 178 | h = sizes[3] * _stride; 179 | 180 | x = torch::upsample_nearest2d(x, {w, h}); 181 | } 182 | else if (sizes.size() == 3) 183 | { 184 | w = sizes[2] * _stride; 185 | x = torch::upsample_nearest1d(x, {w}); 186 | } 187 | return x; 188 | } 189 | }; 190 | 191 | struct MaxPoolLayer2D : torch::nn::Module 192 | { 193 | int _kernel_size; 194 | int _stride; 195 | MaxPoolLayer2D(int kernel_size, int stride){ 196 | _kernel_size = kernel_size; 197 | _stride = stride; 198 | } 199 | 200 | torch::Tensor forward(torch::Tensor x) { 201 | if (_stride != 1) 202 | { 203 | x = torch::max_pool2d(x, {_kernel_size, _kernel_size}, {_stride, _stride}); 204 | } 205 | else 206 | { 207 | int pad = _kernel_size - 1; 208 | 209 | torch::Tensor padded_x = torch::replication_pad2d(x, {0, pad, 0, pad}); 210 | x = torch::max_pool2d(padded_x, {_kernel_size, _kernel_size}, {_stride, _stride}); 211 | } 212 | 213 | return x; 214 | } 215 | }; 216 | 217 | struct DetectionLayer : torch::nn::Module 218 | { 219 | vector _anchors; 220 | 221 | DetectionLayer(vector anchors) 222 | { 223 | _anchors = anchors; 224 | } 225 | 226 | torch::Tensor forward(torch::Tensor prediction, int inp_dim, int num_classes, torch::Device device) 227 | { 228 | return predict_transform(prediction, inp_dim, _anchors, num_classes, device); 229 | } 230 | 231 | torch::Tensor predict_transform(torch::Tensor prediction, int inp_dim, vector anchors, int num_classes, torch::Device device) 232 | { 233 | int batch_size = prediction.size(0); 234 | int stride = floor(inp_dim / prediction.size(2)); 235 | int grid_size = floor(inp_dim / stride); 236 | int bbox_attrs = 5 + num_classes; 237 | int num_anchors = anchors.size()/2; 238 | 239 | for (size_t i = 0; i < anchors.size(); i++) 240 | { 241 | anchors[i] = anchors[i]/stride; 242 | } 243 | torch::Tensor result = prediction.view({batch_size, bbox_attrs * num_anchors, grid_size * grid_size}); 244 | result = result.transpose(1,2).contiguous(); 245 | result = result.view({batch_size, grid_size*grid_size*num_anchors, bbox_attrs}); 246 | 247 | result.select(2, 0).sigmoid_(); 248 | result.select(2, 1).sigmoid_(); 249 | result.select(2, 4).sigmoid_(); 250 | 251 | auto grid_len = torch::arange(grid_size); 252 | 253 | std::vector args = torch::meshgrid({grid_len, grid_len}); 254 | 255 | torch::Tensor x_offset = args[1].contiguous().view({-1, 1}); 256 | torch::Tensor y_offset = args[0].contiguous().view({-1, 1}); 257 | 258 | // std::cout << "x_offset:" << x_offset << endl; 259 | // std::cout << "y_offset:" << y_offset << endl; 260 | 261 | x_offset = x_offset.to(device); 262 | y_offset = y_offset.to(device); 263 | 264 | auto x_y_offset = torch::cat({x_offset, y_offset}, 1).repeat({1, num_anchors}).view({-1, 2}).unsqueeze(0); 265 | result.slice(2, 0, 2).add_(x_y_offset); 266 | 267 | torch::Tensor anchors_tensor = torch::from_blob(anchors.data(), {num_anchors, 2}); 268 | //if (device != nullptr) 269 | anchors_tensor = anchors_tensor.to(device); 270 | anchors_tensor = anchors_tensor.repeat({grid_size*grid_size, 1}).unsqueeze(0); 271 | 272 | result.slice(2, 2, 4).exp_().mul_(anchors_tensor); 273 | result.slice(2, 5, 5 + num_classes).sigmoid_(); 274 | result.slice(2, 0, 4).mul_(stride); 275 | 276 | return result; 277 | } 278 | }; 279 | 280 | 281 | //--------------------------------------------------------------------------- 282 | // Darknet 283 | //--------------------------------------------------------------------------- 284 | Darknet::Darknet(const char *cfg_file, torch::Device *device) { 285 | 286 | load_cfg(cfg_file); 287 | 288 | _device = device; 289 | 290 | create_modules(); 291 | } 292 | 293 | void Darknet::load_cfg(const char *cfg_file) 294 | { 295 | ifstream fs(cfg_file); 296 | string line; 297 | 298 | if(!fs) 299 | { 300 | std::cout << "Fail to load cfg file: " << cfg_file << endl; 301 | std::cout << strerror(errno) << endl; 302 | exit(-1); 303 | } 304 | 305 | while (getline (fs, line)) 306 | { 307 | trim(line); 308 | 309 | if (line.empty()) 310 | { 311 | continue; 312 | } 313 | 314 | if ( line.substr (0,1) == "[") 315 | { 316 | map block; 317 | 318 | string key = line.substr(1, line.length() -2); 319 | block["type"] = key; 320 | 321 | blocks.push_back(block); 322 | } 323 | else 324 | { 325 | map *block = &blocks[blocks.size() -1]; 326 | 327 | vector op_info; 328 | 329 | split(line, op_info, "="); 330 | 331 | if (op_info.size() == 2) 332 | { 333 | string p_key = op_info[0]; 334 | string p_value = op_info[1]; 335 | block->operator[](p_key) = p_value; 336 | } 337 | } 338 | } 339 | fs.close(); 340 | } 341 | 342 | void Darknet::create_modules() 343 | { 344 | int prev_filters = 3; 345 | 346 | std::vector output_filters; 347 | 348 | int index = 0; 349 | 350 | int filters = 0; 351 | 352 | for (size_t i = 0, len = blocks.size(); i < len; i++) 353 | { 354 | map block = blocks[i]; 355 | 356 | string layer_type = block["type"]; 357 | 358 | // std::cout << index << "--" << layer_type << endl; 359 | 360 | torch::nn::Sequential module; 361 | 362 | if (layer_type == "net") 363 | continue; 364 | 365 | if (layer_type == "convolutional") 366 | { 367 | string activation = get_string_from_cfg(block, "activation", ""); 368 | int batch_normalize = get_int_from_cfg(block, "batch_normalize", 0); 369 | filters = get_int_from_cfg(block, "filters", 0); 370 | int padding = get_int_from_cfg(block, "pad", 0); 371 | int kernel_size = get_int_from_cfg(block, "size", 0); 372 | int stride = get_int_from_cfg(block, "stride", 1); 373 | 374 | int pad = padding > 0? (kernel_size -1)/2: 0; 375 | bool with_bias = batch_normalize > 0? false : true; 376 | 377 | torch::nn::Conv2d conv = torch::nn::Conv2d(conv_options(prev_filters, filters, kernel_size, stride, pad, 1, with_bias)); 378 | module->push_back(conv); 379 | 380 | if (batch_normalize > 0) 381 | { 382 | torch::nn::BatchNorm2dImpl bn = torch::nn::BatchNorm2dImpl(bn_options(filters)); //@ihmc3jn09hk Fix for PyTorch 1.6 383 | module->push_back(bn); 384 | } 385 | 386 | if (activation == "leaky") 387 | { 388 | module->push_back(torch::nn::Functional(torch::leaky_relu, /*slope=*/0.1)); 389 | } 390 | } 391 | else if (layer_type == "upsample") 392 | { 393 | int stride = get_int_from_cfg(block, "stride", 1); 394 | 395 | UpsampleLayer uplayer(stride); 396 | module->push_back(uplayer); 397 | } 398 | else if (layer_type == "maxpool") 399 | { 400 | int stride = get_int_from_cfg(block, "stride", 1); 401 | int size = get_int_from_cfg(block, "size", 1); 402 | 403 | MaxPoolLayer2D poolLayer(size, stride); 404 | module->push_back(poolLayer); 405 | } 406 | else if (layer_type == "shortcut") 407 | { 408 | // skip connection 409 | int from = get_int_from_cfg(block, "from", 0); 410 | block["from"] = std::to_string(from); 411 | 412 | blocks[i] = block; 413 | 414 | // placeholder 415 | EmptyLayer layer; 416 | module->push_back(layer); 417 | } 418 | else if (layer_type == "route") 419 | { 420 | // L 85: -1, 61 421 | string layers_info = get_string_from_cfg(block, "layers", ""); 422 | 423 | std::vector layers; 424 | split(layers_info, layers, ","); 425 | 426 | std::string::size_type sz; 427 | signed int start = std::stoi(layers[0], &sz); 428 | signed int end = 0; 429 | 430 | if (layers.size() > 1) 431 | { 432 | end = std::stoi(layers[1], &sz); 433 | } 434 | 435 | if (start > 0) start = start - index; 436 | 437 | if (end > 0) end = end - index; 438 | 439 | block["start"] = std::to_string(start); 440 | block["end"] = std::to_string(end); 441 | 442 | blocks[i] = block; 443 | 444 | // placeholder 445 | EmptyLayer layer; 446 | module->push_back(layer); 447 | 448 | if (end < 0) 449 | { 450 | filters = output_filters[index + start] + output_filters[index + end]; 451 | } 452 | else 453 | { 454 | filters = output_filters[index + start]; 455 | } 456 | } 457 | else if (layer_type == "yolo") 458 | { 459 | string mask_info = get_string_from_cfg(block, "mask", ""); 460 | std::vector masks; 461 | split(mask_info, masks, ","); 462 | 463 | string anchor_info = get_string_from_cfg(block, "anchors", ""); 464 | std::vector anchors; 465 | split(anchor_info, anchors, ","); 466 | 467 | std::vector anchor_points; 468 | int pos; 469 | for (size_t i = 0; i< masks.size(); i++) 470 | { 471 | pos = masks[i]; 472 | anchor_points.push_back(anchors[pos * 2]); 473 | anchor_points.push_back(anchors[pos * 2+1]); 474 | } 475 | 476 | DetectionLayer layer(anchor_points); 477 | module->push_back(layer); 478 | } 479 | else 480 | { 481 | cout << "unsupported operator:" << layer_type << endl; 482 | } 483 | 484 | prev_filters = filters; 485 | output_filters.push_back(filters); 486 | module_list.push_back(module); 487 | 488 | char *module_key = new char[strlen("layer_") + sizeof(index) + 1]; 489 | 490 | sprintf(module_key, "%s%d", "layer_", index); 491 | 492 | register_module(module_key, module); 493 | 494 | index += 1; 495 | } 496 | } 497 | 498 | map* Darknet::get_net_info() 499 | { 500 | if (blocks.size() > 0) 501 | { 502 | return &blocks[0]; 503 | } 504 | return nullptr; 505 | } 506 | 507 | void Darknet::load_weights(const char *weight_file) 508 | { 509 | ifstream fs(weight_file, ios::binary); 510 | 511 | if (!fs) { 512 | std::cout << "Fail to load weight file: " << weight_file << endl; 513 | std::cout << strerror(errno) << endl; 514 | exit(-1); 515 | } 516 | 517 | // header info: 5 * int32_t 518 | int32_t header_size = sizeof(int32_t)*5; 519 | 520 | int64_t index_weight = 0; 521 | 522 | fs.seekg (0, fs.end); 523 | int64_t length = fs.tellg(); 524 | // skip header 525 | length = length - header_size; 526 | 527 | fs.seekg (header_size, fs.beg); 528 | float *weights_src = (float *)malloc(length); 529 | fs.read(reinterpret_cast(weights_src), length); 530 | 531 | fs.close(); 532 | 533 | /*at::TensorOptions options= torch::TensorOptions() 534 | .dtype(torch::kFloat32) 535 | .is_variable(true);*/ //@ihmc3jn09hk Remove unused code 536 | at::Tensor weights = torch::from_blob(weights_src, {length/4}); 537 | 538 | for (size_t i = 0; i < module_list.size(); i++) 539 | { 540 | map module_info = blocks[i + 1]; 541 | 542 | string module_type = module_info["type"]; 543 | 544 | // only conv layer need to load weight 545 | if (module_type != "convolutional") continue; 546 | 547 | torch::nn::Sequential seq_module = module_list[i]; 548 | 549 | auto conv_module = seq_module.ptr()->ptr(0); 550 | torch::nn::Conv2dImpl *conv_imp = dynamic_cast(conv_module.get()); 551 | 552 | int batch_normalize = get_int_from_cfg(module_info, "batch_normalize", 0); 553 | 554 | if (batch_normalize > 0) 555 | { 556 | // second module 557 | auto bn_module = seq_module.ptr()->ptr(1); 558 | 559 | torch::nn::BatchNorm2dImpl *bn_imp = dynamic_cast(bn_module.get()); //@ihmc3jn09hk Fix for PyTorch 1.6 560 | 561 | int num_bn_biases = bn_imp->bias.numel(); 562 | 563 | at::Tensor bn_bias = weights.slice(0, index_weight, index_weight + num_bn_biases); 564 | index_weight += num_bn_biases; 565 | 566 | at::Tensor bn_weights = weights.slice(0, index_weight, index_weight + num_bn_biases); 567 | index_weight += num_bn_biases; 568 | 569 | at::Tensor bn_running_mean = weights.slice(0, index_weight, index_weight + num_bn_biases); 570 | index_weight += num_bn_biases; 571 | 572 | at::Tensor bn_running_var = weights.slice(0, index_weight, index_weight + num_bn_biases); 573 | index_weight += num_bn_biases; 574 | 575 | bn_bias = bn_bias.view_as(bn_imp->bias); 576 | bn_weights = bn_weights.view_as(bn_imp->weight); 577 | bn_running_mean = bn_running_mean.view_as(bn_imp->running_mean); 578 | bn_running_var = bn_running_var.view_as(bn_imp->running_var); 579 | 580 | bn_imp->bias.set_data(bn_bias); 581 | bn_imp->weight.set_data(bn_weights); 582 | bn_imp->running_mean.set_data(bn_running_mean); 583 | bn_imp->running_var.set_data(bn_running_var); 584 | } 585 | else 586 | { 587 | int num_conv_biases = conv_imp->bias.numel(); 588 | 589 | at::Tensor conv_bias = weights.slice(0, index_weight, index_weight + num_conv_biases); 590 | index_weight += num_conv_biases; 591 | 592 | conv_bias = conv_bias.view_as(conv_imp->bias); 593 | conv_imp->bias.set_data(conv_bias); 594 | } 595 | 596 | int num_weights = conv_imp->weight.numel(); 597 | 598 | at::Tensor conv_weights = weights.slice(0, index_weight, index_weight + num_weights); 599 | index_weight += num_weights; 600 | 601 | conv_weights = conv_weights.view_as(conv_imp->weight); 602 | conv_imp->weight.set_data(conv_weights); 603 | } 604 | } 605 | 606 | torch::Tensor Darknet::forward(torch::Tensor x) 607 | { 608 | size_t module_count = module_list.size(); 609 | 610 | std::vector outputs(module_count); 611 | 612 | torch::Tensor result; 613 | int write = 0; 614 | 615 | for (size_t i = 0; i < module_count; i++) 616 | { 617 | map block = blocks[i+1]; 618 | 619 | string layer_type = block["type"]; 620 | 621 | if (layer_type == "net") 622 | continue; 623 | 624 | if (layer_type == "convolutional" || layer_type == "upsample" || layer_type == "maxpool") 625 | { 626 | torch::nn::SequentialImpl *seq_imp = dynamic_cast(module_list[i].ptr().get()); 627 | 628 | x = seq_imp->forward(x); 629 | outputs[i] = x; 630 | } 631 | else if (layer_type == "route") 632 | { 633 | int start = std::stoi(block["start"]); 634 | int end = std::stoi(block["end"]); 635 | 636 | if (start > 0) start = start - i; 637 | 638 | if (end == 0) 639 | { 640 | x = outputs[i + start]; 641 | } 642 | else 643 | { 644 | if (end > 0) end = end - i; 645 | 646 | torch::Tensor map_1 = outputs[i + start]; 647 | torch::Tensor map_2 = outputs[i + end]; 648 | 649 | x = torch::cat({map_1, map_2}, 1); 650 | } 651 | 652 | outputs[i] = x; 653 | } 654 | else if (layer_type == "shortcut") 655 | { 656 | int from = std::stoi(block["from"]); 657 | x = outputs[i-1] + outputs[i+from]; 658 | outputs[i] = x; 659 | } 660 | else if (layer_type == "yolo") 661 | { 662 | torch::nn::SequentialImpl *seq_imp = dynamic_cast(module_list[i].ptr().get()); 663 | 664 | map net_info = blocks[0]; 665 | int inp_dim = get_int_from_cfg(net_info, "height", 0); 666 | int num_classes = get_int_from_cfg(block, "classes", 0); 667 | 668 | x = seq_imp->forward(x, inp_dim, num_classes, *_device); 669 | 670 | if (write == 0) 671 | { 672 | result = x; 673 | write = 1; 674 | } 675 | else 676 | { 677 | result = torch::cat({result,x}, 1); 678 | } 679 | 680 | outputs[i] = outputs[i-1]; 681 | } 682 | } 683 | return result; 684 | } 685 | 686 | torch::Tensor Darknet::write_results(torch::Tensor prediction, int num_classes, float confidence, float nms_conf) 687 | { 688 | // get result which object confidence > threshold 689 | auto conf_mask = (prediction.select(2,4) > confidence).to(torch::kFloat32).unsqueeze(2); 690 | 691 | prediction.mul_(conf_mask); 692 | auto ind_nz = torch::nonzero(prediction.select(2, 4)).transpose(0, 1).contiguous(); 693 | 694 | if (ind_nz.size(0) == 0) 695 | { 696 | return torch::zeros({0}); 697 | } 698 | 699 | torch::Tensor box_a = torch::ones(prediction.sizes(), prediction.options()); 700 | // top left x = centerX - w/2 701 | box_a.select(2, 0) = prediction.select(2, 0) - prediction.select(2, 2).div(2); 702 | box_a.select(2, 1) = prediction.select(2, 1) - prediction.select(2, 3).div(2); 703 | box_a.select(2, 2) = prediction.select(2, 0) + prediction.select(2, 2).div(2); 704 | box_a.select(2, 3) = prediction.select(2, 1) + prediction.select(2, 3).div(2); 705 | 706 | prediction.slice(2, 0, 4) = box_a.slice(2, 0, 4); 707 | 708 | int batch_size = prediction.size(0); 709 | int item_attr_size = 5; 710 | 711 | torch::Tensor output = torch::ones({1, prediction.size(2) + 1}); 712 | bool write = false; 713 | 714 | int num = 0; 715 | 716 | for (int i = 0; i < batch_size; i++) 717 | { 718 | auto image_prediction = prediction[i]; 719 | 720 | // get the max classes score at each result 721 | std::tuple max_classes = torch::max(image_prediction.slice(1, item_attr_size, item_attr_size + num_classes), 1); 722 | 723 | // class score 724 | auto max_conf = std::get<0>(max_classes); 725 | // index 726 | auto max_conf_score = std::get<1>(max_classes); 727 | max_conf = max_conf.to(torch::kFloat32).unsqueeze(1); 728 | max_conf_score = max_conf_score.to(torch::kFloat32).unsqueeze(1); 729 | 730 | // shape: n * 7, left x, left y, right x, right y, object confidence, class_score, class_id 731 | image_prediction = torch::cat({image_prediction.slice(1, 0, 5), max_conf, max_conf_score}, 1); 732 | 733 | // remove item which object confidence == 0 734 | auto non_zero_index = torch::nonzero(image_prediction.select(1,4)); 735 | auto image_prediction_data = image_prediction.index_select(0, non_zero_index.squeeze()).view({-1, 7}); 736 | 737 | // get unique classes 738 | std::vector img_classes; 739 | 740 | for (int m = 0, len = image_prediction_data.size(0); m < len; m++) 741 | { 742 | bool found = false; 743 | for (size_t n = 0; n < img_classes.size(); n++) 744 | { 745 | auto ret = (image_prediction_data[m][6] == img_classes[n]); 746 | if (torch::nonzero(ret).size(0) > 0) 747 | { 748 | found = true; 749 | break; 750 | } 751 | } 752 | if (!found) img_classes.push_back(image_prediction_data[m][6]); 753 | } 754 | 755 | for (size_t k = 0; k < img_classes.size(); k++) 756 | { 757 | auto cls = img_classes[k]; 758 | 759 | auto cls_mask = image_prediction_data * (image_prediction_data.select(1, 6) == cls).to(torch::kFloat32).unsqueeze(1); 760 | auto class_mask_index = torch::nonzero(cls_mask.select(1, 5)).squeeze(); 761 | 762 | auto image_pred_class = image_prediction_data.index_select(0, class_mask_index).view({-1,7}); 763 | // ascend by confidence 764 | // seems that inverse method not work 765 | std::tuple sort_ret = torch::sort(image_pred_class.select(1,4)); 766 | 767 | auto conf_sort_index = std::get<1>(sort_ret); 768 | 769 | // seems that there is something wrong with inverse method 770 | // conf_sort_index = conf_sort_index.inverse(); 771 | 772 | image_pred_class = image_pred_class.index_select(0, conf_sort_index.squeeze()).cpu(); 773 | 774 | for(int w = 0; w < image_pred_class.size(0)-1; w++) 775 | { 776 | int mi = image_pred_class.size(0) - 1 - w; 777 | 778 | if (mi <= 0) 779 | { 780 | break; 781 | } 782 | 783 | auto ious = get_bbox_iou(image_pred_class[mi].unsqueeze(0), image_pred_class.slice(0, 0, mi)); 784 | 785 | auto iou_mask = (ious < nms_conf).to(torch::kFloat32).unsqueeze(1); 786 | image_pred_class.slice(0, 0, mi) = image_pred_class.slice(0, 0, mi) * iou_mask; 787 | 788 | // remove from list 789 | auto non_zero_index = torch::nonzero(image_pred_class.select(1,4)).squeeze(); 790 | image_pred_class = image_pred_class.index_select(0, non_zero_index).view({-1,7}); 791 | } 792 | 793 | torch::Tensor batch_index = torch::ones({image_pred_class.size(0), 1}).fill_(i); 794 | 795 | if (!write) 796 | { 797 | output = torch::cat({batch_index, image_pred_class}, 1); 798 | write = true; 799 | } 800 | else 801 | { 802 | auto out = torch::cat({batch_index, image_pred_class}, 1); 803 | output = torch::cat({output,out}, 0); 804 | } 805 | 806 | num += 1; 807 | } 808 | } 809 | 810 | if (num == 0) 811 | { 812 | return torch::zeros({0}); 813 | } 814 | 815 | return output; 816 | } 817 | -------------------------------------------------------------------------------- /Darknet.h: -------------------------------------------------------------------------------- 1 | /******************************************************************************* 2 | * 3 | * Author : walktree 4 | * Email : walktree@gmail.com 5 | * 6 | * A Libtorch implementation of the YOLO v3 object detection algorithm, written with pure C++. 7 | * It's fast, easy to be integrated to your production, and supports CPU and GPU computation. Enjoy ~ 8 | * 9 | *******************************************************************************/ 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | using namespace std; 17 | 18 | struct Darknet : torch::nn::Module { 19 | 20 | public: 21 | 22 | Darknet(const char *conf_file, torch::Device *device); 23 | 24 | map* get_net_info(); 25 | 26 | void load_weights(const char *weight_file); 27 | 28 | torch::Tensor forward(torch::Tensor x); 29 | 30 | /** 31 | * 对预测数据进行筛选 32 | */ 33 | torch::Tensor write_results(torch::Tensor prediction, int num_classes, float confidence, float nms_conf = 0.4); 34 | 35 | private: 36 | 37 | torch::Device *_device; 38 | 39 | vector> blocks; 40 | 41 | torch::nn::Sequential features; 42 | 43 | vector module_list; 44 | 45 | // load YOLOv3 46 | void load_cfg(const char *cfg_file); 47 | 48 | void create_modules(); 49 | 50 | int get_int_from_cfg(map block, string key, int default_value); 51 | 52 | string get_string_from_cfg(map block, string key, string default_value); 53 | }; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # libtorch-yolov3 2 | A Libtorch implementation of the YOLO v3 object detection algorithm, written with pure C++. It's fast, easy to be integrated to your production, and CPU and GPU are both supported. Enjoy ~ 3 | 4 | This project is inspired by the [pytorch version](https://github.com/ayooshkathuria/pytorch-yolo-v3), I rewritten it with C++. 5 | 6 | ## Requirements 7 | 1. LibTorch v1.0.0 8 | 2. Cuda 9 | 3. OpenCV (just used in the example) 10 | 11 | 12 | ## To compile 13 | 1. cmake3 14 | 2. gcc 5.4 + 15 | 16 | 17 | 18 | ``` 19 | mkdir build && cd build 20 | cmake3 -DCMAKE_PREFIX_PATH="your libtorch path" .. 21 | 22 | # if there are multi versions of gcc, then tell cmake which one your want to use, e.g.: 23 | cmake3 -DCMAKE_PREFIX_PATH="your libtorch path" -DCMAKE_C_COMPILER=/usr/local/bin/gcc -DCMAKE_CXX_COMPILER=/usr/local/bin/g++ .. 24 | ``` 25 | 26 | 27 | ## Running the detector 28 | 29 | The first thing you need to do is to get the weights file for v3: 30 | 31 | ``` 32 | cd models 33 | wget https://pjreddie.com/media/files/yolov3.weights 34 | ``` 35 | 36 | On Single image: 37 | ``` 38 | ./yolo-app ../imgs/person.jpg 39 | ``` 40 | 41 | As I tested, it will take 25 ms on GPU ( 1080 ti ). please run inference job more than once, and calculate the average cost. 42 | -------------------------------------------------------------------------------- /imgs/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/dog.jpg -------------------------------------------------------------------------------- /imgs/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/eagle.jpg -------------------------------------------------------------------------------- /imgs/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/giraffe.jpg -------------------------------------------------------------------------------- /imgs/herd_of_horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/herd_of_horses.jpg -------------------------------------------------------------------------------- /imgs/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/img1.jpg -------------------------------------------------------------------------------- /imgs/img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/img2.jpg -------------------------------------------------------------------------------- /imgs/img3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/img3.jpg -------------------------------------------------------------------------------- /imgs/img4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/img4.jpg -------------------------------------------------------------------------------- /imgs/messi.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/messi.jpg -------------------------------------------------------------------------------- /imgs/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/person.jpg -------------------------------------------------------------------------------- /imgs/scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/walktree/libtorch-yolov3/bfae598fe31fc1063ac08d32413560c9ce703cbe/imgs/scream.jpg -------------------------------------------------------------------------------- /main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "Darknet.h" 9 | 10 | using namespace std; 11 | using namespace std::chrono; 12 | 13 | int main(int argc, const char* argv[]) 14 | { 15 | if (argc != 2) { 16 | std::cerr << "usage: yolo-app \n"; 17 | return -1; 18 | } 19 | 20 | torch::DeviceType device_type; 21 | 22 | if (torch::cuda::is_available() ) { 23 | device_type = torch::kCUDA; 24 | } else { 25 | device_type = torch::kCPU; 26 | } 27 | torch::Device device(device_type); 28 | 29 | // input image size for YOLO v3 30 | int input_image_size = 416; 31 | 32 | Darknet net("../models/yolov3.cfg", &device); 33 | 34 | map *info = net.get_net_info(); 35 | 36 | info->operator[]("height") = std::to_string(input_image_size); 37 | 38 | std::cout << "loading weight ..." << endl; 39 | net.load_weights("../models/yolov3.weights"); 40 | std::cout << "weight loaded ..." << endl; 41 | 42 | net.to(device); 43 | 44 | torch::NoGradGuard no_grad; 45 | net.eval(); 46 | 47 | std::cout << "start to inference ..." << endl; 48 | 49 | cv::Mat origin_image, resized_image; 50 | 51 | // origin_image = cv::imread("../139.jpg"); 52 | origin_image = cv::imread(argv[1]); 53 | 54 | cv::cvtColor(origin_image, resized_image, cv::COLOR_BGR2RGB); 55 | cv::resize(resized_image, resized_image, cv::Size(input_image_size, input_image_size)); 56 | 57 | cv::Mat img_float; 58 | resized_image.convertTo(img_float, CV_32F, 1.0/255); 59 | 60 | auto img_tensor = torch::from_blob(img_float.data, {1, input_image_size, input_image_size, 3}).to(device); 61 | img_tensor = img_tensor.permute({0,3,1,2}); 62 | 63 | auto start = std::chrono::high_resolution_clock::now(); 64 | 65 | auto output = net.forward(img_tensor); 66 | 67 | // filter result by NMS 68 | // class_num = 80 69 | // confidence = 0.6 70 | auto result = net.write_results(output, 80, 0.6, 0.4); 71 | 72 | auto end = std::chrono::high_resolution_clock::now(); 73 | 74 | auto duration = duration_cast(end - start); 75 | 76 | // It should be known that it takes longer time at first time 77 | std::cout << "inference taken : " << duration.count() << " ms" << endl; 78 | 79 | if (result.dim() == 1) 80 | { 81 | std::cout << "no object found" << endl; 82 | } 83 | else 84 | { 85 | int obj_num = result.size(0); 86 | 87 | std::cout << obj_num << " objects found" << endl; 88 | 89 | float w_scale = float(origin_image.cols) / input_image_size; 90 | float h_scale = float(origin_image.rows) / input_image_size; 91 | 92 | result.select(1,1).mul_(w_scale); 93 | result.select(1,2).mul_(h_scale); 94 | result.select(1,3).mul_(w_scale); 95 | result.select(1,4).mul_(h_scale); 96 | 97 | auto result_data = result.accessor(); 98 | 99 | for (int i = 0; i < result.size(0) ; i++) 100 | { 101 | cv::rectangle(origin_image, cv::Point(result_data[i][1], result_data[i][2]), cv::Point(result_data[i][3], result_data[i][4]), cv::Scalar(0, 0, 255), 1, 1, 0); 102 | } 103 | 104 | cv::imwrite("out-det.jpg", origin_image); 105 | } 106 | 107 | std::cout << "Done" << endl; 108 | 109 | return 0; 110 | } 111 | -------------------------------------------------------------------------------- /models/yolov3-tiny.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=2 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=16 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | [maxpool] 34 | size=2 35 | stride=2 36 | 37 | [convolutional] 38 | batch_normalize=1 39 | filters=32 40 | size=3 41 | stride=1 42 | pad=1 43 | activation=leaky 44 | 45 | [maxpool] 46 | size=2 47 | stride=2 48 | 49 | [convolutional] 50 | batch_normalize=1 51 | filters=64 52 | size=3 53 | stride=1 54 | pad=1 55 | activation=leaky 56 | 57 | [maxpool] 58 | size=2 59 | stride=2 60 | 61 | [convolutional] 62 | batch_normalize=1 63 | filters=128 64 | size=3 65 | stride=1 66 | pad=1 67 | activation=leaky 68 | 69 | [maxpool] 70 | size=2 71 | stride=2 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=256 76 | size=3 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [maxpool] 82 | size=2 83 | stride=2 84 | 85 | [convolutional] 86 | batch_normalize=1 87 | filters=512 88 | size=3 89 | stride=1 90 | pad=1 91 | activation=leaky 92 | 93 | [maxpool] 94 | size=2 95 | stride=1 96 | 97 | [convolutional] 98 | batch_normalize=1 99 | filters=1024 100 | size=3 101 | stride=1 102 | pad=1 103 | activation=leaky 104 | 105 | ########### 106 | 107 | [convolutional] 108 | batch_normalize=1 109 | filters=256 110 | size=1 111 | stride=1 112 | pad=1 113 | activation=leaky 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=512 118 | size=3 119 | stride=1 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | size=1 125 | stride=1 126 | pad=1 127 | filters=255 128 | activation=linear 129 | 130 | 131 | 132 | [yolo] 133 | mask = 3,4,5 134 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 135 | classes=80 136 | num=6 137 | jitter=.3 138 | ignore_thresh = .7 139 | truth_thresh = 1 140 | random=1 141 | 142 | [route] 143 | layers = -4 144 | 145 | [convolutional] 146 | batch_normalize=1 147 | filters=128 148 | size=1 149 | stride=1 150 | pad=1 151 | activation=leaky 152 | 153 | [upsample] 154 | stride=2 155 | 156 | [route] 157 | layers = -1, 8 158 | 159 | [convolutional] 160 | batch_normalize=1 161 | filters=256 162 | size=3 163 | stride=1 164 | pad=1 165 | activation=leaky 166 | 167 | [convolutional] 168 | size=1 169 | stride=1 170 | pad=1 171 | filters=255 172 | activation=linear 173 | 174 | [yolo] 175 | mask = 1,2,3 176 | anchors = 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 177 | classes=80 178 | num=6 179 | jitter=.3 180 | ignore_thresh = .7 181 | truth_thresh = 1 182 | random=1 183 | -------------------------------------------------------------------------------- /models/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width= 320 9 | height = 320 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | --------------------------------------------------------------------------------