├── .gitignore ├── CHANGELOG.rst ├── CMakeLists.txt ├── README.md ├── launch └── dnn_detect.launch ├── model ├── MobileNetSSD_deploy.caffemodel └── MobileNetSSD_deploy.prototxt.txt ├── msg ├── DetectedObject.msg └── DetectedObjectArray.msg ├── package.xml ├── src └── dnn_detect.cpp ├── srv └── Detect.srv └── test ├── dnn_images.test ├── dnn_images_test.cpp └── test_images └── cat.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.lo 7 | *.o 8 | *.obj 9 | 10 | # Precompiled Headers 11 | *.gch 12 | *.pch 13 | 14 | # Compiled Dynamic libraries 15 | *.so 16 | *.dylib 17 | *.dll 18 | 19 | # Fortran module files 20 | *.mod 21 | *.smod 22 | 23 | # Compiled Static libraries 24 | *.lai 25 | *.la 26 | *.a 27 | *.lib 28 | 29 | # Executables 30 | *.exe 31 | *.out 32 | *.app 33 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2 | Changelog for package dnn_detect 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 4 | 5 | 0.1.0 (2020-09-21) 6 | ------------------ 7 | * dnn_images_test.cpp - support opencv version 4 8 | * Noetic support 9 | * Contributors: Jim Vaughan, Rohan Agrawal, Tim 10 | 11 | 0.0.3 (2018-02-16) 12 | ------------------ 13 | * Add optional rotation of image 14 | * Added one shot mode, which requires a service call to trigger detection. 15 | * Update README.md 16 | * Contributors: Jim Vaughan 17 | 18 | 0.0.2 (2017-12-03) 19 | ------------------ 20 | * Initial commit 21 | * Contributors: Jim Vaughan 22 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 2.8.3) 3 | project(dnn_detect) 4 | 5 | find_package(catkin REQUIRED COMPONENTS 6 | roscpp 7 | tf2_geometry_msgs 8 | tf2_ros 9 | tf2 10 | visualization_msgs 11 | image_transport 12 | cv_bridge 13 | std_msgs 14 | ) 15 | 16 | find_package(OpenCV REQUIRED) 17 | 18 | 19 | add_message_files( 20 | FILES 21 | DetectedObject.msg 22 | DetectedObjectArray.msg 23 | ) 24 | 25 | 26 | add_service_files( 27 | FILES 28 | Detect.srv 29 | ) 30 | 31 | generate_messages( 32 | DEPENDENCIES 33 | std_msgs 34 | ) 35 | catkin_package(INCLUDE_DIRS DEPENDS OpenCV) 36 | 37 | ########### 38 | ## Build ## 39 | ########### 40 | 41 | 42 | add_definitions(-std=c++11) 43 | 44 | include_directories(${catkin_INCLUDE_DIRS}) 45 | include_directories(${OpenCV_INCLUDE_DIRS}) 46 | 47 | add_executable(dnn_detect src/dnn_detect.cpp) 48 | 49 | add_dependencies(dnn_detect ${${PROJECT_NAME}_EXPORTED_TARGETS} 50 | ${catkin_EXPORTED_TARGETS}) 51 | 52 | target_link_libraries(dnn_detect ${catkin_LIBRARIES} ${OpenCV_LIBS}) 53 | 54 | ############# 55 | ## Install ## 56 | ############# 57 | 58 | ## Mark executables and/or libraries for installation 59 | install(TARGETS dnn_detect 60 | ARCHIVE DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 61 | LIBRARY DESTINATION ${CATKIN_PACKAGE_LIB_DESTINATION} 62 | RUNTIME DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} 63 | ) 64 | 65 | install(DIRECTORY launch/ 66 | DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/launch 67 | ) 68 | 69 | install(DIRECTORY model/ 70 | DESTINATION ${CATKIN_PACKAGE_SHARE_DESTINATION}/model 71 | ) 72 | 73 | ########### 74 | ## Tests ## 75 | ########### 76 | 77 | if(CATKIN_ENABLE_TESTING) 78 | find_package(rostest REQUIRED) 79 | 80 | # Tests need c++11 81 | add_definitions(-std=c++11) 82 | 83 | add_rostest_gtest(dnn_images_test 84 | test/dnn_images.test 85 | test/dnn_images_test.cpp) 86 | add_dependencies(dnn_images_test ${PROJECT_NAME}_generate_messages) 87 | target_link_libraries(dnn_images_test ${catkin_LIBRARIES} ${OpenCV_LIBS}) 88 | endif() 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # dnn_detect 3 | 4 | This package provides object detection using OpenCV's Deep Neural Network module. 5 | 6 | Documentation is at [http://wiki.ros.org/dnn_detect](http://wiki.ros.org/dnn_detect). 7 | 8 | The model used by default is from [chuanqi305's MobileNet-SSD](https://github.com/chuanqi305/MobileNet-SSD). 9 | -------------------------------------------------------------------------------- /launch/dnn_detect.launch: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 10 | 11 | 12 | 13 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /model/MobileNetSSD_deploy.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UbiquityRobotics/dnn_detect/c23161c9c1c2a2bd15618b6b3450522ac8aad2cb/model/MobileNetSSD_deploy.caffemodel -------------------------------------------------------------------------------- /model/MobileNetSSD_deploy.prototxt.txt: -------------------------------------------------------------------------------- 1 | name: "MobileNet-SSD" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 300 7 | dim: 300 8 | } 9 | layer { 10 | name: "conv0" 11 | type: "Convolution" 12 | bottom: "data" 13 | top: "conv0" 14 | param { 15 | lr_mult: 1.0 16 | decay_mult: 1.0 17 | } 18 | param { 19 | lr_mult: 2.0 20 | decay_mult: 0.0 21 | } 22 | convolution_param { 23 | num_output: 32 24 | pad: 1 25 | kernel_size: 3 26 | stride: 2 27 | weight_filler { 28 | type: "msra" 29 | } 30 | bias_filler { 31 | type: "constant" 32 | value: 0.0 33 | } 34 | } 35 | } 36 | layer { 37 | name: "conv0/relu" 38 | type: "ReLU" 39 | bottom: "conv0" 40 | top: "conv0" 41 | } 42 | layer { 43 | name: "conv1/dw" 44 | type: "Convolution" 45 | bottom: "conv0" 46 | top: "conv1/dw" 47 | param { 48 | lr_mult: 1.0 49 | decay_mult: 1.0 50 | } 51 | param { 52 | lr_mult: 2.0 53 | decay_mult: 0.0 54 | } 55 | convolution_param { 56 | num_output: 32 57 | pad: 1 58 | kernel_size: 3 59 | group: 32 60 | engine: CAFFE 61 | weight_filler { 62 | type: "msra" 63 | } 64 | bias_filler { 65 | type: "constant" 66 | value: 0.0 67 | } 68 | } 69 | } 70 | layer { 71 | name: "conv1/dw/relu" 72 | type: "ReLU" 73 | bottom: "conv1/dw" 74 | top: "conv1/dw" 75 | } 76 | layer { 77 | name: "conv1" 78 | type: "Convolution" 79 | bottom: "conv1/dw" 80 | top: "conv1" 81 | param { 82 | lr_mult: 1.0 83 | decay_mult: 1.0 84 | } 85 | param { 86 | lr_mult: 2.0 87 | decay_mult: 0.0 88 | } 89 | convolution_param { 90 | num_output: 64 91 | kernel_size: 1 92 | weight_filler { 93 | type: "msra" 94 | } 95 | bias_filler { 96 | type: "constant" 97 | value: 0.0 98 | } 99 | } 100 | } 101 | layer { 102 | name: "conv1/relu" 103 | type: "ReLU" 104 | bottom: "conv1" 105 | top: "conv1" 106 | } 107 | layer { 108 | name: "conv2/dw" 109 | type: "Convolution" 110 | bottom: "conv1" 111 | top: "conv2/dw" 112 | param { 113 | lr_mult: 1.0 114 | decay_mult: 1.0 115 | } 116 | param { 117 | lr_mult: 2.0 118 | decay_mult: 0.0 119 | } 120 | convolution_param { 121 | num_output: 64 122 | pad: 1 123 | kernel_size: 3 124 | stride: 2 125 | group: 64 126 | engine: CAFFE 127 | weight_filler { 128 | type: "msra" 129 | } 130 | bias_filler { 131 | type: "constant" 132 | value: 0.0 133 | } 134 | } 135 | } 136 | layer { 137 | name: "conv2/dw/relu" 138 | type: "ReLU" 139 | bottom: "conv2/dw" 140 | top: "conv2/dw" 141 | } 142 | layer { 143 | name: "conv2" 144 | type: "Convolution" 145 | bottom: "conv2/dw" 146 | top: "conv2" 147 | param { 148 | lr_mult: 1.0 149 | decay_mult: 1.0 150 | } 151 | param { 152 | lr_mult: 2.0 153 | decay_mult: 0.0 154 | } 155 | convolution_param { 156 | num_output: 128 157 | kernel_size: 1 158 | weight_filler { 159 | type: "msra" 160 | } 161 | bias_filler { 162 | type: "constant" 163 | value: 0.0 164 | } 165 | } 166 | } 167 | layer { 168 | name: "conv2/relu" 169 | type: "ReLU" 170 | bottom: "conv2" 171 | top: "conv2" 172 | } 173 | layer { 174 | name: "conv3/dw" 175 | type: "Convolution" 176 | bottom: "conv2" 177 | top: "conv3/dw" 178 | param { 179 | lr_mult: 1.0 180 | decay_mult: 1.0 181 | } 182 | param { 183 | lr_mult: 2.0 184 | decay_mult: 0.0 185 | } 186 | convolution_param { 187 | num_output: 128 188 | pad: 1 189 | kernel_size: 3 190 | group: 128 191 | engine: CAFFE 192 | weight_filler { 193 | type: "msra" 194 | } 195 | bias_filler { 196 | type: "constant" 197 | value: 0.0 198 | } 199 | } 200 | } 201 | layer { 202 | name: "conv3/dw/relu" 203 | type: "ReLU" 204 | bottom: "conv3/dw" 205 | top: "conv3/dw" 206 | } 207 | layer { 208 | name: "conv3" 209 | type: "Convolution" 210 | bottom: "conv3/dw" 211 | top: "conv3" 212 | param { 213 | lr_mult: 1.0 214 | decay_mult: 1.0 215 | } 216 | param { 217 | lr_mult: 2.0 218 | decay_mult: 0.0 219 | } 220 | convolution_param { 221 | num_output: 128 222 | kernel_size: 1 223 | weight_filler { 224 | type: "msra" 225 | } 226 | bias_filler { 227 | type: "constant" 228 | value: 0.0 229 | } 230 | } 231 | } 232 | layer { 233 | name: "conv3/relu" 234 | type: "ReLU" 235 | bottom: "conv3" 236 | top: "conv3" 237 | } 238 | layer { 239 | name: "conv4/dw" 240 | type: "Convolution" 241 | bottom: "conv3" 242 | top: "conv4/dw" 243 | param { 244 | lr_mult: 1.0 245 | decay_mult: 1.0 246 | } 247 | param { 248 | lr_mult: 2.0 249 | decay_mult: 0.0 250 | } 251 | convolution_param { 252 | num_output: 128 253 | pad: 1 254 | kernel_size: 3 255 | stride: 2 256 | group: 128 257 | engine: CAFFE 258 | weight_filler { 259 | type: "msra" 260 | } 261 | bias_filler { 262 | type: "constant" 263 | value: 0.0 264 | } 265 | } 266 | } 267 | layer { 268 | name: "conv4/dw/relu" 269 | type: "ReLU" 270 | bottom: "conv4/dw" 271 | top: "conv4/dw" 272 | } 273 | layer { 274 | name: "conv4" 275 | type: "Convolution" 276 | bottom: "conv4/dw" 277 | top: "conv4" 278 | param { 279 | lr_mult: 1.0 280 | decay_mult: 1.0 281 | } 282 | param { 283 | lr_mult: 2.0 284 | decay_mult: 0.0 285 | } 286 | convolution_param { 287 | num_output: 256 288 | kernel_size: 1 289 | weight_filler { 290 | type: "msra" 291 | } 292 | bias_filler { 293 | type: "constant" 294 | value: 0.0 295 | } 296 | } 297 | } 298 | layer { 299 | name: "conv4/relu" 300 | type: "ReLU" 301 | bottom: "conv4" 302 | top: "conv4" 303 | } 304 | layer { 305 | name: "conv5/dw" 306 | type: "Convolution" 307 | bottom: "conv4" 308 | top: "conv5/dw" 309 | param { 310 | lr_mult: 1.0 311 | decay_mult: 1.0 312 | } 313 | param { 314 | lr_mult: 2.0 315 | decay_mult: 0.0 316 | } 317 | convolution_param { 318 | num_output: 256 319 | pad: 1 320 | kernel_size: 3 321 | group: 256 322 | engine: CAFFE 323 | weight_filler { 324 | type: "msra" 325 | } 326 | bias_filler { 327 | type: "constant" 328 | value: 0.0 329 | } 330 | } 331 | } 332 | layer { 333 | name: "conv5/dw/relu" 334 | type: "ReLU" 335 | bottom: "conv5/dw" 336 | top: "conv5/dw" 337 | } 338 | layer { 339 | name: "conv5" 340 | type: "Convolution" 341 | bottom: "conv5/dw" 342 | top: "conv5" 343 | param { 344 | lr_mult: 1.0 345 | decay_mult: 1.0 346 | } 347 | param { 348 | lr_mult: 2.0 349 | decay_mult: 0.0 350 | } 351 | convolution_param { 352 | num_output: 256 353 | kernel_size: 1 354 | weight_filler { 355 | type: "msra" 356 | } 357 | bias_filler { 358 | type: "constant" 359 | value: 0.0 360 | } 361 | } 362 | } 363 | layer { 364 | name: "conv5/relu" 365 | type: "ReLU" 366 | bottom: "conv5" 367 | top: "conv5" 368 | } 369 | layer { 370 | name: "conv6/dw" 371 | type: "Convolution" 372 | bottom: "conv5" 373 | top: "conv6/dw" 374 | param { 375 | lr_mult: 1.0 376 | decay_mult: 1.0 377 | } 378 | param { 379 | lr_mult: 2.0 380 | decay_mult: 0.0 381 | } 382 | convolution_param { 383 | num_output: 256 384 | pad: 1 385 | kernel_size: 3 386 | stride: 2 387 | group: 256 388 | engine: CAFFE 389 | weight_filler { 390 | type: "msra" 391 | } 392 | bias_filler { 393 | type: "constant" 394 | value: 0.0 395 | } 396 | } 397 | } 398 | layer { 399 | name: "conv6/dw/relu" 400 | type: "ReLU" 401 | bottom: "conv6/dw" 402 | top: "conv6/dw" 403 | } 404 | layer { 405 | name: "conv6" 406 | type: "Convolution" 407 | bottom: "conv6/dw" 408 | top: "conv6" 409 | param { 410 | lr_mult: 1.0 411 | decay_mult: 1.0 412 | } 413 | param { 414 | lr_mult: 2.0 415 | decay_mult: 0.0 416 | } 417 | convolution_param { 418 | num_output: 512 419 | kernel_size: 1 420 | weight_filler { 421 | type: "msra" 422 | } 423 | bias_filler { 424 | type: "constant" 425 | value: 0.0 426 | } 427 | } 428 | } 429 | layer { 430 | name: "conv6/relu" 431 | type: "ReLU" 432 | bottom: "conv6" 433 | top: "conv6" 434 | } 435 | layer { 436 | name: "conv7/dw" 437 | type: "Convolution" 438 | bottom: "conv6" 439 | top: "conv7/dw" 440 | param { 441 | lr_mult: 1.0 442 | decay_mult: 1.0 443 | } 444 | param { 445 | lr_mult: 2.0 446 | decay_mult: 0.0 447 | } 448 | convolution_param { 449 | num_output: 512 450 | pad: 1 451 | kernel_size: 3 452 | group: 512 453 | engine: CAFFE 454 | weight_filler { 455 | type: "msra" 456 | } 457 | bias_filler { 458 | type: "constant" 459 | value: 0.0 460 | } 461 | } 462 | } 463 | layer { 464 | name: "conv7/dw/relu" 465 | type: "ReLU" 466 | bottom: "conv7/dw" 467 | top: "conv7/dw" 468 | } 469 | layer { 470 | name: "conv7" 471 | type: "Convolution" 472 | bottom: "conv7/dw" 473 | top: "conv7" 474 | param { 475 | lr_mult: 1.0 476 | decay_mult: 1.0 477 | } 478 | param { 479 | lr_mult: 2.0 480 | decay_mult: 0.0 481 | } 482 | convolution_param { 483 | num_output: 512 484 | kernel_size: 1 485 | weight_filler { 486 | type: "msra" 487 | } 488 | bias_filler { 489 | type: "constant" 490 | value: 0.0 491 | } 492 | } 493 | } 494 | layer { 495 | name: "conv7/relu" 496 | type: "ReLU" 497 | bottom: "conv7" 498 | top: "conv7" 499 | } 500 | layer { 501 | name: "conv8/dw" 502 | type: "Convolution" 503 | bottom: "conv7" 504 | top: "conv8/dw" 505 | param { 506 | lr_mult: 1.0 507 | decay_mult: 1.0 508 | } 509 | param { 510 | lr_mult: 2.0 511 | decay_mult: 0.0 512 | } 513 | convolution_param { 514 | num_output: 512 515 | pad: 1 516 | kernel_size: 3 517 | group: 512 518 | engine: CAFFE 519 | weight_filler { 520 | type: "msra" 521 | } 522 | bias_filler { 523 | type: "constant" 524 | value: 0.0 525 | } 526 | } 527 | } 528 | layer { 529 | name: "conv8/dw/relu" 530 | type: "ReLU" 531 | bottom: "conv8/dw" 532 | top: "conv8/dw" 533 | } 534 | layer { 535 | name: "conv8" 536 | type: "Convolution" 537 | bottom: "conv8/dw" 538 | top: "conv8" 539 | param { 540 | lr_mult: 1.0 541 | decay_mult: 1.0 542 | } 543 | param { 544 | lr_mult: 2.0 545 | decay_mult: 0.0 546 | } 547 | convolution_param { 548 | num_output: 512 549 | kernel_size: 1 550 | weight_filler { 551 | type: "msra" 552 | } 553 | bias_filler { 554 | type: "constant" 555 | value: 0.0 556 | } 557 | } 558 | } 559 | layer { 560 | name: "conv8/relu" 561 | type: "ReLU" 562 | bottom: "conv8" 563 | top: "conv8" 564 | } 565 | layer { 566 | name: "conv9/dw" 567 | type: "Convolution" 568 | bottom: "conv8" 569 | top: "conv9/dw" 570 | param { 571 | lr_mult: 1.0 572 | decay_mult: 1.0 573 | } 574 | param { 575 | lr_mult: 2.0 576 | decay_mult: 0.0 577 | } 578 | convolution_param { 579 | num_output: 512 580 | pad: 1 581 | kernel_size: 3 582 | group: 512 583 | engine: CAFFE 584 | weight_filler { 585 | type: "msra" 586 | } 587 | bias_filler { 588 | type: "constant" 589 | value: 0.0 590 | } 591 | } 592 | } 593 | layer { 594 | name: "conv9/dw/relu" 595 | type: "ReLU" 596 | bottom: "conv9/dw" 597 | top: "conv9/dw" 598 | } 599 | layer { 600 | name: "conv9" 601 | type: "Convolution" 602 | bottom: "conv9/dw" 603 | top: "conv9" 604 | param { 605 | lr_mult: 1.0 606 | decay_mult: 1.0 607 | } 608 | param { 609 | lr_mult: 2.0 610 | decay_mult: 0.0 611 | } 612 | convolution_param { 613 | num_output: 512 614 | kernel_size: 1 615 | weight_filler { 616 | type: "msra" 617 | } 618 | bias_filler { 619 | type: "constant" 620 | value: 0.0 621 | } 622 | } 623 | } 624 | layer { 625 | name: "conv9/relu" 626 | type: "ReLU" 627 | bottom: "conv9" 628 | top: "conv9" 629 | } 630 | layer { 631 | name: "conv10/dw" 632 | type: "Convolution" 633 | bottom: "conv9" 634 | top: "conv10/dw" 635 | param { 636 | lr_mult: 1.0 637 | decay_mult: 1.0 638 | } 639 | param { 640 | lr_mult: 2.0 641 | decay_mult: 0.0 642 | } 643 | convolution_param { 644 | num_output: 512 645 | pad: 1 646 | kernel_size: 3 647 | group: 512 648 | engine: CAFFE 649 | weight_filler { 650 | type: "msra" 651 | } 652 | bias_filler { 653 | type: "constant" 654 | value: 0.0 655 | } 656 | } 657 | } 658 | layer { 659 | name: "conv10/dw/relu" 660 | type: "ReLU" 661 | bottom: "conv10/dw" 662 | top: "conv10/dw" 663 | } 664 | layer { 665 | name: "conv10" 666 | type: "Convolution" 667 | bottom: "conv10/dw" 668 | top: "conv10" 669 | param { 670 | lr_mult: 1.0 671 | decay_mult: 1.0 672 | } 673 | param { 674 | lr_mult: 2.0 675 | decay_mult: 0.0 676 | } 677 | convolution_param { 678 | num_output: 512 679 | kernel_size: 1 680 | weight_filler { 681 | type: "msra" 682 | } 683 | bias_filler { 684 | type: "constant" 685 | value: 0.0 686 | } 687 | } 688 | } 689 | layer { 690 | name: "conv10/relu" 691 | type: "ReLU" 692 | bottom: "conv10" 693 | top: "conv10" 694 | } 695 | layer { 696 | name: "conv11/dw" 697 | type: "Convolution" 698 | bottom: "conv10" 699 | top: "conv11/dw" 700 | param { 701 | lr_mult: 1.0 702 | decay_mult: 1.0 703 | } 704 | param { 705 | lr_mult: 2.0 706 | decay_mult: 0.0 707 | } 708 | convolution_param { 709 | num_output: 512 710 | pad: 1 711 | kernel_size: 3 712 | group: 512 713 | engine: CAFFE 714 | weight_filler { 715 | type: "msra" 716 | } 717 | bias_filler { 718 | type: "constant" 719 | value: 0.0 720 | } 721 | } 722 | } 723 | layer { 724 | name: "conv11/dw/relu" 725 | type: "ReLU" 726 | bottom: "conv11/dw" 727 | top: "conv11/dw" 728 | } 729 | layer { 730 | name: "conv11" 731 | type: "Convolution" 732 | bottom: "conv11/dw" 733 | top: "conv11" 734 | param { 735 | lr_mult: 1.0 736 | decay_mult: 1.0 737 | } 738 | param { 739 | lr_mult: 2.0 740 | decay_mult: 0.0 741 | } 742 | convolution_param { 743 | num_output: 512 744 | kernel_size: 1 745 | weight_filler { 746 | type: "msra" 747 | } 748 | bias_filler { 749 | type: "constant" 750 | value: 0.0 751 | } 752 | } 753 | } 754 | layer { 755 | name: "conv11/relu" 756 | type: "ReLU" 757 | bottom: "conv11" 758 | top: "conv11" 759 | } 760 | layer { 761 | name: "conv12/dw" 762 | type: "Convolution" 763 | bottom: "conv11" 764 | top: "conv12/dw" 765 | param { 766 | lr_mult: 1.0 767 | decay_mult: 1.0 768 | } 769 | param { 770 | lr_mult: 2.0 771 | decay_mult: 0.0 772 | } 773 | convolution_param { 774 | num_output: 512 775 | pad: 1 776 | kernel_size: 3 777 | stride: 2 778 | group: 512 779 | engine: CAFFE 780 | weight_filler { 781 | type: "msra" 782 | } 783 | bias_filler { 784 | type: "constant" 785 | value: 0.0 786 | } 787 | } 788 | } 789 | layer { 790 | name: "conv12/dw/relu" 791 | type: "ReLU" 792 | bottom: "conv12/dw" 793 | top: "conv12/dw" 794 | } 795 | layer { 796 | name: "conv12" 797 | type: "Convolution" 798 | bottom: "conv12/dw" 799 | top: "conv12" 800 | param { 801 | lr_mult: 1.0 802 | decay_mult: 1.0 803 | } 804 | param { 805 | lr_mult: 2.0 806 | decay_mult: 0.0 807 | } 808 | convolution_param { 809 | num_output: 1024 810 | kernel_size: 1 811 | weight_filler { 812 | type: "msra" 813 | } 814 | bias_filler { 815 | type: "constant" 816 | value: 0.0 817 | } 818 | } 819 | } 820 | layer { 821 | name: "conv12/relu" 822 | type: "ReLU" 823 | bottom: "conv12" 824 | top: "conv12" 825 | } 826 | layer { 827 | name: "conv13/dw" 828 | type: "Convolution" 829 | bottom: "conv12" 830 | top: "conv13/dw" 831 | param { 832 | lr_mult: 1.0 833 | decay_mult: 1.0 834 | } 835 | param { 836 | lr_mult: 2.0 837 | decay_mult: 0.0 838 | } 839 | convolution_param { 840 | num_output: 1024 841 | pad: 1 842 | kernel_size: 3 843 | group: 1024 844 | engine: CAFFE 845 | weight_filler { 846 | type: "msra" 847 | } 848 | bias_filler { 849 | type: "constant" 850 | value: 0.0 851 | } 852 | } 853 | } 854 | layer { 855 | name: "conv13/dw/relu" 856 | type: "ReLU" 857 | bottom: "conv13/dw" 858 | top: "conv13/dw" 859 | } 860 | layer { 861 | name: "conv13" 862 | type: "Convolution" 863 | bottom: "conv13/dw" 864 | top: "conv13" 865 | param { 866 | lr_mult: 1.0 867 | decay_mult: 1.0 868 | } 869 | param { 870 | lr_mult: 2.0 871 | decay_mult: 0.0 872 | } 873 | convolution_param { 874 | num_output: 1024 875 | kernel_size: 1 876 | weight_filler { 877 | type: "msra" 878 | } 879 | bias_filler { 880 | type: "constant" 881 | value: 0.0 882 | } 883 | } 884 | } 885 | layer { 886 | name: "conv13/relu" 887 | type: "ReLU" 888 | bottom: "conv13" 889 | top: "conv13" 890 | } 891 | layer { 892 | name: "conv14_1" 893 | type: "Convolution" 894 | bottom: "conv13" 895 | top: "conv14_1" 896 | param { 897 | lr_mult: 1.0 898 | decay_mult: 1.0 899 | } 900 | param { 901 | lr_mult: 2.0 902 | decay_mult: 0.0 903 | } 904 | convolution_param { 905 | num_output: 256 906 | kernel_size: 1 907 | weight_filler { 908 | type: "msra" 909 | } 910 | bias_filler { 911 | type: "constant" 912 | value: 0.0 913 | } 914 | } 915 | } 916 | layer { 917 | name: "conv14_1/relu" 918 | type: "ReLU" 919 | bottom: "conv14_1" 920 | top: "conv14_1" 921 | } 922 | layer { 923 | name: "conv14_2" 924 | type: "Convolution" 925 | bottom: "conv14_1" 926 | top: "conv14_2" 927 | param { 928 | lr_mult: 1.0 929 | decay_mult: 1.0 930 | } 931 | param { 932 | lr_mult: 2.0 933 | decay_mult: 0.0 934 | } 935 | convolution_param { 936 | num_output: 512 937 | pad: 1 938 | kernel_size: 3 939 | stride: 2 940 | weight_filler { 941 | type: "msra" 942 | } 943 | bias_filler { 944 | type: "constant" 945 | value: 0.0 946 | } 947 | } 948 | } 949 | layer { 950 | name: "conv14_2/relu" 951 | type: "ReLU" 952 | bottom: "conv14_2" 953 | top: "conv14_2" 954 | } 955 | layer { 956 | name: "conv15_1" 957 | type: "Convolution" 958 | bottom: "conv14_2" 959 | top: "conv15_1" 960 | param { 961 | lr_mult: 1.0 962 | decay_mult: 1.0 963 | } 964 | param { 965 | lr_mult: 2.0 966 | decay_mult: 0.0 967 | } 968 | convolution_param { 969 | num_output: 128 970 | kernel_size: 1 971 | weight_filler { 972 | type: "msra" 973 | } 974 | bias_filler { 975 | type: "constant" 976 | value: 0.0 977 | } 978 | } 979 | } 980 | layer { 981 | name: "conv15_1/relu" 982 | type: "ReLU" 983 | bottom: "conv15_1" 984 | top: "conv15_1" 985 | } 986 | layer { 987 | name: "conv15_2" 988 | type: "Convolution" 989 | bottom: "conv15_1" 990 | top: "conv15_2" 991 | param { 992 | lr_mult: 1.0 993 | decay_mult: 1.0 994 | } 995 | param { 996 | lr_mult: 2.0 997 | decay_mult: 0.0 998 | } 999 | convolution_param { 1000 | num_output: 256 1001 | pad: 1 1002 | kernel_size: 3 1003 | stride: 2 1004 | weight_filler { 1005 | type: "msra" 1006 | } 1007 | bias_filler { 1008 | type: "constant" 1009 | value: 0.0 1010 | } 1011 | } 1012 | } 1013 | layer { 1014 | name: "conv15_2/relu" 1015 | type: "ReLU" 1016 | bottom: "conv15_2" 1017 | top: "conv15_2" 1018 | } 1019 | layer { 1020 | name: "conv16_1" 1021 | type: "Convolution" 1022 | bottom: "conv15_2" 1023 | top: "conv16_1" 1024 | param { 1025 | lr_mult: 1.0 1026 | decay_mult: 1.0 1027 | } 1028 | param { 1029 | lr_mult: 2.0 1030 | decay_mult: 0.0 1031 | } 1032 | convolution_param { 1033 | num_output: 128 1034 | kernel_size: 1 1035 | weight_filler { 1036 | type: "msra" 1037 | } 1038 | bias_filler { 1039 | type: "constant" 1040 | value: 0.0 1041 | } 1042 | } 1043 | } 1044 | layer { 1045 | name: "conv16_1/relu" 1046 | type: "ReLU" 1047 | bottom: "conv16_1" 1048 | top: "conv16_1" 1049 | } 1050 | layer { 1051 | name: "conv16_2" 1052 | type: "Convolution" 1053 | bottom: "conv16_1" 1054 | top: "conv16_2" 1055 | param { 1056 | lr_mult: 1.0 1057 | decay_mult: 1.0 1058 | } 1059 | param { 1060 | lr_mult: 2.0 1061 | decay_mult: 0.0 1062 | } 1063 | convolution_param { 1064 | num_output: 256 1065 | pad: 1 1066 | kernel_size: 3 1067 | stride: 2 1068 | weight_filler { 1069 | type: "msra" 1070 | } 1071 | bias_filler { 1072 | type: "constant" 1073 | value: 0.0 1074 | } 1075 | } 1076 | } 1077 | layer { 1078 | name: "conv16_2/relu" 1079 | type: "ReLU" 1080 | bottom: "conv16_2" 1081 | top: "conv16_2" 1082 | } 1083 | layer { 1084 | name: "conv17_1" 1085 | type: "Convolution" 1086 | bottom: "conv16_2" 1087 | top: "conv17_1" 1088 | param { 1089 | lr_mult: 1.0 1090 | decay_mult: 1.0 1091 | } 1092 | param { 1093 | lr_mult: 2.0 1094 | decay_mult: 0.0 1095 | } 1096 | convolution_param { 1097 | num_output: 64 1098 | kernel_size: 1 1099 | weight_filler { 1100 | type: "msra" 1101 | } 1102 | bias_filler { 1103 | type: "constant" 1104 | value: 0.0 1105 | } 1106 | } 1107 | } 1108 | layer { 1109 | name: "conv17_1/relu" 1110 | type: "ReLU" 1111 | bottom: "conv17_1" 1112 | top: "conv17_1" 1113 | } 1114 | layer { 1115 | name: "conv17_2" 1116 | type: "Convolution" 1117 | bottom: "conv17_1" 1118 | top: "conv17_2" 1119 | param { 1120 | lr_mult: 1.0 1121 | decay_mult: 1.0 1122 | } 1123 | param { 1124 | lr_mult: 2.0 1125 | decay_mult: 0.0 1126 | } 1127 | convolution_param { 1128 | num_output: 128 1129 | pad: 1 1130 | kernel_size: 3 1131 | stride: 2 1132 | weight_filler { 1133 | type: "msra" 1134 | } 1135 | bias_filler { 1136 | type: "constant" 1137 | value: 0.0 1138 | } 1139 | } 1140 | } 1141 | layer { 1142 | name: "conv17_2/relu" 1143 | type: "ReLU" 1144 | bottom: "conv17_2" 1145 | top: "conv17_2" 1146 | } 1147 | layer { 1148 | name: "conv11_mbox_loc" 1149 | type: "Convolution" 1150 | bottom: "conv11" 1151 | top: "conv11_mbox_loc" 1152 | param { 1153 | lr_mult: 1.0 1154 | decay_mult: 1.0 1155 | } 1156 | param { 1157 | lr_mult: 2.0 1158 | decay_mult: 0.0 1159 | } 1160 | convolution_param { 1161 | num_output: 12 1162 | kernel_size: 1 1163 | weight_filler { 1164 | type: "msra" 1165 | } 1166 | bias_filler { 1167 | type: "constant" 1168 | value: 0.0 1169 | } 1170 | } 1171 | } 1172 | layer { 1173 | name: "conv11_mbox_loc_perm" 1174 | type: "Permute" 1175 | bottom: "conv11_mbox_loc" 1176 | top: "conv11_mbox_loc_perm" 1177 | permute_param { 1178 | order: 0 1179 | order: 2 1180 | order: 3 1181 | order: 1 1182 | } 1183 | } 1184 | layer { 1185 | name: "conv11_mbox_loc_flat" 1186 | type: "Flatten" 1187 | bottom: "conv11_mbox_loc_perm" 1188 | top: "conv11_mbox_loc_flat" 1189 | flatten_param { 1190 | axis: 1 1191 | } 1192 | } 1193 | layer { 1194 | name: "conv11_mbox_conf" 1195 | type: "Convolution" 1196 | bottom: "conv11" 1197 | top: "conv11_mbox_conf" 1198 | param { 1199 | lr_mult: 1.0 1200 | decay_mult: 1.0 1201 | } 1202 | param { 1203 | lr_mult: 2.0 1204 | decay_mult: 0.0 1205 | } 1206 | convolution_param { 1207 | num_output: 63 1208 | kernel_size: 1 1209 | weight_filler { 1210 | type: "msra" 1211 | } 1212 | bias_filler { 1213 | type: "constant" 1214 | value: 0.0 1215 | } 1216 | } 1217 | } 1218 | layer { 1219 | name: "conv11_mbox_conf_perm" 1220 | type: "Permute" 1221 | bottom: "conv11_mbox_conf" 1222 | top: "conv11_mbox_conf_perm" 1223 | permute_param { 1224 | order: 0 1225 | order: 2 1226 | order: 3 1227 | order: 1 1228 | } 1229 | } 1230 | layer { 1231 | name: "conv11_mbox_conf_flat" 1232 | type: "Flatten" 1233 | bottom: "conv11_mbox_conf_perm" 1234 | top: "conv11_mbox_conf_flat" 1235 | flatten_param { 1236 | axis: 1 1237 | } 1238 | } 1239 | layer { 1240 | name: "conv11_mbox_priorbox" 1241 | type: "PriorBox" 1242 | bottom: "conv11" 1243 | bottom: "data" 1244 | top: "conv11_mbox_priorbox" 1245 | prior_box_param { 1246 | min_size: 60.0 1247 | aspect_ratio: 2.0 1248 | flip: true 1249 | clip: false 1250 | variance: 0.1 1251 | variance: 0.1 1252 | variance: 0.2 1253 | variance: 0.2 1254 | offset: 0.5 1255 | } 1256 | } 1257 | layer { 1258 | name: "conv13_mbox_loc" 1259 | type: "Convolution" 1260 | bottom: "conv13" 1261 | top: "conv13_mbox_loc" 1262 | param { 1263 | lr_mult: 1.0 1264 | decay_mult: 1.0 1265 | } 1266 | param { 1267 | lr_mult: 2.0 1268 | decay_mult: 0.0 1269 | } 1270 | convolution_param { 1271 | num_output: 24 1272 | kernel_size: 1 1273 | weight_filler { 1274 | type: "msra" 1275 | } 1276 | bias_filler { 1277 | type: "constant" 1278 | value: 0.0 1279 | } 1280 | } 1281 | } 1282 | layer { 1283 | name: "conv13_mbox_loc_perm" 1284 | type: "Permute" 1285 | bottom: "conv13_mbox_loc" 1286 | top: "conv13_mbox_loc_perm" 1287 | permute_param { 1288 | order: 0 1289 | order: 2 1290 | order: 3 1291 | order: 1 1292 | } 1293 | } 1294 | layer { 1295 | name: "conv13_mbox_loc_flat" 1296 | type: "Flatten" 1297 | bottom: "conv13_mbox_loc_perm" 1298 | top: "conv13_mbox_loc_flat" 1299 | flatten_param { 1300 | axis: 1 1301 | } 1302 | } 1303 | layer { 1304 | name: "conv13_mbox_conf" 1305 | type: "Convolution" 1306 | bottom: "conv13" 1307 | top: "conv13_mbox_conf" 1308 | param { 1309 | lr_mult: 1.0 1310 | decay_mult: 1.0 1311 | } 1312 | param { 1313 | lr_mult: 2.0 1314 | decay_mult: 0.0 1315 | } 1316 | convolution_param { 1317 | num_output: 126 1318 | kernel_size: 1 1319 | weight_filler { 1320 | type: "msra" 1321 | } 1322 | bias_filler { 1323 | type: "constant" 1324 | value: 0.0 1325 | } 1326 | } 1327 | } 1328 | layer { 1329 | name: "conv13_mbox_conf_perm" 1330 | type: "Permute" 1331 | bottom: "conv13_mbox_conf" 1332 | top: "conv13_mbox_conf_perm" 1333 | permute_param { 1334 | order: 0 1335 | order: 2 1336 | order: 3 1337 | order: 1 1338 | } 1339 | } 1340 | layer { 1341 | name: "conv13_mbox_conf_flat" 1342 | type: "Flatten" 1343 | bottom: "conv13_mbox_conf_perm" 1344 | top: "conv13_mbox_conf_flat" 1345 | flatten_param { 1346 | axis: 1 1347 | } 1348 | } 1349 | layer { 1350 | name: "conv13_mbox_priorbox" 1351 | type: "PriorBox" 1352 | bottom: "conv13" 1353 | bottom: "data" 1354 | top: "conv13_mbox_priorbox" 1355 | prior_box_param { 1356 | min_size: 105.0 1357 | max_size: 150.0 1358 | aspect_ratio: 2.0 1359 | aspect_ratio: 3.0 1360 | flip: true 1361 | clip: false 1362 | variance: 0.1 1363 | variance: 0.1 1364 | variance: 0.2 1365 | variance: 0.2 1366 | offset: 0.5 1367 | } 1368 | } 1369 | layer { 1370 | name: "conv14_2_mbox_loc" 1371 | type: "Convolution" 1372 | bottom: "conv14_2" 1373 | top: "conv14_2_mbox_loc" 1374 | param { 1375 | lr_mult: 1.0 1376 | decay_mult: 1.0 1377 | } 1378 | param { 1379 | lr_mult: 2.0 1380 | decay_mult: 0.0 1381 | } 1382 | convolution_param { 1383 | num_output: 24 1384 | kernel_size: 1 1385 | weight_filler { 1386 | type: "msra" 1387 | } 1388 | bias_filler { 1389 | type: "constant" 1390 | value: 0.0 1391 | } 1392 | } 1393 | } 1394 | layer { 1395 | name: "conv14_2_mbox_loc_perm" 1396 | type: "Permute" 1397 | bottom: "conv14_2_mbox_loc" 1398 | top: "conv14_2_mbox_loc_perm" 1399 | permute_param { 1400 | order: 0 1401 | order: 2 1402 | order: 3 1403 | order: 1 1404 | } 1405 | } 1406 | layer { 1407 | name: "conv14_2_mbox_loc_flat" 1408 | type: "Flatten" 1409 | bottom: "conv14_2_mbox_loc_perm" 1410 | top: "conv14_2_mbox_loc_flat" 1411 | flatten_param { 1412 | axis: 1 1413 | } 1414 | } 1415 | layer { 1416 | name: "conv14_2_mbox_conf" 1417 | type: "Convolution" 1418 | bottom: "conv14_2" 1419 | top: "conv14_2_mbox_conf" 1420 | param { 1421 | lr_mult: 1.0 1422 | decay_mult: 1.0 1423 | } 1424 | param { 1425 | lr_mult: 2.0 1426 | decay_mult: 0.0 1427 | } 1428 | convolution_param { 1429 | num_output: 126 1430 | kernel_size: 1 1431 | weight_filler { 1432 | type: "msra" 1433 | } 1434 | bias_filler { 1435 | type: "constant" 1436 | value: 0.0 1437 | } 1438 | } 1439 | } 1440 | layer { 1441 | name: "conv14_2_mbox_conf_perm" 1442 | type: "Permute" 1443 | bottom: "conv14_2_mbox_conf" 1444 | top: "conv14_2_mbox_conf_perm" 1445 | permute_param { 1446 | order: 0 1447 | order: 2 1448 | order: 3 1449 | order: 1 1450 | } 1451 | } 1452 | layer { 1453 | name: "conv14_2_mbox_conf_flat" 1454 | type: "Flatten" 1455 | bottom: "conv14_2_mbox_conf_perm" 1456 | top: "conv14_2_mbox_conf_flat" 1457 | flatten_param { 1458 | axis: 1 1459 | } 1460 | } 1461 | layer { 1462 | name: "conv14_2_mbox_priorbox" 1463 | type: "PriorBox" 1464 | bottom: "conv14_2" 1465 | bottom: "data" 1466 | top: "conv14_2_mbox_priorbox" 1467 | prior_box_param { 1468 | min_size: 150.0 1469 | max_size: 195.0 1470 | aspect_ratio: 2.0 1471 | aspect_ratio: 3.0 1472 | flip: true 1473 | clip: false 1474 | variance: 0.1 1475 | variance: 0.1 1476 | variance: 0.2 1477 | variance: 0.2 1478 | offset: 0.5 1479 | } 1480 | } 1481 | layer { 1482 | name: "conv15_2_mbox_loc" 1483 | type: "Convolution" 1484 | bottom: "conv15_2" 1485 | top: "conv15_2_mbox_loc" 1486 | param { 1487 | lr_mult: 1.0 1488 | decay_mult: 1.0 1489 | } 1490 | param { 1491 | lr_mult: 2.0 1492 | decay_mult: 0.0 1493 | } 1494 | convolution_param { 1495 | num_output: 24 1496 | kernel_size: 1 1497 | weight_filler { 1498 | type: "msra" 1499 | } 1500 | bias_filler { 1501 | type: "constant" 1502 | value: 0.0 1503 | } 1504 | } 1505 | } 1506 | layer { 1507 | name: "conv15_2_mbox_loc_perm" 1508 | type: "Permute" 1509 | bottom: "conv15_2_mbox_loc" 1510 | top: "conv15_2_mbox_loc_perm" 1511 | permute_param { 1512 | order: 0 1513 | order: 2 1514 | order: 3 1515 | order: 1 1516 | } 1517 | } 1518 | layer { 1519 | name: "conv15_2_mbox_loc_flat" 1520 | type: "Flatten" 1521 | bottom: "conv15_2_mbox_loc_perm" 1522 | top: "conv15_2_mbox_loc_flat" 1523 | flatten_param { 1524 | axis: 1 1525 | } 1526 | } 1527 | layer { 1528 | name: "conv15_2_mbox_conf" 1529 | type: "Convolution" 1530 | bottom: "conv15_2" 1531 | top: "conv15_2_mbox_conf" 1532 | param { 1533 | lr_mult: 1.0 1534 | decay_mult: 1.0 1535 | } 1536 | param { 1537 | lr_mult: 2.0 1538 | decay_mult: 0.0 1539 | } 1540 | convolution_param { 1541 | num_output: 126 1542 | kernel_size: 1 1543 | weight_filler { 1544 | type: "msra" 1545 | } 1546 | bias_filler { 1547 | type: "constant" 1548 | value: 0.0 1549 | } 1550 | } 1551 | } 1552 | layer { 1553 | name: "conv15_2_mbox_conf_perm" 1554 | type: "Permute" 1555 | bottom: "conv15_2_mbox_conf" 1556 | top: "conv15_2_mbox_conf_perm" 1557 | permute_param { 1558 | order: 0 1559 | order: 2 1560 | order: 3 1561 | order: 1 1562 | } 1563 | } 1564 | layer { 1565 | name: "conv15_2_mbox_conf_flat" 1566 | type: "Flatten" 1567 | bottom: "conv15_2_mbox_conf_perm" 1568 | top: "conv15_2_mbox_conf_flat" 1569 | flatten_param { 1570 | axis: 1 1571 | } 1572 | } 1573 | layer { 1574 | name: "conv15_2_mbox_priorbox" 1575 | type: "PriorBox" 1576 | bottom: "conv15_2" 1577 | bottom: "data" 1578 | top: "conv15_2_mbox_priorbox" 1579 | prior_box_param { 1580 | min_size: 195.0 1581 | max_size: 240.0 1582 | aspect_ratio: 2.0 1583 | aspect_ratio: 3.0 1584 | flip: true 1585 | clip: false 1586 | variance: 0.1 1587 | variance: 0.1 1588 | variance: 0.2 1589 | variance: 0.2 1590 | offset: 0.5 1591 | } 1592 | } 1593 | layer { 1594 | name: "conv16_2_mbox_loc" 1595 | type: "Convolution" 1596 | bottom: "conv16_2" 1597 | top: "conv16_2_mbox_loc" 1598 | param { 1599 | lr_mult: 1.0 1600 | decay_mult: 1.0 1601 | } 1602 | param { 1603 | lr_mult: 2.0 1604 | decay_mult: 0.0 1605 | } 1606 | convolution_param { 1607 | num_output: 24 1608 | kernel_size: 1 1609 | weight_filler { 1610 | type: "msra" 1611 | } 1612 | bias_filler { 1613 | type: "constant" 1614 | value: 0.0 1615 | } 1616 | } 1617 | } 1618 | layer { 1619 | name: "conv16_2_mbox_loc_perm" 1620 | type: "Permute" 1621 | bottom: "conv16_2_mbox_loc" 1622 | top: "conv16_2_mbox_loc_perm" 1623 | permute_param { 1624 | order: 0 1625 | order: 2 1626 | order: 3 1627 | order: 1 1628 | } 1629 | } 1630 | layer { 1631 | name: "conv16_2_mbox_loc_flat" 1632 | type: "Flatten" 1633 | bottom: "conv16_2_mbox_loc_perm" 1634 | top: "conv16_2_mbox_loc_flat" 1635 | flatten_param { 1636 | axis: 1 1637 | } 1638 | } 1639 | layer { 1640 | name: "conv16_2_mbox_conf" 1641 | type: "Convolution" 1642 | bottom: "conv16_2" 1643 | top: "conv16_2_mbox_conf" 1644 | param { 1645 | lr_mult: 1.0 1646 | decay_mult: 1.0 1647 | } 1648 | param { 1649 | lr_mult: 2.0 1650 | decay_mult: 0.0 1651 | } 1652 | convolution_param { 1653 | num_output: 126 1654 | kernel_size: 1 1655 | weight_filler { 1656 | type: "msra" 1657 | } 1658 | bias_filler { 1659 | type: "constant" 1660 | value: 0.0 1661 | } 1662 | } 1663 | } 1664 | layer { 1665 | name: "conv16_2_mbox_conf_perm" 1666 | type: "Permute" 1667 | bottom: "conv16_2_mbox_conf" 1668 | top: "conv16_2_mbox_conf_perm" 1669 | permute_param { 1670 | order: 0 1671 | order: 2 1672 | order: 3 1673 | order: 1 1674 | } 1675 | } 1676 | layer { 1677 | name: "conv16_2_mbox_conf_flat" 1678 | type: "Flatten" 1679 | bottom: "conv16_2_mbox_conf_perm" 1680 | top: "conv16_2_mbox_conf_flat" 1681 | flatten_param { 1682 | axis: 1 1683 | } 1684 | } 1685 | layer { 1686 | name: "conv16_2_mbox_priorbox" 1687 | type: "PriorBox" 1688 | bottom: "conv16_2" 1689 | bottom: "data" 1690 | top: "conv16_2_mbox_priorbox" 1691 | prior_box_param { 1692 | min_size: 240.0 1693 | max_size: 285.0 1694 | aspect_ratio: 2.0 1695 | aspect_ratio: 3.0 1696 | flip: true 1697 | clip: false 1698 | variance: 0.1 1699 | variance: 0.1 1700 | variance: 0.2 1701 | variance: 0.2 1702 | offset: 0.5 1703 | } 1704 | } 1705 | layer { 1706 | name: "conv17_2_mbox_loc" 1707 | type: "Convolution" 1708 | bottom: "conv17_2" 1709 | top: "conv17_2_mbox_loc" 1710 | param { 1711 | lr_mult: 1.0 1712 | decay_mult: 1.0 1713 | } 1714 | param { 1715 | lr_mult: 2.0 1716 | decay_mult: 0.0 1717 | } 1718 | convolution_param { 1719 | num_output: 24 1720 | kernel_size: 1 1721 | weight_filler { 1722 | type: "msra" 1723 | } 1724 | bias_filler { 1725 | type: "constant" 1726 | value: 0.0 1727 | } 1728 | } 1729 | } 1730 | layer { 1731 | name: "conv17_2_mbox_loc_perm" 1732 | type: "Permute" 1733 | bottom: "conv17_2_mbox_loc" 1734 | top: "conv17_2_mbox_loc_perm" 1735 | permute_param { 1736 | order: 0 1737 | order: 2 1738 | order: 3 1739 | order: 1 1740 | } 1741 | } 1742 | layer { 1743 | name: "conv17_2_mbox_loc_flat" 1744 | type: "Flatten" 1745 | bottom: "conv17_2_mbox_loc_perm" 1746 | top: "conv17_2_mbox_loc_flat" 1747 | flatten_param { 1748 | axis: 1 1749 | } 1750 | } 1751 | layer { 1752 | name: "conv17_2_mbox_conf" 1753 | type: "Convolution" 1754 | bottom: "conv17_2" 1755 | top: "conv17_2_mbox_conf" 1756 | param { 1757 | lr_mult: 1.0 1758 | decay_mult: 1.0 1759 | } 1760 | param { 1761 | lr_mult: 2.0 1762 | decay_mult: 0.0 1763 | } 1764 | convolution_param { 1765 | num_output: 126 1766 | kernel_size: 1 1767 | weight_filler { 1768 | type: "msra" 1769 | } 1770 | bias_filler { 1771 | type: "constant" 1772 | value: 0.0 1773 | } 1774 | } 1775 | } 1776 | layer { 1777 | name: "conv17_2_mbox_conf_perm" 1778 | type: "Permute" 1779 | bottom: "conv17_2_mbox_conf" 1780 | top: "conv17_2_mbox_conf_perm" 1781 | permute_param { 1782 | order: 0 1783 | order: 2 1784 | order: 3 1785 | order: 1 1786 | } 1787 | } 1788 | layer { 1789 | name: "conv17_2_mbox_conf_flat" 1790 | type: "Flatten" 1791 | bottom: "conv17_2_mbox_conf_perm" 1792 | top: "conv17_2_mbox_conf_flat" 1793 | flatten_param { 1794 | axis: 1 1795 | } 1796 | } 1797 | layer { 1798 | name: "conv17_2_mbox_priorbox" 1799 | type: "PriorBox" 1800 | bottom: "conv17_2" 1801 | bottom: "data" 1802 | top: "conv17_2_mbox_priorbox" 1803 | prior_box_param { 1804 | min_size: 285.0 1805 | max_size: 300.0 1806 | aspect_ratio: 2.0 1807 | aspect_ratio: 3.0 1808 | flip: true 1809 | clip: false 1810 | variance: 0.1 1811 | variance: 0.1 1812 | variance: 0.2 1813 | variance: 0.2 1814 | offset: 0.5 1815 | } 1816 | } 1817 | layer { 1818 | name: "mbox_loc" 1819 | type: "Concat" 1820 | bottom: "conv11_mbox_loc_flat" 1821 | bottom: "conv13_mbox_loc_flat" 1822 | bottom: "conv14_2_mbox_loc_flat" 1823 | bottom: "conv15_2_mbox_loc_flat" 1824 | bottom: "conv16_2_mbox_loc_flat" 1825 | bottom: "conv17_2_mbox_loc_flat" 1826 | top: "mbox_loc" 1827 | concat_param { 1828 | axis: 1 1829 | } 1830 | } 1831 | layer { 1832 | name: "mbox_conf" 1833 | type: "Concat" 1834 | bottom: "conv11_mbox_conf_flat" 1835 | bottom: "conv13_mbox_conf_flat" 1836 | bottom: "conv14_2_mbox_conf_flat" 1837 | bottom: "conv15_2_mbox_conf_flat" 1838 | bottom: "conv16_2_mbox_conf_flat" 1839 | bottom: "conv17_2_mbox_conf_flat" 1840 | top: "mbox_conf" 1841 | concat_param { 1842 | axis: 1 1843 | } 1844 | } 1845 | layer { 1846 | name: "mbox_priorbox" 1847 | type: "Concat" 1848 | bottom: "conv11_mbox_priorbox" 1849 | bottom: "conv13_mbox_priorbox" 1850 | bottom: "conv14_2_mbox_priorbox" 1851 | bottom: "conv15_2_mbox_priorbox" 1852 | bottom: "conv16_2_mbox_priorbox" 1853 | bottom: "conv17_2_mbox_priorbox" 1854 | top: "mbox_priorbox" 1855 | concat_param { 1856 | axis: 2 1857 | } 1858 | } 1859 | layer { 1860 | name: "mbox_conf_reshape" 1861 | type: "Reshape" 1862 | bottom: "mbox_conf" 1863 | top: "mbox_conf_reshape" 1864 | reshape_param { 1865 | shape { 1866 | dim: 0 1867 | dim: -1 1868 | dim: 21 1869 | } 1870 | } 1871 | } 1872 | layer { 1873 | name: "mbox_conf_softmax" 1874 | type: "Softmax" 1875 | bottom: "mbox_conf_reshape" 1876 | top: "mbox_conf_softmax" 1877 | softmax_param { 1878 | axis: 2 1879 | } 1880 | } 1881 | layer { 1882 | name: "mbox_conf_flatten" 1883 | type: "Flatten" 1884 | bottom: "mbox_conf_softmax" 1885 | top: "mbox_conf_flatten" 1886 | flatten_param { 1887 | axis: 1 1888 | } 1889 | } 1890 | layer { 1891 | name: "detection_out" 1892 | type: "DetectionOutput" 1893 | bottom: "mbox_loc" 1894 | bottom: "mbox_conf_flatten" 1895 | bottom: "mbox_priorbox" 1896 | top: "detection_out" 1897 | include { 1898 | phase: TEST 1899 | } 1900 | detection_output_param { 1901 | num_classes: 21 1902 | share_location: true 1903 | background_label_id: 0 1904 | nms_param { 1905 | nms_threshold: 0.45 1906 | top_k: 100 1907 | } 1908 | code_type: CENTER_SIZE 1909 | keep_top_k: 100 1910 | confidence_threshold: 0.25 1911 | } 1912 | } 1913 | -------------------------------------------------------------------------------- /msg/DetectedObject.msg: -------------------------------------------------------------------------------- 1 | string class_name 2 | float32 confidence 3 | float32 x_min 4 | float32 x_max 5 | float32 y_min 6 | float32 y_max 7 | -------------------------------------------------------------------------------- /msg/DetectedObjectArray.msg: -------------------------------------------------------------------------------- 1 | Header header 2 | DetectedObject[] objects 3 | -------------------------------------------------------------------------------- /package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | dnn_detect 4 | 0.1.0 5 | DNN based detection 6 | 7 | Jim Vaughan 8 | Rohan Agrawal 9 | 10 | BSD 11 | 12 | Jim Vaughan 13 | 14 | catkin 15 | 16 | roscpp 17 | tf2_geometry_msgs 18 | tf2_ros 19 | tf2 20 | visualization_msgs 21 | image_transport 22 | image_transport_plugins 23 | sensor_msgs 24 | cv_bridge 25 | dynamic_reconfigure 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/dnn_detect.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017, Ubiquity Robotics 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18 | * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | * POSSIBILITY OF SUCH DAMAGE. 25 | * 26 | * The views and conclusions contained in the software and documentation are 27 | * those of the authors and should not be interpreted as representing official 28 | * policies, either expressed or implied, of the FreeBSD Project. 29 | * 30 | */ 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | #include 37 | #include 38 | #include 39 | 40 | #include "dnn_detect/DetectedObject.h" 41 | #include "dnn_detect/DetectedObjectArray.h" 42 | #include "dnn_detect/Detect.h" 43 | 44 | #include 45 | #include 46 | #include 47 | 48 | #include 49 | #include 50 | #include 51 | #include 52 | 53 | #include 54 | #include 55 | #include 56 | 57 | using namespace std; 58 | using namespace cv; 59 | 60 | std::condition_variable cond; 61 | std::mutex mutx; 62 | 63 | class DnnNode { 64 | private: 65 | ros::Publisher results_pub; 66 | 67 | image_transport::ImageTransport it; 68 | image_transport::Subscriber img_sub; 69 | 70 | // if set, we publish the images that contain objects 71 | bool publish_images; 72 | 73 | int frame_num; 74 | float min_confidence; 75 | int im_size; 76 | int rotate_flag; 77 | float scale_factor; 78 | float mean_val; 79 | std::vector class_names; 80 | 81 | image_transport::Publisher image_pub; 82 | 83 | cv::dnn::Net net; 84 | cv::Mat resized_image; 85 | cv::Mat rotated_image; 86 | 87 | bool single_shot; 88 | volatile bool triggered; 89 | volatile bool processed; 90 | 91 | dnn_detect::DetectedObjectArray results; 92 | 93 | ros::ServiceServer detect_srv; 94 | 95 | bool trigger_callback(dnn_detect::Detect::Request &req, 96 | dnn_detect::Detect::Response &res); 97 | 98 | void image_callback(const sensor_msgs::ImageConstPtr &msg); 99 | 100 | public: 101 | DnnNode(ros::NodeHandle &nh); 102 | }; 103 | 104 | bool DnnNode:: trigger_callback(dnn_detect::Detect::Request &req, 105 | dnn_detect::Detect::Response &res) 106 | { 107 | ROS_INFO("Got service request"); 108 | triggered = true; 109 | 110 | std::unique_lock lock(mutx); 111 | 112 | while (!processed) { 113 | cond.wait(lock); 114 | } 115 | res.result = results; 116 | processed = false; 117 | return true; 118 | } 119 | 120 | 121 | void DnnNode::image_callback(const sensor_msgs::ImageConstPtr & msg) 122 | { 123 | if (single_shot && !triggered) { 124 | return; 125 | } 126 | triggered = false; 127 | 128 | ROS_INFO("Got image %d", msg->header.seq); 129 | frame_num++; 130 | 131 | cv_bridge::CvImagePtr cv_ptr; 132 | 133 | try { 134 | cv_ptr = cv_bridge::toCvCopy(msg, sensor_msgs::image_encodings::BGR8); 135 | 136 | int w = cv_ptr->image.cols; 137 | int h = cv_ptr->image.rows; 138 | 139 | if (rotate_flag >= 0) { 140 | cv::rotate(cv_ptr->image, rotated_image, rotate_flag); 141 | rotated_image.copyTo(cv_ptr->image); 142 | } 143 | 144 | cv::resize(cv_ptr->image, resized_image, cvSize(im_size, im_size)); 145 | cv::Mat blob = cv::dnn::blobFromImage(resized_image, scale_factor, 146 | cvSize(im_size, im_size), mean_val, false); 147 | 148 | net.setInput(blob, "data"); 149 | cv::Mat objs = net.forward("detection_out"); 150 | 151 | cv::Mat detectionMat(objs.size[2], objs.size[3], CV_32F, 152 | objs.ptr()); 153 | 154 | std::unique_lock lock(mutx); 155 | results.header.frame_id = msg->header.frame_id; 156 | results.objects.clear(); 157 | 158 | for(int i = 0; i < detectionMat.rows; i++) { 159 | 160 | float confidence = detectionMat.at(i, 2); 161 | if (confidence > min_confidence) { 162 | int object_class = (int)(detectionMat.at(i, 1)); 163 | 164 | int x_min = static_cast(detectionMat.at(i, 3) * w); 165 | int y_min = static_cast(detectionMat.at(i, 4) * h); 166 | int x_max = static_cast(detectionMat.at(i, 5) * w); 167 | int y_max = static_cast(detectionMat.at(i, 6) * h); 168 | 169 | std::string class_name; 170 | if (object_class >= class_names.size()) { 171 | class_name = "unknown"; 172 | ROS_ERROR("Object class %d out of range of class names", 173 | object_class); 174 | } 175 | else { 176 | class_name = class_names[object_class]; 177 | } 178 | std::string label = str(boost::format{"%1% %2%"} % 179 | class_name % confidence); 180 | 181 | ROS_INFO("%s", label.c_str()); 182 | dnn_detect::DetectedObject obj; 183 | obj.class_name = class_name; 184 | obj.confidence = confidence; 185 | obj.x_min = x_min; 186 | obj.x_max = x_max; 187 | obj.y_min = y_min; 188 | obj.y_max = y_max; 189 | results.objects.push_back(obj); 190 | 191 | Rect object(x_min, y_min, x_max-x_min, y_max-y_min); 192 | 193 | rectangle(cv_ptr->image, object, Scalar(0, 255, 0)); 194 | int baseline=0; 195 | cv::Size text_size = cv::getTextSize(label, 196 | FONT_HERSHEY_SIMPLEX, 0.75, 2, &baseline); 197 | putText(cv_ptr->image, label, Point(x_min, y_min-text_size.height), 198 | FONT_HERSHEY_SIMPLEX, 0.75, Scalar(0, 255, 0)); 199 | } 200 | } 201 | 202 | results_pub.publish(results); 203 | 204 | image_pub.publish(cv_ptr->toImageMsg()); 205 | 206 | } 207 | catch(cv_bridge::Exception & e) { 208 | ROS_ERROR("cv_bridge exception: %s", e.what()); 209 | } 210 | catch(cv::Exception & e) { 211 | ROS_ERROR("cv exception: %s", e.what()); 212 | } 213 | ROS_DEBUG("Notifying condition variable"); 214 | processed = true; 215 | cond.notify_all(); 216 | } 217 | 218 | DnnNode::DnnNode(ros::NodeHandle & nh) : it(nh) 219 | { 220 | frame_num = 0; 221 | 222 | std::string dir; 223 | std::string proto_net_file; 224 | std::string caffe_model_file; 225 | std::string classes("background," 226 | "aeroplane,bicycle,bird,boat,bottle,bus,car,cat,chair," 227 | "cow,diningtable,dog,horse,motorbike,person,pottedplant," 228 | "sheep,sofa,train,tvmonitor"); 229 | 230 | nh.param("single_shot", single_shot, false); 231 | 232 | nh.param("publish_images", publish_images, false); 233 | nh.param("data_dir", dir, ""); 234 | nh.param("protonet_file", proto_net_file, 235 | "MobileNetSSD_deploy.prototxt.txt"); 236 | nh.param("caffe_model_file", caffe_model_file, 237 | "MobileNetSSD_deploy.caffemodel"); 238 | nh.param("min_confidence", min_confidence, 0.2); 239 | nh.param("im_size", im_size, 300); 240 | nh.param("rotate_flag", rotate_flag, -1); 241 | nh.param("scale_factor", scale_factor, 0.007843f); 242 | nh.param("mean_val", mean_val, 127.5f); 243 | nh.param("class_names", classes, classes); 244 | 245 | boost::split(class_names, classes, boost::is_any_of(",")); 246 | ROS_INFO("Read %d class names", (int)class_names.size()); 247 | 248 | try { 249 | net = cv::dnn::readNetFromCaffe(dir + "/" + proto_net_file, 250 | dir + "/" + caffe_model_file); 251 | } 252 | catch(cv::Exception & e) { 253 | ROS_ERROR("cv exception: %s", e.what()); 254 | exit(1); 255 | } 256 | 257 | triggered = false; 258 | 259 | detect_srv = nh.advertiseService("detect", &DnnNode::trigger_callback, this); 260 | 261 | results_pub = 262 | nh.advertise("/dnn_objects", 20); 263 | 264 | image_pub = it.advertise("/dnn_images", 1); 265 | 266 | img_sub = it.subscribe("/camera", 1, 267 | &DnnNode::image_callback, this); 268 | 269 | ROS_INFO("DNN detection ready"); 270 | } 271 | 272 | int main(int argc, char ** argv) { 273 | ros::init(argc, argv, "dnn_detect"); 274 | ros::NodeHandle nh("~"); 275 | 276 | DnnNode node = DnnNode(nh); 277 | ros::MultiThreadedSpinner spinner(2); 278 | spinner.spin(); 279 | 280 | return 0; 281 | } 282 | -------------------------------------------------------------------------------- /srv/Detect.srv: -------------------------------------------------------------------------------- 1 | --- 2 | DetectedObjectArray result 3 | -------------------------------------------------------------------------------- /test/dnn_images.test: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /test/dnn_images_test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "dnn_detect/DetectedObject.h" 9 | #include "dnn_detect/DetectedObjectArray.h" 10 | #include "dnn_detect/Detect.h" 11 | 12 | #include 13 | 14 | #if CV_MAJOR_VERSION < 4 15 | #define IMREAD_COLOR_MODE CV_LOAD_IMAGE_COLOR 16 | #else 17 | #define IMREAD_COLOR_MODE cv::IMREAD_COLOR 18 | #endif 19 | 20 | class DnnImagesTest : public ::testing::Test { 21 | protected: 22 | virtual void SetUp() { 23 | it = new image_transport::ImageTransport(nh); 24 | image_pub = it->advertise("camera/image", 1); 25 | 26 | ros::NodeHandle nh_priv("~"); 27 | nh_priv.getParam("image_directory", image_directory); 28 | object_sub = nh.subscribe("/dnn_objects", 1, &DnnImagesTest::object_callback, this); 29 | got_object = false; 30 | got_cat = false; 31 | 32 | } 33 | 34 | // Make a service request to trigger detection 35 | void trigger() { 36 | ros::NodeHandle node; 37 | ros::ServiceClient client = 38 | node.serviceClient("/dnn_detect/detect"); 39 | dnn_detect::Detect d; 40 | client.call(d); 41 | } 42 | 43 | virtual void TearDown() { delete it;} 44 | 45 | void publish_image(std::string file) { 46 | boost::thread trig(&DnnImagesTest::trigger, this); 47 | 48 | sleep(1); 49 | cv::Mat image = cv::imread(image_directory+file, IMREAD_COLOR_MODE); 50 | sensor_msgs::ImagePtr msg = cv_bridge::CvImage(std_msgs::Header(), "bgr8", 51 | image).toImageMsg(); 52 | image_pub.publish(msg); 53 | } 54 | 55 | ros::NodeHandle nh; 56 | 57 | // Set up Publishing of static images 58 | image_transport::ImageTransport* it; 59 | image_transport::Publisher image_pub; 60 | 61 | bool got_object; 62 | bool got_cat; 63 | ros::Subscriber object_sub; 64 | 65 | std::string image_directory; 66 | 67 | // Set up subscribing 68 | void object_callback(const dnn_detect::DetectedObjectArray& results) { 69 | got_object = true; 70 | for (const auto& obj : results.objects) { 71 | if (obj.class_name == "cat") { 72 | got_cat = true; 73 | } 74 | } 75 | } 76 | }; 77 | 78 | 79 | TEST_F(DnnImagesTest, cat) { 80 | ros::Rate loop_rate(5); 81 | while (nh.ok() && !got_object && !got_cat) { 82 | publish_image("cat.jpg"); 83 | ros::spinOnce(); 84 | loop_rate.sleep(); 85 | } 86 | 87 | ASSERT_TRUE(got_cat); 88 | } 89 | 90 | int main(int argc, char** argv) 91 | { 92 | 93 | testing::InitGoogleTest(&argc, argv); 94 | ros::init(argc, argv, "DnnImagesTest"); 95 | return RUN_ALL_TESTS(); 96 | } 97 | -------------------------------------------------------------------------------- /test/test_images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UbiquityRobotics/dnn_detect/c23161c9c1c2a2bd15618b6b3450522ac8aad2cb/test/test_images/cat.jpg --------------------------------------------------------------------------------