├── README.md └── deploy.prototxt /README.md: -------------------------------------------------------------------------------- 1 | # CSRNet (Try our [Pytorch Version](https://github.com/leeyeehoo/CSRNet-pytorch/tree/master)!) 2 | This is the repo for [CSRNet: Dilated Convolutional Neural Networks for Understanding the Highly Congested Scenes](https://arxiv.org/abs/1802.10062) in CVPR 2018, which delivered a state-of-the-art, straightforward and end-to-end architecture for crowd counting tasks. 3 | ## Datasets 4 | ShanghaiTech Dataset: [Google Drive](https://drive.google.com/open?id=16dhJn7k4FWVwByRsQAEpl9lwjuV03jVI) 5 | 6 | ## Models (Only for tests) 7 | 8 | This is the model for test. The results should be similar to the results shown in the paper(slightly better or worse). 9 | 10 | 1) ShanghaiTech_Part_A: [Google Drive](https://drive.google.com/open?id=1odZ3B_ZDSepPcVFO_TfGUIrpF2DF7SwY) 11 | 12 | 2) ShanghaiTech_Part_B: [Google Drive](https://drive.google.com/open?id=1NOpn0ztlye85vrHR2TMwOI2Qu_S8zANj) 13 | 14 | ## Prerequisites 15 | 16 | 1) A good CAFFE 17 | 18 | We understand that it's tedious and difficult to config a custom input layer (even installing CAFFE on your own PC), thus we make a pytorch version for the csrnet: [CSRNet Pytorch Version](https://github.com/leeyeehoo/CSRNet-pytorch/tree/master) 19 | 20 | ## References 21 | 22 | If you find the CSRNet useful, please cite our paper. Thank you! 23 | 24 | ``` 25 | @inproceedings{li2018csrnet, 26 | title={CSRNet: Dilated convolutional neural networks for understanding the highly congested scenes}, 27 | author={Li, Yuhong and Zhang, Xiaofan and Chen, Deming}, 28 | booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition}, 29 | pages={1091--1100}, 30 | year={2018} 31 | } 32 | ``` 33 | Please cite the Shanghai datasets and other works if you use them. 34 | 35 | ``` 36 | @inproceedings{zhang2016single, 37 | title={Single-image crowd counting via multi-column convolutional neural network}, 38 | author={Zhang, Yingying and Zhou, Desen and Chen, Siqin and Gao, Shenghua and Ma, Yi}, 39 | booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition}, 40 | pages={589--597}, 41 | year={2016} 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /deploy.prototxt: -------------------------------------------------------------------------------- 1 | input: "data" 2 | input_shape { 3 | dim: 1 4 | dim: 3 5 | dim: 224 6 | dim: 224 7 | } 8 | #First column network 9 | 10 | layer { 11 | bottom: "data" 12 | top: "conv1_1" 13 | name: "conv1_1" 14 | type: "Convolution" 15 | param { 16 | lr_mult: 0 17 | decay_mult: 0 18 | } 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | convolution_param { 24 | num_output: 64 25 | pad: 1 26 | kernel_size: 3 27 | weight_filler { 28 | type: "gaussian" 29 | std: 0.01 30 | } 31 | bias_filler { 32 | type: "constant" 33 | value: 0 34 | } 35 | } 36 | } 37 | layer { 38 | bottom: "conv1_1" 39 | top: "conv1_1" 40 | name: "relu1_1" 41 | type: "ReLU" 42 | } 43 | layer { 44 | bottom: "conv1_1" 45 | top: "conv1_2" 46 | name: "conv1_2" 47 | type: "Convolution" 48 | param { 49 | lr_mult: 0 50 | decay_mult: 0 51 | } 52 | param { 53 | lr_mult: 0 54 | decay_mult: 0 55 | } 56 | convolution_param { 57 | num_output: 64 58 | pad: 1 59 | kernel_size: 3 60 | weight_filler { 61 | type: "gaussian" 62 | std: 0.01 63 | } 64 | bias_filler { 65 | type: "constant" 66 | value: 0 67 | } 68 | } 69 | } 70 | layer { 71 | bottom: "conv1_2" 72 | top: "conv1_2" 73 | name: "relu1_2" 74 | type: "ReLU" 75 | } 76 | layer { 77 | bottom: "conv1_2" 78 | top: "pool1" 79 | name: "pool1" 80 | type: "Pooling" 81 | pooling_param { 82 | pool: MAX 83 | kernel_size: 2 84 | stride: 2 85 | } 86 | } 87 | layer { 88 | bottom: "pool1" 89 | top: "conv2_1" 90 | name: "conv2_1" 91 | type: "Convolution" 92 | param { 93 | lr_mult: 0 94 | decay_mult: 0 95 | } 96 | param { 97 | lr_mult: 0 98 | decay_mult: 0 99 | } 100 | convolution_param { 101 | num_output: 128 102 | pad: 1 103 | kernel_size: 3 104 | weight_filler { 105 | type: "gaussian" 106 | std: 0.01 107 | } 108 | bias_filler { 109 | type: "constant" 110 | value: 0 111 | } 112 | } 113 | } 114 | layer { 115 | bottom: "conv2_1" 116 | top: "conv2_1" 117 | name: "relu2_1" 118 | type: "ReLU" 119 | } 120 | layer { 121 | bottom: "conv2_1" 122 | top: "conv2_2" 123 | name: "conv2_2" 124 | type: "Convolution" 125 | param { 126 | lr_mult: 0 127 | decay_mult: 0 128 | } 129 | param { 130 | lr_mult: 0 131 | decay_mult: 0 132 | } 133 | convolution_param { 134 | num_output: 128 135 | pad: 1 136 | kernel_size: 3 137 | weight_filler { 138 | type: "gaussian" 139 | std: 0.01 140 | } 141 | bias_filler { 142 | type: "constant" 143 | value: 0 144 | } 145 | } 146 | } 147 | layer { 148 | bottom: "conv2_2" 149 | top: "conv2_2" 150 | name: "relu2_2" 151 | type: "ReLU" 152 | } 153 | layer { 154 | bottom: "conv2_2" 155 | top: "pool2" 156 | name: "pool2" 157 | type: "Pooling" 158 | pooling_param { 159 | pool: MAX 160 | kernel_size: 2 161 | stride: 2 162 | } 163 | } 164 | layer { 165 | bottom: "pool2" 166 | top: "conv3_1" 167 | name: "conv3_1" 168 | type: "Convolution" 169 | param { 170 | lr_mult: 0 171 | decay_mult: 0 172 | } 173 | param { 174 | lr_mult: 0 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | weight_filler { 182 | type: "gaussian" 183 | std: 0.01 184 | } 185 | bias_filler { 186 | type: "constant" 187 | value: 0 188 | } 189 | } 190 | } 191 | layer { 192 | bottom: "conv3_1" 193 | top: "conv3_1" 194 | name: "relu3_1" 195 | type: "ReLU" 196 | } 197 | layer { 198 | bottom: "conv3_1" 199 | top: "conv3_2" 200 | name: "conv3_2" 201 | type: "Convolution" 202 | param { 203 | lr_mult: 0 204 | decay_mult: 0 205 | } 206 | param { 207 | lr_mult: 0 208 | decay_mult: 0 209 | } 210 | convolution_param { 211 | num_output: 256 212 | pad: 1 213 | kernel_size: 3 214 | weight_filler { 215 | type: "gaussian" 216 | std: 0.01 217 | } 218 | bias_filler { 219 | type: "constant" 220 | value: 0 221 | } 222 | } 223 | } 224 | layer { 225 | bottom: "conv3_2" 226 | top: "conv3_2" 227 | name: "relu3_2" 228 | type: "ReLU" 229 | } 230 | layer { 231 | bottom: "conv3_2" 232 | top: "conv3_3" 233 | name: "conv3_3" 234 | type: "Convolution" 235 | param { 236 | lr_mult: 0 237 | decay_mult: 0 238 | } 239 | param { 240 | lr_mult: 0 241 | decay_mult: 0 242 | } 243 | convolution_param { 244 | num_output: 256 245 | pad: 1 246 | kernel_size: 3 247 | weight_filler { 248 | type: "gaussian" 249 | std: 0.01 250 | } 251 | bias_filler { 252 | type: "constant" 253 | value: 0 254 | } 255 | } 256 | } 257 | layer { 258 | bottom: "conv3_3" 259 | top: "conv3_3" 260 | name: "relu3_3" 261 | type: "ReLU" 262 | } 263 | layer { 264 | bottom: "conv3_3" 265 | top: "pool3" 266 | name: "pool3" 267 | type: "Pooling" 268 | pooling_param { 269 | pool: MAX 270 | kernel_size: 2 271 | stride: 2 272 | } 273 | } 274 | layer { 275 | bottom: "pool3" 276 | top: "conv4_1" 277 | name: "conv4_1" 278 | type: "Convolution" 279 | param { 280 | lr_mult: 0 281 | decay_mult: 0 282 | } 283 | param { 284 | lr_mult: 0 285 | decay_mult: 0 286 | } 287 | convolution_param { 288 | num_output: 512 289 | pad: 1 290 | kernel_size: 3 291 | weight_filler { 292 | type: "gaussian" 293 | std: 0.01 294 | } 295 | bias_filler { 296 | type: "constant" 297 | value: 0 298 | } 299 | } 300 | } 301 | layer { 302 | bottom: "conv4_1" 303 | top: "conv4_1" 304 | name: "relu4_1" 305 | type: "ReLU" 306 | } 307 | layer { 308 | bottom: "conv4_1" 309 | top: "conv4_2" 310 | name: "conv4_2" 311 | type: "Convolution" 312 | param { 313 | lr_mult: 0 314 | decay_mult: 0 315 | } 316 | param { 317 | lr_mult: 0 318 | decay_mult: 0 319 | } 320 | convolution_param { 321 | num_output: 512 322 | pad: 1 323 | kernel_size: 3 324 | weight_filler { 325 | type: "gaussian" 326 | std: 0.01 327 | } 328 | bias_filler { 329 | type: "constant" 330 | value: 0 331 | } 332 | } 333 | } 334 | layer { 335 | bottom: "conv4_2" 336 | top: "conv4_2" 337 | name: "relu4_2" 338 | type: "ReLU" 339 | } 340 | layer { 341 | bottom: "conv4_2" 342 | top: "conv4_3" 343 | name: "conv4_3" 344 | type: "Convolution" 345 | param { 346 | lr_mult: 0 347 | decay_mult: 0 348 | } 349 | param { 350 | lr_mult: 0 351 | decay_mult: 0 352 | } 353 | convolution_param { 354 | num_output: 512 355 | pad: 1 356 | kernel_size: 3 357 | weight_filler { 358 | type: "gaussian" 359 | std: 0.01 360 | } 361 | bias_filler { 362 | type: "constant" 363 | value: 0 364 | } 365 | } 366 | } 367 | layer { 368 | bottom: "conv4_3" 369 | top: "conv4_3" 370 | name: "relu4_3" 371 | type: "ReLU" 372 | } 373 | layer { 374 | bottom: "conv4_3" 375 | top: "conv6_1" 376 | name: "conv6_1" 377 | type: "Convolution" 378 | param { 379 | lr_mult: 1 380 | decay_mult: 1 381 | } 382 | param { 383 | lr_mult: 2 384 | decay_mult: 0 385 | } 386 | convolution_param { 387 | num_output: 512 388 | pad: 2 389 | dilation: 2 390 | kernel_size: 3 391 | weight_filler { 392 | type: "gaussian" 393 | std: 0.01 394 | } 395 | bias_filler { 396 | type: "constant" 397 | value: 0 398 | } 399 | } 400 | } 401 | layer { 402 | bottom: "conv6_1" 403 | top: "conv6_1" 404 | name: "relu6_1" 405 | type: "ReLU" 406 | } 407 | layer { 408 | bottom: "conv6_1" 409 | top: "conv6_2" 410 | name: "conv6_2" 411 | type: "Convolution" 412 | param { 413 | lr_mult: 1 414 | decay_mult: 1 415 | } 416 | param { 417 | lr_mult: 2 418 | decay_mult: 0 419 | } 420 | convolution_param { 421 | num_output: 512 422 | pad: 2 423 | dilation: 2 424 | kernel_size: 3 425 | weight_filler { 426 | type: "gaussian" 427 | std: 0.01 428 | } 429 | bias_filler { 430 | type: "constant" 431 | value: 0 432 | } 433 | } 434 | } 435 | layer { 436 | bottom: "conv6_2" 437 | top: "conv6_2" 438 | name: "relu6_2" 439 | type: "ReLU" 440 | } 441 | layer { 442 | bottom: "conv6_2" 443 | top: "conv6_3" 444 | name: "conv6_3" 445 | type: "Convolution" 446 | param { 447 | lr_mult: 1 448 | decay_mult: 1 449 | } 450 | param { 451 | lr_mult: 2 452 | decay_mult: 0 453 | } 454 | convolution_param { 455 | num_output: 512 456 | pad: 2 457 | dilation: 2 458 | kernel_size: 3 459 | weight_filler { 460 | type: "gaussian" 461 | std: 0.01 462 | } 463 | bias_filler { 464 | type: "constant" 465 | value: 0 466 | } 467 | } 468 | } 469 | layer { 470 | bottom: "conv6_3" 471 | top: "conv6_3" 472 | name: "relu6_3" 473 | type: "ReLU" 474 | } 475 | layer { 476 | bottom: "conv6_3" 477 | top: "conv7_1" 478 | name: "conv7_1" 479 | type: "Convolution" 480 | param { 481 | lr_mult: 1 482 | decay_mult: 1 483 | } 484 | param { 485 | lr_mult: 2 486 | decay_mult: 0 487 | } 488 | convolution_param { 489 | num_output: 256 490 | pad: 2 491 | dilation: 2 492 | kernel_size: 3 493 | weight_filler { 494 | type: "gaussian" 495 | std: 0.01 496 | } 497 | bias_filler { 498 | type: "constant" 499 | value: 0 500 | } 501 | } 502 | } 503 | layer { 504 | bottom: "conv7_1" 505 | top: "conv7_1" 506 | name: "relu7_1" 507 | type: "ReLU" 508 | } 509 | layer { 510 | bottom: "conv7_1" 511 | top: "conv7_2" 512 | name: "conv7_2" 513 | type: "Convolution" 514 | param { 515 | lr_mult: 1 516 | decay_mult: 1 517 | } 518 | param { 519 | lr_mult: 2 520 | decay_mult: 0 521 | } 522 | convolution_param { 523 | num_output: 128 524 | pad: 2 525 | dilation: 2 526 | kernel_size: 3 527 | weight_filler { 528 | type: "gaussian" 529 | std: 0.01 530 | } 531 | bias_filler { 532 | type: "constant" 533 | value: 0 534 | } 535 | } 536 | } 537 | layer { 538 | bottom: "conv7_2" 539 | top: "conv7_2" 540 | name: "relu7_2" 541 | type: "ReLU" 542 | } 543 | layer { 544 | bottom: "conv7_2" 545 | top: "conv7_3" 546 | name: "conv7_3" 547 | type: "Convolution" 548 | param { 549 | lr_mult: 1 550 | decay_mult: 1 551 | } 552 | param { 553 | lr_mult: 2 554 | decay_mult: 0 555 | } 556 | convolution_param { 557 | num_output: 64 558 | pad: 2 559 | dilation: 2 560 | kernel_size: 3 561 | weight_filler { 562 | type: "gaussian" 563 | std: 0.01 564 | } 565 | bias_filler { 566 | type: "constant" 567 | value: 0 568 | } 569 | } 570 | } 571 | layer { 572 | bottom: "conv7_3" 573 | top: "conv7_3" 574 | name: "relu7_3" 575 | type: "ReLU" 576 | } 577 | layer { 578 | bottom: "conv7_3" 579 | top: "estdmap" 580 | name: "fu1" 581 | type: "Convolution" 582 | param { 583 | lr_mult: 1 584 | decay_mult: 1 585 | } 586 | param { 587 | lr_mult: 2 588 | decay_mult: 0 589 | } 590 | convolution_param { 591 | num_output: 1 592 | kernel_size: 1 593 | weight_filler { 594 | type: "gaussian" 595 | std: 0.01 596 | } 597 | bias_filler { 598 | type: "constant" 599 | value: 0 600 | } 601 | } 602 | } 603 | 604 | 605 | --------------------------------------------------------------------------------