├── LICENSE ├── README.md ├── copy_model.h ├── data ├── .gitignore ├── README.md ├── demo │ ├── 000456.jpg │ ├── 000542.jpg │ ├── 001150.jpg │ ├── 001763.jpg │ └── 004545.jpg ├── pylintrc └── scripts │ ├── fetch_fast_rcnn_ohem_models.sh │ ├── fetch_imagenet_models.sh │ └── fetch_selective_search_data.sh ├── experiments ├── README.md ├── cfgs │ ├── fast_rcnn_adv_128.yml │ └── fast_rcnn_adv_pretrain.yml ├── logs │ └── .gitignore └── scripts │ ├── fast_rcnn_adv.sh │ ├── fast_rcnn_adv_pretrain.sh │ └── fast_rcnn_std.sh ├── lib ├── Makefile ├── datasets │ ├── VOCdevkit-matlab-wrapper │ │ ├── get_voc_opts.m │ │ ├── voc_eval.m │ │ └── xVOCap.m │ ├── __init__.py │ ├── coco.py │ ├── ds_utils.py │ ├── factory.py │ ├── imdb.py │ ├── pascal_voc.py │ ├── tools │ │ └── mcg_munge.py │ └── voc_eval.py ├── fast_rcnn │ ├── __init__.py │ ├── bbox_transform.py │ ├── config.py │ ├── nms_wrapper.py │ ├── test.py │ └── train.py ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── py_cpu_nms.py ├── pycocotools │ ├── UPSTREAM_REV │ ├── __init__.py │ ├── _mask.pyx │ ├── coco.py │ ├── cocoeval.py │ ├── license.txt │ ├── mask.py │ ├── maskApi.c │ └── maskApi.h ├── roi_data_layer │ ├── __init__.py │ ├── layer.py │ ├── minibatch.py │ └── roidb.py ├── rpn │ ├── README.md │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── generate.py │ ├── generate_anchors.py │ ├── proposal_layer.py │ └── proposal_target_layer.py ├── setup.py ├── transform │ ├── __init__.py │ └── torch_image_transform_layer.py └── utils │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ └── timer.py ├── models └── pascal_voc │ ├── VGG16 │ ├── fast_rcnn │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── fast_rcnn_adv │ │ ├── init_weights2.json │ │ ├── solver.prototxt │ │ └── train.prototxt │ ├── fast_rcnn_adv_pretrain │ │ ├── solver.prototxt │ │ └── train.prototxt │ └── fast_rcnn_std │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ └── VGG_CNN_M_1024 │ ├── fast_rcnn │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt │ └── fast_rcnn_ohem │ ├── solver.prototxt │ └── train.prototxt ├── python_utils ├── __init__.py ├── _init_paths.py ├── do_net_surgery.py ├── evaluate_detection.py ├── general_utils.py └── pycaffe_utils.py ├── tools ├── README.md ├── _init_paths.py ├── compress_net.py ├── demo.py ├── eval_recall.py ├── reval.py ├── rpn_generate.py ├── test_net.py ├── train_faster_rcnn_alt_opt.py ├── train_net.py └── train_svms.py └── train.sh /LICENSE: -------------------------------------------------------------------------------- 1 | Adversarial Fast-RCNN (A-FAST-RCNN) 2 | 3 | Copyright (c) 2017, Xiaolong Wang 4 | 5 | The MIT License (MIT) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | THE SOFTWARE. 24 | 25 | ************************************************************************ 26 | 27 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 28 | 29 | This project, A-FAST-RCNN, incorporates material from the project(s) 30 | listed below (collectively, "Third Party Code"). The original copyright 31 | notice and license of such Third Party Code are set out below. This 32 | Third Party Code is licensed to you under their original license terms 33 | set forth below. 34 | 35 | 1. Fast R-CNN (https://github.com/rbgirshick/fast-rcnn) 36 | 37 | Copyright (c) Microsoft Corporation 38 | 39 | All rights reserved. 40 | 41 | MIT License 42 | 43 | Permission is hereby granted, free of charge, to any person obtaining a 44 | copy of this software and associated documentation files (the "Software"), 45 | to deal in the Software without restriction, including without limitation 46 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 47 | and/or sell copies of the Software, and to permit persons to whom the 48 | Software is furnished to do so, subject to the following conditions: 49 | 50 | The above copyright notice and this permission notice shall be included 51 | in all copies or substantial portions of the Software. 52 | 53 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 54 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 55 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 56 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 57 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 58 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 59 | OTHER DEALINGS IN THE SOFTWARE. 60 | 61 | 62 | 2. Faster R-CNN (https://github.com/rbgirshick/py-faster-rcnn) 63 | 64 | The MIT License (MIT) 65 | 66 | Copyright (c) 2015 Microsoft Corporation 67 | 68 | Permission is hereby granted, free of charge, to any person obtaining a copy 69 | of this software and associated documentation files (the "Software"), to deal 70 | in the Software without restriction, including without limitation the rights 71 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 72 | copies of the Software, and to permit persons to whom the Software is 73 | furnished to do so, subject to the following conditions: 74 | 75 | The above copyright notice and this permission notice shall be included in 76 | all copies or substantial portions of the Software. 77 | 78 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 79 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 80 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 81 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 82 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 83 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 84 | THE SOFTWARE. 85 | 86 | 3. Caffe, (https://github.com/BVLC/caffe/) 87 | 88 | COPYRIGHT 89 | 90 | All contributions by the University of California: 91 | Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 92 | All rights reserved. 93 | 94 | All other contributions: 95 | Copyright (c) 2014, 2015, the respective contributors 96 | All rights reserved. 97 | 98 | Caffe uses a shared copyright model: each contributor holds copyright 99 | over their contributions to Caffe. The project versioning records all 100 | such contribution and copyright details. If a contributor wants to 101 | further mark their specific copyright on a particular contribution, 102 | they should indicate their copyright solely in the commit message of 103 | the change when it is committed. 104 | 105 | The BSD 2-Clause License 106 | 107 | Redistribution and use in source and binary forms, with or without 108 | modification, are permitted provided that the following conditions 109 | are met: 110 | 111 | 1. Redistributions of source code must retain the above copyright notice, 112 | this list of conditions and the following disclaimer. 113 | 114 | 2. Redistributions in binary form must reproduce the above copyright 115 | notice, this list of conditions and the following disclaimer in the 116 | documentation and/or other materials provided with the distribution. 117 | 118 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 119 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 120 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 121 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 122 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 123 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 124 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 125 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 126 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 127 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 128 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 129 | 130 | ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION********** 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A-Fast-RCNN: Hard Positive Generation via Adversary for Object Detection 2 | By Xiaolong Wang, Abhinav Shrivastava, and Abhinav Gupta 3 | 4 | ### Introduction 5 | 6 | This is a Caffe based version of A-Fast-RCNN ([arxiv_link](https://arxiv.org/pdf/1704.03414.pdf)). Although we originally implement it on torch, this Caffe re-implementation is much simpler, faster and easier to use. 7 | 8 | We release the code for training A-Fast-RCNN with Adversarial Spatial Dropout Network. 9 | 10 | 11 | ### License 12 | 13 | This code is released under the MIT License (refer to the LICENSE file for details). 14 | 15 | ### Citing 16 | 17 | If you find this useful in your research, please consider citing: 18 | 19 | @inproceedings{WangCVPR17afrcnn, 20 | Author = {Xiaolong Wang and Abhinav Shrivastava and Abhinav Gupta}, 21 | Title = {A-Fast-RCNN: Hard Positive Generation via Adversary for Object Detection}, 22 | Booktitle = {Conference on Computer Vision and Pattern Recognition ({CVPR})}, 23 | Year = {2017} 24 | } 25 | 26 | ### Disclaimer 27 | 28 | This implementation is built on a *fork* of the OHEM code ([here](https://github.com/abhi2610/ohem)), which in turn builds on the Faster R-CNN Python code ([here](https://github.com/rbgirshick/py-faster-rcnn)) and Fast R-CNN ([here](https://github.com/rbgirshick/fast-rcnn)). Please cite the appropriate papers depending on which part of the code and/or model you are using. 29 | 30 | ### Results 31 | | Approach | training data | test data | mAP 32 | | Fast R-CNN (FRCN) | VOC 07 trainval | VOC 07 test | 67.6 33 | | FRCN with adversary | VOC 07 trainval | VOC 07 test | 70.8 34 | 35 | **Note**: The reported results are based on the VGG16 network. 36 | 37 | 38 | 39 | ### Installation 40 | 41 | Please follow the exact installation and download the VOC data as the Faster R-CNN Python code ([here](https://github.com/rbgirshick/py-faster-rcnn)). 42 | 43 | ### Usage 44 | 45 | To run the code, one can simply do, 46 | ```Shell 47 | ./train.sh 48 | ``` 49 | 50 | It includes 3-stage of training: 51 | 52 | ```Shell 53 | ./experiments/scripts/fast_rcnn_std.sh [GPU_ID] VGG16 pascal_voc 54 | ``` 55 | which is used for training a standard Fast-RCNN for 10K iterations, you can download my [model](https://www.dropbox.com/s/ccs7lw3gydfzgvv/fast_rcnn_std_iter_10000.caffemodel?dl=0) and [logs](https://www.dropbox.com/s/hwbag60l1gmtxbb/fast_rcnn_std.txt.2017-04-08_16-53-59?dl=0) for this step. 56 | 57 | ```Shell 58 | ./experiments/scripts/fast_rcnn_adv_pretrain.sh [GPU_ID] VGG16 pascal_voc 59 | ``` 60 | which is a pre-training stage for the adversarial network, you can download my [model](https://www.dropbox.com/s/hvqpxn3bigarhdn/fast_rcnn_adv_pretrain_iter_25000.caffemodel?dl=0) and [logs](https://www.dropbox.com/s/i79j5hd0ee4ybke/fast_rcnn_adv_pretrain.txt.2017-04-08_19-39-49?dl=0) for this step. 61 | 62 | ```Shell 63 | ./copy_model.h 64 | ``` 65 | which is used to copy the weights of the above two models to initialize the joint model. 66 | 67 | ```Shell 68 | ./experiments/scripts/fast_rcnn_adv.sh [GPU_ID] VGG16 pascal_voc 69 | ``` 70 | which is joint training of the detector and the adversarial network, you can download my [model](https://www.dropbox.com/s/5wvxh8g5n3ewvp4/fast_rcnn_adv_iter_40000.caffemodel?dl=0) and [logs](https://www.dropbox.com/s/awrdrwyfthdgba5/fast_rcnn_adv.txt.2017-04-09_22-09-57?dl=0) for this step. 71 | -------------------------------------------------------------------------------- /copy_model.h: -------------------------------------------------------------------------------- 1 | 2 | 3 | python python_utils/do_net_surgery.py \ 4 | --out_net_def models/pascal_voc/VGG16/fast_rcnn_adv/train.prototxt \ 5 | --net_surgery_json models/pascal_voc/VGG16/fast_rcnn_adv/init_weights2.json \ 6 | --out_net_file output/fast_rcnn_adv/voc_2007_trainval/train_init.caffemodel 7 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | selective_search* 2 | imagenet_models* 3 | fast_rcnn_models* 4 | VOCdevkit* 5 | cache 6 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This directory holds (*after you download them*): 2 | - Fast R-CNN models trained with OHEM on VOC 2007 trainval 3 | - Caffe models pre-trained on ImageNet 4 | - Symlinks to datasets 5 | 6 | To download Fast R-CNN models (VGG_CNN_M_1024, VGG16) trained with OHEM on VOC 2007 trainval, run: 7 | 8 | ``` 9 | ./data/scripts/fetch_fast_rcnn_ohem_models.sh 10 | ``` 11 | 12 | This script will populate `data/fast_rcnn_ohem_models` with VGG16 and VGG_CNN_M_1024 models (Fast R-CNN detectors trained with OHEM). 13 | 14 | 15 | To download Caffe models (ZF, VGG16) pre-trained on ImageNet, run: 16 | 17 | ``` 18 | ./data/scripts/fetch_imagenet_models.sh 19 | ``` 20 | 21 | This script will populate `data/imagenet_models`. 22 | 23 | In order to train and test with PASCAL VOC, you will need to establish symlinks. 24 | From the `data` directory (`cd data`): 25 | 26 | ``` 27 | # For VOC 2007 28 | ln -s /your/path/to/VOC2007/VOCdevkit VOCdevkit2007 29 | 30 | # For VOC 2012 31 | ln -s /your/path/to/VOC2012/VOCdevkit VOCdevkit2012 32 | ``` 33 | 34 | Install the MS COCO dataset at /path/to/coco 35 | 36 | ``` 37 | ln -s /path/to/coco coco 38 | ``` 39 | 40 | For COCO with Fast R-CNN, place object proposals under `coco_proposals` (inside 41 | the `data` directory). You can obtain proposals on COCO from Jan Hosang at 42 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/object-recognition-and-scene-understanding/how-good-are-detection-proposals-really/. 43 | For COCO, using MCG is recommended over selective search. MCG boxes can be downloaded 44 | from http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/. 45 | Use the tool `lib/datasets/tools/mcg_munge.py` to convert the downloaded MCG data 46 | into the same file layout as those from Jan Hosang. 47 | 48 | Since you'll likely be experimenting with multiple installs of Fast/er R-CNN in 49 | parallel, you'll probably want to keep all of this data in a shared place and 50 | use symlinks. On my system I create the following symlinks inside `data`: 51 | 52 | Annotations for the 5k image 'minival' subset of COCO val2014 that I like to use 53 | can be found at http://www.cs.berkeley.edu/~rbg/faster-rcnn-data/instances_minival2014.json.zip. 54 | Annotations for COCO val2014 (set) minus minival (~35k images) can be found at 55 | http://www.cs.berkeley.edu/~rbg/faster-rcnn-data/instances_valminusminival2014.json.zip. 56 | 57 | ``` 58 | # data/cache holds various outputs created by the datasets package 59 | ln -s /data/fast_rcnn_shared/cache 60 | 61 | # move the imagenet_models to shared location and symlink to them 62 | ln -s /data/fast_rcnn_shared/imagenet_models 63 | 64 | # move the selective search data to a shared location and symlink to them 65 | # (only applicable to Fast R-CNN training) 66 | ln -s /data/fast_rcnn_shared/selective_search_data 67 | 68 | ln -s /data/VOC2007/VOCdevkit VOCdevkit2007 69 | ln -s /data/VOC2012/VOCdevkit VOCdevkit2012 70 | ``` 71 | -------------------------------------------------------------------------------- /data/demo/000456.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/data/demo/000456.jpg -------------------------------------------------------------------------------- /data/demo/000542.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/data/demo/000542.jpg -------------------------------------------------------------------------------- /data/demo/001150.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/data/demo/001150.jpg -------------------------------------------------------------------------------- /data/demo/001763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/data/demo/001763.jpg -------------------------------------------------------------------------------- /data/demo/004545.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/data/demo/004545.jpg -------------------------------------------------------------------------------- /data/pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | 3 | ignored-modules = numpy, numpy.random, cv2 4 | -------------------------------------------------------------------------------- /data/scripts/fetch_fast_rcnn_ohem_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=fast_rcnn_ohem_models.tgz 7 | URL=http://graphics.cs.cmu.edu/projects/ohem/data/$FILE 8 | CHECKSUM=cbfd5b7ed5ec4d5cb838701cbf1f3ccb 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading Fast R-CNN OHEM models (VGG16 and VGG_CNN_M_1024)(1.5G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_imagenet_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=imagenet_models.tgz 7 | URL=http://www.cs.berkeley.edu/~rbg/faster-rcnn-data/$FILE 8 | CHECKSUM=ed34ca912d6782edfb673a8c3a0bda6d 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading pretrained ImageNet models (1G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_selective_search_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=selective_search_data.tgz 7 | URL=http://www.cs.berkeley.edu/~rbg/fast-rcnn-data/$FILE 8 | CHECKSUM=7078c1db87a7851b31966b96774cd9b9 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading precomputed selective search boxes (0.5G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- 1 | Scripts are under `experiments/scripts`. 2 | 3 | Each script saves a log file under `experiments/logs`. 4 | 5 | Configuration override files used in the experiments are stored in `experiments/cfgs`. 6 | -------------------------------------------------------------------------------- /experiments/cfgs/fast_rcnn_adv_128.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: fast_rcnn_adv 2 | MATLAB: /opt/matlab/8.1/bin/matlab 3 | TRAIN: 4 | BG_THRESH_LO: 0.0 5 | # we use gradient accumulation, 6 | # see solver.prototxt (iter_size: 2) 7 | IMS_PER_BATCH: 1 8 | # adjust batch_size for iter_size 9 | BATCH_SIZE: 128 10 | USE_OHEM: False 11 | # Wasn't used in the paper (impact unknown). 12 | ASPECT_GROUPING: False -------------------------------------------------------------------------------- /experiments/cfgs/fast_rcnn_adv_pretrain.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: fast_rcnn_adv_pretrain 2 | MATLAB: /opt/matlab/8.1/bin/matlab 3 | TRAIN: 4 | BG_THRESH_LO: 0.0 5 | # we use gradient accumulation, 6 | # see solver.prototxt (iter_size: 2) 7 | IMS_PER_BATCH: 1 8 | # adjust batch_size for iter_size 9 | BATCH_SIZE: 64 10 | FG_FRACTION: 0.75 11 | USE_OHEM: False 12 | # Wasn't used in the paper (impact unknown). 13 | ASPECT_GROUPING: False 14 | SNAPSHOT_ITERS: 2500 -------------------------------------------------------------------------------- /experiments/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt* 2 | -------------------------------------------------------------------------------- /experiments/scripts/fast_rcnn_adv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/fast_rcnn_ohem.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is either pascal_voc or coco. 5 | # 6 | # Example: 7 | # ./experiments/scripts/fast_rcnn_ohem.sh 0 VGG16 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=40000 31 | ;; 32 | coco) 33 | echo "Support coming soon. Stay tuned!" 34 | exit 35 | # TRAIN_IMDB="coco_2014_train" 36 | # TEST_IMDB="coco_2014_minival" 37 | # PT_DIR="coco" 38 | # ITERS=280000 39 | ;; 40 | *) 41 | echo "No dataset given" 42 | exit 43 | ;; 44 | esac 45 | 46 | LOG="experiments/logs/fast_rcnn_adv.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 47 | exec &> >(tee -a "$LOG") 48 | echo Logging output to "$LOG" 49 | 50 | 51 | time ./tools/train_net.py --gpu ${GPU_ID} \ 52 | --solver models/${PT_DIR}/${NET}/fast_rcnn_adv/solver.prototxt \ 53 | --weights output/fast_rcnn_adv/voc_2007_trainval/train_init.caffemodel \ 54 | --imdb ${TRAIN_IMDB} \ 55 | --iters ${ITERS} \ 56 | --cfg experiments/cfgs/fast_rcnn_adv_128.yml \ 57 | ${EXTRA_ARGS} 58 | 59 | set +x 60 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 61 | set -x 62 | 63 | time ./tools/test_net.py --gpu ${GPU_ID} \ 64 | --def models/${PT_DIR}/${NET}/fast_rcnn/test.prototxt \ 65 | --net ${NET_FINAL} \ 66 | --imdb ${TEST_IMDB} \ 67 | --cfg experiments/cfgs/fast_rcnn_adv_128.yml \ 68 | --num_dets 2000 \ 69 | --det_thresh 0.00001 \ 70 | ${EXTRA_ARGS} -------------------------------------------------------------------------------- /experiments/scripts/fast_rcnn_adv_pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/fast_rcnn_ohem.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is either pascal_voc or coco. 5 | # 6 | # Example: 7 | # ./experiments/scripts/fast_rcnn_ohem.sh 0 VGG16 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=25000 31 | ;; 32 | coco) 33 | echo "Support coming soon. Stay tuned!" 34 | exit 35 | # TRAIN_IMDB="coco_2014_train" 36 | # TEST_IMDB="coco_2014_minival" 37 | # PT_DIR="coco" 38 | # ITERS=280000 39 | ;; 40 | *) 41 | echo "No dataset given" 42 | exit 43 | ;; 44 | esac 45 | 46 | LOG="experiments/logs/fast_rcnn_adv_pretrain.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 47 | exec &> >(tee -a "$LOG") 48 | echo Logging output to "$LOG" 49 | 50 | # /nfs.yoda/xiaolonw/faster_rcnn/ohem/output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_30000.caffemodel 51 | 52 | time ./tools/train_net.py --gpu ${GPU_ID} \ 53 | --solver models/${PT_DIR}/${NET}/fast_rcnn_adv_pretrain/solver.prototxt \ 54 | --weights output/fast_rcnn_adv/voc_2007_trainval/fast_rcnn_std_iter_10000.caffemodel \ 55 | --imdb ${TRAIN_IMDB} \ 56 | --iters ${ITERS} \ 57 | --cfg experiments/cfgs/fast_rcnn_adv_pretrain.yml \ 58 | ${EXTRA_ARGS} 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /experiments/scripts/fast_rcnn_std.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/fast_rcnn.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is either pascal_voc or coco. 5 | # 6 | # Example: 7 | # ./experiments/scripts/fast_rcnn.sh 0 VGG_CNN_M_1024 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=10000 31 | ;; 32 | coco) 33 | TRAIN_IMDB="coco_2014_train" 34 | TEST_IMDB="coco_2014_minival" 35 | PT_DIR="coco" 36 | ITERS=280000 37 | ;; 38 | *) 39 | echo "No dataset given" 40 | exit 41 | ;; 42 | esac 43 | 44 | LOG="experiments/logs/fast_rcnn_std.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 45 | exec &> >(tee -a "$LOG") 46 | echo Logging output to "$LOG" 47 | 48 | time ./tools/train_net.py --gpu ${GPU_ID} \ 49 | --solver models/${PT_DIR}/${NET}/fast_rcnn_std/solver.prototxt \ 50 | --weights data/imagenet_models/${NET}.v2.caffemodel \ 51 | --imdb ${TRAIN_IMDB} \ 52 | --iters ${ITERS} \ 53 | --cfg experiments/cfgs/fast_rcnn_adv_128.yml \ 54 | ${EXTRA_ARGS} 55 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m: -------------------------------------------------------------------------------- 1 | function VOCopts = get_voc_opts(path) 2 | 3 | tmp = pwd; 4 | cd(path); 5 | try 6 | addpath('VOCcode'); 7 | VOCinit; 8 | catch 9 | rmpath('VOCcode'); 10 | cd(tmp); 11 | error(sprintf('VOCcode directory not found under %s', path)); 12 | end 13 | rmpath('VOCcode'); 14 | cd(tmp); 15 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m: -------------------------------------------------------------------------------- 1 | function res = voc_eval(path, comp_id, test_set, output_dir) 2 | 3 | VOCopts = get_voc_opts(path); 4 | VOCopts.testset = test_set; 5 | 6 | for i = 1:length(VOCopts.classes) 7 | cls = VOCopts.classes{i}; 8 | res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); 9 | end 10 | 11 | fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); 12 | fprintf('Results:\n'); 13 | aps = [res(:).ap]'; 14 | fprintf('%.1f\n', aps * 100); 15 | fprintf('%.1f\n', mean(aps) * 100); 16 | fprintf('~~~~~~~~~~~~~~~~~~~~\n'); 17 | 18 | function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) 19 | 20 | test_set = VOCopts.testset; 21 | year = VOCopts.dataset(4:end); 22 | 23 | addpath(fullfile(VOCopts.datadir, 'VOCcode')); 24 | 25 | res_fn = sprintf(VOCopts.detrespath, comp_id, cls); 26 | 27 | recall = []; 28 | prec = []; 29 | ap = 0; 30 | ap_auc = 0; 31 | 32 | do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); 33 | if do_eval 34 | % Bug in VOCevaldet requires that tic has been called first 35 | tic; 36 | [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); 37 | ap_auc = xVOCap(recall, prec); 38 | 39 | % force plot limits 40 | ylim([0 1]); 41 | xlim([0 1]); 42 | 43 | print(gcf, '-djpeg', '-r0', ... 44 | [output_dir '/' cls '_pr.jpg']); 45 | end 46 | fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); 47 | 48 | res.recall = recall; 49 | res.prec = prec; 50 | res.ap = ap; 51 | res.ap_auc = ap_auc; 52 | 53 | save([output_dir '/' cls '_pr.mat'], ... 54 | 'res', 'recall', 'prec', 'ap', 'ap_auc'); 55 | 56 | rmpath(fullfile(VOCopts.datadir, 'VOCcode')); 57 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m: -------------------------------------------------------------------------------- 1 | function ap = xVOCap(rec,prec) 2 | % From the PASCAL VOC 2011 devkit 3 | 4 | mrec=[0 ; rec ; 1]; 5 | mpre=[0 ; prec ; 0]; 6 | for i=numel(mpre)-1:-1:1 7 | mpre(i)=max(mpre(i),mpre(i+1)); 8 | end 9 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 10 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | 7 | import numpy as np 8 | 9 | def unique_boxes(boxes, scale=1.0): 10 | """Return indices of unique boxes.""" 11 | v = np.array([1, 1e3, 1e6, 1e9]) 12 | hashes = np.round(boxes * scale).dot(v) 13 | _, index = np.unique(hashes, return_index=True) 14 | return np.sort(index) 15 | 16 | def xywh_to_xyxy(boxes): 17 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 18 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 19 | 20 | def xyxy_to_xywh(boxes): 21 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 22 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 23 | 24 | def validate_boxes(boxes, width=0, height=0): 25 | """Check that a set of boxes are valid.""" 26 | x1 = boxes[:, 0] 27 | y1 = boxes[:, 1] 28 | x2 = boxes[:, 2] 29 | y2 = boxes[:, 3] 30 | assert (x1 >= 0).all() 31 | assert (y1 >= 0).all() 32 | assert (x2 >= x1).all() 33 | assert (y2 >= y1).all() 34 | assert (x2 < width).all() 35 | assert (y2 < height).all() 36 | 37 | def filter_small_boxes(boxes, min_size): 38 | w = boxes[:, 2] - boxes[:, 0] 39 | h = boxes[:, 3] - boxes[:, 1] 40 | keep = np.where((w >= min_size) & (h > min_size))[0] 41 | return keep 42 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | 12 | from datasets.pascal_voc import pascal_voc 13 | from datasets.coco import coco 14 | import numpy as np 15 | 16 | # Set up voc__ using selective search "fast" mode 17 | for year in ['2007', '2012']: 18 | for split in ['train', 'val', 'trainval', 'test']: 19 | name = 'voc_{}_{}'.format(year, split) 20 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 21 | 22 | # Set up coco_2014_ 23 | for year in ['2014']: 24 | for split in ['train', 'val', 'minival', 'valminusminival']: 25 | name = 'coco_{}_{}'.format(year, split) 26 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 27 | 28 | # Set up coco_2015_ 29 | for year in ['2015']: 30 | for split in ['test', 'test-dev']: 31 | name = 'coco_{}_{}'.format(year, split) 32 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 33 | 34 | def get_imdb(name): 35 | """Get an imdb (image database) by name.""" 36 | if not __sets.has_key(name): 37 | raise KeyError('Unknown dataset: {}'.format(name)) 38 | return __sets[name]() 39 | 40 | def list_imdbs(): 41 | """List all registered imdbs.""" 42 | return __sets.keys() 43 | -------------------------------------------------------------------------------- /lib/datasets/tools/mcg_munge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | """Hacky tool to convert file system layout of MCG boxes downloaded from 5 | http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ 6 | so that it's consistent with those computed by Jan Hosang (see: 7 | http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- 8 | computing/research/object-recognition-and-scene-understanding/how- 9 | good-are-detection-proposals-really/) 10 | 11 | NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. 12 | Boxes from Hosang et al. are in (x1, y1, x2, y2) order. 13 | """ 14 | 15 | def munge(src_dir): 16 | # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat 17 | # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat 18 | 19 | files = os.listdir(src_dir) 20 | for fn in files: 21 | base, ext = os.path.splitext(fn) 22 | # first 14 chars / first 22 chars / all chars + .mat 23 | # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat 24 | first = base[:14] 25 | second = base[:22] 26 | dst_dir = os.path.join('MCG', 'mat', first, second) 27 | if not os.path.exists(dst_dir): 28 | os.makedirs(dst_dir) 29 | src = os.path.join(src_dir, fn) 30 | dst = os.path.join(dst_dir, fn) 31 | print 'MV: {} -> {}'.format(src, dst) 32 | os.rename(src, dst) 33 | 34 | if __name__ == '__main__': 35 | # src_dir should look something like: 36 | # src_dir = 'MCG-COCO-val2014-boxes' 37 | src_dir = sys.argv[1] 38 | munge(src_dir) 39 | -------------------------------------------------------------------------------- /lib/datasets/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import cPickle 10 | import numpy as np 11 | 12 | def parse_rec(filename): 13 | """ Parse a PASCAL VOC xml file """ 14 | tree = ET.parse(filename) 15 | objects = [] 16 | for obj in tree.findall('object'): 17 | obj_struct = {} 18 | obj_struct['name'] = obj.find('name').text 19 | obj_struct['pose'] = obj.find('pose').text 20 | obj_struct['truncated'] = int(obj.find('truncated').text) 21 | obj_struct['difficult'] = int(obj.find('difficult').text) 22 | bbox = obj.find('bndbox') 23 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 24 | int(bbox.find('ymin').text), 25 | int(bbox.find('xmax').text), 26 | int(bbox.find('ymax').text)] 27 | objects.append(obj_struct) 28 | 29 | return objects 30 | 31 | def voc_ap(rec, prec, use_07_metric=False): 32 | """ ap = voc_ap(rec, prec, [use_07_metric]) 33 | Compute VOC AP given precision and recall. 34 | If use_07_metric is true, uses the 35 | VOC 07 11 point method (default:False). 36 | """ 37 | if use_07_metric: 38 | # 11 point metric 39 | ap = 0. 40 | for t in np.arange(0., 1.1, 0.1): 41 | if np.sum(rec >= t) == 0: 42 | p = 0 43 | else: 44 | p = np.max(prec[rec >= t]) 45 | ap = ap + p / 11. 46 | else: 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], rec, [1.])) 50 | mpre = np.concatenate(([0.], prec, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | def voc_eval(detpath, 65 | annopath, 66 | imagesetfile, 67 | classname, 68 | cachedir, 69 | ovthresh=0.5, 70 | use_07_metric=False): 71 | """rec, prec, ap = voc_eval(detpath, 72 | annopath, 73 | imagesetfile, 74 | classname, 75 | [ovthresh], 76 | [use_07_metric]) 77 | 78 | Top level function that does the PASCAL VOC evaluation. 79 | 80 | detpath: Path to detections 81 | detpath.format(classname) should produce the detection results file. 82 | annopath: Path to annotations 83 | annopath.format(imagename) should be the xml annotations file. 84 | imagesetfile: Text file containing the list of images, one image per line. 85 | classname: Category name (duh) 86 | cachedir: Directory for caching the annotations 87 | [ovthresh]: Overlap threshold (default = 0.5) 88 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 89 | (default False) 90 | """ 91 | # assumes detections are in detpath.format(classname) 92 | # assumes annotations are in annopath.format(imagename) 93 | # assumes imagesetfile is a text file with each line an image name 94 | # cachedir caches the annotations in a pickle file 95 | 96 | # first load gt 97 | if not os.path.isdir(cachedir): 98 | os.mkdir(cachedir) 99 | cachefile = os.path.join(cachedir, 'annots.pkl') 100 | # read list of images 101 | with open(imagesetfile, 'r') as f: 102 | lines = f.readlines() 103 | imagenames = [x.strip() for x in lines] 104 | 105 | if not os.path.isfile(cachefile): 106 | # load annots 107 | recs = {} 108 | for i, imagename in enumerate(imagenames): 109 | recs[imagename] = parse_rec(annopath.format(imagename)) 110 | if i % 100 == 0: 111 | print 'Reading annotation for {:d}/{:d}'.format( 112 | i + 1, len(imagenames)) 113 | # save 114 | print 'Saving cached annotations to {:s}'.format(cachefile) 115 | with open(cachefile, 'w') as f: 116 | cPickle.dump(recs, f) 117 | else: 118 | # load 119 | with open(cachefile, 'r') as f: 120 | recs = cPickle.load(f) 121 | 122 | # extract gt objects for this class 123 | class_recs = {} 124 | npos = 0 125 | for imagename in imagenames: 126 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 127 | bbox = np.array([x['bbox'] for x in R]) 128 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 129 | det = [False] * len(R) 130 | npos = npos + sum(~difficult) 131 | class_recs[imagename] = {'bbox': bbox, 132 | 'difficult': difficult, 133 | 'det': det} 134 | 135 | # read dets 136 | detfile = detpath.format(classname) 137 | with open(detfile, 'r') as f: 138 | lines = f.readlines() 139 | 140 | splitlines = [x.strip().split(' ') for x in lines] 141 | image_ids = [x[0] for x in splitlines] 142 | confidence = np.array([float(x[1]) for x in splitlines]) 143 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 144 | 145 | # sort by confidence 146 | sorted_ind = np.argsort(-confidence) 147 | sorted_scores = np.sort(-confidence) 148 | BB = BB[sorted_ind, :] 149 | image_ids = [image_ids[x] for x in sorted_ind] 150 | 151 | # go down dets and mark TPs and FPs 152 | nd = len(image_ids) 153 | tp = np.zeros(nd) 154 | fp = np.zeros(nd) 155 | for d in range(nd): 156 | R = class_recs[image_ids[d]] 157 | bb = BB[d, :].astype(float) 158 | ovmax = -np.inf 159 | BBGT = R['bbox'].astype(float) 160 | 161 | if BBGT.size > 0: 162 | # compute overlaps 163 | # intersection 164 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 165 | iymin = np.maximum(BBGT[:, 1], bb[1]) 166 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 167 | iymax = np.minimum(BBGT[:, 3], bb[3]) 168 | iw = np.maximum(ixmax - ixmin + 1., 0.) 169 | ih = np.maximum(iymax - iymin + 1., 0.) 170 | inters = iw * ih 171 | 172 | # union 173 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 174 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 175 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 176 | 177 | overlaps = inters / uni 178 | ovmax = np.max(overlaps) 179 | jmax = np.argmax(overlaps) 180 | 181 | if ovmax > ovthresh: 182 | if not R['difficult'][jmax]: 183 | if not R['det'][jmax]: 184 | tp[d] = 1. 185 | R['det'][jmax] = 1 186 | else: 187 | fp[d] = 1. 188 | else: 189 | fp[d] = 1. 190 | 191 | # compute precision recall 192 | fp = np.cumsum(fp) 193 | tp = np.cumsum(tp) 194 | rec = tp / float(npos) 195 | # avoid divide by zero in case the first detection matches a difficult 196 | # ground truth 197 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 198 | ap = voc_ap(rec, prec, use_07_metric) 199 | 200 | return rec, prec, ap 201 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def bbox_transform(ex_rois, gt_rois): 11 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 12 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 13 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 14 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 15 | 16 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 17 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 18 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 19 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 20 | 21 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 22 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 23 | targets_dw = np.log(gt_widths / ex_widths) 24 | targets_dh = np.log(gt_heights / ex_heights) 25 | 26 | targets = np.vstack( 27 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 28 | return targets 29 | 30 | def bbox_transform_inv(boxes, deltas): 31 | if boxes.shape[0] == 0: 32 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 33 | 34 | boxes = boxes.astype(deltas.dtype, copy=False) 35 | 36 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 37 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 38 | ctr_x = boxes[:, 0] + 0.5 * widths 39 | ctr_y = boxes[:, 1] + 0.5 * heights 40 | 41 | dx = deltas[:, 0::4] 42 | dy = deltas[:, 1::4] 43 | dw = deltas[:, 2::4] 44 | dh = deltas[:, 3::4] 45 | 46 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 47 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 48 | pred_w = np.exp(dw) * widths[:, np.newaxis] 49 | pred_h = np.exp(dh) * heights[:, np.newaxis] 50 | 51 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 52 | # x1 53 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 54 | # y1 55 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 56 | # x2 57 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 58 | # y2 59 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 60 | 61 | return pred_boxes 62 | 63 | def clip_boxes(boxes, im_shape): 64 | """ 65 | Clip boxes to image boundaries. 66 | """ 67 | 68 | # x1 >= 0 69 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 70 | # y1 >= 0 71 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 72 | # x2 < im_shape[1] 73 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 74 | # y2 < im_shape[0] 75 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 76 | return boxes 77 | -------------------------------------------------------------------------------- /lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from nms.gpu_nms import gpu_nms 10 | from nms.cpu_nms import cpu_nms 11 | 12 | def nms(dets, thresh, force_cpu=False): 13 | """Dispatch to either CPU or GPU NMS implementations.""" 14 | 15 | if dets.shape[0] == 0: 16 | return [] 17 | if cfg.USE_GPU_NMS and not force_cpu: 18 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | else: 20 | return cpu_nms(dets, thresh) 21 | -------------------------------------------------------------------------------- /lib/fast_rcnn/train.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Train a Fast R-CNN network.""" 9 | 10 | import caffe 11 | from fast_rcnn.config import cfg 12 | import roi_data_layer.roidb as rdl_roidb 13 | from utils.timer import Timer 14 | import numpy as np 15 | import os 16 | 17 | from caffe.proto import caffe_pb2 18 | import google.protobuf as pb2 19 | import google.protobuf.text_format 20 | 21 | class SolverWrapper(object): 22 | """A simple wrapper around Caffe's solver. 23 | This wrapper gives us control over he snapshotting process, which we 24 | use to unnormalize the learned bounding-box regression weights. 25 | """ 26 | 27 | def __init__(self, solver_prototxt, roidb, output_dir, 28 | pretrained_model=None): 29 | """Initialize the SolverWrapper.""" 30 | self.output_dir = output_dir 31 | 32 | if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and 33 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS): 34 | # RPN can only use precomputed normalization because there are no 35 | # fixed statistics to compute a priori 36 | assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED 37 | 38 | if cfg.TRAIN.BBOX_REG: 39 | print 'Computing bounding-box regression targets...' 40 | self.bbox_means, self.bbox_stds = \ 41 | rdl_roidb.add_bbox_regression_targets(roidb) 42 | print 'done' 43 | 44 | self.solver = caffe.SGDSolver(solver_prototxt) 45 | if pretrained_model is not None: 46 | print ('Loading pretrained model ' 47 | 'weights from {:s}').format(pretrained_model) 48 | self.solver.net.copy_from(pretrained_model) 49 | 50 | self.solver_param = caffe_pb2.SolverParameter() 51 | with open(solver_prototxt, 'rt') as f: 52 | pb2.text_format.Merge(f.read(), self.solver_param) 53 | 54 | self.solver.net.layers[0].set_roidb(roidb) 55 | 56 | def snapshot(self): 57 | """Take a snapshot of the network after unnormalizing the learned 58 | bounding-box regression weights. This enables easy use at test-time. 59 | """ 60 | net = self.solver.net 61 | 62 | scale_bbox_params = (cfg.TRAIN.BBOX_REG and 63 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS and 64 | net.params.has_key('bbox_pred')) 65 | 66 | if scale_bbox_params: 67 | # save original values 68 | orig_0 = net.params['bbox_pred'][0].data.copy() 69 | orig_1 = net.params['bbox_pred'][1].data.copy() 70 | 71 | # scale and shift with bbox reg unnormalization; then save snapshot 72 | net.params['bbox_pred'][0].data[...] = \ 73 | (net.params['bbox_pred'][0].data * 74 | self.bbox_stds[:, np.newaxis]) 75 | net.params['bbox_pred'][1].data[...] = \ 76 | (net.params['bbox_pred'][1].data * 77 | self.bbox_stds + self.bbox_means) 78 | 79 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 80 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 81 | filename = (self.solver_param.snapshot_prefix + infix + 82 | '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') 83 | filename = os.path.join(self.output_dir, filename) 84 | 85 | net.save(str(filename)) 86 | print 'Wrote snapshot to: {:s}'.format(filename) 87 | 88 | if scale_bbox_params: 89 | # restore net to original state 90 | net.params['bbox_pred'][0].data[...] = orig_0 91 | net.params['bbox_pred'][1].data[...] = orig_1 92 | return filename 93 | 94 | def train_model(self, max_iters): 95 | """Network training loop.""" 96 | last_snapshot_iter = -1 97 | timer = Timer() 98 | model_paths = [] 99 | while self.solver.iter < max_iters: 100 | # Make one SGD update 101 | timer.tic() 102 | self.solver.step(1) 103 | timer.toc() 104 | if self.solver.iter % (10 * self.solver_param.display) == 0: 105 | print 'speed: {:.3f}s / iter'.format(timer.average_time) 106 | 107 | if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: 108 | last_snapshot_iter = self.solver.iter 109 | model_paths.append(self.snapshot()) 110 | 111 | if last_snapshot_iter != self.solver.iter: 112 | model_paths.append(self.snapshot()) 113 | return model_paths 114 | 115 | def get_training_roidb(imdb): 116 | """Returns a roidb (Region of Interest database) for use in training.""" 117 | if cfg.TRAIN.USE_FLIPPED: 118 | print 'Appending horizontally-flipped training examples...' 119 | imdb.append_flipped_images() 120 | print 'done' 121 | 122 | print 'Preparing training data...' 123 | rdl_roidb.prepare_roidb(imdb) 124 | print 'done' 125 | 126 | return imdb.roidb 127 | 128 | def filter_roidb(roidb): 129 | """Remove roidb entries that have no usable RoIs.""" 130 | 131 | def is_valid(entry): 132 | # Valid images have: 133 | # (1) At least one foreground RoI OR 134 | # (2) At least one background RoI 135 | overlaps = entry['max_overlaps'] 136 | # find boxes with sufficient overlap 137 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 138 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 139 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 140 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 141 | # image is only valid if such boxes exist 142 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 143 | return valid 144 | 145 | num = len(roidb) 146 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 147 | num_after = len(filtered_roidb) 148 | print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after, 149 | num, num_after) 150 | return filtered_roidb 151 | 152 | def train_net(solver_prototxt, roidb, output_dir, 153 | pretrained_model=None, max_iters=40000): 154 | """Train a Fast R-CNN network.""" 155 | 156 | roidb = filter_roidb(roidb) 157 | sw = SolverWrapper(solver_prototxt, roidb, output_dir, 158 | pretrained_model=pretrained_model) 159 | 160 | print 'Solving...' 161 | model_paths = sw.train_model(max_iters) 162 | print 'done solving' 163 | return model_paths 164 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /lib/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocotools/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /lib/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | decode = _mask.decode 78 | iou = _mask.iou 79 | merge = _mask.merge 80 | area = _mask.area 81 | toBbox = _mask.toBbox 82 | frPyObjects = _mask.frPyObjects -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.c: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #include "maskApi.h" 8 | #include 9 | #include 10 | 11 | uint umin( uint a, uint b ) { return (ab) ? a : b; } 13 | 14 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { 15 | R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); 16 | if(cnts) for(siz j=0; jcnts[j]=cnts[j]; 17 | } 18 | 19 | void rleFree( RLE *R ) { 20 | free(R->cnts); R->cnts=0; 21 | } 22 | 23 | void rlesInit( RLE **R, siz n ) { 24 | *R = (RLE*) malloc(sizeof(RLE)*n); 25 | for(siz i=0; i0 ) { 61 | c=umin(ca,cb); cc+=c; ct=0; 62 | ca-=c; if(!ca && a0) { 83 | crowd=iscrowd!=NULL && iscrowd[g]; 84 | if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } 85 | siz ka, kb, a, b; uint c, ca, cb, ct, i, u; bool va, vb; 86 | ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; 87 | cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; 88 | while( ct>0 ) { 89 | c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; 90 | ca-=c; if(!ca && ad?1:c=dy && xs>xe) || (dxye); 151 | if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } 152 | s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; 153 | if(dx>=dy) for( int d=0; d<=dx; d++ ) { 154 | t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; 155 | } else for( int d=0; d<=dy; d++ ) { 156 | t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; 157 | } 158 | } 159 | // get points along y-boundary and downsample 160 | free(x); free(y); k=m; m=0; double xd, yd; 161 | x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); 162 | for( j=1; jw-1 ) continue; 165 | yd=(double)(v[j]h) yd=h; yd=ceil(yd); 167 | x[m]=(int) xd; y[m]=(int) yd; m++; 168 | } 169 | // compute rle encoding given y-boundary points 170 | k=m; a=malloc(sizeof(uint)*(k+1)); 171 | for( j=0; j0) b[m++]=a[j++]; else { 177 | j++; if(jm, p=0; long x; bool more; 184 | char *s=malloc(sizeof(char)*m*6); 185 | for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; 187 | while( more ) { 188 | char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; 189 | if(more) c |= 0x20; c+=48; s[p++]=c; 190 | } 191 | } 192 | s[p]=0; return s; 193 | } 194 | 195 | void rleFrString( RLE *R, char *s, siz h, siz w ) { 196 | siz m=0, p=0, k; long x; bool more; uint *cnts; 197 | while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; 198 | while( s[p] ) { 199 | x=0; k=0; more=1; 200 | while( more ) { 201 | char c=s[p]-48; x |= (c & 0x1f) << 5*k; 202 | more = c & 0x20; p++; k++; 203 | if(!more && (c & 0x10)) x |= -1 << 5*k; 204 | } 205 | if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; 206 | } 207 | rleInit(R,h,w,m,cnts); free(cnts); 208 | } 209 | -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | from fast_rcnn.config import cfg 12 | from fast_rcnn.bbox_transform import bbox_transform 13 | from utils.cython_bbox import bbox_overlaps 14 | import PIL 15 | 16 | def prepare_roidb(imdb): 17 | """Enrich the imdb's roidb by adding some derived quantities that 18 | are useful for training. This function precomputes the maximum 19 | overlap, taken over ground-truth boxes, between each ROI and 20 | each ground-truth box. The class with maximum overlap is also 21 | recorded. 22 | """ 23 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 24 | for i in xrange(imdb.num_images)] 25 | roidb = imdb.roidb 26 | for i in xrange(len(imdb.image_index)): 27 | roidb[i]['image'] = imdb.image_path_at(i) 28 | roidb[i]['width'] = sizes[i][0] 29 | roidb[i]['height'] = sizes[i][1] 30 | # need gt_overlaps as a dense array for argmax 31 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 32 | # max overlap with gt over classes (columns) 33 | max_overlaps = gt_overlaps.max(axis=1) 34 | # gt class that had the max overlap 35 | max_classes = gt_overlaps.argmax(axis=1) 36 | roidb[i]['max_classes'] = max_classes 37 | roidb[i]['max_overlaps'] = max_overlaps 38 | # sanity checks 39 | # max overlap of 0 => class should be zero (background) 40 | zero_inds = np.where(max_overlaps == 0)[0] 41 | assert all(max_classes[zero_inds] == 0) 42 | # max overlap > 0 => class should not be zero (must be a fg class) 43 | nonzero_inds = np.where(max_overlaps > 0)[0] 44 | assert all(max_classes[nonzero_inds] != 0) 45 | 46 | def add_bbox_regression_targets(roidb): 47 | """Add information needed to train bounding-box regressors.""" 48 | assert len(roidb) > 0 49 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 50 | 51 | num_images = len(roidb) 52 | # Infer number of classes from the number of columns in gt_overlaps 53 | num_classes = roidb[0]['gt_overlaps'].shape[1] 54 | for im_i in xrange(num_images): 55 | rois = roidb[im_i]['boxes'] 56 | max_overlaps = roidb[im_i]['max_overlaps'] 57 | max_classes = roidb[im_i]['max_classes'] 58 | roidb[im_i]['bbox_targets'] = \ 59 | _compute_targets(rois, max_overlaps, max_classes) 60 | 61 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 62 | # Use fixed / precomputed "means" and "stds" instead of empirical values 63 | means = np.tile( 64 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 65 | stds = np.tile( 66 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 67 | else: 68 | # Compute values needed for means and stds 69 | # var(x) = E(x^2) - E(x)^2 70 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 71 | sums = np.zeros((num_classes, 4)) 72 | squared_sums = np.zeros((num_classes, 4)) 73 | for im_i in xrange(num_images): 74 | targets = roidb[im_i]['bbox_targets'] 75 | for cls in xrange(1, num_classes): 76 | cls_inds = np.where(targets[:, 0] == cls)[0] 77 | if cls_inds.size > 0: 78 | class_counts[cls] += cls_inds.size 79 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 80 | squared_sums[cls, :] += \ 81 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 82 | 83 | means = sums / class_counts 84 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 85 | 86 | print 'bbox target means:' 87 | print means 88 | print means[1:, :].mean(axis=0) # ignore bg class 89 | print 'bbox target stdevs:' 90 | print stds 91 | print stds[1:, :].mean(axis=0) # ignore bg class 92 | 93 | # Normalize targets 94 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 95 | print "Normalizing targets" 96 | for im_i in xrange(num_images): 97 | targets = roidb[im_i]['bbox_targets'] 98 | for cls in xrange(1, num_classes): 99 | cls_inds = np.where(targets[:, 0] == cls)[0] 100 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 101 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 102 | else: 103 | print "NOT normalizing targets" 104 | 105 | # These values will be needed for making predictions 106 | # (the predicts will need to be unnormalized and uncentered) 107 | return means.ravel(), stds.ravel() 108 | 109 | def _compute_targets(rois, overlaps, labels): 110 | """Compute bounding-box regression targets for an image.""" 111 | # Indices of ground-truth ROIs 112 | gt_inds = np.where(overlaps == 1)[0] 113 | if len(gt_inds) == 0: 114 | # Bail if the image has no ground-truth ROIs 115 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 116 | # Indices of examples for which we try to make predictions 117 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 118 | 119 | # Get IoU overlap between each ex ROI and gt ROI 120 | ex_gt_overlaps = bbox_overlaps( 121 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 122 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 123 | 124 | # Find which gt ROI each ex ROI has max overlap with: 125 | # this will be the ex ROI's gt target 126 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 127 | gt_rois = rois[gt_inds[gt_assignment], :] 128 | ex_rois = rois[ex_inds, :] 129 | 130 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 131 | targets[ex_inds, 0] = labels[ex_inds] 132 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 133 | return targets 134 | -------------------------------------------------------------------------------- /lib/rpn/README.md: -------------------------------------------------------------------------------- 1 | ### `rpn` module overview 2 | 3 | ##### `generate_anchors.py` 4 | 5 | Generates a regular grid of multi-scale, multi-aspect anchor boxes. 6 | 7 | ##### `proposal_layer.py` 8 | 9 | Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals. 10 | 11 | ##### `anchor_target_layer.py` 12 | 13 | Generates training targets/labels for each anchor. Classification labels are 1 (object), 0 (not object) or -1 (ignore). 14 | Bbox regression targets are specified when the classification label is > 0. 15 | 16 | ##### `proposal_target_layer.py` 17 | 18 | Generates training targets/labels for each object proposal: classification labels 0 - K (bg or object class 1, ... , K) 19 | and bbox regression targets in that case that the label is > 0. 20 | 21 | ##### `generate.py` 22 | 23 | Generate object detection proposals from an imdb using an RPN. 24 | -------------------------------------------------------------------------------- /lib/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/rpn/generate.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from utils.blob import im_list_to_blob 10 | from utils.timer import Timer 11 | import numpy as np 12 | import cv2 13 | 14 | def _vis_proposals(im, dets, thresh=0.5): 15 | """Draw detected bounding boxes.""" 16 | inds = np.where(dets[:, -1] >= thresh)[0] 17 | if len(inds) == 0: 18 | return 19 | 20 | class_name = 'obj' 21 | im = im[:, :, (2, 1, 0)] 22 | fig, ax = plt.subplots(figsize=(12, 12)) 23 | ax.imshow(im, aspect='equal') 24 | for i in inds: 25 | bbox = dets[i, :4] 26 | score = dets[i, -1] 27 | 28 | ax.add_patch( 29 | plt.Rectangle((bbox[0], bbox[1]), 30 | bbox[2] - bbox[0], 31 | bbox[3] - bbox[1], fill=False, 32 | edgecolor='red', linewidth=3.5) 33 | ) 34 | ax.text(bbox[0], bbox[1] - 2, 35 | '{:s} {:.3f}'.format(class_name, score), 36 | bbox=dict(facecolor='blue', alpha=0.5), 37 | fontsize=14, color='white') 38 | 39 | ax.set_title(('{} detections with ' 40 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 41 | thresh), 42 | fontsize=14) 43 | plt.axis('off') 44 | plt.tight_layout() 45 | plt.draw() 46 | 47 | def _get_image_blob(im): 48 | """Converts an image into a network input. 49 | 50 | Arguments: 51 | im (ndarray): a color image in BGR order 52 | 53 | Returns: 54 | blob (ndarray): a data blob holding an image pyramid 55 | im_scale_factors (list): list of image scales (relative to im) used 56 | in the image pyramid 57 | """ 58 | im_orig = im.astype(np.float32, copy=True) 59 | im_orig -= cfg.PIXEL_MEANS 60 | 61 | im_shape = im_orig.shape 62 | im_size_min = np.min(im_shape[0:2]) 63 | im_size_max = np.max(im_shape[0:2]) 64 | 65 | processed_ims = [] 66 | 67 | assert len(cfg.TEST.SCALES) == 1 68 | target_size = cfg.TEST.SCALES[0] 69 | 70 | im_scale = float(target_size) / float(im_size_min) 71 | # Prevent the biggest axis from being more than MAX_SIZE 72 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 73 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 74 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 75 | interpolation=cv2.INTER_LINEAR) 76 | im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] 77 | processed_ims.append(im) 78 | 79 | # Create a blob to hold the input images 80 | blob = im_list_to_blob(processed_ims) 81 | 82 | return blob, im_info 83 | 84 | def im_proposals(net, im): 85 | """Generate RPN proposals on a single image.""" 86 | blobs = {} 87 | blobs['data'], blobs['im_info'] = _get_image_blob(im) 88 | net.blobs['data'].reshape(*(blobs['data'].shape)) 89 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 90 | blobs_out = net.forward( 91 | data=blobs['data'].astype(np.float32, copy=False), 92 | im_info=blobs['im_info'].astype(np.float32, copy=False)) 93 | 94 | scale = blobs['im_info'][0, 2] 95 | boxes = blobs_out['rois'][:, 1:].copy() / scale 96 | scores = blobs_out['scores'].copy() 97 | return boxes, scores 98 | 99 | def imdb_proposals(net, imdb): 100 | """Generate RPN proposals on all images in an imdb.""" 101 | 102 | _t = Timer() 103 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 104 | for i in xrange(imdb.num_images): 105 | im = cv2.imread(imdb.image_path_at(i)) 106 | _t.tic() 107 | imdb_boxes[i], scores = im_proposals(net, im) 108 | _t.toc() 109 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 110 | .format(i + 1, imdb.num_images, _t.average_time) 111 | if 0: 112 | dets = np.hstack((imdb_boxes[i], scores)) 113 | # from IPython import embed; embed() 114 | _vis_proposals(im, dets[:3, :], thresh=0.9) 115 | plt.show() 116 | 117 | return imdb_boxes 118 | -------------------------------------------------------------------------------- /lib/rpn/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in xrange(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | print time.time() - t 104 | print a 105 | from IPython import embed; embed() 106 | -------------------------------------------------------------------------------- /lib/rpn/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import numpy as np 10 | import yaml 11 | from fast_rcnn.config import cfg 12 | from generate_anchors import generate_anchors 13 | from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 14 | from fast_rcnn.nms_wrapper import nms 15 | 16 | DEBUG = False 17 | 18 | class ProposalLayer(caffe.Layer): 19 | """ 20 | Outputs object detection proposals by applying estimated bounding-box 21 | transformations to a set of regular boxes (called "anchors"). 22 | """ 23 | 24 | def setup(self, bottom, top): 25 | # parse the layer parameter string, which must be valid YAML 26 | layer_params = yaml.load(self.param_str_) 27 | 28 | self._feat_stride = layer_params['feat_stride'] 29 | anchor_scales = layer_params.get('scales', (8, 16, 32)) 30 | self._anchors = generate_anchors(scales=np.array(anchor_scales)) 31 | self._num_anchors = self._anchors.shape[0] 32 | 33 | if DEBUG: 34 | print 'feat_stride: {}'.format(self._feat_stride) 35 | print 'anchors:' 36 | print self._anchors 37 | 38 | # rois blob: holds R regions of interest, each is a 5-tuple 39 | # (n, x1, y1, x2, y2) specifying an image batch index n and a 40 | # rectangle (x1, y1, x2, y2) 41 | top[0].reshape(1, 5) 42 | 43 | # scores blob: holds scores for R regions of interest 44 | if len(top) > 1: 45 | top[1].reshape(1, 1, 1, 1) 46 | 47 | def forward(self, bottom, top): 48 | # Algorithm: 49 | # 50 | # for each (H, W) location i 51 | # generate A anchor boxes centered on cell i 52 | # apply predicted bbox deltas at cell i to each of the A anchors 53 | # clip predicted boxes to image 54 | # remove predicted boxes with either height or width < threshold 55 | # sort all (proposal, score) pairs by score from highest to lowest 56 | # take top pre_nms_topN proposals before NMS 57 | # apply NMS with threshold 0.7 to remaining proposals 58 | # take after_nms_topN proposals after NMS 59 | # return the top proposals (-> RoIs top, scores top) 60 | 61 | assert bottom[0].data.shape[0] == 1, \ 62 | 'Only single item batches are supported' 63 | 64 | cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' 65 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 66 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 67 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 68 | min_size = cfg[cfg_key].RPN_MIN_SIZE 69 | 70 | # the first set of _num_anchors channels are bg probs 71 | # the second set are the fg probs, which we want 72 | scores = bottom[0].data[:, self._num_anchors:, :, :] 73 | bbox_deltas = bottom[1].data 74 | im_info = bottom[2].data[0, :] 75 | 76 | if DEBUG: 77 | print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) 78 | print 'scale: {}'.format(im_info[2]) 79 | 80 | # 1. Generate proposals from bbox deltas and shifted anchors 81 | height, width = scores.shape[-2:] 82 | 83 | if DEBUG: 84 | print 'score map size: {}'.format(scores.shape) 85 | 86 | # Enumerate all shifts 87 | shift_x = np.arange(0, width) * self._feat_stride 88 | shift_y = np.arange(0, height) * self._feat_stride 89 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 90 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 91 | shift_x.ravel(), shift_y.ravel())).transpose() 92 | 93 | # Enumerate all shifted anchors: 94 | # 95 | # add A anchors (1, A, 4) to 96 | # cell K shifts (K, 1, 4) to get 97 | # shift anchors (K, A, 4) 98 | # reshape to (K*A, 4) shifted anchors 99 | A = self._num_anchors 100 | K = shifts.shape[0] 101 | anchors = self._anchors.reshape((1, A, 4)) + \ 102 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 103 | anchors = anchors.reshape((K * A, 4)) 104 | 105 | # Transpose and reshape predicted bbox transformations to get them 106 | # into the same order as the anchors: 107 | # 108 | # bbox deltas will be (1, 4 * A, H, W) format 109 | # transpose to (1, H, W, 4 * A) 110 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 111 | # in slowest to fastest order 112 | bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) 113 | 114 | # Same story for the scores: 115 | # 116 | # scores are (1, A, H, W) format 117 | # transpose to (1, H, W, A) 118 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 119 | scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) 120 | 121 | # Convert anchors into proposals via bbox transformations 122 | proposals = bbox_transform_inv(anchors, bbox_deltas) 123 | 124 | # 2. clip predicted boxes to image 125 | proposals = clip_boxes(proposals, im_info[:2]) 126 | 127 | # 3. remove predicted boxes with either height or width < threshold 128 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 129 | keep = _filter_boxes(proposals, min_size * im_info[2]) 130 | proposals = proposals[keep, :] 131 | scores = scores[keep] 132 | 133 | # 4. sort all (proposal, score) pairs by score from highest to lowest 134 | # 5. take top pre_nms_topN (e.g. 6000) 135 | order = scores.ravel().argsort()[::-1] 136 | if pre_nms_topN > 0: 137 | order = order[:pre_nms_topN] 138 | proposals = proposals[order, :] 139 | scores = scores[order] 140 | 141 | # 6. apply nms (e.g. threshold = 0.7) 142 | # 7. take after_nms_topN (e.g. 300) 143 | # 8. return the top proposals (-> RoIs top) 144 | keep = nms(np.hstack((proposals, scores)), nms_thresh) 145 | if post_nms_topN > 0: 146 | keep = keep[:post_nms_topN] 147 | proposals = proposals[keep, :] 148 | scores = scores[keep] 149 | 150 | # Output rois blob 151 | # Our RPN implementation only supports a single input image, so all 152 | # batch inds are 0 153 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 154 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 155 | top[0].reshape(*(blob.shape)) 156 | top[0].data[...] = blob 157 | 158 | # [Optional] output scores blob 159 | if len(top) > 1: 160 | top[1].reshape(*(scores.shape)) 161 | top[1].data[...] = scores 162 | 163 | def backward(self, top, propagate_down, bottom): 164 | """This layer does not propagate gradients.""" 165 | pass 166 | 167 | def reshape(self, bottom, top): 168 | """Reshaping happens during the call to forward.""" 169 | pass 170 | 171 | def _filter_boxes(boxes, min_size): 172 | """Remove all boxes with any side smaller than min_size.""" 173 | ws = boxes[:, 2] - boxes[:, 0] + 1 174 | hs = boxes[:, 3] - boxes[:, 1] + 1 175 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 176 | return keep 177 | -------------------------------------------------------------------------------- /lib/rpn/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import yaml 10 | import numpy as np 11 | import numpy.random as npr 12 | from fast_rcnn.config import cfg 13 | from fast_rcnn.bbox_transform import bbox_transform 14 | from utils.cython_bbox import bbox_overlaps 15 | 16 | DEBUG = False 17 | 18 | class ProposalTargetLayer(caffe.Layer): 19 | """ 20 | Assign object detection proposals to ground-truth targets. Produces proposal 21 | classification labels and bounding-box regression targets. 22 | """ 23 | 24 | def setup(self, bottom, top): 25 | layer_params = yaml.load(self.param_str_) 26 | self._num_classes = layer_params['num_classes'] 27 | 28 | # sampled rois (0, x1, y1, x2, y2) 29 | top[0].reshape(1, 5) 30 | # labels 31 | top[1].reshape(1, 1) 32 | # bbox_targets 33 | top[2].reshape(1, self._num_classes * 4) 34 | # bbox_inside_weights 35 | top[3].reshape(1, self._num_classes * 4) 36 | # bbox_outside_weights 37 | top[4].reshape(1, self._num_classes * 4) 38 | 39 | def forward(self, bottom, top): 40 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 41 | # (i.e., rpn.proposal_layer.ProposalLayer), or any other source 42 | all_rois = bottom[0].data 43 | # GT boxes (x1, y1, x2, y2, label) 44 | # TODO(rbg): it's annoying that sometimes I have extra info before 45 | # and other times after box coordinates -- normalize to one format 46 | gt_boxes = bottom[1].data 47 | 48 | # Include ground-truth boxes in the set of candidate rois 49 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 50 | all_rois = np.vstack( 51 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) 52 | ) 53 | 54 | # Sanity check: single batch only 55 | assert np.all(all_rois[:, 0] == 0), \ 56 | 'Only single item batches are supported' 57 | 58 | num_images = 1 59 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 60 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 61 | 62 | # Sample rois with classification labels and bounding box regression 63 | # targets 64 | labels, rois, bbox_targets, bbox_inside_weights = _sample_rois( 65 | all_rois, gt_boxes, fg_rois_per_image, 66 | rois_per_image, self._num_classes) 67 | 68 | if DEBUG: 69 | print 'num fg: {}'.format((labels > 0).sum()) 70 | print 'num bg: {}'.format((labels == 0).sum()) 71 | self._count += 1 72 | self._fg_num += (labels > 0).sum() 73 | self._bg_num += (labels == 0).sum() 74 | print 'num fg avg: {}'.format(self._fg_num / self._count) 75 | print 'num bg avg: {}'.format(self._bg_num / self._count) 76 | print 'ratio: {:.3f}'.format(float(self._fg_num) / float(self._bg_num)) 77 | 78 | # sampled rois 79 | top[0].reshape(*rois.shape) 80 | top[0].data[...] = rois 81 | 82 | # classification labels 83 | top[1].reshape(*labels.shape) 84 | top[1].data[...] = labels 85 | 86 | # bbox_targets 87 | top[2].reshape(*bbox_targets.shape) 88 | top[2].data[...] = bbox_targets 89 | 90 | # bbox_inside_weights 91 | top[3].reshape(*bbox_inside_weights.shape) 92 | top[3].data[...] = bbox_inside_weights 93 | 94 | # bbox_outside_weights 95 | top[4].reshape(*bbox_inside_weights.shape) 96 | top[4].data[...] = np.array(bbox_inside_weights > 0).astype(np.float32) 97 | 98 | def backward(self, top, propagate_down, bottom): 99 | """This layer does not propagate gradients.""" 100 | pass 101 | 102 | def reshape(self, bottom, top): 103 | """Reshaping happens during the call to forward.""" 104 | pass 105 | 106 | 107 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 108 | """Bounding-box regression targets (bbox_target_data) are stored in a 109 | compact form N x (class, tx, ty, tw, th) 110 | 111 | This function expands those targets into the 4-of-4*K representation used 112 | by the network (i.e. only one class has non-zero targets). 113 | 114 | Returns: 115 | bbox_target (ndarray): N x 4K blob of regression targets 116 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 117 | """ 118 | 119 | clss = bbox_target_data[:, 0] 120 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 121 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 122 | inds = np.where(clss > 0)[0] 123 | for ind in inds: 124 | cls = clss[ind] 125 | start = 4 * cls 126 | end = start + 4 127 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 128 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 129 | return bbox_targets, bbox_inside_weights 130 | 131 | 132 | def _compute_targets(ex_rois, gt_rois, labels): 133 | """Compute bounding-box regression targets for an image.""" 134 | 135 | assert ex_rois.shape[0] == gt_rois.shape[0] 136 | assert ex_rois.shape[1] == 4 137 | assert gt_rois.shape[1] == 4 138 | 139 | targets = bbox_transform(ex_rois, gt_rois) 140 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 141 | # Optionally normalize targets by a precomputed mean and stdev 142 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 143 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 144 | return np.hstack( 145 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) 146 | 147 | def _sample_rois(all_rois, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): 148 | """Generate a random sample of RoIs comprising foreground and background 149 | examples. 150 | """ 151 | # overlaps: (rois x gt_boxes) 152 | overlaps = bbox_overlaps( 153 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 154 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 155 | gt_assignment = overlaps.argmax(axis=1) 156 | max_overlaps = overlaps.max(axis=1) 157 | labels = gt_boxes[gt_assignment, 4] 158 | 159 | # Select foreground RoIs as those with >= FG_THRESH overlap 160 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 161 | # Guard against the case when an image has fewer than fg_rois_per_image 162 | # foreground RoIs 163 | fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) 164 | # Sample foreground regions without replacement 165 | if fg_inds.size > 0: 166 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) 167 | 168 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 169 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 170 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 171 | # Compute number of background RoIs to take from this image (guarding 172 | # against there being fewer than desired) 173 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 174 | bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) 175 | # Sample background regions without replacement 176 | if bg_inds.size > 0: 177 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) 178 | 179 | # The indices that we're selecting (both fg and bg) 180 | keep_inds = np.append(fg_inds, bg_inds) 181 | # Select sampled values from various arrays: 182 | labels = labels[keep_inds] 183 | # Clamp labels for the background RoIs to 0 184 | labels[fg_rois_per_this_image:] = 0 185 | rois = all_rois[keep_inds] 186 | 187 | bbox_target_data = _compute_targets( 188 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) 189 | 190 | bbox_targets, bbox_inside_weights = \ 191 | _get_bbox_regression_labels(bbox_target_data, num_classes) 192 | 193 | return labels, rois, bbox_targets, bbox_inside_weights 194 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import subprocess 14 | import numpy as np 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | 30 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 31 | and values giving the absolute path to each directory. 32 | 33 | Starts by looking for the CUDAHOME env variable. If not found, everything 34 | is based on finding 'nvcc' in the PATH. 35 | """ 36 | 37 | # first check if the CUDAHOME env variable is in use 38 | if 'CUDAHOME' in os.environ: 39 | home = os.environ['CUDAHOME'] 40 | nvcc = pjoin(home, 'bin', 'nvcc') 41 | else: 42 | # otherwise, search the PATH for NVCC 43 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 44 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError('The nvcc binary could not be ' 47 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 48 | home = os.path.dirname(os.path.dirname(nvcc)) 49 | 50 | cudaconfig = {'home':home, 'nvcc':nvcc, 51 | 'include': pjoin(home, 'include'), 52 | 'lib64': pjoin(home, 'lib64')} 53 | for k, v in cudaconfig.iteritems(): 54 | if not os.path.exists(v): 55 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 56 | 57 | return cudaconfig 58 | CUDA = locate_cuda() 59 | 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | def customize_compiler_for_nvcc(self): 68 | """inject deep into distutils to customize how the dispatch 69 | to gcc/nvcc works. 70 | 71 | If you subclass UnixCCompiler, it's not trivial to get your subclass 72 | injected in, and still have the right customizations (i.e. 73 | distutils.sysconfig.customize_compiler) run on it. So instead of going 74 | the OO route, I have this. Note, it's kindof like a wierd functional 75 | subclassing going on.""" 76 | 77 | # tell the compiler it can processes .cu 78 | self.src_extensions.append('.cu') 79 | 80 | # save references to the default compiler_so and _comple methods 81 | default_compiler_so = self.compiler_so 82 | super = self._compile 83 | 84 | # now redefine the _compile method. This gets executed for each 85 | # object but distutils doesn't have the ability to change compilers 86 | # based on source extension: we add it. 87 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 88 | if os.path.splitext(src)[1] == '.cu': 89 | # use the cuda for .cu files 90 | self.set_executable('compiler_so', CUDA['nvcc']) 91 | # use only a subset of the extra_postargs, which are 1-1 translated 92 | # from the extra_compile_args in the Extension class 93 | postargs = extra_postargs['nvcc'] 94 | else: 95 | postargs = extra_postargs['gcc'] 96 | 97 | super(obj, src, ext, cc_args, postargs, pp_opts) 98 | # reset the default compiler_so, which we might have changed for cuda 99 | self.compiler_so = default_compiler_so 100 | 101 | # inject our redefined _compile method into the class 102 | self._compile = _compile 103 | 104 | 105 | # run the customize_compiler 106 | class custom_build_ext(build_ext): 107 | def build_extensions(self): 108 | customize_compiler_for_nvcc(self.compiler) 109 | build_ext.build_extensions(self) 110 | 111 | 112 | ext_modules = [ 113 | Extension( 114 | "utils.cython_bbox", 115 | ["utils/bbox.pyx"], 116 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 117 | include_dirs = [numpy_include] 118 | ), 119 | Extension( 120 | "nms.cpu_nms", 121 | ["nms/cpu_nms.pyx"], 122 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 123 | include_dirs = [numpy_include] 124 | ), 125 | Extension('nms.gpu_nms', 126 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 127 | library_dirs=[CUDA['lib64']], 128 | libraries=['cudart'], 129 | language='c++', 130 | runtime_library_dirs=[CUDA['lib64']], 131 | # this syntax is specific to this build system 132 | # we're only going to use certain compiler args with nvcc and not with 133 | # gcc the implementation of this trick is in customize_compiler() below 134 | extra_compile_args={'gcc': ["-Wno-unused-function"], 135 | 'nvcc': ['-arch=sm_35', 136 | '--ptxas-options=-v', 137 | '-c', 138 | '--compiler-options', 139 | "'-fPIC'"]}, 140 | include_dirs = [numpy_include, CUDA['include']] 141 | ), 142 | Extension( 143 | 'pycocotools._mask', 144 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 145 | include_dirs = [numpy_include, 'pycocotools'], 146 | extra_compile_args={ 147 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 148 | ), 149 | ] 150 | 151 | setup( 152 | name='fast_rcnn', 153 | ext_modules=ext_modules, 154 | # inject our custom trigger 155 | cmdclass={'build_ext': custom_build_ext}, 156 | ) 157 | -------------------------------------------------------------------------------- /lib/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaolonw/adversarial-frcnn/2a7bb96c9884c0f09ca5bde35a981087be28562b/lib/transform/__init__.py -------------------------------------------------------------------------------- /lib/transform/torch_image_transform_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # -------------------------------------------------------- 5 | 6 | """ Transform images for compatibility with models trained with 7 | https://github.com/facebook/fb.resnet.torch. 8 | 9 | Usage in model prototxt: 10 | 11 | layer { 12 | name: 'data_xform' 13 | type: 'Python' 14 | bottom: 'data_caffe' 15 | top: 'data' 16 | python_param { 17 | module: 'transform.torch_image_transform_layer' 18 | layer: 'TorchImageTransformLayer' 19 | } 20 | } 21 | """ 22 | 23 | import caffe 24 | from fast_rcnn.config import cfg 25 | import numpy as np 26 | 27 | class TorchImageTransformLayer(caffe.Layer): 28 | def setup(self, bottom, top): 29 | # (1, 3, 1, 1) shaped arrays 30 | self.PIXEL_MEANS = \ 31 | np.array([[[[0.48462227599918]], 32 | [[0.45624044862054]], 33 | [[0.40588363755159]]]]) 34 | self.PIXEL_STDS = \ 35 | np.array([[[[0.22889466674951]], 36 | [[0.22446679341259]], 37 | [[0.22495548344775]]]]) 38 | # The default ("old") pixel means that were already subtracted 39 | channel_swap = (0, 3, 1, 2) 40 | self.OLD_PIXEL_MEANS = \ 41 | cfg.PIXEL_MEANS[np.newaxis, :, :, :].transpose(channel_swap) 42 | 43 | top[0].reshape(*(bottom[0].shape)) 44 | 45 | def forward(self, bottom, top): 46 | ims = bottom[0].data 47 | # Invert the channel means that were already subtracted 48 | ims += self.OLD_PIXEL_MEANS 49 | # 1. Permute BGR to RGB and normalize to [0, 1] 50 | ims = ims[:, [2, 1, 0], :, :] / 255.0 51 | # 2. Remove channel means 52 | ims -= self.PIXEL_MEANS 53 | # 3. Standardize channels 54 | ims /= self.PIXEL_STDS 55 | top[0].reshape(*(ims.shape)) 56 | top[0].data[...] = ims 57 | 58 | def backward(self, top, propagate_down, bottom): 59 | """This layer does not propagate gradients.""" 60 | pass 61 | 62 | def reshape(self, bottom, top): 63 | """Reshaping happens during the call to forward.""" 64 | pass 65 | -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.so 3 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | # Move channels (axis 3) to axis 1 26 | # Axis order will become: (batch elem, channel, height, width) 27 | channel_swap = (0, 3, 1, 2) 28 | blob = blob.transpose(channel_swap) 29 | return blob 30 | 31 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 32 | """Mean subtract and scale an image for use in a blob.""" 33 | im = im.astype(np.float32, copy=False) 34 | im -= pixel_means 35 | im_shape = im.shape 36 | im_size_min = np.min(im_shape[0:2]) 37 | im_size_max = np.max(im_shape[0:2]) 38 | im_scale = float(target_size) / float(im_size_min) 39 | # Prevent the biggest axis from being more than MAX_SIZE 40 | if np.round(im_scale * im_size_max) > max_size: 41 | im_scale = float(max_size) / float(im_size_max) 42 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 43 | interpolation=cv2.INTER_LINEAR) 44 | 45 | return im, im_scale 46 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 # to be changed on-the-fly to num ROIs 14 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 256 154 | pad: 1 155 | kernel_size: 3 156 | } 157 | } 158 | layer { 159 | name: "relu3_1" 160 | type: "ReLU" 161 | bottom: "conv3_1" 162 | top: "conv3_1" 163 | } 164 | layer { 165 | name: "conv3_2" 166 | type: "Convolution" 167 | bottom: "conv3_1" 168 | top: "conv3_2" 169 | param { 170 | lr_mult: 1 171 | decay_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | } 182 | } 183 | layer { 184 | name: "relu3_2" 185 | type: "ReLU" 186 | bottom: "conv3_2" 187 | top: "conv3_2" 188 | } 189 | layer { 190 | name: "conv3_3" 191 | type: "Convolution" 192 | bottom: "conv3_2" 193 | top: "conv3_3" 194 | param { 195 | lr_mult: 1 196 | decay_mult: 1 197 | } 198 | param { 199 | lr_mult: 2 200 | decay_mult: 0 201 | } 202 | convolution_param { 203 | num_output: 256 204 | pad: 1 205 | kernel_size: 3 206 | } 207 | } 208 | layer { 209 | name: "relu3_3" 210 | type: "ReLU" 211 | bottom: "conv3_3" 212 | top: "conv3_3" 213 | } 214 | layer { 215 | name: "pool3" 216 | type: "Pooling" 217 | bottom: "conv3_3" 218 | top: "pool3" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 2 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "conv4_1" 227 | type: "Convolution" 228 | bottom: "pool3" 229 | top: "conv4_1" 230 | param { 231 | lr_mult: 1 232 | decay_mult: 1 233 | } 234 | param { 235 | lr_mult: 2 236 | decay_mult: 0 237 | } 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 241 | kernel_size: 3 242 | } 243 | } 244 | layer { 245 | name: "relu4_1" 246 | type: "ReLU" 247 | bottom: "conv4_1" 248 | top: "conv4_1" 249 | } 250 | layer { 251 | name: "conv4_2" 252 | type: "Convolution" 253 | bottom: "conv4_1" 254 | top: "conv4_2" 255 | param { 256 | lr_mult: 1 257 | decay_mult: 1 258 | } 259 | param { 260 | lr_mult: 2 261 | decay_mult: 0 262 | } 263 | convolution_param { 264 | num_output: 512 265 | pad: 1 266 | kernel_size: 3 267 | } 268 | } 269 | layer { 270 | name: "relu4_2" 271 | type: "ReLU" 272 | bottom: "conv4_2" 273 | top: "conv4_2" 274 | } 275 | layer { 276 | name: "conv4_3" 277 | type: "Convolution" 278 | bottom: "conv4_2" 279 | top: "conv4_3" 280 | param { 281 | lr_mult: 1 282 | decay_mult: 1 283 | } 284 | param { 285 | lr_mult: 2 286 | decay_mult: 0 287 | } 288 | convolution_param { 289 | num_output: 512 290 | pad: 1 291 | kernel_size: 3 292 | } 293 | } 294 | layer { 295 | name: "relu4_3" 296 | type: "ReLU" 297 | bottom: "conv4_3" 298 | top: "conv4_3" 299 | } 300 | layer { 301 | name: "pool4" 302 | type: "Pooling" 303 | bottom: "conv4_3" 304 | top: "pool4" 305 | pooling_param { 306 | pool: MAX 307 | kernel_size: 2 308 | stride: 2 309 | } 310 | } 311 | layer { 312 | name: "conv5_1" 313 | type: "Convolution" 314 | bottom: "pool4" 315 | top: "conv5_1" 316 | param { 317 | lr_mult: 1 318 | decay_mult: 1 319 | } 320 | param { 321 | lr_mult: 2 322 | decay_mult: 0 323 | } 324 | convolution_param { 325 | num_output: 512 326 | pad: 1 327 | kernel_size: 3 328 | } 329 | } 330 | layer { 331 | name: "relu5_1" 332 | type: "ReLU" 333 | bottom: "conv5_1" 334 | top: "conv5_1" 335 | } 336 | layer { 337 | name: "conv5_2" 338 | type: "Convolution" 339 | bottom: "conv5_1" 340 | top: "conv5_2" 341 | param { 342 | lr_mult: 1 343 | decay_mult: 1 344 | } 345 | param { 346 | lr_mult: 2 347 | decay_mult: 0 348 | } 349 | convolution_param { 350 | num_output: 512 351 | pad: 1 352 | kernel_size: 3 353 | } 354 | } 355 | layer { 356 | name: "relu5_2" 357 | type: "ReLU" 358 | bottom: "conv5_2" 359 | top: "conv5_2" 360 | } 361 | layer { 362 | name: "conv5_3" 363 | type: "Convolution" 364 | bottom: "conv5_2" 365 | top: "conv5_3" 366 | param { 367 | lr_mult: 1 368 | decay_mult: 1 369 | } 370 | param { 371 | lr_mult: 2 372 | decay_mult: 0 373 | } 374 | convolution_param { 375 | num_output: 512 376 | pad: 1 377 | kernel_size: 3 378 | } 379 | } 380 | layer { 381 | name: "relu5_3" 382 | type: "ReLU" 383 | bottom: "conv5_3" 384 | top: "conv5_3" 385 | } 386 | layer { 387 | name: "roi_pool5" 388 | type: "ROIPooling" 389 | bottom: "conv5_3" 390 | bottom: "rois" 391 | top: "pool5" 392 | roi_pooling_param { 393 | pooled_w: 7 394 | pooled_h: 7 395 | spatial_scale: 0.0625 # 1/16 396 | } 397 | } 398 | layer { 399 | name: "fc6" 400 | type: "InnerProduct" 401 | bottom: "pool5" 402 | top: "fc6" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | inner_product_param { 412 | num_output: 4096 413 | } 414 | } 415 | layer { 416 | name: "relu6" 417 | type: "ReLU" 418 | bottom: "fc6" 419 | top: "fc6" 420 | } 421 | layer { 422 | name: "drop6" 423 | type: "Dropout" 424 | bottom: "fc6" 425 | top: "fc6" 426 | dropout_param { 427 | dropout_ratio: 0.5 428 | } 429 | } 430 | layer { 431 | name: "fc7" 432 | type: "InnerProduct" 433 | bottom: "fc6" 434 | top: "fc7" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | inner_product_param { 444 | num_output: 4096 445 | } 446 | } 447 | layer { 448 | name: "relu7" 449 | type: "ReLU" 450 | bottom: "fc7" 451 | top: "fc7" 452 | } 453 | layer { 454 | name: "drop7" 455 | type: "Dropout" 456 | bottom: "fc7" 457 | top: "fc7" 458 | dropout_param { 459 | dropout_ratio: 0.5 460 | } 461 | } 462 | layer { 463 | name: "cls_score" 464 | type: "InnerProduct" 465 | bottom: "fc7" 466 | top: "cls_score" 467 | param { 468 | lr_mult: 1 469 | decay_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | decay_mult: 0 474 | } 475 | inner_product_param { 476 | num_output: 21 477 | weight_filler { 478 | type: "gaussian" 479 | std: 0.01 480 | } 481 | bias_filler { 482 | type: "constant" 483 | value: 0 484 | } 485 | } 486 | } 487 | layer { 488 | name: "bbox_pred" 489 | type: "InnerProduct" 490 | bottom: "fc7" 491 | top: "bbox_pred" 492 | param { 493 | lr_mult: 1 494 | decay_mult: 1 495 | } 496 | param { 497 | lr_mult: 2 498 | decay_mult: 0 499 | } 500 | inner_product_param { 501 | num_output: 84 502 | weight_filler { 503 | type: "gaussian" 504 | std: 0.001 505 | } 506 | bias_filler { 507 | type: "constant" 508 | value: 0 509 | } 510 | } 511 | } 512 | layer { 513 | name: "cls_prob" 514 | type: "Softmax" 515 | bottom: "cls_score" 516 | top: "cls_prob" 517 | } 518 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 21" 15 | } 16 | } 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | } 150 | convolution_param { 151 | num_output: 256 152 | pad: 1 153 | kernel_size: 3 154 | } 155 | } 156 | layer { 157 | name: "relu3_1" 158 | type: "ReLU" 159 | bottom: "conv3_1" 160 | top: "conv3_1" 161 | } 162 | layer { 163 | name: "conv3_2" 164 | type: "Convolution" 165 | bottom: "conv3_1" 166 | top: "conv3_2" 167 | param { 168 | lr_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | } 173 | convolution_param { 174 | num_output: 256 175 | pad: 1 176 | kernel_size: 3 177 | } 178 | } 179 | layer { 180 | name: "relu3_2" 181 | type: "ReLU" 182 | bottom: "conv3_2" 183 | top: "conv3_2" 184 | } 185 | layer { 186 | name: "conv3_3" 187 | type: "Convolution" 188 | bottom: "conv3_2" 189 | top: "conv3_3" 190 | param { 191 | lr_mult: 1 192 | } 193 | param { 194 | lr_mult: 2 195 | } 196 | convolution_param { 197 | num_output: 256 198 | pad: 1 199 | kernel_size: 3 200 | } 201 | } 202 | layer { 203 | name: "relu3_3" 204 | type: "ReLU" 205 | bottom: "conv3_3" 206 | top: "conv3_3" 207 | } 208 | layer { 209 | name: "pool3" 210 | type: "Pooling" 211 | bottom: "conv3_3" 212 | top: "pool3" 213 | pooling_param { 214 | pool: MAX 215 | kernel_size: 2 216 | stride: 2 217 | } 218 | } 219 | layer { 220 | name: "conv4_1" 221 | type: "Convolution" 222 | bottom: "pool3" 223 | top: "conv4_1" 224 | param { 225 | lr_mult: 1 226 | } 227 | param { 228 | lr_mult: 2 229 | } 230 | convolution_param { 231 | num_output: 512 232 | pad: 1 233 | kernel_size: 3 234 | } 235 | } 236 | layer { 237 | name: "relu4_1" 238 | type: "ReLU" 239 | bottom: "conv4_1" 240 | top: "conv4_1" 241 | } 242 | layer { 243 | name: "conv4_2" 244 | type: "Convolution" 245 | bottom: "conv4_1" 246 | top: "conv4_2" 247 | param { 248 | lr_mult: 1 249 | } 250 | param { 251 | lr_mult: 2 252 | } 253 | convolution_param { 254 | num_output: 512 255 | pad: 1 256 | kernel_size: 3 257 | } 258 | } 259 | layer { 260 | name: "relu4_2" 261 | type: "ReLU" 262 | bottom: "conv4_2" 263 | top: "conv4_2" 264 | } 265 | layer { 266 | name: "conv4_3" 267 | type: "Convolution" 268 | bottom: "conv4_2" 269 | top: "conv4_3" 270 | param { 271 | lr_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | } 276 | convolution_param { 277 | num_output: 512 278 | pad: 1 279 | kernel_size: 3 280 | } 281 | } 282 | layer { 283 | name: "relu4_3" 284 | type: "ReLU" 285 | bottom: "conv4_3" 286 | top: "conv4_3" 287 | } 288 | layer { 289 | name: "pool4" 290 | type: "Pooling" 291 | bottom: "conv4_3" 292 | top: "pool4" 293 | pooling_param { 294 | pool: MAX 295 | kernel_size: 2 296 | stride: 2 297 | } 298 | } 299 | layer { 300 | name: "conv5_1" 301 | type: "Convolution" 302 | bottom: "pool4" 303 | top: "conv5_1" 304 | param { 305 | lr_mult: 1 306 | } 307 | param { 308 | lr_mult: 2 309 | } 310 | convolution_param { 311 | num_output: 512 312 | pad: 1 313 | kernel_size: 3 314 | } 315 | } 316 | layer { 317 | name: "relu5_1" 318 | type: "ReLU" 319 | bottom: "conv5_1" 320 | top: "conv5_1" 321 | } 322 | layer { 323 | name: "conv5_2" 324 | type: "Convolution" 325 | bottom: "conv5_1" 326 | top: "conv5_2" 327 | param { 328 | lr_mult: 1 329 | } 330 | param { 331 | lr_mult: 2 332 | } 333 | convolution_param { 334 | num_output: 512 335 | pad: 1 336 | kernel_size: 3 337 | } 338 | } 339 | layer { 340 | name: "relu5_2" 341 | type: "ReLU" 342 | bottom: "conv5_2" 343 | top: "conv5_2" 344 | } 345 | layer { 346 | name: "conv5_3" 347 | type: "Convolution" 348 | bottom: "conv5_2" 349 | top: "conv5_3" 350 | param { 351 | lr_mult: 1 352 | } 353 | param { 354 | lr_mult: 2 355 | } 356 | convolution_param { 357 | num_output: 512 358 | pad: 1 359 | kernel_size: 3 360 | } 361 | } 362 | layer { 363 | name: "relu5_3" 364 | type: "ReLU" 365 | bottom: "conv5_3" 366 | top: "conv5_3" 367 | } 368 | layer { 369 | name: "roi_pool5" 370 | type: "ROIPooling" 371 | bottom: "conv5_3" 372 | bottom: "rois" 373 | top: "pool5" 374 | roi_pooling_param { 375 | pooled_w: 7 376 | pooled_h: 7 377 | spatial_scale: 0.0625 # 1/16 378 | } 379 | } 380 | layer { 381 | name: "fc6" 382 | type: "InnerProduct" 383 | bottom: "pool5" 384 | top: "fc6" 385 | param { 386 | lr_mult: 1 387 | } 388 | param { 389 | lr_mult: 2 390 | } 391 | inner_product_param { 392 | num_output: 4096 393 | } 394 | } 395 | layer { 396 | name: "relu6" 397 | type: "ReLU" 398 | bottom: "fc6" 399 | top: "fc6" 400 | } 401 | layer { 402 | name: "drop6" 403 | type: "Dropout" 404 | bottom: "fc6" 405 | top: "fc6" 406 | dropout_param { 407 | dropout_ratio: 0.5 408 | } 409 | } 410 | layer { 411 | name: "fc7" 412 | type: "InnerProduct" 413 | bottom: "fc6" 414 | top: "fc7" 415 | param { 416 | lr_mult: 1 417 | } 418 | param { 419 | lr_mult: 2 420 | } 421 | inner_product_param { 422 | num_output: 4096 423 | } 424 | } 425 | layer { 426 | name: "relu7" 427 | type: "ReLU" 428 | bottom: "fc7" 429 | top: "fc7" 430 | } 431 | layer { 432 | name: "drop7" 433 | type: "Dropout" 434 | bottom: "fc7" 435 | top: "fc7" 436 | dropout_param { 437 | dropout_ratio: 0.5 438 | } 439 | } 440 | layer { 441 | name: "cls_score" 442 | type: "InnerProduct" 443 | bottom: "fc7" 444 | top: "cls_score" 445 | param { 446 | lr_mult: 1 447 | } 448 | param { 449 | lr_mult: 2 450 | } 451 | inner_product_param { 452 | num_output: 21 453 | weight_filler { 454 | type: "gaussian" 455 | std: 0.01 456 | } 457 | bias_filler { 458 | type: "constant" 459 | value: 0 460 | } 461 | } 462 | } 463 | layer { 464 | name: "bbox_pred" 465 | type: "InnerProduct" 466 | bottom: "fc7" 467 | top: "bbox_pred" 468 | param { 469 | lr_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | } 474 | inner_product_param { 475 | num_output: 84 476 | weight_filler { 477 | type: "gaussian" 478 | std: 0.001 479 | } 480 | bias_filler { 481 | type: "constant" 482 | value: 0 483 | } 484 | } 485 | } 486 | layer { 487 | name: "loss_cls" 488 | type: "SoftmaxWithLoss" 489 | bottom: "cls_score" 490 | bottom: "labels" 491 | top: "loss_cls" 492 | loss_weight: 1 493 | } 494 | layer { 495 | name: "loss_bbox" 496 | type: "SmoothL1Loss" 497 | bottom: "bbox_pred" 498 | bottom: "bbox_targets" 499 | bottom: "bbox_inside_weights" 500 | bottom: "bbox_outside_weights" 501 | top: "loss_bbox" 502 | loss_weight: 1 503 | } 504 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_adv/init_weights2.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "prototxt" : "models/pascal_voc/VGG16/fast_rcnn_adv_pretrain/train.prototxt", 4 | "model" : "output/fast_rcnn_adv_pretrain/voc_2007_trainval/fast_rcnn_adv_pretrain_iter_25000.caffemodel", 5 | "copy_ops" : { 6 | "source" : ["conv6_mask", "conv7_mask", "conv8_mask", "conv9_mask", "conv10_mask"], 7 | "dest" : ["conv6_mask", "conv7_mask", "conv8_mask", "conv9_mask", "conv10_mask"], 8 | "reshape" : [0, 0, 0, 0, 0] 9 | } 10 | }, 11 | { 12 | "prototxt" : "models/pascal_voc/VGG16/fast_rcnn/test.prototxt", 13 | "model" : "output/fast_rcnn_adv/voc_2007_trainval/fast_rcnn_std_iter_10000.caffemodel", 14 | "copy_ops" : { 15 | "source" : ["conv1_1", "conv1_2", "conv2_1", "conv2_2", "conv3_1", "conv3_2", "conv3_3", "conv4_1", "conv4_2", "conv4_3", "conv5_1", "conv5_2", "conv5_3", "fc6", "fc7", "cls_score"], 16 | "dest" : ["conv1_1", "conv1_2", "conv2_1", "conv2_2", "conv3_1", "conv3_2", "conv3_3", "conv4_1", "conv4_2", "conv4_3", "conv5_1", "conv5_2", "conv5_3", "fc6", "fc7", "cls_score"], 17 | "reshape" : [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 18 | } 19 | } 20 | ] 21 | 22 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_adv/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/fast_rcnn_adv/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | iter_size: 2 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "fast_rcnn_adv" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_adv_pretrain/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/fast_rcnn_adv_pretrain/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 20000 6 | display: 20 7 | average_loss: 100 8 | iter_size: 8 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "fast_rcnn_adv_pretrain" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_std/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/fast_rcnn_std/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | iter_size: 2 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "fast_rcnn_std" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_std/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 # to be changed on-the-fly to num ROIs 14 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 256 154 | pad: 1 155 | kernel_size: 3 156 | } 157 | } 158 | layer { 159 | name: "relu3_1" 160 | type: "ReLU" 161 | bottom: "conv3_1" 162 | top: "conv3_1" 163 | } 164 | layer { 165 | name: "conv3_2" 166 | type: "Convolution" 167 | bottom: "conv3_1" 168 | top: "conv3_2" 169 | param { 170 | lr_mult: 1 171 | decay_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | } 182 | } 183 | layer { 184 | name: "relu3_2" 185 | type: "ReLU" 186 | bottom: "conv3_2" 187 | top: "conv3_2" 188 | } 189 | layer { 190 | name: "conv3_3" 191 | type: "Convolution" 192 | bottom: "conv3_2" 193 | top: "conv3_3" 194 | param { 195 | lr_mult: 1 196 | decay_mult: 1 197 | } 198 | param { 199 | lr_mult: 2 200 | decay_mult: 0 201 | } 202 | convolution_param { 203 | num_output: 256 204 | pad: 1 205 | kernel_size: 3 206 | } 207 | } 208 | layer { 209 | name: "relu3_3" 210 | type: "ReLU" 211 | bottom: "conv3_3" 212 | top: "conv3_3" 213 | } 214 | layer { 215 | name: "pool3" 216 | type: "Pooling" 217 | bottom: "conv3_3" 218 | top: "pool3" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 2 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "conv4_1" 227 | type: "Convolution" 228 | bottom: "pool3" 229 | top: "conv4_1" 230 | param { 231 | lr_mult: 1 232 | decay_mult: 1 233 | } 234 | param { 235 | lr_mult: 2 236 | decay_mult: 0 237 | } 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 241 | kernel_size: 3 242 | } 243 | } 244 | layer { 245 | name: "relu4_1" 246 | type: "ReLU" 247 | bottom: "conv4_1" 248 | top: "conv4_1" 249 | } 250 | layer { 251 | name: "conv4_2" 252 | type: "Convolution" 253 | bottom: "conv4_1" 254 | top: "conv4_2" 255 | param { 256 | lr_mult: 1 257 | decay_mult: 1 258 | } 259 | param { 260 | lr_mult: 2 261 | decay_mult: 0 262 | } 263 | convolution_param { 264 | num_output: 512 265 | pad: 1 266 | kernel_size: 3 267 | } 268 | } 269 | layer { 270 | name: "relu4_2" 271 | type: "ReLU" 272 | bottom: "conv4_2" 273 | top: "conv4_2" 274 | } 275 | layer { 276 | name: "conv4_3" 277 | type: "Convolution" 278 | bottom: "conv4_2" 279 | top: "conv4_3" 280 | param { 281 | lr_mult: 1 282 | decay_mult: 1 283 | } 284 | param { 285 | lr_mult: 2 286 | decay_mult: 0 287 | } 288 | convolution_param { 289 | num_output: 512 290 | pad: 1 291 | kernel_size: 3 292 | } 293 | } 294 | layer { 295 | name: "relu4_3" 296 | type: "ReLU" 297 | bottom: "conv4_3" 298 | top: "conv4_3" 299 | } 300 | layer { 301 | name: "pool4" 302 | type: "Pooling" 303 | bottom: "conv4_3" 304 | top: "pool4" 305 | pooling_param { 306 | pool: MAX 307 | kernel_size: 2 308 | stride: 2 309 | } 310 | } 311 | layer { 312 | name: "conv5_1" 313 | type: "Convolution" 314 | bottom: "pool4" 315 | top: "conv5_1" 316 | param { 317 | lr_mult: 1 318 | decay_mult: 1 319 | } 320 | param { 321 | lr_mult: 2 322 | decay_mult: 0 323 | } 324 | convolution_param { 325 | num_output: 512 326 | pad: 1 327 | kernel_size: 3 328 | } 329 | } 330 | layer { 331 | name: "relu5_1" 332 | type: "ReLU" 333 | bottom: "conv5_1" 334 | top: "conv5_1" 335 | } 336 | layer { 337 | name: "conv5_2" 338 | type: "Convolution" 339 | bottom: "conv5_1" 340 | top: "conv5_2" 341 | param { 342 | lr_mult: 1 343 | decay_mult: 1 344 | } 345 | param { 346 | lr_mult: 2 347 | decay_mult: 0 348 | } 349 | convolution_param { 350 | num_output: 512 351 | pad: 1 352 | kernel_size: 3 353 | } 354 | } 355 | layer { 356 | name: "relu5_2" 357 | type: "ReLU" 358 | bottom: "conv5_2" 359 | top: "conv5_2" 360 | } 361 | layer { 362 | name: "conv5_3" 363 | type: "Convolution" 364 | bottom: "conv5_2" 365 | top: "conv5_3" 366 | param { 367 | lr_mult: 1 368 | decay_mult: 1 369 | } 370 | param { 371 | lr_mult: 2 372 | decay_mult: 0 373 | } 374 | convolution_param { 375 | num_output: 512 376 | pad: 1 377 | kernel_size: 3 378 | } 379 | } 380 | layer { 381 | name: "relu5_3" 382 | type: "ReLU" 383 | bottom: "conv5_3" 384 | top: "conv5_3" 385 | } 386 | layer { 387 | name: "roi_pool5" 388 | type: "ROIPooling" 389 | bottom: "conv5_3" 390 | bottom: "rois" 391 | top: "pool5" 392 | roi_pooling_param { 393 | pooled_w: 7 394 | pooled_h: 7 395 | spatial_scale: 0.0625 # 1/16 396 | } 397 | } 398 | layer { 399 | name: "fc6" 400 | type: "InnerProduct" 401 | bottom: "pool5" 402 | top: "fc6" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | inner_product_param { 412 | num_output: 4096 413 | } 414 | } 415 | layer { 416 | name: "relu6" 417 | type: "ReLU" 418 | bottom: "fc6" 419 | top: "fc6" 420 | } 421 | layer { 422 | name: "drop6" 423 | type: "Dropout" 424 | bottom: "fc6" 425 | top: "fc6" 426 | dropout_param { 427 | dropout_ratio: 0.5 428 | } 429 | } 430 | layer { 431 | name: "fc7" 432 | type: "InnerProduct" 433 | bottom: "fc6" 434 | top: "fc7" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | inner_product_param { 444 | num_output: 4096 445 | } 446 | } 447 | layer { 448 | name: "relu7" 449 | type: "ReLU" 450 | bottom: "fc7" 451 | top: "fc7" 452 | } 453 | layer { 454 | name: "drop7" 455 | type: "Dropout" 456 | bottom: "fc7" 457 | top: "fc7" 458 | dropout_param { 459 | dropout_ratio: 0.5 460 | } 461 | } 462 | layer { 463 | name: "cls_score" 464 | type: "InnerProduct" 465 | bottom: "fc7" 466 | top: "cls_score" 467 | param { 468 | lr_mult: 1 469 | decay_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | decay_mult: 0 474 | } 475 | inner_product_param { 476 | num_output: 21 477 | weight_filler { 478 | type: "gaussian" 479 | std: 0.01 480 | } 481 | bias_filler { 482 | type: "constant" 483 | value: 0 484 | } 485 | } 486 | } 487 | layer { 488 | name: "bbox_pred" 489 | type: "InnerProduct" 490 | bottom: "fc7" 491 | top: "bbox_pred" 492 | param { 493 | lr_mult: 1 494 | decay_mult: 1 495 | } 496 | param { 497 | lr_mult: 2 498 | decay_mult: 0 499 | } 500 | inner_product_param { 501 | num_output: 84 502 | weight_filler { 503 | type: "gaussian" 504 | std: 0.001 505 | } 506 | bias_filler { 507 | type: "constant" 508 | value: 0 509 | } 510 | } 511 | } 512 | layer { 513 | name: "cls_prob" 514 | type: "Softmax" 515 | bottom: "cls_score" 516 | top: "cls_prob" 517 | } 518 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn_std/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 21" 15 | } 16 | } 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | } 150 | convolution_param { 151 | num_output: 256 152 | pad: 1 153 | kernel_size: 3 154 | } 155 | } 156 | layer { 157 | name: "relu3_1" 158 | type: "ReLU" 159 | bottom: "conv3_1" 160 | top: "conv3_1" 161 | } 162 | layer { 163 | name: "conv3_2" 164 | type: "Convolution" 165 | bottom: "conv3_1" 166 | top: "conv3_2" 167 | param { 168 | lr_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | } 173 | convolution_param { 174 | num_output: 256 175 | pad: 1 176 | kernel_size: 3 177 | } 178 | } 179 | layer { 180 | name: "relu3_2" 181 | type: "ReLU" 182 | bottom: "conv3_2" 183 | top: "conv3_2" 184 | } 185 | layer { 186 | name: "conv3_3" 187 | type: "Convolution" 188 | bottom: "conv3_2" 189 | top: "conv3_3" 190 | param { 191 | lr_mult: 1 192 | } 193 | param { 194 | lr_mult: 2 195 | } 196 | convolution_param { 197 | num_output: 256 198 | pad: 1 199 | kernel_size: 3 200 | } 201 | } 202 | layer { 203 | name: "relu3_3" 204 | type: "ReLU" 205 | bottom: "conv3_3" 206 | top: "conv3_3" 207 | } 208 | layer { 209 | name: "pool3" 210 | type: "Pooling" 211 | bottom: "conv3_3" 212 | top: "pool3" 213 | pooling_param { 214 | pool: MAX 215 | kernel_size: 2 216 | stride: 2 217 | } 218 | } 219 | layer { 220 | name: "conv4_1" 221 | type: "Convolution" 222 | bottom: "pool3" 223 | top: "conv4_1" 224 | param { 225 | lr_mult: 1 226 | } 227 | param { 228 | lr_mult: 2 229 | } 230 | convolution_param { 231 | num_output: 512 232 | pad: 1 233 | kernel_size: 3 234 | } 235 | } 236 | layer { 237 | name: "relu4_1" 238 | type: "ReLU" 239 | bottom: "conv4_1" 240 | top: "conv4_1" 241 | } 242 | layer { 243 | name: "conv4_2" 244 | type: "Convolution" 245 | bottom: "conv4_1" 246 | top: "conv4_2" 247 | param { 248 | lr_mult: 1 249 | } 250 | param { 251 | lr_mult: 2 252 | } 253 | convolution_param { 254 | num_output: 512 255 | pad: 1 256 | kernel_size: 3 257 | } 258 | } 259 | layer { 260 | name: "relu4_2" 261 | type: "ReLU" 262 | bottom: "conv4_2" 263 | top: "conv4_2" 264 | } 265 | layer { 266 | name: "conv4_3" 267 | type: "Convolution" 268 | bottom: "conv4_2" 269 | top: "conv4_3" 270 | param { 271 | lr_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | } 276 | convolution_param { 277 | num_output: 512 278 | pad: 1 279 | kernel_size: 3 280 | } 281 | } 282 | layer { 283 | name: "relu4_3" 284 | type: "ReLU" 285 | bottom: "conv4_3" 286 | top: "conv4_3" 287 | } 288 | layer { 289 | name: "pool4" 290 | type: "Pooling" 291 | bottom: "conv4_3" 292 | top: "pool4" 293 | pooling_param { 294 | pool: MAX 295 | kernel_size: 2 296 | stride: 2 297 | } 298 | } 299 | layer { 300 | name: "conv5_1" 301 | type: "Convolution" 302 | bottom: "pool4" 303 | top: "conv5_1" 304 | param { 305 | lr_mult: 1 306 | } 307 | param { 308 | lr_mult: 2 309 | } 310 | convolution_param { 311 | num_output: 512 312 | pad: 1 313 | kernel_size: 3 314 | } 315 | } 316 | layer { 317 | name: "relu5_1" 318 | type: "ReLU" 319 | bottom: "conv5_1" 320 | top: "conv5_1" 321 | } 322 | layer { 323 | name: "conv5_2" 324 | type: "Convolution" 325 | bottom: "conv5_1" 326 | top: "conv5_2" 327 | param { 328 | lr_mult: 1 329 | } 330 | param { 331 | lr_mult: 2 332 | } 333 | convolution_param { 334 | num_output: 512 335 | pad: 1 336 | kernel_size: 3 337 | } 338 | } 339 | layer { 340 | name: "relu5_2" 341 | type: "ReLU" 342 | bottom: "conv5_2" 343 | top: "conv5_2" 344 | } 345 | layer { 346 | name: "conv5_3" 347 | type: "Convolution" 348 | bottom: "conv5_2" 349 | top: "conv5_3" 350 | param { 351 | lr_mult: 1 352 | } 353 | param { 354 | lr_mult: 2 355 | } 356 | convolution_param { 357 | num_output: 512 358 | pad: 1 359 | kernel_size: 3 360 | } 361 | } 362 | layer { 363 | name: "relu5_3" 364 | type: "ReLU" 365 | bottom: "conv5_3" 366 | top: "conv5_3" 367 | } 368 | layer { 369 | name: "roi_pool5" 370 | type: "ROIPooling" 371 | bottom: "conv5_3" 372 | bottom: "rois" 373 | top: "pool5" 374 | roi_pooling_param { 375 | pooled_w: 7 376 | pooled_h: 7 377 | spatial_scale: 0.0625 # 1/16 378 | } 379 | } 380 | layer { 381 | name: "fc6" 382 | type: "InnerProduct" 383 | bottom: "pool5" 384 | top: "fc6" 385 | param { 386 | lr_mult: 1 387 | } 388 | param { 389 | lr_mult: 2 390 | } 391 | inner_product_param { 392 | num_output: 4096 393 | } 394 | } 395 | layer { 396 | name: "relu6" 397 | type: "ReLU" 398 | bottom: "fc6" 399 | top: "fc6" 400 | } 401 | layer { 402 | name: "drop6" 403 | type: "Dropout" 404 | bottom: "fc6" 405 | top: "fc6" 406 | dropout_param { 407 | dropout_ratio: 0.5 408 | } 409 | } 410 | layer { 411 | name: "fc7" 412 | type: "InnerProduct" 413 | bottom: "fc6" 414 | top: "fc7" 415 | param { 416 | lr_mult: 1 417 | } 418 | param { 419 | lr_mult: 2 420 | } 421 | inner_product_param { 422 | num_output: 4096 423 | } 424 | } 425 | layer { 426 | name: "relu7" 427 | type: "ReLU" 428 | bottom: "fc7" 429 | top: "fc7" 430 | } 431 | layer { 432 | name: "drop7" 433 | type: "Dropout" 434 | bottom: "fc7" 435 | top: "fc7" 436 | dropout_param { 437 | dropout_ratio: 0.5 438 | } 439 | } 440 | layer { 441 | name: "cls_score" 442 | type: "InnerProduct" 443 | bottom: "fc7" 444 | top: "cls_score" 445 | param { 446 | lr_mult: 1 447 | } 448 | param { 449 | lr_mult: 2 450 | } 451 | inner_product_param { 452 | num_output: 21 453 | weight_filler { 454 | type: "gaussian" 455 | std: 0.01 456 | } 457 | bias_filler { 458 | type: "constant" 459 | value: 0 460 | } 461 | } 462 | } 463 | layer { 464 | name: "bbox_pred" 465 | type: "InnerProduct" 466 | bottom: "fc7" 467 | top: "bbox_pred" 468 | param { 469 | lr_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | } 474 | inner_product_param { 475 | num_output: 84 476 | weight_filler { 477 | type: "gaussian" 478 | std: 0.001 479 | } 480 | bias_filler { 481 | type: "constant" 482 | value: 0 483 | } 484 | } 485 | } 486 | layer { 487 | name: "loss_cls" 488 | type: "SoftmaxWithLoss" 489 | bottom: "cls_score" 490 | bottom: "labels" 491 | top: "loss_cls" 492 | loss_weight: 1 493 | } 494 | layer { 495 | name: "loss_bbox" 496 | type: "SmoothL1Loss" 497 | bottom: "bbox_pred" 498 | bottom: "bbox_targets" 499 | bottom: "bbox_inside_weights" 500 | bottom: "bbox_outside_weights" 501 | top: "loss_bbox" 502 | loss_weight: 1 503 | } 504 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "drop6" 223 | type: "Dropout" 224 | bottom: "fc6" 225 | top: "fc6" 226 | dropout_param { 227 | dropout_ratio: 0.5 228 | } 229 | } 230 | layer { 231 | name: "fc7" 232 | type: "InnerProduct" 233 | bottom: "fc6" 234 | top: "fc7" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 1024 245 | } 246 | } 247 | layer { 248 | name: "relu7" 249 | type: "ReLU" 250 | bottom: "fc7" 251 | top: "fc7" 252 | } 253 | layer { 254 | name: "drop7" 255 | type: "Dropout" 256 | bottom: "fc7" 257 | top: "fc7" 258 | dropout_param { 259 | dropout_ratio: 0.5 260 | } 261 | } 262 | layer { 263 | name: "cls_score" 264 | type: "InnerProduct" 265 | bottom: "fc7" 266 | top: "cls_score" 267 | param { 268 | lr_mult: 1 269 | decay_mult: 1 270 | } 271 | param { 272 | lr_mult: 2 273 | decay_mult: 0 274 | } 275 | inner_product_param { 276 | num_output: 21 277 | weight_filler { 278 | type: "gaussian" 279 | std: 0.01 280 | } 281 | bias_filler { 282 | type: "constant" 283 | value: 0 284 | } 285 | } 286 | } 287 | layer { 288 | name: "bbox_pred" 289 | type: "InnerProduct" 290 | bottom: "fc7" 291 | top: "bbox_pred" 292 | param { 293 | lr_mult: 1 294 | decay_mult: 1 295 | } 296 | param { 297 | lr_mult: 2 298 | decay_mult: 0 299 | } 300 | inner_product_param { 301 | num_output: 84 302 | weight_filler { 303 | type: "gaussian" 304 | std: 0.001 305 | } 306 | bias_filler { 307 | type: "constant" 308 | value: 0 309 | } 310 | } 311 | } 312 | layer { 313 | name: "cls_prob" 314 | type: "Softmax" 315 | bottom: "cls_score" 316 | top: "cls_prob" 317 | } 318 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 21" 15 | } 16 | } 17 | layer { 18 | name: "conv1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1" 22 | param { lr_mult: 0 decay_mult: 0 } 23 | param { lr_mult: 0 decay_mult: 0 } 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | stride: 2 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layer { 37 | name: "norm1" 38 | type: "LRN" 39 | bottom: "conv1" 40 | top: "norm1" 41 | lrn_param { 42 | local_size: 5 43 | alpha: 0.0005 44 | beta: 0.75 45 | k: 2 46 | } 47 | } 48 | layer { 49 | name: "pool1" 50 | type: "Pooling" 51 | bottom: "norm1" 52 | top: "pool1" 53 | pooling_param { 54 | pool: MAX 55 | kernel_size: 3 56 | stride: 2 57 | } 58 | } 59 | layer { 60 | name: "conv2" 61 | type: "Convolution" 62 | bottom: "pool1" 63 | top: "conv2" 64 | param { 65 | lr_mult: 1 66 | } 67 | param { 68 | lr_mult: 2 69 | } 70 | convolution_param { 71 | num_output: 256 72 | pad: 1 73 | kernel_size: 5 74 | stride: 2 75 | } 76 | } 77 | layer { 78 | name: "relu2" 79 | type: "ReLU" 80 | bottom: "conv2" 81 | top: "conv2" 82 | } 83 | layer { 84 | name: "norm2" 85 | type: "LRN" 86 | bottom: "conv2" 87 | top: "norm2" 88 | lrn_param { 89 | local_size: 5 90 | alpha: 0.0005 91 | beta: 0.75 92 | k: 2 93 | } 94 | } 95 | layer { 96 | name: "pool2" 97 | type: "Pooling" 98 | bottom: "norm2" 99 | top: "pool2" 100 | pooling_param { 101 | pool: MAX 102 | kernel_size: 3 103 | stride: 2 104 | } 105 | } 106 | layer { 107 | name: "conv3" 108 | type: "Convolution" 109 | bottom: "pool2" 110 | top: "conv3" 111 | param { 112 | lr_mult: 1 113 | } 114 | param { 115 | lr_mult: 2 116 | } 117 | convolution_param { 118 | num_output: 512 119 | pad: 1 120 | kernel_size: 3 121 | } 122 | } 123 | layer { 124 | name: "relu3" 125 | type: "ReLU" 126 | bottom: "conv3" 127 | top: "conv3" 128 | } 129 | layer { 130 | name: "conv4" 131 | type: "Convolution" 132 | bottom: "conv3" 133 | top: "conv4" 134 | param { 135 | lr_mult: 1 136 | } 137 | param { 138 | lr_mult: 2 139 | } 140 | convolution_param { 141 | num_output: 512 142 | pad: 1 143 | kernel_size: 3 144 | } 145 | } 146 | layer { 147 | name: "relu4" 148 | type: "ReLU" 149 | bottom: "conv4" 150 | top: "conv4" 151 | } 152 | layer { 153 | name: "conv5" 154 | type: "Convolution" 155 | bottom: "conv4" 156 | top: "conv5" 157 | param { 158 | lr_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | } 163 | convolution_param { 164 | num_output: 512 165 | pad: 1 166 | kernel_size: 3 167 | } 168 | } 169 | layer { 170 | name: "relu5" 171 | type: "ReLU" 172 | bottom: "conv5" 173 | top: "conv5" 174 | } 175 | layer { 176 | name: "roi_pool5" 177 | type: "ROIPooling" 178 | bottom: "conv5" 179 | bottom: "rois" 180 | top: "pool5" 181 | roi_pooling_param { 182 | pooled_w: 6 183 | pooled_h: 6 184 | spatial_scale: 0.0625 # 1/16 185 | } 186 | } 187 | layer { 188 | name: "fc6" 189 | type: "InnerProduct" 190 | bottom: "pool5" 191 | top: "fc6" 192 | param { 193 | lr_mult: 1 194 | } 195 | param { 196 | lr_mult: 2 197 | } 198 | inner_product_param { 199 | num_output: 4096 200 | } 201 | } 202 | layer { 203 | name: "relu6" 204 | type: "ReLU" 205 | bottom: "fc6" 206 | top: "fc6" 207 | } 208 | layer { 209 | name: "drop6" 210 | type: "Dropout" 211 | bottom: "fc6" 212 | top: "fc6" 213 | dropout_param { 214 | dropout_ratio: 0.5 215 | } 216 | } 217 | layer { 218 | name: "fc7" 219 | type: "InnerProduct" 220 | bottom: "fc6" 221 | top: "fc7" 222 | param { 223 | lr_mult: 1 224 | } 225 | param { 226 | lr_mult: 2 227 | } 228 | inner_product_param { 229 | num_output: 1024 230 | } 231 | } 232 | layer { 233 | name: "relu7" 234 | type: "ReLU" 235 | bottom: "fc7" 236 | top: "fc7" 237 | } 238 | layer { 239 | name: "drop7" 240 | type: "Dropout" 241 | bottom: "fc7" 242 | top: "fc7" 243 | dropout_param { 244 | dropout_ratio: 0.5 245 | } 246 | } 247 | layer { 248 | name: "cls_score" 249 | type: "InnerProduct" 250 | bottom: "fc7" 251 | top: "cls_score" 252 | param { 253 | lr_mult: 1 254 | } 255 | param { 256 | lr_mult: 2 257 | } 258 | inner_product_param { 259 | num_output: 21 260 | weight_filler { 261 | type: "gaussian" 262 | std: 0.01 263 | } 264 | bias_filler { 265 | type: "constant" 266 | value: 0 267 | } 268 | } 269 | } 270 | layer { 271 | name: "bbox_pred" 272 | type: "InnerProduct" 273 | bottom: "fc7" 274 | top: "bbox_pred" 275 | param { 276 | lr_mult: 1 277 | } 278 | param { 279 | lr_mult: 2 280 | } 281 | inner_product_param { 282 | num_output: 84 283 | weight_filler { 284 | type: "gaussian" 285 | std: 0.001 286 | } 287 | bias_filler { 288 | type: "constant" 289 | value: 0 290 | } 291 | } 292 | } 293 | layer { 294 | name: "loss_cls" 295 | type: "SoftmaxWithLoss" 296 | bottom: "cls_score" 297 | bottom: "labels" 298 | top: "loss_cls" 299 | loss_weight: 1 300 | } 301 | layer { 302 | name: "loss_bbox" 303 | type: "SmoothL1Loss" 304 | bottom: "bbox_pred" 305 | bottom: "bbox_targets" 306 | bottom: "bbox_inside_weights" 307 | bottom: "bbox_outside_weights" 308 | top: "loss_bbox" 309 | loss_weight: 1 310 | } 311 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn_ohem/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/fast_rcnn_ohem/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | iter_size: 2 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /python_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Python Utils 3 | # Copyright (c) 2015 UC Berkeley 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Saurabh Gupta 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /python_utils/_init_paths.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Set up paths for Fast R-CNN.""" 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | def add_path(path): 14 | if path not in sys.path: 15 | sys.path.insert(0, path) 16 | 17 | this_dir = osp.dirname(__file__) 18 | 19 | # Add caffe to PYTHONPATH 20 | caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python') 21 | add_path(caffe_path) 22 | 23 | # Add lib to PYTHONPATH 24 | lib_path = osp.join(this_dir, '..', 'lib') 25 | add_path(lib_path) 26 | -------------------------------------------------------------------------------- /python_utils/do_net_surgery.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) 2015, Saurabh Gupta 3 | # 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # --------------------------------------------------------- 6 | 7 | # For fusing network outputs 8 | import _init_paths 9 | import caffe 10 | import pycaffe_utils 11 | import sys, pprint, argparse 12 | 13 | def parse_args(): 14 | """ 15 | Parse input arguments 16 | """ 17 | parser = argparse.ArgumentParser(description='Network surgery script') 18 | parser.add_argument('--out_net_def', help='prototxt file defining the output network', default=None, type=str) 19 | parser.add_argument('--net_surgery_json', help='json file which defines what blobs to copy from where', default=None, type=str) 20 | parser.add_argument('--out_net_file', help='caffemodel to save the ouput network to', default=None, type=str) 21 | if len(sys.argv) == 1: 22 | parser.print_help() 23 | sys.exit(1) 24 | args = parser.parse_args() 25 | return args 26 | 27 | if __name__ == '__main__': 28 | args = parse_args() 29 | net = caffe.Net(args.out_net_def, caffe.TEST) 30 | pycaffe_utils.net_surgery(net, args.net_surgery_json) 31 | net.save(args.out_net_file) 32 | -------------------------------------------------------------------------------- /python_utils/evaluate_detection.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) 2015, Saurabh Gupta 3 | # 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # --------------------------------------------------------- 6 | 7 | import utils.cython_bbox 8 | import numpy as np 9 | 10 | def inst_bench_image(dt, gt, bOpts, overlap = None): 11 | 12 | nDt = len(dt['sc']) 13 | nGt = len(gt['diff']) 14 | numInst = np.sum(gt['diff'] == False) 15 | 16 | 17 | if overlap == None: 18 | overlap = utils.cython_bbox.bbox_overlaps(dt['boxInfo'].astype(np.float), gt['boxInfo'].astype(np.float)) 19 | # assert(issorted(-dt.sc), 'Scores are not sorted.\n'); 20 | sc = dt['sc']; 21 | 22 | det = np.zeros((nGt,1)).astype(np.bool) 23 | tp = np.zeros((nDt,1)).astype(np.bool) 24 | fp = np.zeros((nDt,1)).astype(np.bool) 25 | dupDet = np.zeros((nDt,1)).astype(np.bool) 26 | instId = np.zeros((nDt,1)).astype(np.int32) 27 | ov = np.zeros((nDt,1)).astype(np.float32) 28 | 29 | # Walk through the detections in decreasing score 30 | # and assign tp, fp, fn, tn labels 31 | for i in xrange(nDt): 32 | # assign detection to ground truth object if any 33 | if nGt > 0: 34 | maxOverlap = overlap[i,:].max(); maxInd = overlap[i,:].argmax(); 35 | instId[i] = maxInd; ov[i] = maxOverlap; 36 | else: 37 | maxOverlap = 0; instId[i] = -1; maxInd = -1; 38 | # assign detection as true positive/don't care/false positive 39 | if maxOverlap >= bOpts['minoverlap']: 40 | if gt['diff'][maxInd] == False: 41 | if det[maxInd] == False: 42 | # true positive 43 | tp[i] = True; 44 | det[maxInd] = True; 45 | else: 46 | # false positive (multiple detection) 47 | fp[i] = True; 48 | dupDet[i] = True; 49 | else: 50 | # false positive 51 | fp[i] = True; 52 | return tp, fp, sc, numInst, dupDet, instId, ov 53 | 54 | 55 | def inst_bench(dt, gt, bOpts, tp = None, fp = None, sc = None, numInst = None): 56 | """ 57 | ap, rec, prec, npos, details = inst_bench(dt, gt, bOpts, tp = None, fp = None, sc = None, numInst = None) 58 | dt - a list with a dict for each image and with following fields 59 | .boxInfo - info that will be used to cpmpute the overlap with ground truths, a list 60 | .sc - score 61 | gt 62 | .boxInfo - info used to compute the overlap, a list 63 | .diff - a logical array of size nGtx1, saying if the instance is hard or not 64 | bOpt 65 | .minoverlap - the minimum overlap to call it a true positive 66 | [tp], [fp], [sc], [numInst] 67 | Optional arguments, in case the inst_bench_image is being called outside of this function 68 | """ 69 | if tp is None: 70 | # We do not have the tp, fp, sc, and numInst, so compute them from the structures gt, and out 71 | tp = []; fp = []; numInst = []; score = []; dupDet = []; instId = []; ov = []; 72 | for i in range(len(gt)): 73 | # Sort dt by the score 74 | sc = dt[i]['sc'] 75 | bb = dt[i]['boxInfo'] 76 | ind = np.argsort(sc, axis = 0); 77 | ind = ind[::-1] 78 | if len(ind) > 0: 79 | sc = np.vstack((sc[i,:] for i in ind)) 80 | bb = np.vstack((bb[i,:] for i in ind)) 81 | else: 82 | sc = np.zeros((0,1)).astype(np.float) 83 | bb = np.zeros((0,4)).astype(np.float) 84 | 85 | dtI = dict({'boxInfo': bb, 'sc': sc}) 86 | tp_i, fp_i, sc_i, numInst_i, dupDet_i, instId_i, ov_i = inst_bench_image(dtI, gt[i], bOpts) 87 | tp.append(tp_i); fp.append(fp_i); score.append(sc_i); numInst.append(numInst_i); 88 | dupDet.append(dupDet_i); instId.append(instId_i); ov.append(ov_i); 89 | details = {'tp': list(tp), 'fp': list(fp), 'score': list(score), 'dupDet': list(dupDet), 90 | 'numInst': list(numInst), 'instId': list(instId), 'ov': list(ov)} 91 | 92 | tp = np.vstack(tp[:]) 93 | fp = np.vstack(fp[:]) 94 | sc = np.vstack(score[:]) 95 | 96 | cat_all = np.hstack((tp,fp,sc)) 97 | ind = np.argsort(cat_all[:,2]) 98 | cat_all = cat_all[ind[::-1],:] 99 | tp = np.cumsum(cat_all[:,0], axis = 0); 100 | fp = np.cumsum(cat_all[:,1], axis = 0); 101 | thresh = cat_all[:,2]; 102 | npos = np.sum(numInst, axis = 0); 103 | 104 | # Compute precision/recall 105 | rec = tp / npos; 106 | prec = np.divide(tp, (fp+tp)); 107 | ap = VOCap(rec, prec); 108 | return ap, rec, prec, npos, details 109 | 110 | def VOCap(rec, prec): 111 | rec = rec.reshape(rec.size,1); prec = prec.reshape(prec.size,1) 112 | z = np.zeros((1,1)); o = np.ones((1,1)); 113 | mrec = np.vstack((z, rec, o)) 114 | mpre = np.vstack((z, prec, z)) 115 | for i in range(len(mpre)-2, -1, -1): 116 | mpre[i] = max(mpre[i], mpre[i+1]) 117 | 118 | I = np.where(mrec[1:] != mrec[0:-1])[0]+1; 119 | ap = 0; 120 | for i in I: 121 | ap = ap + (mrec[i] - mrec[i-1])*mpre[i]; 122 | return ap 123 | -------------------------------------------------------------------------------- /python_utils/general_utils.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) 2015, Saurabh Gupta 3 | # 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # --------------------------------------------------------- 6 | 7 | import numpy as np 8 | import cPickle 9 | import os 10 | from IPython.core.debugger import Tracer 11 | import scipy.io as scio 12 | import time 13 | 14 | 15 | def tic_toc_print(interval, string): 16 | global tic_toc_print_time_old 17 | if 'tic_toc_print_time_old' not in globals(): 18 | tic_toc_print_time_old = time.time() 19 | print string 20 | else: 21 | new_time = time.time() 22 | if new_time - tic_toc_print_time_old > interval: 23 | tic_toc_print_time_old = new_time; 24 | print string 25 | 26 | def mkdir_if_missing(output_dir): 27 | """ 28 | def mkdir_if_missing(output_dir) 29 | """ 30 | if not os.path.exists(output_dir): 31 | os.makedirs(output_dir) 32 | 33 | def sigmoid(x): 34 | """ 35 | def sigmoid(x) 36 | """ 37 | y = x.copy().astype(np.float32) 38 | ind = np.where(x > 0)[0] 39 | y[ind] = 1/(1 + np.exp(-x[ind])) 40 | ind = np.where(x <= 0)[0] 41 | y[ind] = np.exp(x[ind])/(np.exp(x[ind]) + 1) 42 | return y 43 | 44 | def save_variables(pickle_file_name, var, info, overwrite = False): 45 | """ 46 | def save_variables(pickle_file_name, var, info, overwrite = False) 47 | """ 48 | if os.path.exists(pickle_file_name) and overwrite == False: 49 | raise Exception('{:s} exists and over write is false.'.format(pickle_file_name)) 50 | # Construct the dictionary 51 | assert(type(var) == list); assert(type(info) == list); 52 | d = {} 53 | for i in xrange(len(var)): 54 | d[info[i]] = var[i] 55 | with open(pickle_file_name, 'wb') as f: 56 | cPickle.dump(d, f, cPickle.HIGHEST_PROTOCOL) 57 | 58 | def load_variables(pickle_file_name): 59 | """ 60 | d = load_variables(pickle_file_name) 61 | Output: 62 | d is a dictionary of variables stored in the pickle file. 63 | """ 64 | if os.path.exists(pickle_file_name): 65 | with open(pickle_file_name, 'rb') as f: 66 | d = cPickle.load(f) 67 | return d 68 | else: 69 | raise Exception('{:s} does not exists.'.format(pickle_file_name)) 70 | -------------------------------------------------------------------------------- /python_utils/pycaffe_utils.py: -------------------------------------------------------------------------------- 1 | # --------------------------------------------------------- 2 | # Copyright (c) 2015, Saurabh Gupta 3 | # 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # --------------------------------------------------------- 6 | 7 | 8 | import caffe, yaml 9 | 10 | def net_surgery(net, json_file_or_dict): 11 | # Load the JSON file 12 | if isinstance(json_file_or_dict, str): 13 | with open(json_file_or_dict, 'rt') as f: 14 | source_description = yaml.load(f) 15 | else: 16 | source_description = json_file_or_dict 17 | # Find a list of blobs in the target net 18 | target_blobs = net.params.keys() 19 | target_blobs = dict(zip(target_blobs, [0]*len(target_blobs))) 20 | 21 | # For each item in the json file load the network and copy the layers 22 | for src_desc in source_description: 23 | net_source = caffe.Net(src_desc['prototxt'], src_desc['model'], caffe.TEST) 24 | for j in xrange(len(src_desc['copy_ops']['dest'])): 25 | dest_name = src_desc['copy_ops']['dest'][j] 26 | 27 | assert dest_name in target_blobs, \ 28 | 'Destination name {} not in target network blobs'.format(dest_name) 29 | 30 | src_name = src_desc['copy_ops']['source'][j] 31 | assert src_name in net_source.params.keys(), \ 32 | 'Source name {} not in source network blobs'.format(src_name) 33 | 34 | allow_different_shape = src_desc['copy_ops']['reshape'][j] 35 | 36 | if target_blobs[dest_name] is not 0: 37 | print 'Target blob {} is being reassigned'.format(dest_name) 38 | target_blobs[dest_name] = target_blobs[dest_name] + 1 39 | 40 | assert(len(net.params[dest_name]) == \ 41 | len(net_source.params[src_name])), \ 42 | 'Number of blobs in {} in source do not match number of blobs in {} in destination'\ 43 | .format(src_name, dest_name) 44 | 45 | for k in xrange(len(net.params[dest_name])): 46 | src = net_source.params[src_name][k] 47 | dest = net.params[dest_name][k] 48 | if allow_different_shape: 49 | assert(src.count == dest.count), \ 50 | 'Count of blobs in {}[{:d}] in source do not match count of blobs in {}[{:d}] in destination'\ 51 | .format(src_name, k, dest_name, k) 52 | dest.data[...] = src.data.reshape(dest.data.shape) 53 | else: 54 | src_shape = src.data.shape 55 | dest_shape = dest.data.shape 56 | assert(src_shape == dest_shape), \ 57 | 'Shape of blobs in {}[{:d}] {} in source do not match shape of blobs in {}[{:d}] {} in destination'\ 58 | .format(src_name, k, str(src_shape), dest_name, k, str(dest_shape)) 59 | dest.data[...] = src.data 60 | 61 | unusual = [x for x in target_blobs.keys() if target_blobs[x] is not 1] 62 | for x in unusual: 63 | print 'Parameter blob {} copied {:d} times.'.format(x, target_blobs[x]) 64 | 65 | return target_blobs 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | Tools for training, testing, and compressing Fast R-CNN networks. 2 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Set up paths for Fast R-CNN.""" 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | def add_path(path): 14 | if path not in sys.path: 15 | sys.path.insert(0, path) 16 | 17 | this_dir = osp.dirname(__file__) 18 | 19 | # Add caffe to PYTHONPATH 20 | caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python') 21 | add_path(caffe_path) 22 | 23 | # Add lib to PYTHONPATH 24 | lib_path = osp.join(this_dir, '..', 'lib') 25 | add_path(lib_path) 26 | -------------------------------------------------------------------------------- /tools/compress_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Compress a Fast R-CNN network using truncated SVD.""" 11 | 12 | import _init_paths 13 | import caffe 14 | import argparse 15 | import numpy as np 16 | import os, sys 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Compress a Fast R-CNN network') 21 | parser.add_argument('--def', dest='prototxt', 22 | help='prototxt file defining the uncompressed network', 23 | default=None, type=str) 24 | parser.add_argument('--def-svd', dest='prototxt_svd', 25 | help='prototxt file defining the SVD compressed network', 26 | default=None, type=str) 27 | parser.add_argument('--net', dest='caffemodel', 28 | help='model to compress', 29 | default=None, type=str) 30 | 31 | if len(sys.argv) == 1: 32 | parser.print_help() 33 | sys.exit(1) 34 | 35 | args = parser.parse_args() 36 | return args 37 | 38 | def compress_weights(W, l): 39 | """Compress the weight matrix W of an inner product (fully connected) layer 40 | using truncated SVD. 41 | 42 | Parameters: 43 | W: N x M weights matrix 44 | l: number of singular values to retain 45 | 46 | Returns: 47 | Ul, L: matrices such that W \approx Ul*L 48 | """ 49 | 50 | # numpy doesn't seem to have a fast truncated SVD algorithm... 51 | # this could be faster 52 | U, s, V = np.linalg.svd(W, full_matrices=False) 53 | 54 | Ul = U[:, :l] 55 | sl = s[:l] 56 | Vl = V[:l, :] 57 | 58 | L = np.dot(np.diag(sl), Vl) 59 | return Ul, L 60 | 61 | def main(): 62 | args = parse_args() 63 | 64 | # prototxt = 'models/VGG16/test.prototxt' 65 | # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' 66 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 67 | 68 | # prototxt_svd = 'models/VGG16/svd/test_fc6_fc7.prototxt' 69 | # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' 70 | net_svd = caffe.Net(args.prototxt_svd, args.caffemodel, caffe.TEST) 71 | 72 | print('Uncompressed network {} : {}'.format(args.prototxt, args.caffemodel)) 73 | print('Compressed network prototxt {}'.format(args.prototxt_svd)) 74 | 75 | out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svd' 76 | out_dir = os.path.dirname(args.caffemodel) 77 | 78 | # Compress fc6 79 | if net_svd.params.has_key('fc6_L'): 80 | l_fc6 = net_svd.params['fc6_L'][0].data.shape[0] 81 | print(' fc6_L bottleneck size: {}'.format(l_fc6)) 82 | 83 | # uncompressed weights and biases 84 | W_fc6 = net.params['fc6'][0].data 85 | B_fc6 = net.params['fc6'][1].data 86 | 87 | print(' compressing fc6...') 88 | Ul_fc6, L_fc6 = compress_weights(W_fc6, l_fc6) 89 | 90 | assert(len(net_svd.params['fc6_L']) == 1) 91 | 92 | # install compressed matrix factors (and original biases) 93 | net_svd.params['fc6_L'][0].data[...] = L_fc6 94 | 95 | net_svd.params['fc6_U'][0].data[...] = Ul_fc6 96 | net_svd.params['fc6_U'][1].data[...] = B_fc6 97 | 98 | out += '_fc6_{}'.format(l_fc6) 99 | 100 | # Compress fc7 101 | if net_svd.params.has_key('fc7_L'): 102 | l_fc7 = net_svd.params['fc7_L'][0].data.shape[0] 103 | print ' fc7_L bottleneck size: {}'.format(l_fc7) 104 | 105 | W_fc7 = net.params['fc7'][0].data 106 | B_fc7 = net.params['fc7'][1].data 107 | 108 | print(' compressing fc7...') 109 | Ul_fc7, L_fc7 = compress_weights(W_fc7, l_fc7) 110 | 111 | assert(len(net_svd.params['fc7_L']) == 1) 112 | 113 | net_svd.params['fc7_L'][0].data[...] = L_fc7 114 | 115 | net_svd.params['fc7_U'][0].data[...] = Ul_fc7 116 | net_svd.params['fc7_U'][1].data[...] = B_fc7 117 | 118 | out += '_fc7_{}'.format(l_fc7) 119 | 120 | filename = '{}/{}.caffemodel'.format(out_dir, out) 121 | net_svd.save(filename) 122 | print 'Wrote svd model to: {:s}'.format(filename) 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Faster R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """ 11 | Demo script showing detections in sample images. 12 | 13 | See README.md for installation instructions before running. 14 | """ 15 | 16 | import _init_paths 17 | from fast_rcnn.config import cfg 18 | from fast_rcnn.test import im_detect 19 | from fast_rcnn.nms_wrapper import nms 20 | from utils.timer import Timer 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | import scipy.io as sio 24 | import caffe, os, sys, cv2 25 | import argparse 26 | 27 | CLASSES = ('__background__', 28 | 'aeroplane', 'bicycle', 'bird', 'boat', 29 | 'bottle', 'bus', 'car', 'cat', 'chair', 30 | 'cow', 'diningtable', 'dog', 'horse', 31 | 'motorbike', 'person', 'pottedplant', 32 | 'sheep', 'sofa', 'train', 'tvmonitor') 33 | 34 | NETS = {'vgg16': ('VGG16', 35 | 'VGG16_faster_rcnn_final.caffemodel'), 36 | 'zf': ('ZF', 37 | 'ZF_faster_rcnn_final.caffemodel')} 38 | 39 | 40 | def vis_detections(im, class_name, dets, thresh=0.5): 41 | """Draw detected bounding boxes.""" 42 | inds = np.where(dets[:, -1] >= thresh)[0] 43 | if len(inds) == 0: 44 | return 45 | 46 | im = im[:, :, (2, 1, 0)] 47 | fig, ax = plt.subplots(figsize=(12, 12)) 48 | ax.imshow(im, aspect='equal') 49 | for i in inds: 50 | bbox = dets[i, :4] 51 | score = dets[i, -1] 52 | 53 | ax.add_patch( 54 | plt.Rectangle((bbox[0], bbox[1]), 55 | bbox[2] - bbox[0], 56 | bbox[3] - bbox[1], fill=False, 57 | edgecolor='red', linewidth=3.5) 58 | ) 59 | ax.text(bbox[0], bbox[1] - 2, 60 | '{:s} {:.3f}'.format(class_name, score), 61 | bbox=dict(facecolor='blue', alpha=0.5), 62 | fontsize=14, color='white') 63 | 64 | ax.set_title(('{} detections with ' 65 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 66 | thresh), 67 | fontsize=14) 68 | plt.axis('off') 69 | plt.tight_layout() 70 | plt.draw() 71 | 72 | def demo(net, image_name): 73 | """Detect object classes in an image using pre-computed object proposals.""" 74 | 75 | # Load the demo image 76 | im_file = os.path.join(cfg.DATA_DIR, 'demo', image_name) 77 | im = cv2.imread(im_file) 78 | 79 | # Detect all object classes and regress object bounds 80 | timer = Timer() 81 | timer.tic() 82 | scores, boxes = im_detect(net, im) 83 | timer.toc() 84 | print ('Detection took {:.3f}s for ' 85 | '{:d} object proposals').format(timer.total_time, boxes.shape[0]) 86 | 87 | # Visualize detections for each class 88 | CONF_THRESH = 0.8 89 | NMS_THRESH = 0.3 90 | for cls_ind, cls in enumerate(CLASSES[1:]): 91 | cls_ind += 1 # because we skipped background 92 | cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)] 93 | cls_scores = scores[:, cls_ind] 94 | dets = np.hstack((cls_boxes, 95 | cls_scores[:, np.newaxis])).astype(np.float32) 96 | keep = nms(dets, NMS_THRESH) 97 | dets = dets[keep, :] 98 | vis_detections(im, cls, dets, thresh=CONF_THRESH) 99 | 100 | def parse_args(): 101 | """Parse input arguments.""" 102 | parser = argparse.ArgumentParser(description='Faster R-CNN demo') 103 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 104 | default=0, type=int) 105 | parser.add_argument('--cpu', dest='cpu_mode', 106 | help='Use CPU mode (overrides --gpu)', 107 | action='store_true') 108 | parser.add_argument('--net', dest='demo_net', help='Network to use [vgg16]', 109 | choices=NETS.keys(), default='vgg16') 110 | 111 | args = parser.parse_args() 112 | 113 | return args 114 | 115 | if __name__ == '__main__': 116 | cfg.TEST.HAS_RPN = True # Use RPN for proposals 117 | 118 | args = parse_args() 119 | 120 | prototxt = os.path.join(cfg.MODELS_DIR, NETS[args.demo_net][0], 121 | 'faster_rcnn_alt_opt', 'faster_rcnn_test.pt') 122 | caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models', 123 | NETS[args.demo_net][1]) 124 | 125 | if not os.path.isfile(caffemodel): 126 | raise IOError(('{:s} not found.\nDid you run ./data/script/' 127 | 'fetch_faster_rcnn_models.sh?').format(caffemodel)) 128 | 129 | if args.cpu_mode: 130 | caffe.set_mode_cpu() 131 | else: 132 | caffe.set_mode_gpu() 133 | caffe.set_device(args.gpu_id) 134 | cfg.GPU_ID = args.gpu_id 135 | net = caffe.Net(prototxt, caffemodel, caffe.TEST) 136 | 137 | print '\n\nLoaded network {:s}'.format(caffemodel) 138 | 139 | # Warmup on a dummy image 140 | im = 128 * np.ones((300, 500, 3), dtype=np.uint8) 141 | for i in xrange(2): 142 | _, _= im_detect(net, im) 143 | 144 | im_names = ['000456.jpg', '000542.jpg', '001150.jpg', 145 | '001763.jpg', '004545.jpg'] 146 | for im_name in im_names: 147 | print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' 148 | print 'Demo for data/demo/{}'.format(im_name) 149 | demo(net, im_name) 150 | 151 | plt.show() 152 | -------------------------------------------------------------------------------- /tools/eval_recall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import _init_paths 4 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list 5 | from datasets.factory import get_imdb 6 | import argparse 7 | import time, os, sys 8 | import numpy as np 9 | 10 | def parse_args(): 11 | """ 12 | Parse input arguments 13 | """ 14 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 15 | parser.add_argument('--imdb', dest='imdb_name', 16 | help='dataset to test', 17 | default='voc_2007_test', type=str) 18 | parser.add_argument('--method', dest='method', 19 | help='proposal method', 20 | default='selective_search', type=str) 21 | parser.add_argument('--rpn-file', dest='rpn_file', 22 | default=None, type=str) 23 | 24 | if len(sys.argv) == 1: 25 | parser.print_help() 26 | sys.exit(1) 27 | 28 | args = parser.parse_args() 29 | return args 30 | 31 | if __name__ == '__main__': 32 | args = parse_args() 33 | 34 | print('Called with args:') 35 | print(args) 36 | 37 | imdb = get_imdb(args.imdb_name) 38 | imdb.set_proposal_method(args.method) 39 | if args.rpn_file is not None: 40 | imdb.config['rpn_file'] = args.rpn_file 41 | 42 | candidate_boxes = None 43 | if 0: 44 | import scipy.io as sio 45 | filename = 'debug/stage1_rpn_voc_2007_test.mat' 46 | raw_data = sio.loadmat(filename)['aboxes'].ravel() 47 | candidate_boxes = raw_data 48 | 49 | ar, gt_overlaps, recalls, thresholds = \ 50 | imdb.evaluate_recall(candidate_boxes=candidate_boxes) 51 | print 'Method: {}'.format(args.method) 52 | print 'AverageRec: {:.3f}'.format(ar) 53 | 54 | def recall_at(t): 55 | ind = np.where(thresholds > t - 1e-5)[0][0] 56 | assert np.isclose(thresholds[ind], t) 57 | return recalls[ind] 58 | 59 | print 'Recall@0.5: {:.3f}'.format(recall_at(0.5)) 60 | print 'Recall@0.6: {:.3f}'.format(recall_at(0.6)) 61 | print 'Recall@0.7: {:.3f}'.format(recall_at(0.7)) 62 | print 'Recall@0.8: {:.3f}'.format(recall_at(0.8)) 63 | print 'Recall@0.9: {:.3f}'.format(recall_at(0.9)) 64 | # print again for easy spreadsheet copying 65 | print '{:.3f}'.format(ar) 66 | print '{:.3f}'.format(recall_at(0.5)) 67 | print '{:.3f}'.format(recall_at(0.6)) 68 | print '{:.3f}'.format(recall_at(0.7)) 69 | print '{:.3f}'.format(recall_at(0.8)) 70 | print '{:.3f}'.format(recall_at(0.9)) 71 | -------------------------------------------------------------------------------- /tools/reval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Reval = re-eval. Re-evaluate saved detections.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.test import apply_nms 14 | from fast_rcnn.config import cfg 15 | from datasets.factory import get_imdb 16 | import cPickle 17 | import os, sys, argparse 18 | import numpy as np 19 | 20 | def parse_args(): 21 | """ 22 | Parse input arguments 23 | """ 24 | parser = argparse.ArgumentParser(description='Re-evaluate results') 25 | parser.add_argument('output_dir', nargs=1, help='results directory', 26 | type=str) 27 | parser.add_argument('--imdb', dest='imdb_name', 28 | help='dataset to re-evaluate', 29 | default='voc_2007_test', type=str) 30 | parser.add_argument('--matlab', dest='matlab_eval', 31 | help='use matlab for evaluation', 32 | action='store_true') 33 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 34 | action='store_true') 35 | parser.add_argument('--nms', dest='apply_nms', help='apply nms', 36 | action='store_true') 37 | 38 | if len(sys.argv) == 1: 39 | parser.print_help() 40 | sys.exit(1) 41 | 42 | args = parser.parse_args() 43 | return args 44 | 45 | def from_dets(imdb_name, output_dir, args): 46 | imdb = get_imdb(imdb_name) 47 | imdb.competition_mode(args.comp_mode) 48 | imdb.config['matlab_eval'] = args.matlab_eval 49 | with open(os.path.join(output_dir, 'detections.pkl'), 'rb') as f: 50 | dets = cPickle.load(f) 51 | 52 | if args.apply_nms: 53 | print 'Applying NMS to all detections' 54 | nms_dets = apply_nms(dets, cfg.TEST.NMS) 55 | else: 56 | nms_dets = dets 57 | 58 | print 'Evaluating detections' 59 | imdb.evaluate_detections(nms_dets, output_dir) 60 | 61 | if __name__ == '__main__': 62 | args = parse_args() 63 | 64 | output_dir = os.path.abspath(args.output_dir[0]) 65 | imdb_name = args.imdb_name 66 | from_dets(imdb_name, output_dir, args) 67 | -------------------------------------------------------------------------------- /tools/rpn_generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast/er/ R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Generate RPN proposals.""" 11 | 12 | import _init_paths 13 | import numpy as np 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 15 | from datasets.factory import get_imdb 16 | from rpn.generate import imdb_proposals 17 | import cPickle 18 | import caffe 19 | import argparse 20 | import pprint 21 | import time, os, sys 22 | 23 | def parse_args(): 24 | """ 25 | Parse input arguments 26 | """ 27 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 28 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 29 | default=0, type=int) 30 | parser.add_argument('--def', dest='prototxt', 31 | help='prototxt file defining the network', 32 | default=None, type=str) 33 | parser.add_argument('--net', dest='caffemodel', 34 | help='model to test', 35 | default=None, type=str) 36 | parser.add_argument('--cfg', dest='cfg_file', 37 | help='optional config file', default=None, type=str) 38 | parser.add_argument('--wait', dest='wait', 39 | help='wait until net file exists', 40 | default=True, type=bool) 41 | parser.add_argument('--imdb', dest='imdb_name', 42 | help='dataset to test', 43 | default='voc_2007_test', type=str) 44 | parser.add_argument('--set', dest='set_cfgs', 45 | help='set config keys', default=None, 46 | nargs=argparse.REMAINDER) 47 | 48 | if len(sys.argv) == 1: 49 | parser.print_help() 50 | sys.exit(1) 51 | 52 | args = parser.parse_args() 53 | return args 54 | 55 | if __name__ == '__main__': 56 | args = parse_args() 57 | 58 | print('Called with args:') 59 | print(args) 60 | 61 | if args.cfg_file is not None: 62 | cfg_from_file(args.cfg_file) 63 | if args.set_cfgs is not None: 64 | cfg_from_list(args.set_cfgs) 65 | 66 | cfg.GPU_ID = args.gpu_id 67 | 68 | # RPN test settings 69 | cfg.TEST.RPN_PRE_NMS_TOP_N = -1 70 | cfg.TEST.RPN_POST_NMS_TOP_N = 2000 71 | 72 | print('Using config:') 73 | pprint.pprint(cfg) 74 | 75 | while not os.path.exists(args.caffemodel) and args.wait: 76 | print('Waiting for {} to exist...'.format(args.caffemodel)) 77 | time.sleep(10) 78 | 79 | caffe.set_mode_gpu() 80 | caffe.set_device(args.gpu_id) 81 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 82 | net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] 83 | 84 | imdb = get_imdb(args.imdb_name) 85 | imdb_boxes = imdb_proposals(net, imdb) 86 | 87 | output_dir = get_output_dir(imdb, net) 88 | rpn_file = os.path.join(output_dir, net.name + '_rpn_proposals.pkl') 89 | with open(rpn_file, 'wb') as f: 90 | cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL) 91 | print 'Wrote RPN proposals to {}'.format(rpn_file) 92 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN with OHEM 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick and Abhinav Shrivastava 7 | # -------------------------------------------------------- 8 | 9 | """Test a Fast R-CNN network on an image database.""" 10 | 11 | import _init_paths 12 | from fast_rcnn.test import test_net 13 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list 14 | from datasets.factory import get_imdb 15 | import caffe 16 | import argparse 17 | import pprint 18 | import time, os, sys 19 | 20 | def parse_args(): 21 | """ 22 | Parse input arguments 23 | """ 24 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 25 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 26 | default=0, type=int) 27 | parser.add_argument('--def', dest='prototxt', 28 | help='prototxt file defining the network', 29 | default=None, type=str) 30 | parser.add_argument('--net', dest='caffemodel', 31 | help='model to test', 32 | default=None, type=str) 33 | parser.add_argument('--cfg', dest='cfg_file', 34 | help='optional config file', default=None, type=str) 35 | parser.add_argument('--wait', dest='wait', 36 | help='wait until net file exists', 37 | default=True, type=bool) 38 | parser.add_argument('--imdb', dest='imdb_name', 39 | help='dataset to test', 40 | default='voc_2007_test', type=str) 41 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 42 | action='store_true') 43 | parser.add_argument('--set', dest='set_cfgs', 44 | help='set config keys', default=None, 45 | nargs=argparse.REMAINDER) 46 | parser.add_argument('--vis', dest='vis', help='visualize detections', 47 | action='store_true') 48 | parser.add_argument('--num_dets', dest='max_per_image', 49 | help='max number of detections per image', 50 | default=100, type=int) 51 | parser.add_argument('--det_thresh', dest='det_thresh', 52 | help='detection score threshold', 53 | default=0.05, type=float) 54 | 55 | if len(sys.argv) == 1: 56 | parser.print_help() 57 | sys.exit(1) 58 | 59 | args = parser.parse_args() 60 | return args 61 | 62 | if __name__ == '__main__': 63 | args = parse_args() 64 | 65 | print('Called with args:') 66 | print(args) 67 | 68 | if args.cfg_file is not None: 69 | cfg_from_file(args.cfg_file) 70 | if args.set_cfgs is not None: 71 | cfg_from_list(args.set_cfgs) 72 | 73 | cfg.GPU_ID = args.gpu_id 74 | 75 | print('Using config:') 76 | pprint.pprint(cfg) 77 | 78 | while not os.path.exists(args.caffemodel) and args.wait: 79 | print('Waiting for {} to exist...'.format(args.caffemodel)) 80 | time.sleep(10) 81 | 82 | caffe.set_mode_gpu() 83 | caffe.set_device(args.gpu_id) 84 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 85 | net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] 86 | 87 | imdb = get_imdb(args.imdb_name) 88 | imdb.competition_mode(args.comp_mode) 89 | if not cfg.TEST.HAS_RPN: 90 | imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD) 91 | 92 | test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis, thresh=args.det_thresh) 93 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Train a Fast R-CNN network on a region of interest database.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.train import get_training_roidb, train_net 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 15 | from datasets.factory import get_imdb 16 | import datasets.imdb 17 | import caffe 18 | import argparse 19 | import pprint 20 | import numpy as np 21 | import sys 22 | 23 | def parse_args(): 24 | """ 25 | Parse input arguments 26 | """ 27 | parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') 28 | parser.add_argument('--gpu', dest='gpu_id', 29 | help='GPU device id to use [0]', 30 | default=0, type=int) 31 | parser.add_argument('--solver', dest='solver', 32 | help='solver prototxt', 33 | default=None, type=str) 34 | parser.add_argument('--iters', dest='max_iters', 35 | help='number of iterations to train', 36 | default=40000, type=int) 37 | parser.add_argument('--weights', dest='pretrained_model', 38 | help='initialize with pretrained model weights', 39 | default=None, type=str) 40 | parser.add_argument('--cfg', dest='cfg_file', 41 | help='optional config file', 42 | default=None, type=str) 43 | parser.add_argument('--imdb', dest='imdb_name', 44 | help='dataset to train on', 45 | default='voc_2007_trainval', type=str) 46 | parser.add_argument('--rand', dest='randomize', 47 | help='randomize (do not use a fixed seed)', 48 | action='store_true') 49 | parser.add_argument('--set', dest='set_cfgs', 50 | help='set config keys', default=None, 51 | nargs=argparse.REMAINDER) 52 | 53 | if len(sys.argv) == 1: 54 | parser.print_help() 55 | sys.exit(1) 56 | 57 | args = parser.parse_args() 58 | return args 59 | 60 | def combined_roidb(imdb_names): 61 | def get_roidb(imdb_name): 62 | imdb = get_imdb(imdb_name) 63 | print 'Loaded dataset `{:s}` for training'.format(imdb.name) 64 | imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) 65 | print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) 66 | roidb = get_training_roidb(imdb) 67 | return roidb 68 | 69 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 70 | roidb = roidbs[0] 71 | if len(roidbs) > 1: 72 | for r in roidbs[1:]: 73 | roidb.extend(r) 74 | imdb = datasets.imdb.imdb(imdb_names) 75 | else: 76 | imdb = get_imdb(imdb_names) 77 | return imdb, roidb 78 | 79 | if __name__ == '__main__': 80 | args = parse_args() 81 | 82 | print('Called with args:') 83 | print(args) 84 | 85 | if args.cfg_file is not None: 86 | cfg_from_file(args.cfg_file) 87 | if args.set_cfgs is not None: 88 | cfg_from_list(args.set_cfgs) 89 | 90 | cfg.GPU_ID = args.gpu_id 91 | 92 | print('Using config:') 93 | pprint.pprint(cfg) 94 | 95 | if not args.randomize: 96 | # fix the random seeds (numpy and caffe) for reproducibility 97 | np.random.seed(cfg.RNG_SEED) 98 | caffe.set_random_seed(cfg.RNG_SEED) 99 | 100 | # set up caffe 101 | caffe.set_mode_gpu() 102 | caffe.set_device(args.gpu_id) 103 | 104 | imdb, roidb = combined_roidb(args.imdb_name) 105 | print '{:d} roidb entries'.format(len(roidb)) 106 | 107 | output_dir = get_output_dir(imdb) 108 | print 'Output will be saved to `{:s}`'.format(output_dir) 109 | 110 | train_net(args.solver, roidb, output_dir, 111 | pretrained_model=args.pretrained_model, 112 | max_iters=args.max_iters) 113 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | ./experiments/scripts/fast_rcnn_std.sh 0 VGG16 pascal_voc 2 | ./experiments/scripts/fast_rcnn_adv_pretrain.sh 0 VGG16 pascal_voc 3 | ./copy_model.h 4 | ./experiments/scripts/fast_rcnn_adv.sh 0 VGG16 pascal_voc 5 | 6 | --------------------------------------------------------------------------------