├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── data ├── .gitignore ├── README.md ├── demo │ ├── 000456.jpg │ ├── 000542.jpg │ ├── 001150.jpg │ ├── 001763.jpg │ └── 004545.jpg ├── pylintrc └── scripts │ ├── fetch_faster_rcnn_models.sh │ ├── fetch_imagenet_models.sh │ └── fetch_selective_search_data.sh ├── experiments ├── README.md ├── cfgs │ ├── faster_rcnn_alt_opt.yml │ └── faster_rcnn_end2end.yml ├── logs │ └── .gitignore └── scripts │ ├── fast_rcnn.sh │ ├── faster_rcnn_alt_opt.sh │ └── faster_rcnn_end2end.sh ├── lib ├── Makefile ├── datasets │ ├── VOCdevkit-matlab-wrapper │ │ ├── get_voc_opts.m │ │ ├── voc_eval.m │ │ └── xVOCap.m │ ├── __init__.py │ ├── coco.py │ ├── ds_utils.py │ ├── factory.py │ ├── imdb.py │ ├── pascal_voc.py │ ├── tools │ │ └── mcg_munge.py │ └── voc_eval.py ├── fast_rcnn │ ├── __init__.py │ ├── bbox_transform.py │ ├── config.py │ ├── nms_wrapper.py │ ├── test.py │ └── train.py ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── cpu_nms.pyx │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── py_cpu_nms.py ├── pycocotools │ ├── UPSTREAM_REV │ ├── __init__.py │ ├── _mask.pyx │ ├── coco.py │ ├── cocoeval.py │ ├── license.txt │ ├── mask.py │ ├── maskApi.c │ └── maskApi.h ├── roi_data_layer │ ├── __init__.py │ ├── layer.py │ ├── minibatch.py │ └── roidb.py ├── rpn │ ├── README.md │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── generate.py │ ├── generate_anchors.py │ ├── proposal_layer.py │ └── proposal_target_layer.py ├── setup.py ├── transform │ ├── __init__.py │ └── torch_image_transform_layer.py └── utils │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ └── timer.py ├── models ├── README.md ├── coco │ ├── VGG16 │ │ ├── fast_rcnn │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ │ └── faster_rcnn_end2end │ │ │ ├── solver.prototxt │ │ │ ├── test.prototxt │ │ │ └── train.prototxt │ └── VGG_CNN_M_1024 │ │ ├── fast_rcnn │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ │ └── faster_rcnn_end2end │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt └── pascal_voc │ ├── VGG16 │ ├── fast_rcnn │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── faster_rcnn_alt_opt │ │ ├── faster_rcnn_test.pt │ │ ├── rpn_test.pt │ │ ├── stage1_fast_rcnn_solver30k40k.pt │ │ ├── stage1_fast_rcnn_train.pt │ │ ├── stage1_rpn_solver60k80k.pt │ │ ├── stage1_rpn_train.pt │ │ ├── stage2_fast_rcnn_solver30k40k.pt │ │ ├── stage2_fast_rcnn_train.pt │ │ ├── stage2_rpn_solver60k80k.pt │ │ └── stage2_rpn_train.pt │ └── faster_rcnn_end2end │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── VGG_CNN_M_1024 │ ├── fast_rcnn │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── faster_rcnn_alt_opt │ │ ├── faster_rcnn_test.pt │ │ ├── rpn_test.pt │ │ ├── stage1_fast_rcnn_solver30k40k.pt │ │ ├── stage1_fast_rcnn_train.pt │ │ ├── stage1_rpn_solver60k80k.pt │ │ ├── stage1_rpn_train.pt │ │ ├── stage2_fast_rcnn_solver30k40k.pt │ │ ├── stage2_fast_rcnn_train.pt │ │ ├── stage2_rpn_solver60k80k.pt │ │ └── stage2_rpn_train.pt │ └── faster_rcnn_end2end │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ └── ZF │ ├── fast_rcnn │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt │ ├── faster_rcnn_alt_opt │ ├── faster_rcnn_test.pt │ ├── rpn_test.pt │ ├── stage1_fast_rcnn_solver30k40k.pt │ ├── stage1_fast_rcnn_train.pt │ ├── stage1_rpn_solver60k80k.pt │ ├── stage1_rpn_train.pt │ ├── stage2_fast_rcnn_solver30k40k.pt │ ├── stage2_fast_rcnn_train.pt │ ├── stage2_rpn_solver60k80k.pt │ └── stage2_rpn_train.pt │ └── faster_rcnn_end2end │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt └── tools ├── README.md ├── _init_paths.py ├── compress_net.py ├── demo.py ├── eval_recall.py ├── reval.py ├── rpn_generate.py ├── test_net.py ├── train_faster_rcnn_alt_opt.py ├── train_net.py └── train_svms.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .ipynb_checkpoints 3 | lib/build 4 | lib/pycocotools/_mask.c 5 | lib/pycocotools/_mask.so 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "caffe-fast-rcnn"] 2 | path = caffe-fast-rcnn 3 | url = https://github.com/rbgirshick/caffe-fast-rcnn.git 4 | branch = fast-rcnn 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Faster R-CNN 2 | 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2015 Microsoft Corporation 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | THE SOFTWARE. 24 | 25 | ************************************************************************ 26 | 27 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 28 | 29 | This project, Faster R-CNN, incorporates material from the project(s) 30 | listed below (collectively, "Third Party Code"). Microsoft is not the 31 | original author of the Third Party Code. The original copyright notice 32 | and license under which Microsoft received such Third Party Code are set 33 | out below. This Third Party Code is licensed to you under their original 34 | license terms set forth below. Microsoft reserves all other rights not 35 | expressly granted, whether by implication, estoppel or otherwise. 36 | 37 | 1. Caffe, (https://github.com/BVLC/caffe/) 38 | 39 | COPYRIGHT 40 | 41 | All contributions by the University of California: 42 | Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 43 | All rights reserved. 44 | 45 | All other contributions: 46 | Copyright (c) 2014, 2015, the respective contributors 47 | All rights reserved. 48 | 49 | Caffe uses a shared copyright model: each contributor holds copyright 50 | over their contributions to Caffe. The project versioning records all 51 | such contribution and copyright details. If a contributor wants to 52 | further mark their specific copyright on a particular contribution, 53 | they should indicate their copyright solely in the commit message of 54 | the change when it is committed. 55 | 56 | The BSD 2-Clause License 57 | 58 | Redistribution and use in source and binary forms, with or without 59 | modification, are permitted provided that the following conditions 60 | are met: 61 | 62 | 1. Redistributions of source code must retain the above copyright notice, 63 | this list of conditions and the following disclaimer. 64 | 65 | 2. Redistributions in binary form must reproduce the above copyright 66 | notice, this list of conditions and the following disclaimer in the 67 | documentation and/or other materials provided with the distribution. 68 | 69 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 70 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 71 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 72 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 73 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 74 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 75 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 76 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 77 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 78 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 79 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 80 | 81 | ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION********** 82 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | selective_search* 2 | imagenet_models* 3 | fast_rcnn_models* 4 | VOCdevkit* 5 | cache 6 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This directory holds (*after you download them*): 2 | - Caffe models pre-trained on ImageNet 3 | - Faster R-CNN models 4 | - Symlinks to datasets 5 | 6 | To download Caffe models (ZF, VGG16) pre-trained on ImageNet, run: 7 | 8 | ``` 9 | ./data/scripts/fetch_imagenet_models.sh 10 | ``` 11 | 12 | This script will populate `data/imagenet_models`. 13 | 14 | To download Faster R-CNN models trained on VOC 2007, run: 15 | 16 | ``` 17 | ./data/scripts/fetch_faster_rcnn_models.sh 18 | ``` 19 | 20 | This script will populate `data/faster_rcnn_models`. 21 | 22 | In order to train and test with PASCAL VOC, you will need to establish symlinks. 23 | From the `data` directory (`cd data`): 24 | 25 | ``` 26 | # For VOC 2007 27 | ln -s /your/path/to/VOC2007/VOCdevkit VOCdevkit2007 28 | 29 | # For VOC 2012 30 | ln -s /your/path/to/VOC2012/VOCdevkit VOCdevkit2012 31 | ``` 32 | 33 | Install the MS COCO dataset at /path/to/coco 34 | 35 | ``` 36 | ln -s /path/to/coco coco 37 | ``` 38 | 39 | For COCO with Fast R-CNN, place object proposals under `coco_proposals` (inside 40 | the `data` directory). You can obtain proposals on COCO from Jan Hosang at 41 | https://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal-computing/research/object-recognition-and-scene-understanding/how-good-are-detection-proposals-really/. 42 | For COCO, using MCG is recommended over selective search. MCG boxes can be downloaded 43 | from http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/. 44 | Use the tool `lib/datasets/tools/mcg_munge.py` to convert the downloaded MCG data 45 | into the same file layout as those from Jan Hosang. 46 | 47 | Since you'll likely be experimenting with multiple installs of Fast/er R-CNN in 48 | parallel, you'll probably want to keep all of this data in a shared place and 49 | use symlinks. On my system I create the following symlinks inside `data`: 50 | 51 | Annotations for the 5k image 'minival' subset of COCO val2014 that I like to use 52 | can be found at https://dl.dropboxusercontent.com/s/o43o90bna78omob/instances_minival2014.json.zip?dl=0. 53 | Annotations for COCO val2014 (set) minus minival (~35k images) can be found at 54 | https://dl.dropboxusercontent.com/s/s3tw5zcg7395368/instances_valminusminival2014.json.zip?dl=0. 55 | 56 | ``` 57 | # data/cache holds various outputs created by the datasets package 58 | ln -s /data/fast_rcnn_shared/cache 59 | 60 | # move the imagenet_models to shared location and symlink to them 61 | ln -s /data/fast_rcnn_shared/imagenet_models 62 | 63 | # move the selective search data to a shared location and symlink to them 64 | # (only applicable to Fast R-CNN training) 65 | ln -s /data/fast_rcnn_shared/selective_search_data 66 | 67 | ln -s /data/VOC2007/VOCdevkit VOCdevkit2007 68 | ln -s /data/VOC2012/VOCdevkit VOCdevkit2012 69 | ``` 70 | -------------------------------------------------------------------------------- /data/demo/000456.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/data/demo/000456.jpg -------------------------------------------------------------------------------- /data/demo/000542.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/data/demo/000542.jpg -------------------------------------------------------------------------------- /data/demo/001150.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/data/demo/001150.jpg -------------------------------------------------------------------------------- /data/demo/001763.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/data/demo/001763.jpg -------------------------------------------------------------------------------- /data/demo/004545.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/data/demo/004545.jpg -------------------------------------------------------------------------------- /data/pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | 3 | ignored-modules = numpy, numpy.random, cv2 4 | -------------------------------------------------------------------------------- /data/scripts/fetch_faster_rcnn_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=faster_rcnn_models.tgz 7 | URL=https://dl.dropboxusercontent.com/s/o6ii098bu51d139/faster_rcnn_models.tgz?dl=0 8 | CHECKSUM=ac116844f66aefe29587214272054668 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading Faster R-CNN demo models (695M)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_imagenet_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=imagenet_models.tgz 7 | URL=https://dl.dropbox.com/s/gstw7122padlf0l/imagenet_models.tgz?dl=0 8 | CHECKSUM=ed34ca912d6782edfb673a8c3a0bda6d 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading pretrained ImageNet models (1G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_selective_search_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=selective_search_data.tgz 7 | URL=https://dl.dropboxusercontent.com/s/orrt7o6bp6ae0tc/selective_search_data.tgz?dl=0 8 | CHECKSUM=7078c1db87a7851b31966b96774cd9b9 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading precomputed selective search boxes (0.5G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- 1 | Scripts are under `experiments/scripts`. 2 | 3 | Each script saves a log file under `experiments/logs`. 4 | 5 | Configuration override files used in the experiments are stored in `experiments/cfgs`. 6 | -------------------------------------------------------------------------------- /experiments/cfgs/faster_rcnn_alt_opt.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: faster_rcnn_alt_opt 2 | TRAIN: 3 | BG_THRESH_LO: 0.0 4 | TEST: 5 | HAS_RPN: True 6 | -------------------------------------------------------------------------------- /experiments/cfgs/faster_rcnn_end2end.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: faster_rcnn_end2end 2 | TRAIN: 3 | HAS_RPN: True 4 | IMS_PER_BATCH: 1 5 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 6 | RPN_POSITIVE_OVERLAP: 0.7 7 | RPN_BATCHSIZE: 256 8 | PROPOSAL_METHOD: gt 9 | BG_THRESH_LO: 0.0 10 | TEST: 11 | HAS_RPN: True 12 | -------------------------------------------------------------------------------- /experiments/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt* 2 | -------------------------------------------------------------------------------- /experiments/scripts/fast_rcnn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/fast_rcnn.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is either pascal_voc or coco. 5 | # 6 | # Example: 7 | # ./experiments/scripts/fast_rcnn.sh 0 VGG_CNN_M_1024 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=40000 31 | ;; 32 | coco) 33 | TRAIN_IMDB="coco_2014_train" 34 | TEST_IMDB="coco_2014_minival" 35 | PT_DIR="coco" 36 | ITERS=280000 37 | ;; 38 | *) 39 | echo "No dataset given" 40 | exit 41 | ;; 42 | esac 43 | 44 | LOG="experiments/logs/fast_rcnn_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 45 | exec &> >(tee -a "$LOG") 46 | echo Logging output to "$LOG" 47 | 48 | time ./tools/train_net.py --gpu ${GPU_ID} \ 49 | --solver models/${PT_DIR}/${NET}/fast_rcnn/solver.prototxt \ 50 | --weights data/imagenet_models/${NET}.v2.caffemodel \ 51 | --imdb ${TRAIN_IMDB} \ 52 | --iters ${ITERS} \ 53 | ${EXTRA_ARGS} 54 | 55 | set +x 56 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 57 | set -x 58 | 59 | time ./tools/test_net.py --gpu ${GPU_ID} \ 60 | --def models/${PT_DIR}/${NET}/fast_rcnn/test.prototxt \ 61 | --net ${NET_FINAL} \ 62 | --imdb ${TEST_IMDB} \ 63 | ${EXTRA_ARGS} 64 | -------------------------------------------------------------------------------- /experiments/scripts/faster_rcnn_alt_opt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/faster_rcnn_alt_opt.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is only pascal_voc for now 5 | # 6 | # Example: 7 | # ./experiments/scripts/faster_rcnn_alt_opt.sh 0 VGG_CNN_M_1024 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=40000 31 | ;; 32 | coco) 33 | echo "Not implemented: use experiments/scripts/faster_rcnn_end2end.sh for coco" 34 | exit 35 | ;; 36 | *) 37 | echo "No dataset given" 38 | exit 39 | ;; 40 | esac 41 | 42 | LOG="experiments/logs/faster_rcnn_alt_opt_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 43 | exec &> >(tee -a "$LOG") 44 | echo Logging output to "$LOG" 45 | 46 | time ./tools/train_faster_rcnn_alt_opt.py --gpu ${GPU_ID} \ 47 | --net_name ${NET} \ 48 | --weights data/imagenet_models/${NET}.v2.caffemodel \ 49 | --imdb ${TRAIN_IMDB} \ 50 | --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \ 51 | ${EXTRA_ARGS} 52 | 53 | set +x 54 | NET_FINAL=`grep "Final model:" ${LOG} | awk '{print $3}'` 55 | set -x 56 | 57 | time ./tools/test_net.py --gpu ${GPU_ID} \ 58 | --def models/${PT_DIR}/${NET}/faster_rcnn_alt_opt/faster_rcnn_test.pt \ 59 | --net ${NET_FINAL} \ 60 | --imdb ${TEST_IMDB} \ 61 | --cfg experiments/cfgs/faster_rcnn_alt_opt.yml \ 62 | ${EXTRA_ARGS} 63 | -------------------------------------------------------------------------------- /experiments/scripts/faster_rcnn_end2end.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/faster_rcnn_end2end.sh GPU NET DATASET [options args to {train,test}_net.py] 4 | # DATASET is either pascal_voc or coco. 5 | # 6 | # Example: 7 | # ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG_CNN_M_1024 pascal_voc \ 8 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400, 500, 600, 700]" 9 | 10 | set -x 11 | set -e 12 | 13 | export PYTHONUNBUFFERED="True" 14 | 15 | GPU_ID=$1 16 | NET=$2 17 | NET_lc=${NET,,} 18 | DATASET=$3 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:3:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | case $DATASET in 26 | pascal_voc) 27 | TRAIN_IMDB="voc_2007_trainval" 28 | TEST_IMDB="voc_2007_test" 29 | PT_DIR="pascal_voc" 30 | ITERS=70000 31 | ;; 32 | coco) 33 | # This is a very long and slow training schedule 34 | # You can probably use fewer iterations and reduce the 35 | # time to the LR drop (set in the solver to 350,000 iterations). 36 | TRAIN_IMDB="coco_2014_train" 37 | TEST_IMDB="coco_2014_minival" 38 | PT_DIR="coco" 39 | ITERS=490000 40 | ;; 41 | *) 42 | echo "No dataset given" 43 | exit 44 | ;; 45 | esac 46 | 47 | LOG="experiments/logs/faster_rcnn_end2end_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 48 | exec &> >(tee -a "$LOG") 49 | echo Logging output to "$LOG" 50 | 51 | time ./tools/train_net.py --gpu ${GPU_ID} \ 52 | --solver models/${PT_DIR}/${NET}/faster_rcnn_end2end/solver.prototxt \ 53 | --weights data/imagenet_models/${NET}.v2.caffemodel \ 54 | --imdb ${TRAIN_IMDB} \ 55 | --iters ${ITERS} \ 56 | --cfg experiments/cfgs/faster_rcnn_end2end.yml \ 57 | ${EXTRA_ARGS} 58 | 59 | set +x 60 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 61 | set -x 62 | 63 | time ./tools/test_net.py --gpu ${GPU_ID} \ 64 | --def models/${PT_DIR}/${NET}/faster_rcnn_end2end/test.prototxt \ 65 | --net ${NET_FINAL} \ 66 | --imdb ${TEST_IMDB} \ 67 | --cfg experiments/cfgs/faster_rcnn_end2end.yml \ 68 | ${EXTRA_ARGS} 69 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m: -------------------------------------------------------------------------------- 1 | function VOCopts = get_voc_opts(path) 2 | 3 | tmp = pwd; 4 | cd(path); 5 | try 6 | addpath('VOCcode'); 7 | VOCinit; 8 | catch 9 | rmpath('VOCcode'); 10 | cd(tmp); 11 | error(sprintf('VOCcode directory not found under %s', path)); 12 | end 13 | rmpath('VOCcode'); 14 | cd(tmp); 15 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m: -------------------------------------------------------------------------------- 1 | function res = voc_eval(path, comp_id, test_set, output_dir) 2 | 3 | VOCopts = get_voc_opts(path); 4 | VOCopts.testset = test_set; 5 | 6 | for i = 1:length(VOCopts.classes) 7 | cls = VOCopts.classes{i}; 8 | res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir); 9 | end 10 | 11 | fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); 12 | fprintf('Results:\n'); 13 | aps = [res(:).ap]'; 14 | fprintf('%.1f\n', aps * 100); 15 | fprintf('%.1f\n', mean(aps) * 100); 16 | fprintf('~~~~~~~~~~~~~~~~~~~~\n'); 17 | 18 | function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir) 19 | 20 | test_set = VOCopts.testset; 21 | year = VOCopts.dataset(4:end); 22 | 23 | addpath(fullfile(VOCopts.datadir, 'VOCcode')); 24 | 25 | res_fn = sprintf(VOCopts.detrespath, comp_id, cls); 26 | 27 | recall = []; 28 | prec = []; 29 | ap = 0; 30 | ap_auc = 0; 31 | 32 | do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); 33 | if do_eval 34 | % Bug in VOCevaldet requires that tic has been called first 35 | tic; 36 | [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); 37 | ap_auc = xVOCap(recall, prec); 38 | 39 | % force plot limits 40 | ylim([0 1]); 41 | xlim([0 1]); 42 | 43 | print(gcf, '-djpeg', '-r0', ... 44 | [output_dir '/' cls '_pr.jpg']); 45 | end 46 | fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); 47 | 48 | res.recall = recall; 49 | res.prec = prec; 50 | res.ap = ap; 51 | res.ap_auc = ap_auc; 52 | 53 | save([output_dir '/' cls '_pr.mat'], ... 54 | 'res', 'recall', 'prec', 'ap', 'ap_auc'); 55 | 56 | rmpath(fullfile(VOCopts.datadir, 'VOCcode')); 57 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m: -------------------------------------------------------------------------------- 1 | function ap = xVOCap(rec,prec) 2 | % From the PASCAL VOC 2011 devkit 3 | 4 | mrec=[0 ; rec ; 1]; 5 | mpre=[0 ; prec ; 0]; 6 | for i=numel(mpre)-1:-1:1 7 | mpre(i)=max(mpre(i),mpre(i+1)); 8 | end 9 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 10 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/datasets/ds_utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick 5 | # -------------------------------------------------------- 6 | 7 | import numpy as np 8 | 9 | def unique_boxes(boxes, scale=1.0): 10 | """Return indices of unique boxes.""" 11 | v = np.array([1, 1e3, 1e6, 1e9]) 12 | hashes = np.round(boxes * scale).dot(v) 13 | _, index = np.unique(hashes, return_index=True) 14 | return np.sort(index) 15 | 16 | def xywh_to_xyxy(boxes): 17 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 18 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 19 | 20 | def xyxy_to_xywh(boxes): 21 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 22 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 23 | 24 | def validate_boxes(boxes, width=0, height=0): 25 | """Check that a set of boxes are valid.""" 26 | x1 = boxes[:, 0] 27 | y1 = boxes[:, 1] 28 | x2 = boxes[:, 2] 29 | y2 = boxes[:, 3] 30 | assert (x1 >= 0).all() 31 | assert (y1 >= 0).all() 32 | assert (x2 >= x1).all() 33 | assert (y2 >= y1).all() 34 | assert (x2 < width).all() 35 | assert (y2 < height).all() 36 | 37 | def filter_small_boxes(boxes, min_size): 38 | w = boxes[:, 2] - boxes[:, 0] 39 | h = boxes[:, 3] - boxes[:, 1] 40 | keep = np.where((w >= min_size) & (h > min_size))[0] 41 | return keep 42 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | 12 | from datasets.pascal_voc import pascal_voc 13 | from datasets.coco import coco 14 | import numpy as np 15 | 16 | # Set up voc__ using selective search "fast" mode 17 | for year in ['2007', '2012']: 18 | for split in ['train', 'val', 'trainval', 'test']: 19 | name = 'voc_{}_{}'.format(year, split) 20 | __sets[name] = (lambda split=split, year=year: pascal_voc(split, year)) 21 | 22 | # Set up coco_2014_ 23 | for year in ['2014']: 24 | for split in ['train', 'val', 'minival', 'valminusminival']: 25 | name = 'coco_{}_{}'.format(year, split) 26 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 27 | 28 | # Set up coco_2015_ 29 | for year in ['2015']: 30 | for split in ['test', 'test-dev']: 31 | name = 'coco_{}_{}'.format(year, split) 32 | __sets[name] = (lambda split=split, year=year: coco(split, year)) 33 | 34 | def get_imdb(name): 35 | """Get an imdb (image database) by name.""" 36 | if not __sets.has_key(name): 37 | raise KeyError('Unknown dataset: {}'.format(name)) 38 | return __sets[name]() 39 | 40 | def list_imdbs(): 41 | """List all registered imdbs.""" 42 | return __sets.keys() 43 | -------------------------------------------------------------------------------- /lib/datasets/tools/mcg_munge.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | """Hacky tool to convert file system layout of MCG boxes downloaded from 5 | http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/mcg/ 6 | so that it's consistent with those computed by Jan Hosang (see: 7 | http://www.mpi-inf.mpg.de/departments/computer-vision-and-multimodal- 8 | computing/research/object-recognition-and-scene-understanding/how- 9 | good-are-detection-proposals-really/) 10 | 11 | NB: Boxes from the MCG website are in (y1, x1, y2, x2) order. 12 | Boxes from Hosang et al. are in (x1, y1, x2, y2) order. 13 | """ 14 | 15 | def munge(src_dir): 16 | # stored as: ./MCG-COCO-val2014-boxes/COCO_val2014_000000193401.mat 17 | # want: ./MCG/mat/COCO_val2014_0/COCO_val2014_000000141/COCO_val2014_000000141334.mat 18 | 19 | files = os.listdir(src_dir) 20 | for fn in files: 21 | base, ext = os.path.splitext(fn) 22 | # first 14 chars / first 22 chars / all chars + .mat 23 | # COCO_val2014_0/COCO_val2014_000000447/COCO_val2014_000000447991.mat 24 | first = base[:14] 25 | second = base[:22] 26 | dst_dir = os.path.join('MCG', 'mat', first, second) 27 | if not os.path.exists(dst_dir): 28 | os.makedirs(dst_dir) 29 | src = os.path.join(src_dir, fn) 30 | dst = os.path.join(dst_dir, fn) 31 | print 'MV: {} -> {}'.format(src, dst) 32 | os.rename(src, dst) 33 | 34 | if __name__ == '__main__': 35 | # src_dir should look something like: 36 | # src_dir = 'MCG-COCO-val2014-boxes' 37 | src_dir = sys.argv[1] 38 | munge(src_dir) 39 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def bbox_transform(ex_rois, gt_rois): 11 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 12 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 13 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 14 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 15 | 16 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 17 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 18 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 19 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 20 | 21 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 22 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 23 | targets_dw = np.log(gt_widths / ex_widths) 24 | targets_dh = np.log(gt_heights / ex_heights) 25 | 26 | targets = np.vstack( 27 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 28 | return targets 29 | 30 | def bbox_transform_inv(boxes, deltas): 31 | if boxes.shape[0] == 0: 32 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 33 | 34 | boxes = boxes.astype(deltas.dtype, copy=False) 35 | 36 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 37 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 38 | ctr_x = boxes[:, 0] + 0.5 * widths 39 | ctr_y = boxes[:, 1] + 0.5 * heights 40 | 41 | dx = deltas[:, 0::4] 42 | dy = deltas[:, 1::4] 43 | dw = deltas[:, 2::4] 44 | dh = deltas[:, 3::4] 45 | 46 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 47 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 48 | pred_w = np.exp(dw) * widths[:, np.newaxis] 49 | pred_h = np.exp(dh) * heights[:, np.newaxis] 50 | 51 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 52 | # x1 53 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 54 | # y1 55 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 56 | # x2 57 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 58 | # y2 59 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 60 | 61 | return pred_boxes 62 | 63 | def clip_boxes(boxes, im_shape): 64 | """ 65 | Clip boxes to image boundaries. 66 | """ 67 | 68 | # x1 >= 0 69 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 70 | # y1 >= 0 71 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 72 | # x2 < im_shape[1] 73 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 74 | # y2 < im_shape[0] 75 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 76 | return boxes 77 | -------------------------------------------------------------------------------- /lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from nms.gpu_nms import gpu_nms 10 | from nms.cpu_nms import cpu_nms 11 | 12 | def nms(dets, thresh, force_cpu=False): 13 | """Dispatch to either CPU or GPU NMS implementations.""" 14 | 15 | if dets.shape[0] == 0: 16 | return [] 17 | if cfg.USE_GPU_NMS and not force_cpu: 18 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 19 | else: 20 | return cpu_nms(dets, thresh) 21 | -------------------------------------------------------------------------------- /lib/fast_rcnn/train.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Train a Fast R-CNN network.""" 9 | 10 | import caffe 11 | from fast_rcnn.config import cfg 12 | import roi_data_layer.roidb as rdl_roidb 13 | from utils.timer import Timer 14 | import numpy as np 15 | import os 16 | 17 | from caffe.proto import caffe_pb2 18 | import google.protobuf as pb2 19 | 20 | class SolverWrapper(object): 21 | """A simple wrapper around Caffe's solver. 22 | This wrapper gives us control over he snapshotting process, which we 23 | use to unnormalize the learned bounding-box regression weights. 24 | """ 25 | 26 | def __init__(self, solver_prototxt, roidb, output_dir, 27 | pretrained_model=None): 28 | """Initialize the SolverWrapper.""" 29 | self.output_dir = output_dir 30 | 31 | if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and 32 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS): 33 | # RPN can only use precomputed normalization because there are no 34 | # fixed statistics to compute a priori 35 | assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED 36 | 37 | if cfg.TRAIN.BBOX_REG: 38 | print 'Computing bounding-box regression targets...' 39 | self.bbox_means, self.bbox_stds = \ 40 | rdl_roidb.add_bbox_regression_targets(roidb) 41 | print 'done' 42 | 43 | self.solver = caffe.SGDSolver(solver_prototxt) 44 | if pretrained_model is not None: 45 | print ('Loading pretrained model ' 46 | 'weights from {:s}').format(pretrained_model) 47 | self.solver.net.copy_from(pretrained_model) 48 | 49 | self.solver_param = caffe_pb2.SolverParameter() 50 | with open(solver_prototxt, 'rt') as f: 51 | pb2.text_format.Merge(f.read(), self.solver_param) 52 | 53 | self.solver.net.layers[0].set_roidb(roidb) 54 | 55 | def snapshot(self): 56 | """Take a snapshot of the network after unnormalizing the learned 57 | bounding-box regression weights. This enables easy use at test-time. 58 | """ 59 | net = self.solver.net 60 | 61 | scale_bbox_params = (cfg.TRAIN.BBOX_REG and 62 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS and 63 | net.params.has_key('bbox_pred')) 64 | 65 | if scale_bbox_params: 66 | # save original values 67 | orig_0 = net.params['bbox_pred'][0].data.copy() 68 | orig_1 = net.params['bbox_pred'][1].data.copy() 69 | 70 | # scale and shift with bbox reg unnormalization; then save snapshot 71 | net.params['bbox_pred'][0].data[...] = \ 72 | (net.params['bbox_pred'][0].data * 73 | self.bbox_stds[:, np.newaxis]) 74 | net.params['bbox_pred'][1].data[...] = \ 75 | (net.params['bbox_pred'][1].data * 76 | self.bbox_stds + self.bbox_means) 77 | 78 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 79 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 80 | filename = (self.solver_param.snapshot_prefix + infix + 81 | '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') 82 | filename = os.path.join(self.output_dir, filename) 83 | 84 | net.save(str(filename)) 85 | print 'Wrote snapshot to: {:s}'.format(filename) 86 | 87 | if scale_bbox_params: 88 | # restore net to original state 89 | net.params['bbox_pred'][0].data[...] = orig_0 90 | net.params['bbox_pred'][1].data[...] = orig_1 91 | return filename 92 | 93 | def train_model(self, max_iters): 94 | """Network training loop.""" 95 | last_snapshot_iter = -1 96 | timer = Timer() 97 | model_paths = [] 98 | while self.solver.iter < max_iters: 99 | # Make one SGD update 100 | timer.tic() 101 | self.solver.step(1) 102 | timer.toc() 103 | if self.solver.iter % (10 * self.solver_param.display) == 0: 104 | print 'speed: {:.3f}s / iter'.format(timer.average_time) 105 | 106 | if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: 107 | last_snapshot_iter = self.solver.iter 108 | model_paths.append(self.snapshot()) 109 | 110 | if last_snapshot_iter != self.solver.iter: 111 | model_paths.append(self.snapshot()) 112 | return model_paths 113 | 114 | def get_training_roidb(imdb): 115 | """Returns a roidb (Region of Interest database) for use in training.""" 116 | if cfg.TRAIN.USE_FLIPPED: 117 | print 'Appending horizontally-flipped training examples...' 118 | imdb.append_flipped_images() 119 | print 'done' 120 | 121 | print 'Preparing training data...' 122 | rdl_roidb.prepare_roidb(imdb) 123 | print 'done' 124 | 125 | return imdb.roidb 126 | 127 | def filter_roidb(roidb): 128 | """Remove roidb entries that have no usable RoIs.""" 129 | 130 | def is_valid(entry): 131 | # Valid images have: 132 | # (1) At least one foreground RoI OR 133 | # (2) At least one background RoI 134 | overlaps = entry['max_overlaps'] 135 | # find boxes with sufficient overlap 136 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 137 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 138 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 139 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 140 | # image is only valid if such boxes exist 141 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 142 | return valid 143 | 144 | num = len(roidb) 145 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 146 | num_after = len(filtered_roidb) 147 | print 'Filtered {} roidb entries: {} -> {}'.format(num - num_after, 148 | num, num_after) 149 | return filtered_roidb 150 | 151 | def train_net(solver_prototxt, roidb, output_dir, 152 | pretrained_model=None, max_iters=40000): 153 | """Train a Fast R-CNN network.""" 154 | 155 | roidb = filter_roidb(roidb) 156 | sw = SolverWrapper(solver_prototxt, roidb, output_dir, 157 | pretrained_model=pretrained_model) 158 | 159 | print 'Solving...' 160 | model_paths = sw.train_model(max_iters) 161 | print 'done solving' 162 | return model_paths 163 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /lib/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocotools/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /lib/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | import pycocotools._mask as _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | decode = _mask.decode 78 | iou = _mask.iou 79 | merge = _mask.merge 80 | area = _mask.area 81 | toBbox = _mask.toBbox 82 | frPyObjects = _mask.frPyObjects -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | from fast_rcnn.config import cfg 12 | from fast_rcnn.bbox_transform import bbox_transform 13 | from utils.cython_bbox import bbox_overlaps 14 | import PIL 15 | 16 | def prepare_roidb(imdb): 17 | """Enrich the imdb's roidb by adding some derived quantities that 18 | are useful for training. This function precomputes the maximum 19 | overlap, taken over ground-truth boxes, between each ROI and 20 | each ground-truth box. The class with maximum overlap is also 21 | recorded. 22 | """ 23 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 24 | for i in xrange(imdb.num_images)] 25 | roidb = imdb.roidb 26 | for i in xrange(len(imdb.image_index)): 27 | roidb[i]['image'] = imdb.image_path_at(i) 28 | roidb[i]['width'] = sizes[i][0] 29 | roidb[i]['height'] = sizes[i][1] 30 | # need gt_overlaps as a dense array for argmax 31 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 32 | # max overlap with gt over classes (columns) 33 | max_overlaps = gt_overlaps.max(axis=1) 34 | # gt class that had the max overlap 35 | max_classes = gt_overlaps.argmax(axis=1) 36 | roidb[i]['max_classes'] = max_classes 37 | roidb[i]['max_overlaps'] = max_overlaps 38 | # sanity checks 39 | # max overlap of 0 => class should be zero (background) 40 | zero_inds = np.where(max_overlaps == 0)[0] 41 | assert all(max_classes[zero_inds] == 0) 42 | # max overlap > 0 => class should not be zero (must be a fg class) 43 | nonzero_inds = np.where(max_overlaps > 0)[0] 44 | assert all(max_classes[nonzero_inds] != 0) 45 | 46 | def add_bbox_regression_targets(roidb): 47 | """Add information needed to train bounding-box regressors.""" 48 | assert len(roidb) > 0 49 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 50 | 51 | num_images = len(roidb) 52 | # Infer number of classes from the number of columns in gt_overlaps 53 | num_classes = roidb[0]['gt_overlaps'].shape[1] 54 | for im_i in xrange(num_images): 55 | rois = roidb[im_i]['boxes'] 56 | max_overlaps = roidb[im_i]['max_overlaps'] 57 | max_classes = roidb[im_i]['max_classes'] 58 | roidb[im_i]['bbox_targets'] = \ 59 | _compute_targets(rois, max_overlaps, max_classes) 60 | 61 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 62 | # Use fixed / precomputed "means" and "stds" instead of empirical values 63 | means = np.tile( 64 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 65 | stds = np.tile( 66 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 67 | else: 68 | # Compute values needed for means and stds 69 | # var(x) = E(x^2) - E(x)^2 70 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 71 | sums = np.zeros((num_classes, 4)) 72 | squared_sums = np.zeros((num_classes, 4)) 73 | for im_i in xrange(num_images): 74 | targets = roidb[im_i]['bbox_targets'] 75 | for cls in xrange(1, num_classes): 76 | cls_inds = np.where(targets[:, 0] == cls)[0] 77 | if cls_inds.size > 0: 78 | class_counts[cls] += cls_inds.size 79 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 80 | squared_sums[cls, :] += \ 81 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 82 | 83 | means = sums / class_counts 84 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 85 | 86 | print 'bbox target means:' 87 | print means 88 | print means[1:, :].mean(axis=0) # ignore bg class 89 | print 'bbox target stdevs:' 90 | print stds 91 | print stds[1:, :].mean(axis=0) # ignore bg class 92 | 93 | # Normalize targets 94 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 95 | print "Normalizing targets" 96 | for im_i in xrange(num_images): 97 | targets = roidb[im_i]['bbox_targets'] 98 | for cls in xrange(1, num_classes): 99 | cls_inds = np.where(targets[:, 0] == cls)[0] 100 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 101 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 102 | else: 103 | print "NOT normalizing targets" 104 | 105 | # These values will be needed for making predictions 106 | # (the predicts will need to be unnormalized and uncentered) 107 | return means.ravel(), stds.ravel() 108 | 109 | def _compute_targets(rois, overlaps, labels): 110 | """Compute bounding-box regression targets for an image.""" 111 | # Indices of ground-truth ROIs 112 | gt_inds = np.where(overlaps == 1)[0] 113 | if len(gt_inds) == 0: 114 | # Bail if the image has no ground-truth ROIs 115 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 116 | # Indices of examples for which we try to make predictions 117 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 118 | 119 | # Get IoU overlap between each ex ROI and gt ROI 120 | ex_gt_overlaps = bbox_overlaps( 121 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 122 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 123 | 124 | # Find which gt ROI each ex ROI has max overlap with: 125 | # this will be the ex ROI's gt target 126 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 127 | gt_rois = rois[gt_inds[gt_assignment], :] 128 | ex_rois = rois[ex_inds, :] 129 | 130 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 131 | targets[ex_inds, 0] = labels[ex_inds] 132 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 133 | return targets 134 | -------------------------------------------------------------------------------- /lib/rpn/README.md: -------------------------------------------------------------------------------- 1 | ### `rpn` module overview 2 | 3 | ##### `generate_anchors.py` 4 | 5 | Generates a regular grid of multi-scale, multi-aspect anchor boxes. 6 | 7 | ##### `proposal_layer.py` 8 | 9 | Converts RPN outputs (per-anchor scores and bbox regression estimates) into object proposals. 10 | 11 | ##### `anchor_target_layer.py` 12 | 13 | Generates training targets/labels for each anchor. Classification labels are 1 (object), 0 (not object) or -1 (ignore). 14 | Bbox regression targets are specified when the classification label is > 0. 15 | 16 | ##### `proposal_target_layer.py` 17 | 18 | Generates training targets/labels for each object proposal: classification labels 0 - K (bg or object class 1, ... , K) 19 | and bbox regression targets in that case that the label is > 0. 20 | 21 | ##### `generate.py` 22 | 23 | Generate object detection proposals from an imdb using an RPN. 24 | -------------------------------------------------------------------------------- /lib/rpn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/rpn/generate.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from fast_rcnn.config import cfg 9 | from utils.blob import im_list_to_blob 10 | from utils.timer import Timer 11 | import numpy as np 12 | import cv2 13 | 14 | def _vis_proposals(im, dets, thresh=0.5): 15 | """Draw detected bounding boxes.""" 16 | inds = np.where(dets[:, -1] >= thresh)[0] 17 | if len(inds) == 0: 18 | return 19 | 20 | class_name = 'obj' 21 | im = im[:, :, (2, 1, 0)] 22 | fig, ax = plt.subplots(figsize=(12, 12)) 23 | ax.imshow(im, aspect='equal') 24 | for i in inds: 25 | bbox = dets[i, :4] 26 | score = dets[i, -1] 27 | 28 | ax.add_patch( 29 | plt.Rectangle((bbox[0], bbox[1]), 30 | bbox[2] - bbox[0], 31 | bbox[3] - bbox[1], fill=False, 32 | edgecolor='red', linewidth=3.5) 33 | ) 34 | ax.text(bbox[0], bbox[1] - 2, 35 | '{:s} {:.3f}'.format(class_name, score), 36 | bbox=dict(facecolor='blue', alpha=0.5), 37 | fontsize=14, color='white') 38 | 39 | ax.set_title(('{} detections with ' 40 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 41 | thresh), 42 | fontsize=14) 43 | plt.axis('off') 44 | plt.tight_layout() 45 | plt.draw() 46 | 47 | def _get_image_blob(im): 48 | """Converts an image into a network input. 49 | 50 | Arguments: 51 | im (ndarray): a color image in BGR order 52 | 53 | Returns: 54 | blob (ndarray): a data blob holding an image pyramid 55 | im_scale_factors (list): list of image scales (relative to im) used 56 | in the image pyramid 57 | """ 58 | im_orig = im.astype(np.float32, copy=True) 59 | im_orig -= cfg.PIXEL_MEANS 60 | 61 | im_shape = im_orig.shape 62 | im_size_min = np.min(im_shape[0:2]) 63 | im_size_max = np.max(im_shape[0:2]) 64 | 65 | processed_ims = [] 66 | 67 | assert len(cfg.TEST.SCALES) == 1 68 | target_size = cfg.TEST.SCALES[0] 69 | 70 | im_scale = float(target_size) / float(im_size_min) 71 | # Prevent the biggest axis from being more than MAX_SIZE 72 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 73 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 74 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 75 | interpolation=cv2.INTER_LINEAR) 76 | im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] 77 | processed_ims.append(im) 78 | 79 | # Create a blob to hold the input images 80 | blob = im_list_to_blob(processed_ims) 81 | 82 | return blob, im_info 83 | 84 | def im_proposals(net, im): 85 | """Generate RPN proposals on a single image.""" 86 | blobs = {} 87 | blobs['data'], blobs['im_info'] = _get_image_blob(im) 88 | net.blobs['data'].reshape(*(blobs['data'].shape)) 89 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 90 | blobs_out = net.forward( 91 | data=blobs['data'].astype(np.float32, copy=False), 92 | im_info=blobs['im_info'].astype(np.float32, copy=False)) 93 | 94 | scale = blobs['im_info'][0, 2] 95 | boxes = blobs_out['rois'][:, 1:].copy() / scale 96 | scores = blobs_out['scores'].copy() 97 | return boxes, scores 98 | 99 | def imdb_proposals(net, imdb): 100 | """Generate RPN proposals on all images in an imdb.""" 101 | 102 | _t = Timer() 103 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 104 | for i in xrange(imdb.num_images): 105 | im = cv2.imread(imdb.image_path_at(i)) 106 | _t.tic() 107 | imdb_boxes[i], scores = im_proposals(net, im) 108 | _t.toc() 109 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 110 | .format(i + 1, imdb.num_images, _t.average_time) 111 | if 0: 112 | dets = np.hstack((imdb_boxes[i], scores)) 113 | # from IPython import embed; embed() 114 | _vis_proposals(im, dets[:3, :], thresh=0.9) 115 | plt.show() 116 | 117 | return imdb_boxes 118 | -------------------------------------------------------------------------------- /lib/rpn/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6)): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | 44 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 45 | ratio_anchors = _ratio_enum(base_anchor, ratios) 46 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 47 | for i in xrange(ratio_anchors.shape[0])]) 48 | return anchors 49 | 50 | def _whctrs(anchor): 51 | """ 52 | Return width, height, x center, and y center for an anchor (window). 53 | """ 54 | 55 | w = anchor[2] - anchor[0] + 1 56 | h = anchor[3] - anchor[1] + 1 57 | x_ctr = anchor[0] + 0.5 * (w - 1) 58 | y_ctr = anchor[1] + 0.5 * (h - 1) 59 | return w, h, x_ctr, y_ctr 60 | 61 | def _mkanchors(ws, hs, x_ctr, y_ctr): 62 | """ 63 | Given a vector of widths (ws) and heights (hs) around a center 64 | (x_ctr, y_ctr), output a set of anchors (windows). 65 | """ 66 | 67 | ws = ws[:, np.newaxis] 68 | hs = hs[:, np.newaxis] 69 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 70 | y_ctr - 0.5 * (hs - 1), 71 | x_ctr + 0.5 * (ws - 1), 72 | y_ctr + 0.5 * (hs - 1))) 73 | return anchors 74 | 75 | def _ratio_enum(anchor, ratios): 76 | """ 77 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 78 | """ 79 | 80 | w, h, x_ctr, y_ctr = _whctrs(anchor) 81 | size = w * h 82 | size_ratios = size / ratios 83 | ws = np.round(np.sqrt(size_ratios)) 84 | hs = np.round(ws * ratios) 85 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 86 | return anchors 87 | 88 | def _scale_enum(anchor, scales): 89 | """ 90 | Enumerate a set of anchors for each scale wrt an anchor. 91 | """ 92 | 93 | w, h, x_ctr, y_ctr = _whctrs(anchor) 94 | ws = w * scales 95 | hs = h * scales 96 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 97 | return anchors 98 | 99 | if __name__ == '__main__': 100 | import time 101 | t = time.time() 102 | a = generate_anchors() 103 | print time.time() - t 104 | print a 105 | from IPython import embed; embed() 106 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import subprocess 14 | import numpy as np 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | 30 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 31 | and values giving the absolute path to each directory. 32 | 33 | Starts by looking for the CUDAHOME env variable. If not found, everything 34 | is based on finding 'nvcc' in the PATH. 35 | """ 36 | 37 | # first check if the CUDAHOME env variable is in use 38 | if 'CUDAHOME' in os.environ: 39 | home = os.environ['CUDAHOME'] 40 | nvcc = pjoin(home, 'bin', 'nvcc') 41 | else: 42 | # otherwise, search the PATH for NVCC 43 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 44 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError('The nvcc binary could not be ' 47 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 48 | home = os.path.dirname(os.path.dirname(nvcc)) 49 | 50 | cudaconfig = {'home':home, 'nvcc':nvcc, 51 | 'include': pjoin(home, 'include'), 52 | 'lib64': pjoin(home, 'lib64')} 53 | for k, v in cudaconfig.iteritems(): 54 | if not os.path.exists(v): 55 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 56 | 57 | return cudaconfig 58 | CUDA = locate_cuda() 59 | 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | def customize_compiler_for_nvcc(self): 68 | """inject deep into distutils to customize how the dispatch 69 | to gcc/nvcc works. 70 | 71 | If you subclass UnixCCompiler, it's not trivial to get your subclass 72 | injected in, and still have the right customizations (i.e. 73 | distutils.sysconfig.customize_compiler) run on it. So instead of going 74 | the OO route, I have this. Note, it's kindof like a wierd functional 75 | subclassing going on.""" 76 | 77 | # tell the compiler it can processes .cu 78 | self.src_extensions.append('.cu') 79 | 80 | # save references to the default compiler_so and _comple methods 81 | default_compiler_so = self.compiler_so 82 | super = self._compile 83 | 84 | # now redefine the _compile method. This gets executed for each 85 | # object but distutils doesn't have the ability to change compilers 86 | # based on source extension: we add it. 87 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 88 | if os.path.splitext(src)[1] == '.cu': 89 | # use the cuda for .cu files 90 | self.set_executable('compiler_so', CUDA['nvcc']) 91 | # use only a subset of the extra_postargs, which are 1-1 translated 92 | # from the extra_compile_args in the Extension class 93 | postargs = extra_postargs['nvcc'] 94 | else: 95 | postargs = extra_postargs['gcc'] 96 | 97 | super(obj, src, ext, cc_args, postargs, pp_opts) 98 | # reset the default compiler_so, which we might have changed for cuda 99 | self.compiler_so = default_compiler_so 100 | 101 | # inject our redefined _compile method into the class 102 | self._compile = _compile 103 | 104 | 105 | # run the customize_compiler 106 | class custom_build_ext(build_ext): 107 | def build_extensions(self): 108 | customize_compiler_for_nvcc(self.compiler) 109 | build_ext.build_extensions(self) 110 | 111 | 112 | ext_modules = [ 113 | Extension( 114 | "utils.cython_bbox", 115 | ["utils/bbox.pyx"], 116 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 117 | include_dirs = [numpy_include] 118 | ), 119 | Extension( 120 | "nms.cpu_nms", 121 | ["nms/cpu_nms.pyx"], 122 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 123 | include_dirs = [numpy_include] 124 | ), 125 | Extension('nms.gpu_nms', 126 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 127 | library_dirs=[CUDA['lib64']], 128 | libraries=['cudart'], 129 | language='c++', 130 | runtime_library_dirs=[CUDA['lib64']], 131 | # this syntax is specific to this build system 132 | # we're only going to use certain compiler args with nvcc and not with 133 | # gcc the implementation of this trick is in customize_compiler() below 134 | extra_compile_args={'gcc': ["-Wno-unused-function"], 135 | 'nvcc': ['-arch=sm_35', 136 | '--ptxas-options=-v', 137 | '-c', 138 | '--compiler-options', 139 | "'-fPIC'"]}, 140 | include_dirs = [numpy_include, CUDA['include']] 141 | ), 142 | Extension( 143 | 'pycocotools._mask', 144 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 145 | include_dirs = [numpy_include, 'pycocotools'], 146 | extra_compile_args={ 147 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 148 | ), 149 | ] 150 | 151 | setup( 152 | name='fast_rcnn', 153 | ext_modules=ext_modules, 154 | # inject our custom trigger 155 | cmdclass={'build_ext': custom_build_ext}, 156 | ) 157 | -------------------------------------------------------------------------------- /lib/transform/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/781a917b378dbfdedb45b6a56189a31982da1b43/lib/transform/__init__.py -------------------------------------------------------------------------------- /lib/transform/torch_image_transform_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # -------------------------------------------------------- 5 | 6 | """ Transform images for compatibility with models trained with 7 | https://github.com/facebook/fb.resnet.torch. 8 | 9 | Usage in model prototxt: 10 | 11 | layer { 12 | name: 'data_xform' 13 | type: 'Python' 14 | bottom: 'data_caffe' 15 | top: 'data' 16 | python_param { 17 | module: 'transform.torch_image_transform_layer' 18 | layer: 'TorchImageTransformLayer' 19 | } 20 | } 21 | """ 22 | 23 | import caffe 24 | from fast_rcnn.config import cfg 25 | import numpy as np 26 | 27 | class TorchImageTransformLayer(caffe.Layer): 28 | def setup(self, bottom, top): 29 | # (1, 3, 1, 1) shaped arrays 30 | self.PIXEL_MEANS = \ 31 | np.array([[[[0.48462227599918]], 32 | [[0.45624044862054]], 33 | [[0.40588363755159]]]]) 34 | self.PIXEL_STDS = \ 35 | np.array([[[[0.22889466674951]], 36 | [[0.22446679341259]], 37 | [[0.22495548344775]]]]) 38 | # The default ("old") pixel means that were already subtracted 39 | channel_swap = (0, 3, 1, 2) 40 | self.OLD_PIXEL_MEANS = \ 41 | cfg.PIXEL_MEANS[np.newaxis, :, :, :].transpose(channel_swap) 42 | 43 | top[0].reshape(*(bottom[0].shape)) 44 | 45 | def forward(self, bottom, top): 46 | ims = bottom[0].data 47 | # Invert the channel means that were already subtracted 48 | ims += self.OLD_PIXEL_MEANS 49 | # 1. Permute BGR to RGB and normalize to [0, 1] 50 | ims = ims[:, [2, 1, 0], :, :] / 255.0 51 | # 2. Remove channel means 52 | ims -= self.PIXEL_MEANS 53 | # 3. Standardize channels 54 | ims /= self.PIXEL_STDS 55 | top[0].reshape(*(ims.shape)) 56 | top[0].data[...] = ims 57 | 58 | def backward(self, top, propagate_down, bottom): 59 | """This layer does not propagate gradients.""" 60 | pass 61 | 62 | def reshape(self, bottom, top): 63 | """Reshaping happens during the call to forward.""" 64 | pass 65 | -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.so 3 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | # Move channels (axis 3) to axis 1 26 | # Axis order will become: (batch elem, channel, height, width) 27 | channel_swap = (0, 3, 1, 2) 28 | blob = blob.transpose(channel_swap) 29 | return blob 30 | 31 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 32 | """Mean subtract and scale an image for use in a blob.""" 33 | im = im.astype(np.float32, copy=False) 34 | im -= pixel_means 35 | im_shape = im.shape 36 | im_size_min = np.min(im_shape[0:2]) 37 | im_size_max = np.max(im_shape[0:2]) 38 | im_scale = float(target_size) / float(im_size_min) 39 | # Prevent the biggest axis from being more than MAX_SIZE 40 | if np.round(im_scale * im_size_max) > max_size: 41 | im_scale = float(max_size) / float(im_size_max) 42 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 43 | interpolation=cv2.INTER_LINEAR) 44 | 45 | return im, im_scale 46 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | ## Model Zoo 2 | 3 | ### COCO Faster R-CNN VGG-16 trained using end-to-end 4 | 5 | Model URL: https://dl.dropboxusercontent.com/s/cotx0y81zvbbhnt/coco_vgg16_faster_rcnn_final.caffemodel?dl=0 6 | 7 | Training command: 8 | ``` 9 | tools/train_net.py \ 10 | --gpu 0 \ 11 | --solver ./models/coco/VGG16/faster_rcnn_end2end/solver.prototxt \ 12 | --weights data/imagenet_models/VGG16.v2.caffemodel \ 13 | --imdb coco_2014_train+coco_2014_valminusminival \ 14 | --iters 490000 \ 15 | --cfg ./experiments/cfgs/faster_rcnn_end2end.yml 16 | ``` 17 | 18 | `py-faster-rcnn` commit: 68eec95 19 | 20 | test-dev2015 results 21 | ``` 22 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.242 23 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.453 24 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.235 25 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.077 26 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264 27 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.371 28 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.238 29 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.340 30 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.346 31 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.120 32 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.385 33 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.544 34 | ``` 35 | 36 | test-standard2015 results 37 | ``` 38 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.242 39 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.453 40 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.234 41 | Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.072 42 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264 43 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.369 44 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.238 45 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.341 46 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.347 47 | Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.115 48 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.389 49 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.544 50 | ``` 51 | -------------------------------------------------------------------------------- /models/coco/VGG16/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/coco/VGG16/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 200000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/coco/VGG16/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/coco/VGG16/faster_rcnn_end2end/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 350000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg16_faster_rcnn" 15 | iter_size: 2 16 | -------------------------------------------------------------------------------- /models/coco/VGG_CNN_M_1024/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 200000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/coco/VGG_CNN_M_1024/fast_rcnn/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "fc7" 223 | type: "InnerProduct" 224 | bottom: "fc6" 225 | top: "fc7" 226 | param { 227 | lr_mult: 1 228 | decay_mult: 1 229 | } 230 | param { 231 | lr_mult: 2 232 | decay_mult: 0 233 | } 234 | inner_product_param { 235 | num_output: 1024 236 | } 237 | } 238 | layer { 239 | name: "relu7" 240 | type: "ReLU" 241 | bottom: "fc7" 242 | top: "fc7" 243 | } 244 | layer { 245 | name: "cls_score" 246 | type: "InnerProduct" 247 | bottom: "fc7" 248 | top: "cls_score" 249 | param { 250 | lr_mult: 1 251 | decay_mult: 1 252 | } 253 | param { 254 | lr_mult: 2 255 | decay_mult: 0 256 | } 257 | inner_product_param { 258 | num_output: 81 259 | weight_filler { 260 | type: "gaussian" 261 | std: 0.01 262 | } 263 | bias_filler { 264 | type: "constant" 265 | value: 0 266 | } 267 | } 268 | } 269 | layer { 270 | name: "bbox_pred" 271 | type: "InnerProduct" 272 | bottom: "fc7" 273 | top: "bbox_pred" 274 | param { 275 | lr_mult: 1 276 | decay_mult: 1 277 | } 278 | param { 279 | lr_mult: 2 280 | decay_mult: 0 281 | } 282 | inner_product_param { 283 | num_output: 324 284 | weight_filler { 285 | type: "gaussian" 286 | std: 0.001 287 | } 288 | bias_filler { 289 | type: "constant" 290 | value: 0 291 | } 292 | } 293 | } 294 | layer { 295 | name: "cls_prob" 296 | type: "Softmax" 297 | bottom: "cls_score" 298 | top: "cls_prob" 299 | } 300 | -------------------------------------------------------------------------------- /models/coco/VGG_CNN_M_1024/fast_rcnn/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 81" 15 | } 16 | } 17 | layer { 18 | name: "conv1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1" 22 | param { lr_mult: 0 decay_mult: 0 } 23 | param { lr_mult: 0 decay_mult: 0 } 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | stride: 2 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layer { 37 | name: "norm1" 38 | type: "LRN" 39 | bottom: "conv1" 40 | top: "norm1" 41 | lrn_param { 42 | local_size: 5 43 | alpha: 0.0005 44 | beta: 0.75 45 | k: 2 46 | } 47 | } 48 | layer { 49 | name: "pool1" 50 | type: "Pooling" 51 | bottom: "norm1" 52 | top: "pool1" 53 | pooling_param { 54 | pool: MAX 55 | kernel_size: 3 56 | stride: 2 57 | } 58 | } 59 | layer { 60 | name: "conv2" 61 | type: "Convolution" 62 | bottom: "pool1" 63 | top: "conv2" 64 | param { 65 | lr_mult: 1 66 | } 67 | param { 68 | lr_mult: 2 69 | } 70 | convolution_param { 71 | num_output: 256 72 | pad: 1 73 | kernel_size: 5 74 | stride: 2 75 | } 76 | } 77 | layer { 78 | name: "relu2" 79 | type: "ReLU" 80 | bottom: "conv2" 81 | top: "conv2" 82 | } 83 | layer { 84 | name: "norm2" 85 | type: "LRN" 86 | bottom: "conv2" 87 | top: "norm2" 88 | lrn_param { 89 | local_size: 5 90 | alpha: 0.0005 91 | beta: 0.75 92 | k: 2 93 | } 94 | } 95 | layer { 96 | name: "pool2" 97 | type: "Pooling" 98 | bottom: "norm2" 99 | top: "pool2" 100 | pooling_param { 101 | pool: MAX 102 | kernel_size: 3 103 | stride: 2 104 | } 105 | } 106 | layer { 107 | name: "conv3" 108 | type: "Convolution" 109 | bottom: "pool2" 110 | top: "conv3" 111 | param { 112 | lr_mult: 1 113 | } 114 | param { 115 | lr_mult: 2 116 | } 117 | convolution_param { 118 | num_output: 512 119 | pad: 1 120 | kernel_size: 3 121 | } 122 | } 123 | layer { 124 | name: "relu3" 125 | type: "ReLU" 126 | bottom: "conv3" 127 | top: "conv3" 128 | } 129 | layer { 130 | name: "conv4" 131 | type: "Convolution" 132 | bottom: "conv3" 133 | top: "conv4" 134 | param { 135 | lr_mult: 1 136 | } 137 | param { 138 | lr_mult: 2 139 | } 140 | convolution_param { 141 | num_output: 512 142 | pad: 1 143 | kernel_size: 3 144 | } 145 | } 146 | layer { 147 | name: "relu4" 148 | type: "ReLU" 149 | bottom: "conv4" 150 | top: "conv4" 151 | } 152 | layer { 153 | name: "conv5" 154 | type: "Convolution" 155 | bottom: "conv4" 156 | top: "conv5" 157 | param { 158 | lr_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | } 163 | convolution_param { 164 | num_output: 512 165 | pad: 1 166 | kernel_size: 3 167 | } 168 | } 169 | layer { 170 | name: "relu5" 171 | type: "ReLU" 172 | bottom: "conv5" 173 | top: "conv5" 174 | } 175 | layer { 176 | name: "roi_pool5" 177 | type: "ROIPooling" 178 | bottom: "conv5" 179 | bottom: "rois" 180 | top: "pool5" 181 | roi_pooling_param { 182 | pooled_w: 6 183 | pooled_h: 6 184 | spatial_scale: 0.0625 # 1/16 185 | } 186 | } 187 | layer { 188 | name: "fc6" 189 | type: "InnerProduct" 190 | bottom: "pool5" 191 | top: "fc6" 192 | param { 193 | lr_mult: 1 194 | } 195 | param { 196 | lr_mult: 2 197 | } 198 | inner_product_param { 199 | num_output: 4096 200 | } 201 | } 202 | layer { 203 | name: "relu6" 204 | type: "ReLU" 205 | bottom: "fc6" 206 | top: "fc6" 207 | } 208 | layer { 209 | name: "fc7" 210 | type: "InnerProduct" 211 | bottom: "fc6" 212 | top: "fc7" 213 | param { 214 | lr_mult: 1 215 | } 216 | param { 217 | lr_mult: 2 218 | } 219 | inner_product_param { 220 | num_output: 1024 221 | } 222 | } 223 | layer { 224 | name: "relu7" 225 | type: "ReLU" 226 | bottom: "fc7" 227 | top: "fc7" 228 | } 229 | layer { 230 | name: "cls_score" 231 | type: "InnerProduct" 232 | bottom: "fc7" 233 | top: "cls_score" 234 | param { 235 | lr_mult: 1 236 | } 237 | param { 238 | lr_mult: 2 239 | } 240 | inner_product_param { 241 | num_output: 81 242 | weight_filler { 243 | type: "gaussian" 244 | std: 0.01 245 | } 246 | bias_filler { 247 | type: "constant" 248 | value: 0 249 | } 250 | } 251 | } 252 | layer { 253 | name: "bbox_pred" 254 | type: "InnerProduct" 255 | bottom: "fc7" 256 | top: "bbox_pred" 257 | param { 258 | lr_mult: 1 259 | } 260 | param { 261 | lr_mult: 2 262 | } 263 | inner_product_param { 264 | num_output: 324 265 | weight_filler { 266 | type: "gaussian" 267 | std: 0.001 268 | } 269 | bias_filler { 270 | type: "constant" 271 | value: 0 272 | } 273 | } 274 | } 275 | layer { 276 | name: "loss_cls" 277 | type: "SoftmaxWithLoss" 278 | bottom: "cls_score" 279 | bottom: "labels" 280 | top: "loss_cls" 281 | loss_weight: 1 282 | } 283 | layer { 284 | name: "loss_bbox" 285 | type: "SmoothL1Loss" 286 | bottom: "bbox_pred" 287 | bottom: "bbox_targets" 288 | bottom: "bbox_inside_weights" 289 | bottom: "bbox_outside_weights" 290 | top: "loss_bbox" 291 | loss_weight: 1 292 | } 293 | -------------------------------------------------------------------------------- /models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/coco/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 350000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn" 15 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_alt_opt/rpn_test.pt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "im_info" 12 | input_shape { 13 | dim: 1 14 | dim: 3 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | convolution_param { 23 | num_output: 64 24 | pad: 1 kernel_size: 3 25 | } 26 | } 27 | layer { 28 | name: "relu1_1" 29 | type: "ReLU" 30 | bottom: "conv1_1" 31 | top: "conv1_1" 32 | } 33 | layer { 34 | name: "conv1_2" 35 | type: "Convolution" 36 | bottom: "conv1_1" 37 | top: "conv1_2" 38 | convolution_param { 39 | num_output: 64 40 | pad: 1 kernel_size: 3 41 | } 42 | } 43 | layer { 44 | name: "relu1_2" 45 | type: "ReLU" 46 | bottom: "conv1_2" 47 | top: "conv1_2" 48 | } 49 | layer { 50 | name: "pool1" 51 | type: "Pooling" 52 | bottom: "conv1_2" 53 | top: "pool1" 54 | pooling_param { 55 | pool: MAX 56 | kernel_size: 2 stride: 2 57 | } 58 | } 59 | layer { 60 | name: "conv2_1" 61 | type: "Convolution" 62 | bottom: "pool1" 63 | top: "conv2_1" 64 | convolution_param { 65 | num_output: 128 66 | pad: 1 kernel_size: 3 67 | } 68 | } 69 | layer { 70 | name: "relu2_1" 71 | type: "ReLU" 72 | bottom: "conv2_1" 73 | top: "conv2_1" 74 | } 75 | layer { 76 | name: "conv2_2" 77 | type: "Convolution" 78 | bottom: "conv2_1" 79 | top: "conv2_2" 80 | convolution_param { 81 | num_output: 128 82 | pad: 1 kernel_size: 3 83 | } 84 | } 85 | layer { 86 | name: "relu2_2" 87 | type: "ReLU" 88 | bottom: "conv2_2" 89 | top: "conv2_2" 90 | } 91 | layer { 92 | name: "pool2" 93 | type: "Pooling" 94 | bottom: "conv2_2" 95 | top: "pool2" 96 | pooling_param { 97 | pool: MAX 98 | kernel_size: 2 stride: 2 99 | } 100 | } 101 | layer { 102 | name: "conv3_1" 103 | type: "Convolution" 104 | bottom: "pool2" 105 | top: "conv3_1" 106 | convolution_param { 107 | num_output: 256 108 | pad: 1 kernel_size: 3 109 | } 110 | } 111 | layer { 112 | name: "relu3_1" 113 | type: "ReLU" 114 | bottom: "conv3_1" 115 | top: "conv3_1" 116 | } 117 | layer { 118 | name: "conv3_2" 119 | type: "Convolution" 120 | bottom: "conv3_1" 121 | top: "conv3_2" 122 | convolution_param { 123 | num_output: 256 124 | pad: 1 kernel_size: 3 125 | } 126 | } 127 | layer { 128 | name: "relu3_2" 129 | type: "ReLU" 130 | bottom: "conv3_2" 131 | top: "conv3_2" 132 | } 133 | layer { 134 | name: "conv3_3" 135 | type: "Convolution" 136 | bottom: "conv3_2" 137 | top: "conv3_3" 138 | convolution_param { 139 | num_output: 256 140 | pad: 1 kernel_size: 3 141 | } 142 | } 143 | layer { 144 | name: "relu3_3" 145 | type: "ReLU" 146 | bottom: "conv3_3" 147 | top: "conv3_3" 148 | } 149 | layer { 150 | name: "pool3" 151 | type: "Pooling" 152 | bottom: "conv3_3" 153 | top: "pool3" 154 | pooling_param { 155 | pool: MAX 156 | kernel_size: 2 stride: 2 157 | } 158 | } 159 | layer { 160 | name: "conv4_1" 161 | type: "Convolution" 162 | bottom: "pool3" 163 | top: "conv4_1" 164 | convolution_param { 165 | num_output: 512 166 | pad: 1 kernel_size: 3 167 | } 168 | } 169 | layer { 170 | name: "relu4_1" 171 | type: "ReLU" 172 | bottom: "conv4_1" 173 | top: "conv4_1" 174 | } 175 | layer { 176 | name: "conv4_2" 177 | type: "Convolution" 178 | bottom: "conv4_1" 179 | top: "conv4_2" 180 | convolution_param { 181 | num_output: 512 182 | pad: 1 kernel_size: 3 183 | } 184 | } 185 | layer { 186 | name: "relu4_2" 187 | type: "ReLU" 188 | bottom: "conv4_2" 189 | top: "conv4_2" 190 | } 191 | layer { 192 | name: "conv4_3" 193 | type: "Convolution" 194 | bottom: "conv4_2" 195 | top: "conv4_3" 196 | convolution_param { 197 | num_output: 512 198 | pad: 1 kernel_size: 3 199 | } 200 | } 201 | layer { 202 | name: "relu4_3" 203 | type: "ReLU" 204 | bottom: "conv4_3" 205 | top: "conv4_3" 206 | } 207 | layer { 208 | name: "pool4" 209 | type: "Pooling" 210 | bottom: "conv4_3" 211 | top: "pool4" 212 | pooling_param { 213 | pool: MAX 214 | kernel_size: 2 stride: 2 215 | } 216 | } 217 | layer { 218 | name: "conv5_1" 219 | type: "Convolution" 220 | bottom: "pool4" 221 | top: "conv5_1" 222 | convolution_param { 223 | num_output: 512 224 | pad: 1 kernel_size: 3 225 | } 226 | } 227 | layer { 228 | name: "relu5_1" 229 | type: "ReLU" 230 | bottom: "conv5_1" 231 | top: "conv5_1" 232 | } 233 | layer { 234 | name: "conv5_2" 235 | type: "Convolution" 236 | bottom: "conv5_1" 237 | top: "conv5_2" 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 kernel_size: 3 241 | } 242 | } 243 | layer { 244 | name: "relu5_2" 245 | type: "ReLU" 246 | bottom: "conv5_2" 247 | top: "conv5_2" 248 | } 249 | layer { 250 | name: "conv5_3" 251 | type: "Convolution" 252 | bottom: "conv5_2" 253 | top: "conv5_3" 254 | convolution_param { 255 | num_output: 512 256 | pad: 1 kernel_size: 3 257 | } 258 | } 259 | layer { 260 | name: "relu5_3" 261 | type: "ReLU" 262 | bottom: "conv5_3" 263 | top: "conv5_3" 264 | } 265 | 266 | #========= RPN ============ 267 | 268 | layer { 269 | name: "rpn_conv/3x3" 270 | type: "Convolution" 271 | bottom: "conv5_3" 272 | top: "rpn/output" 273 | convolution_param { 274 | num_output: 512 275 | kernel_size: 3 pad: 1 stride: 1 276 | } 277 | } 278 | layer { 279 | name: "rpn_relu/3x3" 280 | type: "ReLU" 281 | bottom: "rpn/output" 282 | top: "rpn/output" 283 | } 284 | 285 | layer { 286 | name: "rpn_cls_score" 287 | type: "Convolution" 288 | bottom: "rpn/output" 289 | top: "rpn_cls_score" 290 | convolution_param { 291 | num_output: 18 # 2(bg/fg) * 9(anchors) 292 | kernel_size: 1 pad: 0 stride: 1 293 | } 294 | } 295 | layer { 296 | name: "rpn_bbox_pred" 297 | type: "Convolution" 298 | bottom: "rpn/output" 299 | top: "rpn_bbox_pred" 300 | convolution_param { 301 | num_output: 36 # 4 * 9(anchors) 302 | kernel_size: 1 pad: 0 stride: 1 303 | } 304 | } 305 | layer { 306 | bottom: "rpn_cls_score" 307 | top: "rpn_cls_score_reshape" 308 | name: "rpn_cls_score_reshape" 309 | type: "Reshape" 310 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 311 | } 312 | 313 | #========= RoI Proposal ============ 314 | 315 | layer { 316 | name: "rpn_cls_prob" 317 | type: "Softmax" 318 | bottom: "rpn_cls_score_reshape" 319 | top: "rpn_cls_prob" 320 | } 321 | layer { 322 | name: 'rpn_cls_prob_reshape' 323 | type: 'Reshape' 324 | bottom: 'rpn_cls_prob' 325 | top: 'rpn_cls_prob_reshape' 326 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 327 | } 328 | layer { 329 | name: 'proposal' 330 | type: 'Python' 331 | bottom: 'rpn_cls_prob_reshape' 332 | bottom: 'rpn_bbox_pred' 333 | bottom: 'im_info' 334 | top: 'rois' 335 | top: 'scores' 336 | python_param { 337 | module: 'rpn.proposal_layer' 338 | layer: 'ProposalLayer' 339 | param_str: "'feat_stride': 16" 340 | } 341 | } 342 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg16_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage1_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg16_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg16_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/faster_rcnn_alt_opt/stage2_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg16_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG16/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG16/faster_rcnn_end2end/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 50000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_faster_rcnn" 16 | iter_size: 2 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "drop6" 223 | type: "Dropout" 224 | bottom: "fc6" 225 | top: "fc6" 226 | dropout_param { 227 | dropout_ratio: 0.5 228 | } 229 | } 230 | layer { 231 | name: "fc7" 232 | type: "InnerProduct" 233 | bottom: "fc6" 234 | top: "fc7" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 1024 245 | } 246 | } 247 | layer { 248 | name: "relu7" 249 | type: "ReLU" 250 | bottom: "fc7" 251 | top: "fc7" 252 | } 253 | layer { 254 | name: "drop7" 255 | type: "Dropout" 256 | bottom: "fc7" 257 | top: "fc7" 258 | dropout_param { 259 | dropout_ratio: 0.5 260 | } 261 | } 262 | layer { 263 | name: "cls_score" 264 | type: "InnerProduct" 265 | bottom: "fc7" 266 | top: "cls_score" 267 | param { 268 | lr_mult: 1 269 | decay_mult: 1 270 | } 271 | param { 272 | lr_mult: 2 273 | decay_mult: 0 274 | } 275 | inner_product_param { 276 | num_output: 21 277 | weight_filler { 278 | type: "gaussian" 279 | std: 0.01 280 | } 281 | bias_filler { 282 | type: "constant" 283 | value: 0 284 | } 285 | } 286 | } 287 | layer { 288 | name: "bbox_pred" 289 | type: "InnerProduct" 290 | bottom: "fc7" 291 | top: "bbox_pred" 292 | param { 293 | lr_mult: 1 294 | decay_mult: 1 295 | } 296 | param { 297 | lr_mult: 2 298 | decay_mult: 0 299 | } 300 | inner_product_param { 301 | num_output: 84 302 | weight_filler { 303 | type: "gaussian" 304 | std: 0.001 305 | } 306 | bias_filler { 307 | type: "constant" 308 | value: 0 309 | } 310 | } 311 | } 312 | layer { 313 | name: "cls_prob" 314 | type: "Softmax" 315 | bottom: "cls_score" 316 | top: "cls_prob" 317 | } 318 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/fast_rcnn/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 21" 15 | } 16 | } 17 | layer { 18 | name: "conv1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1" 22 | param { lr_mult: 0 decay_mult: 0 } 23 | param { lr_mult: 0 decay_mult: 0 } 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | stride: 2 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layer { 37 | name: "norm1" 38 | type: "LRN" 39 | bottom: "conv1" 40 | top: "norm1" 41 | lrn_param { 42 | local_size: 5 43 | alpha: 0.0005 44 | beta: 0.75 45 | k: 2 46 | } 47 | } 48 | layer { 49 | name: "pool1" 50 | type: "Pooling" 51 | bottom: "norm1" 52 | top: "pool1" 53 | pooling_param { 54 | pool: MAX 55 | kernel_size: 3 56 | stride: 2 57 | } 58 | } 59 | layer { 60 | name: "conv2" 61 | type: "Convolution" 62 | bottom: "pool1" 63 | top: "conv2" 64 | param { 65 | lr_mult: 1 66 | } 67 | param { 68 | lr_mult: 2 69 | } 70 | convolution_param { 71 | num_output: 256 72 | pad: 1 73 | kernel_size: 5 74 | stride: 2 75 | } 76 | } 77 | layer { 78 | name: "relu2" 79 | type: "ReLU" 80 | bottom: "conv2" 81 | top: "conv2" 82 | } 83 | layer { 84 | name: "norm2" 85 | type: "LRN" 86 | bottom: "conv2" 87 | top: "norm2" 88 | lrn_param { 89 | local_size: 5 90 | alpha: 0.0005 91 | beta: 0.75 92 | k: 2 93 | } 94 | } 95 | layer { 96 | name: "pool2" 97 | type: "Pooling" 98 | bottom: "norm2" 99 | top: "pool2" 100 | pooling_param { 101 | pool: MAX 102 | kernel_size: 3 103 | stride: 2 104 | } 105 | } 106 | layer { 107 | name: "conv3" 108 | type: "Convolution" 109 | bottom: "pool2" 110 | top: "conv3" 111 | param { 112 | lr_mult: 1 113 | } 114 | param { 115 | lr_mult: 2 116 | } 117 | convolution_param { 118 | num_output: 512 119 | pad: 1 120 | kernel_size: 3 121 | } 122 | } 123 | layer { 124 | name: "relu3" 125 | type: "ReLU" 126 | bottom: "conv3" 127 | top: "conv3" 128 | } 129 | layer { 130 | name: "conv4" 131 | type: "Convolution" 132 | bottom: "conv3" 133 | top: "conv4" 134 | param { 135 | lr_mult: 1 136 | } 137 | param { 138 | lr_mult: 2 139 | } 140 | convolution_param { 141 | num_output: 512 142 | pad: 1 143 | kernel_size: 3 144 | } 145 | } 146 | layer { 147 | name: "relu4" 148 | type: "ReLU" 149 | bottom: "conv4" 150 | top: "conv4" 151 | } 152 | layer { 153 | name: "conv5" 154 | type: "Convolution" 155 | bottom: "conv4" 156 | top: "conv5" 157 | param { 158 | lr_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | } 163 | convolution_param { 164 | num_output: 512 165 | pad: 1 166 | kernel_size: 3 167 | } 168 | } 169 | layer { 170 | name: "relu5" 171 | type: "ReLU" 172 | bottom: "conv5" 173 | top: "conv5" 174 | } 175 | layer { 176 | name: "roi_pool5" 177 | type: "ROIPooling" 178 | bottom: "conv5" 179 | bottom: "rois" 180 | top: "pool5" 181 | roi_pooling_param { 182 | pooled_w: 6 183 | pooled_h: 6 184 | spatial_scale: 0.0625 # 1/16 185 | } 186 | } 187 | layer { 188 | name: "fc6" 189 | type: "InnerProduct" 190 | bottom: "pool5" 191 | top: "fc6" 192 | param { 193 | lr_mult: 1 194 | } 195 | param { 196 | lr_mult: 2 197 | } 198 | inner_product_param { 199 | num_output: 4096 200 | } 201 | } 202 | layer { 203 | name: "relu6" 204 | type: "ReLU" 205 | bottom: "fc6" 206 | top: "fc6" 207 | } 208 | layer { 209 | name: "drop6" 210 | type: "Dropout" 211 | bottom: "fc6" 212 | top: "fc6" 213 | dropout_param { 214 | dropout_ratio: 0.5 215 | } 216 | } 217 | layer { 218 | name: "fc7" 219 | type: "InnerProduct" 220 | bottom: "fc6" 221 | top: "fc7" 222 | param { 223 | lr_mult: 1 224 | } 225 | param { 226 | lr_mult: 2 227 | } 228 | inner_product_param { 229 | num_output: 1024 230 | } 231 | } 232 | layer { 233 | name: "relu7" 234 | type: "ReLU" 235 | bottom: "fc7" 236 | top: "fc7" 237 | } 238 | layer { 239 | name: "drop7" 240 | type: "Dropout" 241 | bottom: "fc7" 242 | top: "fc7" 243 | dropout_param { 244 | dropout_ratio: 0.5 245 | } 246 | } 247 | layer { 248 | name: "cls_score" 249 | type: "InnerProduct" 250 | bottom: "fc7" 251 | top: "cls_score" 252 | param { 253 | lr_mult: 1 254 | } 255 | param { 256 | lr_mult: 2 257 | } 258 | inner_product_param { 259 | num_output: 21 260 | weight_filler { 261 | type: "gaussian" 262 | std: 0.01 263 | } 264 | bias_filler { 265 | type: "constant" 266 | value: 0 267 | } 268 | } 269 | } 270 | layer { 271 | name: "bbox_pred" 272 | type: "InnerProduct" 273 | bottom: "fc7" 274 | top: "bbox_pred" 275 | param { 276 | lr_mult: 1 277 | } 278 | param { 279 | lr_mult: 2 280 | } 281 | inner_product_param { 282 | num_output: 84 283 | weight_filler { 284 | type: "gaussian" 285 | std: 0.001 286 | } 287 | bias_filler { 288 | type: "constant" 289 | value: 0 290 | } 291 | } 292 | } 293 | layer { 294 | name: "loss_cls" 295 | type: "SoftmaxWithLoss" 296 | bottom: "cls_score" 297 | bottom: "labels" 298 | top: "loss_cls" 299 | loss_weight: 1 300 | } 301 | layer { 302 | name: "loss_bbox" 303 | type: "SmoothL1Loss" 304 | bottom: "bbox_pred" 305 | bottom: "bbox_targets" 306 | bottom: "bbox_inside_weights" 307 | bottom: "bbox_outside_weights" 308 | top: "loss_bbox" 309 | loss_weight: 1 310 | } 311 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/faster_rcnn_test.pt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "im_info" 10 | input_shape { 11 | dim: 1 12 | dim: 3 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | convolution_param { 20 | num_output: 96 21 | kernel_size: 7 22 | stride: 2 23 | } 24 | } 25 | layer { 26 | name: "relu1" 27 | type: "ReLU" 28 | bottom: "conv1" 29 | top: "conv1" 30 | } 31 | layer { 32 | name: "norm1" 33 | type: "LRN" 34 | bottom: "conv1" 35 | top: "norm1" 36 | lrn_param { 37 | local_size: 5 38 | alpha: 0.0005 39 | beta: 0.75 40 | k: 2 41 | } 42 | } 43 | layer { 44 | name: "pool1" 45 | type: "Pooling" 46 | bottom: "norm1" 47 | top: "pool1" 48 | pooling_param { 49 | pool: MAX 50 | kernel_size: 3 51 | stride: 2 52 | } 53 | } 54 | layer { 55 | name: "conv2" 56 | type: "Convolution" 57 | bottom: "pool1" 58 | top: "conv2" 59 | convolution_param { 60 | num_output: 256 61 | pad: 1 62 | kernel_size: 5 63 | stride: 2 64 | } 65 | } 66 | layer { 67 | name: "relu2" 68 | type: "ReLU" 69 | bottom: "conv2" 70 | top: "conv2" 71 | } 72 | layer { 73 | name: "norm2" 74 | type: "LRN" 75 | bottom: "conv2" 76 | top: "norm2" 77 | lrn_param { 78 | local_size: 5 79 | alpha: 0.0005 80 | beta: 0.75 81 | k: 2 82 | } 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "norm2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 92 | stride: 2 93 | } 94 | } 95 | layer { 96 | name: "conv3" 97 | type: "Convolution" 98 | bottom: "pool2" 99 | top: "conv3" 100 | convolution_param { 101 | num_output: 512 102 | pad: 1 103 | kernel_size: 3 104 | } 105 | } 106 | layer { 107 | name: "relu3" 108 | type: "ReLU" 109 | bottom: "conv3" 110 | top: "conv3" 111 | } 112 | layer { 113 | name: "conv4" 114 | type: "Convolution" 115 | bottom: "conv3" 116 | top: "conv4" 117 | convolution_param { 118 | num_output: 512 119 | pad: 1 120 | kernel_size: 3 121 | } 122 | } 123 | layer { 124 | name: "relu4" 125 | type: "ReLU" 126 | bottom: "conv4" 127 | top: "conv4" 128 | } 129 | layer { 130 | name: "conv5" 131 | type: "Convolution" 132 | bottom: "conv4" 133 | top: "conv5" 134 | convolution_param { 135 | num_output: 512 136 | pad: 1 137 | kernel_size: 3 138 | } 139 | } 140 | layer { 141 | name: "relu5" 142 | type: "ReLU" 143 | bottom: "conv5" 144 | top: "conv5" 145 | } 146 | 147 | #========= RPN ============ 148 | 149 | layer { 150 | name: "rpn_conv/3x3" 151 | type: "Convolution" 152 | bottom: "conv5" 153 | top: "rpn/output" 154 | convolution_param { 155 | num_output: 256 156 | kernel_size: 3 pad: 1 stride: 1 157 | } 158 | } 159 | layer { 160 | name: "rpn_relu/3x3" 161 | type: "ReLU" 162 | bottom: "rpn/output" 163 | top: "rpn/output" 164 | } 165 | layer { 166 | name: "rpn_cls_score" 167 | type: "Convolution" 168 | bottom: "rpn/output" 169 | top: "rpn_cls_score" 170 | convolution_param { 171 | num_output: 18 # 2(bg/fg) * 9(anchors) 172 | kernel_size: 1 pad: 0 stride: 1 173 | } 174 | } 175 | layer { 176 | name: "rpn_bbox_pred" 177 | type: "Convolution" 178 | bottom: "rpn/output" 179 | top: "rpn_bbox_pred" 180 | convolution_param { 181 | num_output: 36 # 4 * 9(anchors) 182 | kernel_size: 1 pad: 0 stride: 1 183 | } 184 | } 185 | layer { 186 | bottom: "rpn_cls_score" 187 | top: "rpn_cls_score_reshape" 188 | name: "rpn_cls_score_reshape" 189 | type: "Reshape" 190 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 191 | } 192 | 193 | #========= RoI Proposal ============ 194 | 195 | layer { 196 | name: "rpn_cls_prob" 197 | type: "Softmax" 198 | bottom: "rpn_cls_score_reshape" 199 | top: "rpn_cls_prob" 200 | } 201 | layer { 202 | name: 'rpn_cls_prob_reshape' 203 | type: 'Reshape' 204 | bottom: 'rpn_cls_prob' 205 | top: 'rpn_cls_prob_reshape' 206 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 207 | } 208 | layer { 209 | name: 'proposal' 210 | type: 'Python' 211 | bottom: 'rpn_cls_prob_reshape' 212 | bottom: 'rpn_bbox_pred' 213 | bottom: 'im_info' 214 | top: 'rois' 215 | python_param { 216 | module: 'rpn.proposal_layer' 217 | layer: 'ProposalLayer' 218 | param_str: "'feat_stride': 16" 219 | } 220 | } 221 | 222 | #========= RCNN ============ 223 | 224 | layer { 225 | name: "roi_pool5" 226 | type: "ROIPooling" 227 | bottom: "conv5" 228 | bottom: "rois" 229 | top: "pool5" 230 | roi_pooling_param { 231 | pooled_w: 6 232 | pooled_h: 6 233 | spatial_scale: 0.0625 # 1/16 234 | } 235 | } 236 | layer { 237 | name: "fc6" 238 | type: "InnerProduct" 239 | bottom: "pool5" 240 | top: "fc6" 241 | inner_product_param { 242 | num_output: 4096 243 | } 244 | } 245 | layer { 246 | name: "relu6" 247 | type: "ReLU" 248 | bottom: "fc6" 249 | top: "fc6" 250 | } 251 | layer { 252 | name: "fc7" 253 | type: "InnerProduct" 254 | bottom: "fc6" 255 | top: "fc7" 256 | inner_product_param { 257 | num_output: 1024 258 | } 259 | } 260 | layer { 261 | name: "relu7" 262 | type: "ReLU" 263 | bottom: "fc7" 264 | top: "fc7" 265 | } 266 | layer { 267 | name: "cls_score" 268 | type: "InnerProduct" 269 | bottom: "fc7" 270 | top: "cls_score" 271 | inner_product_param { 272 | num_output: 21 273 | } 274 | } 275 | layer { 276 | name: "bbox_pred" 277 | type: "InnerProduct" 278 | bottom: "fc7" 279 | top: "bbox_pred" 280 | inner_product_param { 281 | num_output: 84 282 | } 283 | } 284 | layer { 285 | name: "cls_prob" 286 | type: "Softmax" 287 | bottom: "cls_score" 288 | top: "cls_prob" 289 | } 290 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/rpn_test.pt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "im_info" 10 | input_shape { 11 | dim: 1 12 | dim: 3 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | convolution_param { 20 | num_output: 96 21 | kernel_size: 7 22 | stride: 2 23 | } 24 | } 25 | layer { 26 | name: "relu1" 27 | type: "ReLU" 28 | bottom: "conv1" 29 | top: "conv1" 30 | } 31 | layer { 32 | name: "norm1" 33 | type: "LRN" 34 | bottom: "conv1" 35 | top: "norm1" 36 | lrn_param { 37 | local_size: 5 38 | alpha: 0.0005 39 | beta: 0.75 40 | k: 2 41 | } 42 | } 43 | layer { 44 | name: "pool1" 45 | type: "Pooling" 46 | bottom: "norm1" 47 | top: "pool1" 48 | pooling_param { 49 | pool: MAX 50 | kernel_size: 3 51 | stride: 2 52 | } 53 | } 54 | layer { 55 | name: "conv2" 56 | type: "Convolution" 57 | bottom: "pool1" 58 | top: "conv2" 59 | convolution_param { 60 | num_output: 256 61 | pad: 1 62 | kernel_size: 5 63 | stride: 2 64 | } 65 | } 66 | layer { 67 | name: "relu2" 68 | type: "ReLU" 69 | bottom: "conv2" 70 | top: "conv2" 71 | } 72 | layer { 73 | name: "norm2" 74 | type: "LRN" 75 | bottom: "conv2" 76 | top: "norm2" 77 | lrn_param { 78 | local_size: 5 79 | alpha: 0.0005 80 | beta: 0.75 81 | k: 2 82 | } 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "norm2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 92 | stride: 2 93 | } 94 | } 95 | layer { 96 | name: "conv3" 97 | type: "Convolution" 98 | bottom: "pool2" 99 | top: "conv3" 100 | convolution_param { 101 | num_output: 512 102 | pad: 1 103 | kernel_size: 3 104 | } 105 | } 106 | layer { 107 | name: "relu3" 108 | type: "ReLU" 109 | bottom: "conv3" 110 | top: "conv3" 111 | } 112 | layer { 113 | name: "conv4" 114 | type: "Convolution" 115 | bottom: "conv3" 116 | top: "conv4" 117 | convolution_param { 118 | num_output: 512 119 | pad: 1 120 | kernel_size: 3 121 | } 122 | } 123 | layer { 124 | name: "relu4" 125 | type: "ReLU" 126 | bottom: "conv4" 127 | top: "conv4" 128 | } 129 | layer { 130 | name: "conv5" 131 | type: "Convolution" 132 | bottom: "conv4" 133 | top: "conv5" 134 | convolution_param { 135 | num_output: 512 136 | pad: 1 137 | kernel_size: 3 138 | } 139 | } 140 | layer { 141 | name: "relu5" 142 | type: "ReLU" 143 | bottom: "conv5" 144 | top: "conv5" 145 | } 146 | 147 | #========= RPN ============ 148 | 149 | layer { 150 | name: "rpn_conv/3x3" 151 | type: "Convolution" 152 | bottom: "conv5" 153 | top: "rpn/output" 154 | convolution_param { 155 | num_output: 256 156 | kernel_size: 3 pad: 1 stride: 1 157 | } 158 | } 159 | layer { 160 | name: "rpn_relu/3x3" 161 | type: "ReLU" 162 | bottom: "rpn/output" 163 | top: "rpn/output" 164 | } 165 | layer { 166 | name: "rpn_cls_score" 167 | type: "Convolution" 168 | bottom: "rpn/output" 169 | top: "rpn_cls_score" 170 | convolution_param { 171 | num_output: 18 # 2(bg/fg) * 9(anchors) 172 | kernel_size: 1 pad: 0 stride: 1 173 | } 174 | } 175 | layer { 176 | name: "rpn_bbox_pred" 177 | type: "Convolution" 178 | bottom: "rpn/output" 179 | top: "rpn_bbox_pred" 180 | convolution_param { 181 | num_output: 36 # 4 * 9(anchors) 182 | kernel_size: 1 pad: 0 stride: 1 183 | } 184 | } 185 | layer { 186 | bottom: "rpn_cls_score" 187 | top: "rpn_cls_score_reshape" 188 | name: "rpn_cls_score_reshape" 189 | type: "Reshape" 190 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 191 | } 192 | 193 | #========= RoI Proposal ============ 194 | 195 | layer { 196 | name: "rpn_cls_prob" 197 | type: "Softmax" 198 | bottom: "rpn_cls_score_reshape" 199 | top: "rpn_cls_prob" 200 | } 201 | layer { 202 | name: 'rpn_cls_prob_reshape' 203 | type: 'Reshape' 204 | bottom: 'rpn_cls_prob' 205 | top: 'rpn_cls_prob_reshape' 206 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 207 | } 208 | layer { 209 | name: 'proposal' 210 | type: 'Python' 211 | bottom: 'rpn_cls_prob_reshape' 212 | bottom: 'rpn_bbox_pred' 213 | bottom: 'im_info' 214 | top: 'rois' 215 | top: 'scores' 216 | python_param { 217 | module: 'rpn.proposal_layer' 218 | layer: 'ProposalLayer' 219 | param_str: "'feat_stride': 16" 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg_cnn_m_1024_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage1_rpn_train.pt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'input-data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'im_info' 7 | top: 'gt_boxes' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { lr_mult: 0 decay_mult: 0 } 20 | param { lr_mult: 0 decay_mult: 0 } 21 | convolution_param { 22 | num_output: 96 23 | kernel_size: 7 stride: 2 24 | } 25 | } 26 | layer { 27 | name: "relu1" 28 | type: "ReLU" 29 | bottom: "conv1" 30 | top: "conv1" 31 | } 32 | layer { 33 | name: "norm1" 34 | type: "LRN" 35 | bottom: "conv1" 36 | top: "norm1" 37 | lrn_param { 38 | local_size: 5 39 | alpha: 0.0005 40 | beta: 0.75 41 | k: 2 42 | } 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "norm1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 stride: 2 52 | } 53 | } 54 | layer { 55 | name: "conv2" 56 | type: "Convolution" 57 | bottom: "pool1" 58 | top: "conv2" 59 | param { lr_mult: 1 } 60 | param { lr_mult: 2 } 61 | convolution_param { 62 | num_output: 256 63 | pad: 1 kernel_size: 5 stride: 2 64 | } 65 | } 66 | layer { 67 | name: "relu2" 68 | type: "ReLU" 69 | bottom: "conv2" 70 | top: "conv2" 71 | } 72 | layer { 73 | name: "norm2" 74 | type: "LRN" 75 | bottom: "conv2" 76 | top: "norm2" 77 | lrn_param { 78 | local_size: 5 79 | alpha: 0.0005 80 | beta: 0.75 81 | k: 2 82 | } 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "norm2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 stride: 2 92 | } 93 | } 94 | layer { 95 | name: "conv3" 96 | type: "Convolution" 97 | bottom: "pool2" 98 | top: "conv3" 99 | param { lr_mult: 1 } 100 | param { lr_mult: 2 } 101 | convolution_param { 102 | num_output: 512 103 | pad: 1 kernel_size: 3 104 | } 105 | } 106 | layer { 107 | name: "relu3" 108 | type: "ReLU" 109 | bottom: "conv3" 110 | top: "conv3" 111 | } 112 | layer { 113 | name: "conv4" 114 | type: "Convolution" 115 | bottom: "conv3" 116 | top: "conv4" 117 | param { lr_mult: 1 } 118 | param { lr_mult: 2 } 119 | convolution_param { 120 | num_output: 512 121 | pad: 1 kernel_size: 3 122 | } 123 | } 124 | layer { 125 | name: "relu4" 126 | type: "ReLU" 127 | bottom: "conv4" 128 | top: "conv4" 129 | } 130 | layer { 131 | name: "conv5" 132 | type: "Convolution" 133 | bottom: "conv4" 134 | top: "conv5" 135 | param { lr_mult: 1 } 136 | param { lr_mult: 2 } 137 | convolution_param { 138 | num_output: 512 139 | pad: 1 kernel_size: 3 140 | } 141 | } 142 | layer { 143 | name: "relu5" 144 | type: "ReLU" 145 | bottom: "conv5" 146 | top: "conv5" 147 | } 148 | 149 | #========= RPN ============ 150 | 151 | layer { 152 | name: "rpn_conv/3x3" 153 | type: "Convolution" 154 | bottom: "conv5" 155 | top: "rpn/output" 156 | param { lr_mult: 1.0 } 157 | param { lr_mult: 2.0 } 158 | convolution_param { 159 | num_output: 256 160 | kernel_size: 3 pad: 1 stride: 1 161 | weight_filler { type: "gaussian" std: 0.01 } 162 | bias_filler { type: "constant" value: 0 } 163 | } 164 | } 165 | layer { 166 | name: "rpn_relu/3x3" 167 | type: "ReLU" 168 | bottom: "rpn/output" 169 | top: "rpn/output" 170 | } 171 | layer { 172 | name: "rpn_cls_score" 173 | type: "Convolution" 174 | bottom: "rpn/output" 175 | top: "rpn_cls_score" 176 | param { lr_mult: 1.0 } 177 | param { lr_mult: 2.0 } 178 | convolution_param { 179 | num_output: 18 # 2(bg/fg) * 9(anchors) 180 | kernel_size: 1 pad: 0 stride: 1 181 | weight_filler { type: "gaussian" std: 0.01 } 182 | bias_filler { type: "constant" value: 0 } 183 | } 184 | } 185 | layer { 186 | name: "rpn_bbox_pred" 187 | type: "Convolution" 188 | bottom: "rpn/output" 189 | top: "rpn_bbox_pred" 190 | param { lr_mult: 1.0 } 191 | param { lr_mult: 2.0 } 192 | convolution_param { 193 | num_output: 36 # 4 * 9(anchors) 194 | kernel_size: 1 pad: 0 stride: 1 195 | weight_filler { type: "gaussian" std: 0.01 } 196 | bias_filler { type: "constant" value: 0 } 197 | } 198 | } 199 | layer { 200 | bottom: "rpn_cls_score" 201 | top: "rpn_cls_score_reshape" 202 | name: "rpn_cls_score_reshape" 203 | type: "Reshape" 204 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 205 | } 206 | layer { 207 | name: 'rpn-data' 208 | type: 'Python' 209 | bottom: 'rpn_cls_score' 210 | bottom: 'gt_boxes' 211 | bottom: 'im_info' 212 | bottom: 'data' 213 | top: 'rpn_labels' 214 | top: 'rpn_bbox_targets' 215 | top: 'rpn_bbox_inside_weights' 216 | top: 'rpn_bbox_outside_weights' 217 | python_param { 218 | module: 'rpn.anchor_target_layer' 219 | layer: 'AnchorTargetLayer' 220 | param_str: "'feat_stride': 16" 221 | } 222 | } 223 | layer { 224 | name: "rpn_loss_cls" 225 | type: "SoftmaxWithLoss" 226 | bottom: "rpn_cls_score_reshape" 227 | bottom: "rpn_labels" 228 | propagate_down: 1 229 | propagate_down: 0 230 | top: "rpn_cls_loss" 231 | loss_weight: 1 232 | loss_param { 233 | ignore_label: -1 234 | normalize: true 235 | } 236 | } 237 | layer { 238 | name: "rpn_loss_bbox" 239 | type: "SmoothL1Loss" 240 | bottom: "rpn_bbox_pred" 241 | bottom: "rpn_bbox_targets" 242 | bottom: 'rpn_bbox_inside_weights' 243 | bottom: 'rpn_bbox_outside_weights' 244 | top: "rpn_loss_bbox" 245 | loss_weight: 1 246 | smooth_l1_loss_param { sigma: 3.0 } 247 | } 248 | 249 | #========= RCNN ============ 250 | 251 | layer { 252 | name: "dummy_roi_pool_conv5" 253 | type: "DummyData" 254 | top: "dummy_roi_pool_conv5" 255 | dummy_data_param { 256 | shape { dim: 1 dim: 18432 } 257 | data_filler { type: "gaussian" std: 0.01 } 258 | } 259 | } 260 | layer { 261 | name: "fc6" 262 | type: "InnerProduct" 263 | bottom: "dummy_roi_pool_conv5" 264 | top: "fc6" 265 | param { lr_mult: 0 decay_mult: 0 } 266 | param { lr_mult: 0 decay_mult: 0 } 267 | inner_product_param { 268 | num_output: 4096 269 | } 270 | } 271 | layer { 272 | name: "fc7" 273 | type: "InnerProduct" 274 | bottom: "fc6" 275 | top: "fc7" 276 | param { lr_mult: 0 decay_mult: 0 } 277 | param { lr_mult: 0 decay_mult: 0 } 278 | inner_product_param { 279 | num_output: 1024 280 | } 281 | } 282 | layer { 283 | name: "silence_fc7" 284 | type: "Silence" 285 | bottom: "fc7" 286 | } 287 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "vgg_cnn_m_1024_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_alt_opt/stage2_rpn_train.pt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'input-data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'im_info' 7 | top: 'gt_boxes' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { lr_mult: 0 decay_mult: 0 } 20 | param { lr_mult: 0 decay_mult: 0 } 21 | convolution_param { 22 | num_output: 96 23 | kernel_size: 7 stride: 2 24 | } 25 | } 26 | layer { 27 | name: "relu1" 28 | type: "ReLU" 29 | bottom: "conv1" 30 | top: "conv1" 31 | } 32 | layer { 33 | name: "norm1" 34 | type: "LRN" 35 | bottom: "conv1" 36 | top: "norm1" 37 | lrn_param { 38 | local_size: 5 39 | alpha: 0.0005 40 | beta: 0.75 41 | k: 2 42 | } 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "norm1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 stride: 2 52 | } 53 | } 54 | layer { 55 | name: "conv2" 56 | type: "Convolution" 57 | bottom: "pool1" 58 | top: "conv2" 59 | param { lr_mult: 0 decay_mult: 0 } 60 | param { lr_mult: 0 decay_mult: 0 } 61 | convolution_param { 62 | num_output: 256 63 | pad: 1 kernel_size: 5 stride: 2 64 | } 65 | } 66 | layer { 67 | name: "relu2" 68 | type: "ReLU" 69 | bottom: "conv2" 70 | top: "conv2" 71 | } 72 | layer { 73 | name: "norm2" 74 | type: "LRN" 75 | bottom: "conv2" 76 | top: "norm2" 77 | lrn_param { 78 | local_size: 5 79 | alpha: 0.0005 80 | beta: 0.75 81 | k: 2 82 | } 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "norm2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 stride: 2 92 | } 93 | } 94 | layer { 95 | name: "conv3" 96 | type: "Convolution" 97 | bottom: "pool2" 98 | top: "conv3" 99 | param { lr_mult: 0 decay_mult: 0 } 100 | param { lr_mult: 0 decay_mult: 0 } 101 | convolution_param { 102 | num_output: 512 103 | pad: 1 kernel_size: 3 104 | } 105 | } 106 | layer { 107 | name: "relu3" 108 | type: "ReLU" 109 | bottom: "conv3" 110 | top: "conv3" 111 | } 112 | layer { 113 | name: "conv4" 114 | type: "Convolution" 115 | bottom: "conv3" 116 | top: "conv4" 117 | param { lr_mult: 0 decay_mult: 0 } 118 | param { lr_mult: 0 decay_mult: 0 } 119 | convolution_param { 120 | num_output: 512 121 | pad: 1 kernel_size: 3 122 | } 123 | } 124 | layer { 125 | name: "relu4" 126 | type: "ReLU" 127 | bottom: "conv4" 128 | top: "conv4" 129 | } 130 | layer { 131 | name: "conv5" 132 | type: "Convolution" 133 | bottom: "conv4" 134 | top: "conv5" 135 | param { lr_mult: 0 decay_mult: 0 } 136 | param { lr_mult: 0 decay_mult: 0 } 137 | convolution_param { 138 | num_output: 512 139 | pad: 1 kernel_size: 3 140 | } 141 | } 142 | layer { 143 | name: "relu5" 144 | type: "ReLU" 145 | bottom: "conv5" 146 | top: "conv5" 147 | } 148 | 149 | #========= RPN ============ 150 | 151 | layer { 152 | name: "rpn_conv/3x3" 153 | type: "Convolution" 154 | bottom: "conv5" 155 | top: "rpn/output" 156 | param { lr_mult: 1.0 } 157 | param { lr_mult: 2.0 } 158 | convolution_param { 159 | num_output: 256 160 | kernel_size: 3 pad: 1 stride: 1 161 | weight_filler { type: "gaussian" std: 0.01 } 162 | bias_filler { type: "constant" value: 0 } 163 | } 164 | } 165 | layer { 166 | name: "rpn_relu/3x3" 167 | type: "ReLU" 168 | bottom: "rpn/output" 169 | top: "rpn/output" 170 | } 171 | layer { 172 | name: "rpn_cls_score" 173 | type: "Convolution" 174 | bottom: "rpn/output" 175 | top: "rpn_cls_score" 176 | param { lr_mult: 1.0 } 177 | param { lr_mult: 2.0 } 178 | convolution_param { 179 | num_output: 18 # 2(bg/fg) * 9(anchors) 180 | kernel_size: 1 pad: 0 stride: 1 181 | weight_filler { type: "gaussian" std: 0.01 } 182 | bias_filler { type: "constant" value: 0 } 183 | } 184 | } 185 | layer { 186 | name: "rpn_bbox_pred" 187 | type: "Convolution" 188 | bottom: "rpn/output" 189 | top: "rpn_bbox_pred" 190 | param { lr_mult: 1.0 } 191 | param { lr_mult: 2.0 } 192 | convolution_param { 193 | num_output: 36 # 4 * 9(anchors) 194 | kernel_size: 1 pad: 0 stride: 1 195 | weight_filler { type: "gaussian" std: 0.01 } 196 | bias_filler { type: "constant" value: 0 } 197 | } 198 | } 199 | layer { 200 | bottom: "rpn_cls_score" 201 | top: "rpn_cls_score_reshape" 202 | name: "rpn_cls_score_reshape" 203 | type: "Reshape" 204 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 205 | } 206 | layer { 207 | name: 'rpn-data' 208 | type: 'Python' 209 | bottom: 'rpn_cls_score' 210 | bottom: 'gt_boxes' 211 | bottom: 'im_info' 212 | bottom: 'data' 213 | top: 'rpn_labels' 214 | top: 'rpn_bbox_targets' 215 | top: 'rpn_bbox_inside_weights' 216 | top: 'rpn_bbox_outside_weights' 217 | python_param { 218 | module: 'rpn.anchor_target_layer' 219 | layer: 'AnchorTargetLayer' 220 | param_str: "'feat_stride': 16" 221 | } 222 | } 223 | layer { 224 | name: "rpn_loss_cls" 225 | type: "SoftmaxWithLoss" 226 | bottom: "rpn_cls_score_reshape" 227 | bottom: "rpn_labels" 228 | propagate_down: 1 229 | propagate_down: 0 230 | top: "rpn_cls_loss" 231 | loss_weight: 1 232 | loss_param { 233 | ignore_label: -1 234 | normalize: true 235 | } 236 | } 237 | layer { 238 | name: "rpn_loss_bbox" 239 | type: "SmoothL1Loss" 240 | bottom: "rpn_bbox_pred" 241 | bottom: "rpn_bbox_targets" 242 | bottom: 'rpn_bbox_inside_weights' 243 | bottom: 'rpn_bbox_outside_weights' 244 | top: "rpn_loss_bbox" 245 | loss_weight: 1 246 | smooth_l1_loss_param { sigma: 3.0 } 247 | } 248 | 249 | #========= RCNN ============ 250 | 251 | layer { 252 | name: "dummy_roi_pool_conv5" 253 | type: "DummyData" 254 | top: "dummy_roi_pool_conv5" 255 | dummy_data_param { 256 | shape { dim: 1 dim: 18432 } 257 | data_filler { type: "gaussian" std: 0.01 } 258 | } 259 | } 260 | layer { 261 | name: "fc6" 262 | type: "InnerProduct" 263 | bottom: "dummy_roi_pool_conv5" 264 | top: "fc6" 265 | param { lr_mult: 0 decay_mult: 0 } 266 | param { lr_mult: 0 decay_mult: 0 } 267 | inner_product_param { 268 | num_output: 4096 269 | } 270 | } 271 | layer { 272 | name: "fc7" 273 | type: "InnerProduct" 274 | bottom: "fc6" 275 | top: "fc7" 276 | param { lr_mult: 0 decay_mult: 0 } 277 | param { lr_mult: 0 decay_mult: 0 } 278 | inner_product_param { 279 | num_output: 1024 280 | } 281 | } 282 | layer { 283 | name: "silence_fc7" 284 | type: "Silence" 285 | bottom: "fc7" 286 | } 287 | -------------------------------------------------------------------------------- /models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/VGG_CNN_M_1024/faster_rcnn_end2end/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 50000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_faster_rcnn" 15 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/fast_rcnn/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/fast_rcnn/train.prototxt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "zf_fast_rcnn" 17 | #debug_info: true 18 | #iter_size: 2 19 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/fast_rcnn/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 # to be changed on-the-fly to num ROIs 14 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 15 | } 16 | 17 | #========= conv1-conv5 ============ 18 | 19 | layer { 20 | name: "conv1" 21 | type: "Convolution" 22 | bottom: "data" 23 | top: "conv1" 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | pad: 3 28 | stride: 2 29 | } 30 | } 31 | layer { 32 | name: "relu1" 33 | type: "ReLU" 34 | bottom: "conv1" 35 | top: "conv1" 36 | } 37 | layer { 38 | name: "norm1" 39 | type: "LRN" 40 | bottom: "conv1" 41 | top: "norm1" 42 | lrn_param { 43 | local_size: 3 44 | alpha: 0.00005 45 | beta: 0.75 46 | norm_region: WITHIN_CHANNEL 47 | engine: CAFFE 48 | } 49 | } 50 | layer { 51 | name: "pool1" 52 | type: "Pooling" 53 | bottom: "norm1" 54 | top: "pool1" 55 | pooling_param { 56 | kernel_size: 3 57 | stride: 2 58 | pad: 1 59 | pool: MAX 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | convolution_param { 68 | num_output: 256 69 | kernel_size: 5 70 | pad: 2 71 | stride: 2 72 | } 73 | } 74 | layer { 75 | name: "relu2" 76 | type: "ReLU" 77 | bottom: "conv2" 78 | top: "conv2" 79 | } 80 | layer { 81 | name: "norm2" 82 | type: "LRN" 83 | bottom: "conv2" 84 | top: "norm2" 85 | lrn_param { 86 | local_size: 3 87 | alpha: 0.00005 88 | beta: 0.75 89 | norm_region: WITHIN_CHANNEL 90 | engine: CAFFE 91 | } 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "norm2" 97 | top: "pool2" 98 | pooling_param { 99 | kernel_size: 3 100 | stride: 2 101 | pad: 1 102 | pool: MAX 103 | } 104 | } 105 | layer { 106 | name: "conv3" 107 | type: "Convolution" 108 | bottom: "pool2" 109 | top: "conv3" 110 | convolution_param { 111 | num_output: 384 112 | kernel_size: 3 113 | pad: 1 114 | stride: 1 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "conv3" 121 | top: "conv3" 122 | } 123 | layer { 124 | name: "conv4" 125 | type: "Convolution" 126 | bottom: "conv3" 127 | top: "conv4" 128 | convolution_param { 129 | num_output: 384 130 | kernel_size: 3 131 | pad: 1 132 | stride: 1 133 | } 134 | } 135 | layer { 136 | name: "relu4" 137 | type: "ReLU" 138 | bottom: "conv4" 139 | top: "conv4" 140 | } 141 | layer { 142 | name: "conv5" 143 | type: "Convolution" 144 | bottom: "conv4" 145 | top: "conv5" 146 | convolution_param { 147 | num_output: 256 148 | kernel_size: 3 149 | pad: 1 150 | stride: 1 151 | } 152 | } 153 | layer { 154 | name: "relu5" 155 | type: "ReLU" 156 | bottom: "conv5" 157 | top: "conv5" 158 | } 159 | 160 | #========= RCNN ============ 161 | 162 | layer { 163 | name: "roi_pool_conv5" 164 | type: "ROIPooling" 165 | bottom: "conv5" 166 | bottom: "rois" 167 | top: "roi_pool_conv5" 168 | roi_pooling_param { 169 | pooled_w: 6 170 | pooled_h: 6 171 | spatial_scale: 0.0625 # 1/16 172 | } 173 | } 174 | layer { 175 | name: "fc6" 176 | type: "InnerProduct" 177 | bottom: "roi_pool_conv5" 178 | top: "fc6" 179 | inner_product_param { 180 | num_output: 4096 181 | } 182 | } 183 | layer { 184 | name: "relu6" 185 | type: "ReLU" 186 | bottom: "fc6" 187 | top: "fc6" 188 | } 189 | layer { 190 | name: "drop6" 191 | type: "Dropout" 192 | bottom: "fc6" 193 | top: "fc6" 194 | dropout_param { 195 | dropout_ratio: 0.5 196 | scale_train: false 197 | } 198 | } 199 | layer { 200 | name: "fc7" 201 | type: "InnerProduct" 202 | bottom: "fc6" 203 | top: "fc7" 204 | inner_product_param { 205 | num_output: 4096 206 | } 207 | } 208 | layer { 209 | name: "relu7" 210 | type: "ReLU" 211 | bottom: "fc7" 212 | top: "fc7" 213 | } 214 | layer { 215 | name: "drop7" 216 | type: "Dropout" 217 | bottom: "fc7" 218 | top: "fc7" 219 | dropout_param { 220 | dropout_ratio: 0.5 221 | scale_train: false 222 | } 223 | } 224 | layer { 225 | name: "cls_score" 226 | type: "InnerProduct" 227 | bottom: "fc7" 228 | top: "cls_score" 229 | inner_product_param { 230 | num_output: 21 231 | } 232 | } 233 | layer { 234 | name: "bbox_pred" 235 | type: "InnerProduct" 236 | bottom: "fc7" 237 | top: "bbox_pred" 238 | inner_product_param { 239 | num_output: 84 240 | } 241 | } 242 | layer { 243 | name: "cls_prob" 244 | type: "Softmax" 245 | bottom: "cls_score" 246 | top: "cls_prob" 247 | loss_param { 248 | ignore_label: -1 249 | normalize: true 250 | } 251 | } 252 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/fast_rcnn/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_inside_weights' 10 | top: 'bbox_outside_weights' 11 | python_param { 12 | module: 'roi_data_layer.layer' 13 | layer: 'RoIDataLayer' 14 | param_str: "'num_classes': 21" 15 | } 16 | } 17 | 18 | #========= conv1-conv5 ============ 19 | 20 | layer { 21 | name: "conv1" 22 | type: "Convolution" 23 | bottom: "data" 24 | top: "conv1" 25 | param { lr_mult: 1.0 } 26 | param { lr_mult: 2.0 } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | pad: 3 31 | stride: 2 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "norm1" 42 | type: "LRN" 43 | bottom: "conv1" 44 | top: "norm1" 45 | lrn_param { 46 | local_size: 3 47 | alpha: 0.00005 48 | beta: 0.75 49 | norm_region: WITHIN_CHANNEL 50 | engine: CAFFE 51 | } 52 | } 53 | layer { 54 | name: "pool1" 55 | type: "Pooling" 56 | bottom: "norm1" 57 | top: "pool1" 58 | pooling_param { 59 | kernel_size: 3 60 | stride: 2 61 | pad: 1 62 | pool: MAX 63 | } 64 | } 65 | layer { 66 | name: "conv2" 67 | type: "Convolution" 68 | bottom: "pool1" 69 | top: "conv2" 70 | param { lr_mult: 1.0 } 71 | param { lr_mult: 2.0 } 72 | convolution_param { 73 | num_output: 256 74 | kernel_size: 5 75 | pad: 2 76 | stride: 2 77 | } 78 | } 79 | layer { 80 | name: "relu2" 81 | type: "ReLU" 82 | bottom: "conv2" 83 | top: "conv2" 84 | } 85 | layer { 86 | name: "norm2" 87 | type: "LRN" 88 | bottom: "conv2" 89 | top: "norm2" 90 | lrn_param { 91 | local_size: 3 92 | alpha: 0.00005 93 | beta: 0.75 94 | norm_region: WITHIN_CHANNEL 95 | engine: CAFFE 96 | } 97 | } 98 | layer { 99 | name: "pool2" 100 | type: "Pooling" 101 | bottom: "norm2" 102 | top: "pool2" 103 | pooling_param { 104 | kernel_size: 3 105 | stride: 2 106 | pad: 1 107 | pool: MAX 108 | } 109 | } 110 | layer { 111 | name: "conv3" 112 | type: "Convolution" 113 | bottom: "pool2" 114 | top: "conv3" 115 | param { lr_mult: 1.0 } 116 | param { lr_mult: 2.0 } 117 | convolution_param { 118 | num_output: 384 119 | kernel_size: 3 120 | pad: 1 121 | stride: 1 122 | } 123 | } 124 | layer { 125 | name: "relu3" 126 | type: "ReLU" 127 | bottom: "conv3" 128 | top: "conv3" 129 | } 130 | layer { 131 | name: "conv4" 132 | type: "Convolution" 133 | bottom: "conv3" 134 | top: "conv4" 135 | param { lr_mult: 1.0 } 136 | param { lr_mult: 2.0 } 137 | convolution_param { 138 | num_output: 384 139 | kernel_size: 3 140 | pad: 1 141 | stride: 1 142 | } 143 | } 144 | layer { 145 | name: "relu4" 146 | type: "ReLU" 147 | bottom: "conv4" 148 | top: "conv4" 149 | } 150 | layer { 151 | name: "conv5" 152 | type: "Convolution" 153 | bottom: "conv4" 154 | top: "conv5" 155 | param { lr_mult: 1.0 } 156 | param { lr_mult: 2.0 } 157 | convolution_param { 158 | num_output: 256 159 | kernel_size: 3 160 | pad: 1 161 | stride: 1 162 | } 163 | } 164 | layer { 165 | name: "relu5" 166 | type: "ReLU" 167 | bottom: "conv5" 168 | top: "conv5" 169 | } 170 | 171 | #========= RCNN ============ 172 | 173 | layer { 174 | name: "roi_pool_conv5" 175 | type: "ROIPooling" 176 | bottom: "conv5" 177 | bottom: "rois" 178 | top: "roi_pool_conv5" 179 | roi_pooling_param { 180 | pooled_w: 6 181 | pooled_h: 6 182 | spatial_scale: 0.0625 # 1/16 183 | } 184 | } 185 | layer { 186 | name: "fc6" 187 | type: "InnerProduct" 188 | bottom: "roi_pool_conv5" 189 | top: "fc6" 190 | param { lr_mult: 1.0 } 191 | param { lr_mult: 2.0 } 192 | inner_product_param { 193 | num_output: 4096 194 | } 195 | } 196 | layer { 197 | name: "relu6" 198 | type: "ReLU" 199 | bottom: "fc6" 200 | top: "fc6" 201 | } 202 | layer { 203 | name: "drop6" 204 | type: "Dropout" 205 | bottom: "fc6" 206 | top: "fc6" 207 | dropout_param { 208 | dropout_ratio: 0.5 209 | scale_train: false 210 | } 211 | } 212 | layer { 213 | name: "fc7" 214 | type: "InnerProduct" 215 | bottom: "fc6" 216 | top: "fc7" 217 | param { lr_mult: 1.0 } 218 | param { lr_mult: 2.0 } 219 | inner_product_param { 220 | num_output: 4096 221 | } 222 | } 223 | layer { 224 | name: "relu7" 225 | type: "ReLU" 226 | bottom: "fc7" 227 | top: "fc7" 228 | } 229 | layer { 230 | name: "drop7" 231 | type: "Dropout" 232 | bottom: "fc7" 233 | top: "fc7" 234 | dropout_param { 235 | dropout_ratio: 0.5 236 | scale_train: false 237 | } 238 | } 239 | layer { 240 | name: "cls_score" 241 | type: "InnerProduct" 242 | bottom: "fc7" 243 | top: "cls_score" 244 | param { lr_mult: 1.0 } 245 | param { lr_mult: 2.0 } 246 | inner_product_param { 247 | num_output: 21 248 | weight_filler { 249 | type: "gaussian" 250 | std: 0.01 251 | } 252 | bias_filler { 253 | type: "constant" 254 | value: 0 255 | } 256 | } 257 | } 258 | layer { 259 | name: "bbox_pred" 260 | type: "InnerProduct" 261 | bottom: "fc7" 262 | top: "bbox_pred" 263 | param { lr_mult: 1.0 } 264 | param { lr_mult: 2.0 } 265 | inner_product_param { 266 | num_output: 84 267 | weight_filler { 268 | type: "gaussian" 269 | std: 0.001 270 | } 271 | bias_filler { 272 | type: "constant" 273 | value: 0 274 | } 275 | } 276 | } 277 | layer { 278 | name: "loss_cls" 279 | type: "SoftmaxWithLoss" 280 | bottom: "cls_score" 281 | bottom: "labels" 282 | propagate_down: 1 283 | propagate_down: 0 284 | top: "cls_loss" 285 | loss_weight: 1 286 | loss_param { 287 | ignore_label: -1 288 | normalize: true 289 | } 290 | } 291 | layer { 292 | name: "loss_bbox" 293 | type: "SmoothL1Loss" 294 | bottom: "bbox_pred" 295 | bottom: "bbox_targets" 296 | bottom: "bbox_inside_weights" 297 | bottom: "bbox_outside_weights" 298 | top: "bbox_loss" 299 | loss_weight: 1 300 | } 301 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/faster_rcnn_test.pt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "im_info" 12 | input_shape { 13 | dim: 1 14 | dim: 3 15 | } 16 | 17 | #========= conv1-conv5 ============ 18 | 19 | layer { 20 | name: "conv1" 21 | type: "Convolution" 22 | bottom: "data" 23 | top: "conv1" 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | pad: 3 28 | stride: 2 29 | } 30 | } 31 | layer { 32 | name: "relu1" 33 | type: "ReLU" 34 | bottom: "conv1" 35 | top: "conv1" 36 | } 37 | layer { 38 | name: "norm1" 39 | type: "LRN" 40 | bottom: "conv1" 41 | top: "norm1" 42 | lrn_param { 43 | local_size: 3 44 | alpha: 0.00005 45 | beta: 0.75 46 | norm_region: WITHIN_CHANNEL 47 | engine: CAFFE 48 | } 49 | } 50 | layer { 51 | name: "pool1" 52 | type: "Pooling" 53 | bottom: "norm1" 54 | top: "pool1" 55 | pooling_param { 56 | kernel_size: 3 57 | stride: 2 58 | pad: 1 59 | pool: MAX 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | convolution_param { 68 | num_output: 256 69 | kernel_size: 5 70 | pad: 2 71 | stride: 2 72 | } 73 | } 74 | layer { 75 | name: "relu2" 76 | type: "ReLU" 77 | bottom: "conv2" 78 | top: "conv2" 79 | } 80 | layer { 81 | name: "norm2" 82 | type: "LRN" 83 | bottom: "conv2" 84 | top: "norm2" 85 | lrn_param { 86 | local_size: 3 87 | alpha: 0.00005 88 | beta: 0.75 89 | norm_region: WITHIN_CHANNEL 90 | engine: CAFFE 91 | } 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "norm2" 97 | top: "pool2" 98 | pooling_param { 99 | kernel_size: 3 100 | stride: 2 101 | pad: 1 102 | pool: MAX 103 | } 104 | } 105 | layer { 106 | name: "conv3" 107 | type: "Convolution" 108 | bottom: "pool2" 109 | top: "conv3" 110 | convolution_param { 111 | num_output: 384 112 | kernel_size: 3 113 | pad: 1 114 | stride: 1 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "conv3" 121 | top: "conv3" 122 | } 123 | layer { 124 | name: "conv4" 125 | type: "Convolution" 126 | bottom: "conv3" 127 | top: "conv4" 128 | convolution_param { 129 | num_output: 384 130 | kernel_size: 3 131 | pad: 1 132 | stride: 1 133 | } 134 | } 135 | layer { 136 | name: "relu4" 137 | type: "ReLU" 138 | bottom: "conv4" 139 | top: "conv4" 140 | } 141 | layer { 142 | name: "conv5" 143 | type: "Convolution" 144 | bottom: "conv4" 145 | top: "conv5" 146 | convolution_param { 147 | num_output: 256 148 | kernel_size: 3 149 | pad: 1 150 | stride: 1 151 | } 152 | } 153 | layer { 154 | name: "relu5" 155 | type: "ReLU" 156 | bottom: "conv5" 157 | top: "conv5" 158 | } 159 | 160 | #========= RPN ============ 161 | 162 | 163 | layer { 164 | name: "rpn_conv1" 165 | type: "Convolution" 166 | bottom: "conv5" 167 | top: "rpn_conv1" 168 | convolution_param { 169 | num_output: 256 170 | kernel_size: 3 pad: 1 stride: 1 171 | } 172 | } 173 | layer { 174 | name: "rpn_relu1" 175 | type: "ReLU" 176 | bottom: "rpn_conv1" 177 | top: "rpn_conv1" 178 | } 179 | layer { 180 | name: "rpn_cls_score" 181 | type: "Convolution" 182 | bottom: "rpn_conv1" 183 | top: "rpn_cls_score" 184 | convolution_param { 185 | num_output: 18 # 2(bg/fg) * 9(anchors) 186 | kernel_size: 1 pad: 0 stride: 1 187 | } 188 | } 189 | layer { 190 | name: "rpn_bbox_pred" 191 | type: "Convolution" 192 | bottom: "rpn_conv1" 193 | top: "rpn_bbox_pred" 194 | convolution_param { 195 | num_output: 36 # 4 * 9(anchors) 196 | kernel_size: 1 pad: 0 stride: 1 197 | } 198 | } 199 | layer { 200 | bottom: "rpn_cls_score" 201 | top: "rpn_cls_score_reshape" 202 | name: "rpn_cls_score_reshape" 203 | type: "Reshape" 204 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 205 | } 206 | 207 | #========= RoI Proposal ============ 208 | 209 | layer { 210 | name: "rpn_cls_prob" 211 | type: "Softmax" 212 | bottom: "rpn_cls_score_reshape" 213 | top: "rpn_cls_prob" 214 | } 215 | layer { 216 | name: 'rpn_cls_prob_reshape' 217 | type: 'Reshape' 218 | bottom: 'rpn_cls_prob' 219 | top: 'rpn_cls_prob_reshape' 220 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 221 | } 222 | layer { 223 | name: 'proposal' 224 | type: 'Python' 225 | bottom: 'rpn_cls_prob_reshape' 226 | bottom: 'rpn_bbox_pred' 227 | bottom: 'im_info' 228 | top: 'rois' 229 | python_param { 230 | module: 'rpn.proposal_layer' 231 | layer: 'ProposalLayer' 232 | param_str: "'feat_stride': 16" 233 | } 234 | } 235 | 236 | #========= RCNN ============ 237 | 238 | layer { 239 | name: "roi_pool_conv5" 240 | type: "ROIPooling" 241 | bottom: "conv5" 242 | bottom: "rois" 243 | top: "roi_pool_conv5" 244 | roi_pooling_param { 245 | pooled_w: 6 246 | pooled_h: 6 247 | spatial_scale: 0.0625 # 1/16 248 | } 249 | } 250 | layer { 251 | name: "fc6" 252 | type: "InnerProduct" 253 | bottom: "roi_pool_conv5" 254 | top: "fc6" 255 | inner_product_param { 256 | num_output: 4096 257 | } 258 | } 259 | layer { 260 | name: "relu6" 261 | type: "ReLU" 262 | bottom: "fc6" 263 | top: "fc6" 264 | } 265 | layer { 266 | name: "drop6" 267 | type: "Dropout" 268 | bottom: "fc6" 269 | top: "fc6" 270 | dropout_param { 271 | dropout_ratio: 0.5 272 | scale_train: false 273 | } 274 | } 275 | layer { 276 | name: "fc7" 277 | type: "InnerProduct" 278 | bottom: "fc6" 279 | top: "fc7" 280 | inner_product_param { 281 | num_output: 4096 282 | } 283 | } 284 | layer { 285 | name: "relu7" 286 | type: "ReLU" 287 | bottom: "fc7" 288 | top: "fc7" 289 | } 290 | layer { 291 | name: "drop7" 292 | type: "Dropout" 293 | bottom: "fc7" 294 | top: "fc7" 295 | dropout_param { 296 | dropout_ratio: 0.5 297 | scale_train: false 298 | } 299 | } 300 | layer { 301 | name: "cls_score" 302 | type: "InnerProduct" 303 | bottom: "fc7" 304 | top: "cls_score" 305 | inner_product_param { 306 | num_output: 21 307 | } 308 | } 309 | layer { 310 | name: "bbox_pred" 311 | type: "InnerProduct" 312 | bottom: "fc7" 313 | top: "bbox_pred" 314 | inner_product_param { 315 | num_output: 84 316 | } 317 | } 318 | layer { 319 | name: "cls_prob" 320 | type: "Softmax" 321 | bottom: "cls_score" 322 | top: "cls_prob" 323 | loss_param { 324 | ignore_label: -1 325 | normalize: true 326 | } 327 | } 328 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "im_info" 12 | input_shape { 13 | dim: 1 14 | dim: 3 15 | } 16 | 17 | # ------------------------ layer 1 ----------------------------- 18 | layer { 19 | name: "conv1" 20 | type: "Convolution" 21 | bottom: "data" 22 | top: "conv1" 23 | convolution_param { 24 | num_output: 96 25 | kernel_size: 7 26 | pad: 3 27 | stride: 2 28 | } 29 | } 30 | layer { 31 | name: "relu1" 32 | type: "ReLU" 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layer { 37 | name: "norm1" 38 | type: "LRN" 39 | bottom: "conv1" 40 | top: "norm1" 41 | lrn_param { 42 | local_size: 3 43 | alpha: 0.00005 44 | beta: 0.75 45 | norm_region: WITHIN_CHANNEL 46 | engine: CAFFE 47 | } 48 | } 49 | layer { 50 | name: "pool1" 51 | type: "Pooling" 52 | bottom: "norm1" 53 | top: "pool1" 54 | pooling_param { 55 | kernel_size: 3 56 | stride: 2 57 | pad: 1 58 | pool: MAX 59 | } 60 | } 61 | layer { 62 | name: "conv2" 63 | type: "Convolution" 64 | bottom: "pool1" 65 | top: "conv2" 66 | convolution_param { 67 | num_output: 256 68 | kernel_size: 5 69 | pad: 2 70 | stride: 2 71 | } 72 | } 73 | layer { 74 | name: "relu2" 75 | type: "ReLU" 76 | bottom: "conv2" 77 | top: "conv2" 78 | } 79 | 80 | layer { 81 | name: "norm2" 82 | type: "LRN" 83 | bottom: "conv2" 84 | top: "norm2" 85 | lrn_param { 86 | local_size: 3 87 | alpha: 0.00005 88 | beta: 0.75 89 | norm_region: WITHIN_CHANNEL 90 | engine: CAFFE 91 | } 92 | } 93 | layer { 94 | name: "pool2" 95 | type: "Pooling" 96 | bottom: "norm2" 97 | top: "pool2" 98 | pooling_param { 99 | kernel_size: 3 100 | stride: 2 101 | pad: 1 102 | pool: MAX 103 | } 104 | } 105 | layer { 106 | name: "conv3" 107 | type: "Convolution" 108 | bottom: "pool2" 109 | top: "conv3" 110 | convolution_param { 111 | num_output: 384 112 | kernel_size: 3 113 | pad: 1 114 | stride: 1 115 | } 116 | } 117 | layer { 118 | name: "relu3" 119 | type: "ReLU" 120 | bottom: "conv3" 121 | top: "conv3" 122 | } 123 | layer { 124 | name: "conv4" 125 | type: "Convolution" 126 | bottom: "conv3" 127 | top: "conv4" 128 | convolution_param { 129 | num_output: 384 130 | kernel_size: 3 131 | pad: 1 132 | stride: 1 133 | } 134 | } 135 | layer { 136 | name: "relu4" 137 | type: "ReLU" 138 | bottom: "conv4" 139 | top: "conv4" 140 | } 141 | layer { 142 | name: "conv5" 143 | type: "Convolution" 144 | bottom: "conv4" 145 | top: "conv5" 146 | convolution_param { 147 | num_output: 256 148 | kernel_size: 3 149 | pad: 1 150 | stride: 1 151 | } 152 | } 153 | layer { 154 | name: "relu5" 155 | type: "ReLU" 156 | bottom: "conv5" 157 | top: "conv5" 158 | } 159 | 160 | #-----------------------layer +------------------------- 161 | 162 | layer { 163 | name: "rpn_conv1" 164 | type: "Convolution" 165 | bottom: "conv5" 166 | top: "rpn_conv1" 167 | convolution_param { 168 | num_output: 256 169 | kernel_size: 3 pad: 1 stride: 1 170 | } 171 | } 172 | layer { 173 | name: "rpn_relu1" 174 | type: "ReLU" 175 | bottom: "rpn_conv1" 176 | top: "rpn_conv1" 177 | } 178 | layer { 179 | name: "rpn_cls_score" 180 | type: "Convolution" 181 | bottom: "rpn_conv1" 182 | top: "rpn_cls_score" 183 | convolution_param { 184 | num_output: 18 # 2(bg/fg) * 9(anchors) 185 | kernel_size: 1 pad: 0 stride: 1 186 | } 187 | } 188 | layer { 189 | name: "rpn_bbox_pred" 190 | type: "Convolution" 191 | bottom: "rpn_conv1" 192 | top: "rpn_bbox_pred" 193 | convolution_param { 194 | num_output: 36 # 4 * 9(anchors) 195 | kernel_size: 1 pad: 0 stride: 1 196 | } 197 | } 198 | layer { 199 | bottom: "rpn_cls_score" 200 | top: "rpn_cls_score_reshape" 201 | name: "rpn_cls_score_reshape" 202 | type: "Reshape" 203 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 204 | } 205 | 206 | #-----------------------output------------------------ 207 | layer { 208 | name: "rpn_cls_prob" 209 | type: "Softmax" 210 | bottom: "rpn_cls_score_reshape" 211 | top: "rpn_cls_prob" 212 | } 213 | layer { 214 | name: 'rpn_cls_prob_reshape' 215 | type: 'Reshape' 216 | bottom: 'rpn_cls_prob' 217 | top: 'rpn_cls_prob_reshape' 218 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 219 | } 220 | layer { 221 | name: 'proposal' 222 | type: 'Python' 223 | bottom: 'rpn_cls_prob_reshape' 224 | bottom: 'rpn_bbox_pred' 225 | bottom: 'im_info' 226 | top: 'rois' 227 | top: 'scores' 228 | python_param { 229 | module: 'rpn.proposal_layer' 230 | layer: 'ProposalLayer' 231 | param_str: "'feat_stride': 16" 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "zf_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "zf_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | layer { 3 | name: 'input-data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'im_info' 7 | top: 'gt_boxes' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | 15 | #========= conv1-conv5 ============ 16 | 17 | layer { 18 | name: "conv1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1" 22 | param { lr_mult: 1.0 } 23 | param { lr_mult: 2.0 } 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | pad: 3 28 | stride: 2 29 | } 30 | } 31 | layer { 32 | name: "relu1" 33 | type: "ReLU" 34 | bottom: "conv1" 35 | top: "conv1" 36 | } 37 | layer { 38 | name: "norm1" 39 | type: "LRN" 40 | bottom: "conv1" 41 | top: "norm1" 42 | lrn_param { 43 | local_size: 3 44 | alpha: 0.00005 45 | beta: 0.75 46 | norm_region: WITHIN_CHANNEL 47 | engine: CAFFE 48 | } 49 | } 50 | layer { 51 | name: "pool1" 52 | type: "Pooling" 53 | bottom: "norm1" 54 | top: "pool1" 55 | pooling_param { 56 | kernel_size: 3 57 | stride: 2 58 | pad: 1 59 | pool: MAX 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { lr_mult: 1.0 } 68 | param { lr_mult: 2.0 } 69 | convolution_param { 70 | num_output: 256 71 | kernel_size: 5 72 | pad: 2 73 | stride: 2 74 | } 75 | } 76 | layer { 77 | name: "relu2" 78 | type: "ReLU" 79 | bottom: "conv2" 80 | top: "conv2" 81 | } 82 | layer { 83 | name: "norm2" 84 | type: "LRN" 85 | bottom: "conv2" 86 | top: "norm2" 87 | lrn_param { 88 | local_size: 3 89 | alpha: 0.00005 90 | beta: 0.75 91 | norm_region: WITHIN_CHANNEL 92 | engine: CAFFE 93 | } 94 | } 95 | layer { 96 | name: "pool2" 97 | type: "Pooling" 98 | bottom: "norm2" 99 | top: "pool2" 100 | pooling_param { 101 | kernel_size: 3 102 | stride: 2 103 | pad: 1 104 | pool: MAX 105 | } 106 | } 107 | layer { 108 | name: "conv3" 109 | type: "Convolution" 110 | bottom: "pool2" 111 | top: "conv3" 112 | param { lr_mult: 1.0 } 113 | param { lr_mult: 2.0 } 114 | convolution_param { 115 | num_output: 384 116 | kernel_size: 3 117 | pad: 1 118 | stride: 1 119 | } 120 | } 121 | layer { 122 | name: "relu3" 123 | type: "ReLU" 124 | bottom: "conv3" 125 | top: "conv3" 126 | } 127 | layer { 128 | name: "conv4" 129 | type: "Convolution" 130 | bottom: "conv3" 131 | top: "conv4" 132 | param { lr_mult: 1.0 } 133 | param { lr_mult: 2.0 } 134 | convolution_param { 135 | num_output: 384 136 | kernel_size: 3 137 | pad: 1 138 | stride: 1 139 | } 140 | } 141 | layer { 142 | name: "relu4" 143 | type: "ReLU" 144 | bottom: "conv4" 145 | top: "conv4" 146 | } 147 | layer { 148 | name: "conv5" 149 | type: "Convolution" 150 | bottom: "conv4" 151 | top: "conv5" 152 | param { lr_mult: 1.0 } 153 | param { lr_mult: 2.0 } 154 | convolution_param { 155 | num_output: 256 156 | kernel_size: 3 157 | pad: 1 158 | stride: 1 159 | } 160 | } 161 | layer { 162 | name: "relu5" 163 | type: "ReLU" 164 | bottom: "conv5" 165 | top: "conv5" 166 | } 167 | 168 | #========= RPN ============ 169 | 170 | layer { 171 | name: "rpn_conv1" 172 | type: "Convolution" 173 | bottom: "conv5" 174 | top: "rpn_conv1" 175 | param { lr_mult: 1.0 } 176 | param { lr_mult: 2.0 } 177 | convolution_param { 178 | num_output: 256 179 | kernel_size: 3 pad: 1 stride: 1 180 | weight_filler { type: "gaussian" std: 0.01 } 181 | bias_filler { type: "constant" value: 0 } 182 | } 183 | } 184 | layer { 185 | name: "rpn_relu1" 186 | type: "ReLU" 187 | bottom: "rpn_conv1" 188 | top: "rpn_conv1" 189 | } 190 | layer { 191 | name: "rpn_cls_score" 192 | type: "Convolution" 193 | bottom: "rpn_conv1" 194 | top: "rpn_cls_score" 195 | param { lr_mult: 1.0 } 196 | param { lr_mult: 2.0 } 197 | convolution_param { 198 | num_output: 18 # 2(bg/fg) * 9(anchors) 199 | kernel_size: 1 pad: 0 stride: 1 200 | weight_filler { type: "gaussian" std: 0.01 } 201 | bias_filler { type: "constant" value: 0 } 202 | } 203 | } 204 | layer { 205 | name: "rpn_bbox_pred" 206 | type: "Convolution" 207 | bottom: "rpn_conv1" 208 | top: "rpn_bbox_pred" 209 | param { lr_mult: 1.0 } 210 | param { lr_mult: 2.0 } 211 | convolution_param { 212 | num_output: 36 # 4 * 9(anchors) 213 | kernel_size: 1 pad: 0 stride: 1 214 | weight_filler { type: "gaussian" std: 0.01 } 215 | bias_filler { type: "constant" value: 0 } 216 | } 217 | } 218 | layer { 219 | bottom: "rpn_cls_score" 220 | top: "rpn_cls_score_reshape" 221 | name: "rpn_cls_score_reshape" 222 | type: "Reshape" 223 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 224 | } 225 | layer { 226 | name: 'rpn-data' 227 | type: 'Python' 228 | bottom: 'rpn_cls_score' 229 | bottom: 'gt_boxes' 230 | bottom: 'im_info' 231 | bottom: 'data' 232 | top: 'rpn_labels' 233 | top: 'rpn_bbox_targets' 234 | top: 'rpn_bbox_inside_weights' 235 | top: 'rpn_bbox_outside_weights' 236 | python_param { 237 | module: 'rpn.anchor_target_layer' 238 | layer: 'AnchorTargetLayer' 239 | param_str: "'feat_stride': 16" 240 | } 241 | } 242 | layer { 243 | name: "rpn_loss_cls" 244 | type: "SoftmaxWithLoss" 245 | bottom: "rpn_cls_score_reshape" 246 | bottom: "rpn_labels" 247 | propagate_down: 1 248 | propagate_down: 0 249 | top: "rpn_cls_loss" 250 | loss_weight: 1 251 | loss_param { 252 | ignore_label: -1 253 | normalize: true 254 | } 255 | } 256 | layer { 257 | name: "rpn_loss_bbox" 258 | type: "SmoothL1Loss" 259 | bottom: "rpn_bbox_pred" 260 | bottom: "rpn_bbox_targets" 261 | bottom: "rpn_bbox_inside_weights" 262 | bottom: "rpn_bbox_outside_weights" 263 | top: "rpn_loss_bbox" 264 | loss_weight: 1 265 | smooth_l1_loss_param { sigma: 3.0 } 266 | } 267 | 268 | #========= RCNN ============ 269 | # Dummy layers so that initial parameters are saved into the output net 270 | 271 | layer { 272 | name: "dummy_roi_pool_conv5" 273 | type: "DummyData" 274 | top: "dummy_roi_pool_conv5" 275 | dummy_data_param { 276 | shape { dim: 1 dim: 9216 } 277 | data_filler { type: "gaussian" std: 0.01 } 278 | } 279 | } 280 | layer { 281 | name: "fc6" 282 | type: "InnerProduct" 283 | bottom: "dummy_roi_pool_conv5" 284 | top: "fc6" 285 | param { lr_mult: 0 decay_mult: 0 } 286 | param { lr_mult: 0 decay_mult: 0 } 287 | inner_product_param { 288 | num_output: 4096 289 | } 290 | } 291 | layer { 292 | name: "relu6" 293 | type: "ReLU" 294 | bottom: "fc6" 295 | top: "fc6" 296 | } 297 | layer { 298 | name: "fc7" 299 | type: "InnerProduct" 300 | bottom: "fc6" 301 | top: "fc7" 302 | param { lr_mult: 0 decay_mult: 0 } 303 | param { lr_mult: 0 decay_mult: 0 } 304 | inner_product_param { 305 | num_output: 4096 306 | } 307 | } 308 | layer { 309 | name: "silence_fc7" 310 | type: "Silence" 311 | bottom: "fc7" 312 | } 313 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_solver30k40k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_fast_rcnn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 30000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "zf_fast_rcnn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_solver60k80k.pt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 60000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | # We disable standard caffe solver snapshotting and implement our own snapshot 13 | # function 14 | snapshot: 0 15 | # We still use the snapshot prefix, though 16 | snapshot_prefix: "zf_rpn" 17 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_alt_opt/stage2_rpn_train.pt: -------------------------------------------------------------------------------- 1 | name: "ZF" 2 | layer { 3 | name: 'input-data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'im_info' 7 | top: 'gt_boxes' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | 15 | #========= conv1-conv5 ============ 16 | 17 | layer { 18 | name: "conv1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1" 22 | param { lr_mult: 0 decay_mult: 0 } 23 | param { lr_mult: 0 decay_mult: 0 } 24 | convolution_param { 25 | num_output: 96 26 | kernel_size: 7 27 | pad: 3 28 | stride: 2 29 | } 30 | } 31 | layer { 32 | name: "relu1" 33 | type: "ReLU" 34 | bottom: "conv1" 35 | top: "conv1" 36 | } 37 | layer { 38 | name: "norm1" 39 | type: "LRN" 40 | bottom: "conv1" 41 | top: "norm1" 42 | lrn_param { 43 | local_size: 3 44 | alpha: 0.00005 45 | beta: 0.75 46 | norm_region: WITHIN_CHANNEL 47 | engine: CAFFE 48 | } 49 | } 50 | layer { 51 | name: "pool1" 52 | type: "Pooling" 53 | bottom: "norm1" 54 | top: "pool1" 55 | pooling_param { 56 | kernel_size: 3 57 | stride: 2 58 | pad: 1 59 | pool: MAX 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { lr_mult: 0 decay_mult: 0 } 68 | param { lr_mult: 0 decay_mult: 0 } 69 | convolution_param { 70 | num_output: 256 71 | kernel_size: 5 72 | pad: 2 73 | stride: 2 74 | } 75 | } 76 | layer { 77 | name: "relu2" 78 | type: "ReLU" 79 | bottom: "conv2" 80 | top: "conv2" 81 | } 82 | layer { 83 | name: "norm2" 84 | type: "LRN" 85 | bottom: "conv2" 86 | top: "norm2" 87 | lrn_param { 88 | local_size: 3 89 | alpha: 0.00005 90 | beta: 0.75 91 | norm_region: WITHIN_CHANNEL 92 | engine: CAFFE 93 | } 94 | } 95 | layer { 96 | name: "pool2" 97 | type: "Pooling" 98 | bottom: "norm2" 99 | top: "pool2" 100 | pooling_param { 101 | kernel_size: 3 102 | stride: 2 103 | pad: 1 104 | pool: MAX 105 | } 106 | } 107 | layer { 108 | name: "conv3" 109 | type: "Convolution" 110 | bottom: "pool2" 111 | top: "conv3" 112 | param { lr_mult: 0 decay_mult: 0 } 113 | param { lr_mult: 0 decay_mult: 0 } 114 | convolution_param { 115 | num_output: 384 116 | kernel_size: 3 117 | pad: 1 118 | stride: 1 119 | } 120 | } 121 | layer { 122 | name: "relu3" 123 | type: "ReLU" 124 | bottom: "conv3" 125 | top: "conv3" 126 | } 127 | layer { 128 | name: "conv4" 129 | type: "Convolution" 130 | bottom: "conv3" 131 | top: "conv4" 132 | param { lr_mult: 0 decay_mult: 0 } 133 | param { lr_mult: 0 decay_mult: 0 } 134 | convolution_param { 135 | num_output: 384 136 | kernel_size: 3 137 | pad: 1 138 | stride: 1 139 | } 140 | } 141 | layer { 142 | name: "relu4" 143 | type: "ReLU" 144 | bottom: "conv4" 145 | top: "conv4" 146 | } 147 | layer { 148 | name: "conv5" 149 | type: "Convolution" 150 | bottom: "conv4" 151 | top: "conv5" 152 | param { lr_mult: 0 decay_mult: 0 } 153 | param { lr_mult: 0 decay_mult: 0 } 154 | convolution_param { 155 | num_output: 256 156 | kernel_size: 3 157 | pad: 1 158 | stride: 1 159 | } 160 | } 161 | layer { 162 | name: "relu5" 163 | type: "ReLU" 164 | bottom: "conv5" 165 | top: "conv5" 166 | } 167 | 168 | #========= RPN ============ 169 | 170 | layer { 171 | name: "rpn_conv1" 172 | type: "Convolution" 173 | bottom: "conv5" 174 | top: "rpn_conv1" 175 | param { lr_mult: 1.0 } 176 | param { lr_mult: 2.0 } 177 | convolution_param { 178 | num_output: 256 179 | kernel_size: 3 pad: 1 stride: 1 180 | weight_filler { type: "gaussian" std: 0.01 } 181 | bias_filler { type: "constant" value: 0 } 182 | } 183 | } 184 | layer { 185 | name: "rpn_relu1" 186 | type: "ReLU" 187 | bottom: "rpn_conv1" 188 | top: "rpn_conv1" 189 | } 190 | layer { 191 | name: "rpn_cls_score" 192 | type: "Convolution" 193 | bottom: "rpn_conv1" 194 | top: "rpn_cls_score" 195 | param { lr_mult: 1.0 } 196 | param { lr_mult: 2.0 } 197 | convolution_param { 198 | num_output: 18 # 2(bg/fg) * 9(anchors) 199 | kernel_size: 1 pad: 0 stride: 1 200 | weight_filler { type: "gaussian" std: 0.01 } 201 | bias_filler { type: "constant" value: 0 } 202 | } 203 | } 204 | layer { 205 | name: "rpn_bbox_pred" 206 | type: "Convolution" 207 | bottom: "rpn_conv1" 208 | top: "rpn_bbox_pred" 209 | param { lr_mult: 1.0 } 210 | param { lr_mult: 2.0 } 211 | convolution_param { 212 | num_output: 36 # 4 * 9(anchors) 213 | kernel_size: 1 pad: 0 stride: 1 214 | weight_filler { type: "gaussian" std: 0.01 } 215 | bias_filler { type: "constant" value: 0 } 216 | } 217 | } 218 | layer { 219 | bottom: "rpn_cls_score" 220 | top: "rpn_cls_score_reshape" 221 | name: "rpn_cls_score_reshape" 222 | type: "Reshape" 223 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 224 | } 225 | layer { 226 | name: 'rpn-data' 227 | type: 'Python' 228 | bottom: 'rpn_cls_score' 229 | bottom: 'gt_boxes' 230 | bottom: 'im_info' 231 | bottom: 'data' 232 | top: 'rpn_labels' 233 | top: 'rpn_bbox_targets' 234 | top: 'rpn_bbox_inside_weights' 235 | top: 'rpn_bbox_outside_weights' 236 | python_param { 237 | module: 'rpn.anchor_target_layer' 238 | layer: 'AnchorTargetLayer' 239 | param_str: "'feat_stride': 16" 240 | } 241 | } 242 | layer { 243 | name: "rpn_loss_cls" 244 | type: "SoftmaxWithLoss" 245 | bottom: "rpn_cls_score_reshape" 246 | bottom: "rpn_labels" 247 | propagate_down: 1 248 | propagate_down: 0 249 | top: "rpn_cls_loss" 250 | loss_weight: 1 251 | loss_param { 252 | ignore_label: -1 253 | normalize: true 254 | } 255 | } 256 | layer { 257 | name: "rpn_loss_bbox" 258 | type: "SmoothL1Loss" 259 | bottom: "rpn_bbox_pred" 260 | bottom: "rpn_bbox_targets" 261 | bottom: "rpn_bbox_inside_weights" 262 | bottom: "rpn_bbox_outside_weights" 263 | top: "rpn_loss_bbox" 264 | loss_weight: 1 265 | smooth_l1_loss_param { sigma: 3.0 } 266 | } 267 | 268 | #========= RCNN ============ 269 | # Dummy layers so that initial parameters are saved into the output net 270 | 271 | layer { 272 | name: "dummy_roi_pool_conv5" 273 | type: "DummyData" 274 | top: "dummy_roi_pool_conv5" 275 | dummy_data_param { 276 | shape { dim: 1 dim: 9216 } 277 | data_filler { type: "gaussian" std: 0.01 } 278 | } 279 | } 280 | layer { 281 | name: "fc6" 282 | type: "InnerProduct" 283 | bottom: "dummy_roi_pool_conv5" 284 | top: "fc6" 285 | param { lr_mult: 0 decay_mult: 0 } 286 | param { lr_mult: 0 decay_mult: 0 } 287 | inner_product_param { 288 | num_output: 4096 289 | } 290 | } 291 | layer { 292 | name: "relu6" 293 | type: "ReLU" 294 | bottom: "fc6" 295 | top: "fc6" 296 | } 297 | layer { 298 | name: "fc7" 299 | type: "InnerProduct" 300 | bottom: "fc6" 301 | top: "fc7" 302 | param { lr_mult: 0 decay_mult: 0 } 303 | param { lr_mult: 0 decay_mult: 0 } 304 | inner_product_param { 305 | num_output: 4096 306 | } 307 | } 308 | layer { 309 | name: "silence_fc7" 310 | type: "Silence" 311 | bottom: "fc7" 312 | } 313 | -------------------------------------------------------------------------------- /models/pascal_voc/ZF/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/pascal_voc/ZF/faster_rcnn_end2end/train.prototxt" 2 | 3 | base_lr: 0.001 4 | lr_policy: "step" 5 | gamma: 0.1 6 | stepsize: 50000 7 | display: 20 8 | average_loss: 100 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | 12 | #base_lr: 0.001 13 | #lr_policy: "exp" 14 | #gamma: 0.999539589 # (0.00001/0.001)^(1/10000) 15 | #display: 1 16 | #average_loss: 100 17 | #momentum: 0.9 18 | #weight_decay: 0.0005 19 | 20 | # We disable standard caffe solver snapshotting and implement our own snapshot 21 | # function 22 | snapshot: 0 23 | # We still use the snapshot prefix, though 24 | snapshot_prefix: "zf_faster_rcnn" 25 | iter_size: 2 26 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | Tools for training, testing, and compressing Fast R-CNN networks. 2 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Set up paths for Fast R-CNN.""" 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | def add_path(path): 14 | if path not in sys.path: 15 | sys.path.insert(0, path) 16 | 17 | this_dir = osp.dirname(__file__) 18 | 19 | # Add caffe to PYTHONPATH 20 | caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python') 21 | add_path(caffe_path) 22 | 23 | # Add lib to PYTHONPATH 24 | lib_path = osp.join(this_dir, '..', 'lib') 25 | add_path(lib_path) 26 | -------------------------------------------------------------------------------- /tools/compress_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Compress a Fast R-CNN network using truncated SVD.""" 11 | 12 | import _init_paths 13 | import caffe 14 | import argparse 15 | import numpy as np 16 | import os, sys 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Compress a Fast R-CNN network') 21 | parser.add_argument('--def', dest='prototxt', 22 | help='prototxt file defining the uncompressed network', 23 | default=None, type=str) 24 | parser.add_argument('--def-svd', dest='prototxt_svd', 25 | help='prototxt file defining the SVD compressed network', 26 | default=None, type=str) 27 | parser.add_argument('--net', dest='caffemodel', 28 | help='model to compress', 29 | default=None, type=str) 30 | 31 | if len(sys.argv) == 1: 32 | parser.print_help() 33 | sys.exit(1) 34 | 35 | args = parser.parse_args() 36 | return args 37 | 38 | def compress_weights(W, l): 39 | """Compress the weight matrix W of an inner product (fully connected) layer 40 | using truncated SVD. 41 | 42 | Parameters: 43 | W: N x M weights matrix 44 | l: number of singular values to retain 45 | 46 | Returns: 47 | Ul, L: matrices such that W \approx Ul*L 48 | """ 49 | 50 | # numpy doesn't seem to have a fast truncated SVD algorithm... 51 | # this could be faster 52 | U, s, V = np.linalg.svd(W, full_matrices=False) 53 | 54 | Ul = U[:, :l] 55 | sl = s[:l] 56 | Vl = V[:l, :] 57 | 58 | L = np.dot(np.diag(sl), Vl) 59 | return Ul, L 60 | 61 | def main(): 62 | args = parse_args() 63 | 64 | # prototxt = 'models/VGG16/test.prototxt' 65 | # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' 66 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 67 | 68 | # prototxt_svd = 'models/VGG16/svd/test_fc6_fc7.prototxt' 69 | # caffemodel = 'snapshots/vgg16_fast_rcnn_iter_40000.caffemodel' 70 | net_svd = caffe.Net(args.prototxt_svd, args.caffemodel, caffe.TEST) 71 | 72 | print('Uncompressed network {} : {}'.format(args.prototxt, args.caffemodel)) 73 | print('Compressed network prototxt {}'.format(args.prototxt_svd)) 74 | 75 | out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svd' 76 | out_dir = os.path.dirname(args.caffemodel) 77 | 78 | # Compress fc6 79 | if net_svd.params.has_key('fc6_L'): 80 | l_fc6 = net_svd.params['fc6_L'][0].data.shape[0] 81 | print(' fc6_L bottleneck size: {}'.format(l_fc6)) 82 | 83 | # uncompressed weights and biases 84 | W_fc6 = net.params['fc6'][0].data 85 | B_fc6 = net.params['fc6'][1].data 86 | 87 | print(' compressing fc6...') 88 | Ul_fc6, L_fc6 = compress_weights(W_fc6, l_fc6) 89 | 90 | assert(len(net_svd.params['fc6_L']) == 1) 91 | 92 | # install compressed matrix factors (and original biases) 93 | net_svd.params['fc6_L'][0].data[...] = L_fc6 94 | 95 | net_svd.params['fc6_U'][0].data[...] = Ul_fc6 96 | net_svd.params['fc6_U'][1].data[...] = B_fc6 97 | 98 | out += '_fc6_{}'.format(l_fc6) 99 | 100 | # Compress fc7 101 | if net_svd.params.has_key('fc7_L'): 102 | l_fc7 = net_svd.params['fc7_L'][0].data.shape[0] 103 | print ' fc7_L bottleneck size: {}'.format(l_fc7) 104 | 105 | W_fc7 = net.params['fc7'][0].data 106 | B_fc7 = net.params['fc7'][1].data 107 | 108 | print(' compressing fc7...') 109 | Ul_fc7, L_fc7 = compress_weights(W_fc7, l_fc7) 110 | 111 | assert(len(net_svd.params['fc7_L']) == 1) 112 | 113 | net_svd.params['fc7_L'][0].data[...] = L_fc7 114 | 115 | net_svd.params['fc7_U'][0].data[...] = Ul_fc7 116 | net_svd.params['fc7_U'][1].data[...] = B_fc7 117 | 118 | out += '_fc7_{}'.format(l_fc7) 119 | 120 | filename = '{}/{}.caffemodel'.format(out_dir, out) 121 | net_svd.save(filename) 122 | print 'Wrote svd model to: {:s}'.format(filename) 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Faster R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """ 11 | Demo script showing detections in sample images. 12 | 13 | See README.md for installation instructions before running. 14 | """ 15 | 16 | import _init_paths 17 | from fast_rcnn.config import cfg 18 | from fast_rcnn.test import im_detect 19 | from fast_rcnn.nms_wrapper import nms 20 | from utils.timer import Timer 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | import scipy.io as sio 24 | import caffe, os, sys, cv2 25 | import argparse 26 | 27 | CLASSES = ('__background__', 28 | 'aeroplane', 'bicycle', 'bird', 'boat', 29 | 'bottle', 'bus', 'car', 'cat', 'chair', 30 | 'cow', 'diningtable', 'dog', 'horse', 31 | 'motorbike', 'person', 'pottedplant', 32 | 'sheep', 'sofa', 'train', 'tvmonitor') 33 | 34 | NETS = {'vgg16': ('VGG16', 35 | 'VGG16_faster_rcnn_final.caffemodel'), 36 | 'zf': ('ZF', 37 | 'ZF_faster_rcnn_final.caffemodel')} 38 | 39 | 40 | def vis_detections(im, class_name, dets, thresh=0.5): 41 | """Draw detected bounding boxes.""" 42 | inds = np.where(dets[:, -1] >= thresh)[0] 43 | if len(inds) == 0: 44 | return 45 | 46 | im = im[:, :, (2, 1, 0)] 47 | fig, ax = plt.subplots(figsize=(12, 12)) 48 | ax.imshow(im, aspect='equal') 49 | for i in inds: 50 | bbox = dets[i, :4] 51 | score = dets[i, -1] 52 | 53 | ax.add_patch( 54 | plt.Rectangle((bbox[0], bbox[1]), 55 | bbox[2] - bbox[0], 56 | bbox[3] - bbox[1], fill=False, 57 | edgecolor='red', linewidth=3.5) 58 | ) 59 | ax.text(bbox[0], bbox[1] - 2, 60 | '{:s} {:.3f}'.format(class_name, score), 61 | bbox=dict(facecolor='blue', alpha=0.5), 62 | fontsize=14, color='white') 63 | 64 | ax.set_title(('{} detections with ' 65 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 66 | thresh), 67 | fontsize=14) 68 | plt.axis('off') 69 | plt.tight_layout() 70 | plt.draw() 71 | 72 | def demo(net, image_name): 73 | """Detect object classes in an image using pre-computed object proposals.""" 74 | 75 | # Load the demo image 76 | im_file = os.path.join(cfg.DATA_DIR, 'demo', image_name) 77 | im = cv2.imread(im_file) 78 | 79 | # Detect all object classes and regress object bounds 80 | timer = Timer() 81 | timer.tic() 82 | scores, boxes = im_detect(net, im) 83 | timer.toc() 84 | print ('Detection took {:.3f}s for ' 85 | '{:d} object proposals').format(timer.total_time, boxes.shape[0]) 86 | 87 | # Visualize detections for each class 88 | CONF_THRESH = 0.8 89 | NMS_THRESH = 0.3 90 | for cls_ind, cls in enumerate(CLASSES[1:]): 91 | cls_ind += 1 # because we skipped background 92 | cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)] 93 | cls_scores = scores[:, cls_ind] 94 | dets = np.hstack((cls_boxes, 95 | cls_scores[:, np.newaxis])).astype(np.float32) 96 | keep = nms(dets, NMS_THRESH) 97 | dets = dets[keep, :] 98 | vis_detections(im, cls, dets, thresh=CONF_THRESH) 99 | 100 | def parse_args(): 101 | """Parse input arguments.""" 102 | parser = argparse.ArgumentParser(description='Faster R-CNN demo') 103 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 104 | default=0, type=int) 105 | parser.add_argument('--cpu', dest='cpu_mode', 106 | help='Use CPU mode (overrides --gpu)', 107 | action='store_true') 108 | parser.add_argument('--net', dest='demo_net', help='Network to use [vgg16]', 109 | choices=NETS.keys(), default='vgg16') 110 | 111 | args = parser.parse_args() 112 | 113 | return args 114 | 115 | if __name__ == '__main__': 116 | cfg.TEST.HAS_RPN = True # Use RPN for proposals 117 | 118 | args = parse_args() 119 | 120 | prototxt = os.path.join(cfg.MODELS_DIR, NETS[args.demo_net][0], 121 | 'faster_rcnn_alt_opt', 'faster_rcnn_test.pt') 122 | caffemodel = os.path.join(cfg.DATA_DIR, 'faster_rcnn_models', 123 | NETS[args.demo_net][1]) 124 | 125 | if not os.path.isfile(caffemodel): 126 | raise IOError(('{:s} not found.\nDid you run ./data/script/' 127 | 'fetch_faster_rcnn_models.sh?').format(caffemodel)) 128 | 129 | if args.cpu_mode: 130 | caffe.set_mode_cpu() 131 | else: 132 | caffe.set_mode_gpu() 133 | caffe.set_device(args.gpu_id) 134 | cfg.GPU_ID = args.gpu_id 135 | net = caffe.Net(prototxt, caffemodel, caffe.TEST) 136 | 137 | print '\n\nLoaded network {:s}'.format(caffemodel) 138 | 139 | # Warmup on a dummy image 140 | im = 128 * np.ones((300, 500, 3), dtype=np.uint8) 141 | for i in xrange(2): 142 | _, _= im_detect(net, im) 143 | 144 | im_names = ['000456.jpg', '000542.jpg', '001150.jpg', 145 | '001763.jpg', '004545.jpg'] 146 | for im_name in im_names: 147 | print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' 148 | print 'Demo for data/demo/{}'.format(im_name) 149 | demo(net, im_name) 150 | 151 | plt.show() 152 | -------------------------------------------------------------------------------- /tools/eval_recall.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import _init_paths 4 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list 5 | from datasets.factory import get_imdb 6 | import argparse 7 | import time, os, sys 8 | import numpy as np 9 | 10 | def parse_args(): 11 | """ 12 | Parse input arguments 13 | """ 14 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 15 | parser.add_argument('--imdb', dest='imdb_name', 16 | help='dataset to test', 17 | default='voc_2007_test', type=str) 18 | parser.add_argument('--method', dest='method', 19 | help='proposal method', 20 | default='selective_search', type=str) 21 | parser.add_argument('--rpn-file', dest='rpn_file', 22 | default=None, type=str) 23 | 24 | if len(sys.argv) == 1: 25 | parser.print_help() 26 | sys.exit(1) 27 | 28 | args = parser.parse_args() 29 | return args 30 | 31 | if __name__ == '__main__': 32 | args = parse_args() 33 | 34 | print('Called with args:') 35 | print(args) 36 | 37 | imdb = get_imdb(args.imdb_name) 38 | imdb.set_proposal_method(args.method) 39 | if args.rpn_file is not None: 40 | imdb.config['rpn_file'] = args.rpn_file 41 | 42 | candidate_boxes = None 43 | if 0: 44 | import scipy.io as sio 45 | filename = 'debug/stage1_rpn_voc_2007_test.mat' 46 | raw_data = sio.loadmat(filename)['aboxes'].ravel() 47 | candidate_boxes = raw_data 48 | 49 | ar, gt_overlaps, recalls, thresholds = \ 50 | imdb.evaluate_recall(candidate_boxes=candidate_boxes) 51 | print 'Method: {}'.format(args.method) 52 | print 'AverageRec: {:.3f}'.format(ar) 53 | 54 | def recall_at(t): 55 | ind = np.where(thresholds > t - 1e-5)[0][0] 56 | assert np.isclose(thresholds[ind], t) 57 | return recalls[ind] 58 | 59 | print 'Recall@0.5: {:.3f}'.format(recall_at(0.5)) 60 | print 'Recall@0.6: {:.3f}'.format(recall_at(0.6)) 61 | print 'Recall@0.7: {:.3f}'.format(recall_at(0.7)) 62 | print 'Recall@0.8: {:.3f}'.format(recall_at(0.8)) 63 | print 'Recall@0.9: {:.3f}'.format(recall_at(0.9)) 64 | # print again for easy spreadsheet copying 65 | print '{:.3f}'.format(ar) 66 | print '{:.3f}'.format(recall_at(0.5)) 67 | print '{:.3f}'.format(recall_at(0.6)) 68 | print '{:.3f}'.format(recall_at(0.7)) 69 | print '{:.3f}'.format(recall_at(0.8)) 70 | print '{:.3f}'.format(recall_at(0.9)) 71 | -------------------------------------------------------------------------------- /tools/reval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Reval = re-eval. Re-evaluate saved detections.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.test import apply_nms 14 | from fast_rcnn.config import cfg 15 | from datasets.factory import get_imdb 16 | import cPickle 17 | import os, sys, argparse 18 | import numpy as np 19 | 20 | def parse_args(): 21 | """ 22 | Parse input arguments 23 | """ 24 | parser = argparse.ArgumentParser(description='Re-evaluate results') 25 | parser.add_argument('output_dir', nargs=1, help='results directory', 26 | type=str) 27 | parser.add_argument('--imdb', dest='imdb_name', 28 | help='dataset to re-evaluate', 29 | default='voc_2007_test', type=str) 30 | parser.add_argument('--matlab', dest='matlab_eval', 31 | help='use matlab for evaluation', 32 | action='store_true') 33 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 34 | action='store_true') 35 | parser.add_argument('--nms', dest='apply_nms', help='apply nms', 36 | action='store_true') 37 | 38 | if len(sys.argv) == 1: 39 | parser.print_help() 40 | sys.exit(1) 41 | 42 | args = parser.parse_args() 43 | return args 44 | 45 | def from_dets(imdb_name, output_dir, args): 46 | imdb = get_imdb(imdb_name) 47 | imdb.competition_mode(args.comp_mode) 48 | imdb.config['matlab_eval'] = args.matlab_eval 49 | with open(os.path.join(output_dir, 'detections.pkl'), 'rb') as f: 50 | dets = cPickle.load(f) 51 | 52 | if args.apply_nms: 53 | print 'Applying NMS to all detections' 54 | nms_dets = apply_nms(dets, cfg.TEST.NMS) 55 | else: 56 | nms_dets = dets 57 | 58 | print 'Evaluating detections' 59 | imdb.evaluate_detections(nms_dets, output_dir) 60 | 61 | if __name__ == '__main__': 62 | args = parse_args() 63 | 64 | output_dir = os.path.abspath(args.output_dir[0]) 65 | imdb_name = args.imdb_name 66 | from_dets(imdb_name, output_dir, args) 67 | -------------------------------------------------------------------------------- /tools/rpn_generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast/er/ R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Generate RPN proposals.""" 11 | 12 | import _init_paths 13 | import numpy as np 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 15 | from datasets.factory import get_imdb 16 | from rpn.generate import imdb_proposals 17 | import cPickle 18 | import caffe 19 | import argparse 20 | import pprint 21 | import time, os, sys 22 | 23 | def parse_args(): 24 | """ 25 | Parse input arguments 26 | """ 27 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 28 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 29 | default=0, type=int) 30 | parser.add_argument('--def', dest='prototxt', 31 | help='prototxt file defining the network', 32 | default=None, type=str) 33 | parser.add_argument('--net', dest='caffemodel', 34 | help='model to test', 35 | default=None, type=str) 36 | parser.add_argument('--cfg', dest='cfg_file', 37 | help='optional config file', default=None, type=str) 38 | parser.add_argument('--wait', dest='wait', 39 | help='wait until net file exists', 40 | default=True, type=bool) 41 | parser.add_argument('--imdb', dest='imdb_name', 42 | help='dataset to test', 43 | default='voc_2007_test', type=str) 44 | parser.add_argument('--set', dest='set_cfgs', 45 | help='set config keys', default=None, 46 | nargs=argparse.REMAINDER) 47 | 48 | if len(sys.argv) == 1: 49 | parser.print_help() 50 | sys.exit(1) 51 | 52 | args = parser.parse_args() 53 | return args 54 | 55 | if __name__ == '__main__': 56 | args = parse_args() 57 | 58 | print('Called with args:') 59 | print(args) 60 | 61 | if args.cfg_file is not None: 62 | cfg_from_file(args.cfg_file) 63 | if args.set_cfgs is not None: 64 | cfg_from_list(args.set_cfgs) 65 | 66 | cfg.GPU_ID = args.gpu_id 67 | 68 | # RPN test settings 69 | cfg.TEST.RPN_PRE_NMS_TOP_N = -1 70 | cfg.TEST.RPN_POST_NMS_TOP_N = 2000 71 | 72 | print('Using config:') 73 | pprint.pprint(cfg) 74 | 75 | while not os.path.exists(args.caffemodel) and args.wait: 76 | print('Waiting for {} to exist...'.format(args.caffemodel)) 77 | time.sleep(10) 78 | 79 | caffe.set_mode_gpu() 80 | caffe.set_device(args.gpu_id) 81 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 82 | net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] 83 | 84 | imdb = get_imdb(args.imdb_name) 85 | imdb_boxes = imdb_proposals(net, imdb) 86 | 87 | output_dir = get_output_dir(imdb, net) 88 | rpn_file = os.path.join(output_dir, net.name + '_rpn_proposals.pkl') 89 | with open(rpn_file, 'wb') as f: 90 | cPickle.dump(imdb_boxes, f, cPickle.HIGHEST_PROTOCOL) 91 | print 'Wrote RPN proposals to {}'.format(rpn_file) 92 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Test a Fast R-CNN network on an image database.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.test import test_net 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list 15 | from datasets.factory import get_imdb 16 | import caffe 17 | import argparse 18 | import pprint 19 | import time, os, sys 20 | 21 | def parse_args(): 22 | """ 23 | Parse input arguments 24 | """ 25 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 26 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 27 | default=0, type=int) 28 | parser.add_argument('--def', dest='prototxt', 29 | help='prototxt file defining the network', 30 | default=None, type=str) 31 | parser.add_argument('--net', dest='caffemodel', 32 | help='model to test', 33 | default=None, type=str) 34 | parser.add_argument('--cfg', dest='cfg_file', 35 | help='optional config file', default=None, type=str) 36 | parser.add_argument('--wait', dest='wait', 37 | help='wait until net file exists', 38 | default=True, type=bool) 39 | parser.add_argument('--imdb', dest='imdb_name', 40 | help='dataset to test', 41 | default='voc_2007_test', type=str) 42 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 43 | action='store_true') 44 | parser.add_argument('--set', dest='set_cfgs', 45 | help='set config keys', default=None, 46 | nargs=argparse.REMAINDER) 47 | parser.add_argument('--vis', dest='vis', help='visualize detections', 48 | action='store_true') 49 | parser.add_argument('--num_dets', dest='max_per_image', 50 | help='max number of detections per image', 51 | default=100, type=int) 52 | 53 | if len(sys.argv) == 1: 54 | parser.print_help() 55 | sys.exit(1) 56 | 57 | args = parser.parse_args() 58 | return args 59 | 60 | if __name__ == '__main__': 61 | args = parse_args() 62 | 63 | print('Called with args:') 64 | print(args) 65 | 66 | if args.cfg_file is not None: 67 | cfg_from_file(args.cfg_file) 68 | if args.set_cfgs is not None: 69 | cfg_from_list(args.set_cfgs) 70 | 71 | cfg.GPU_ID = args.gpu_id 72 | 73 | print('Using config:') 74 | pprint.pprint(cfg) 75 | 76 | while not os.path.exists(args.caffemodel) and args.wait: 77 | print('Waiting for {} to exist...'.format(args.caffemodel)) 78 | time.sleep(10) 79 | 80 | caffe.set_mode_gpu() 81 | caffe.set_device(args.gpu_id) 82 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 83 | net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] 84 | 85 | imdb = get_imdb(args.imdb_name) 86 | imdb.competition_mode(args.comp_mode) 87 | if not cfg.TEST.HAS_RPN: 88 | imdb.set_proposal_method(cfg.TEST.PROPOSAL_METHOD) 89 | 90 | test_net(net, imdb, max_per_image=args.max_per_image, vis=args.vis) 91 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Train a Fast R-CNN network on a region of interest database.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.train import get_training_roidb, train_net 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 15 | from datasets.factory import get_imdb 16 | import datasets.imdb 17 | import caffe 18 | import argparse 19 | import pprint 20 | import numpy as np 21 | import sys 22 | 23 | def parse_args(): 24 | """ 25 | Parse input arguments 26 | """ 27 | parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') 28 | parser.add_argument('--gpu', dest='gpu_id', 29 | help='GPU device id to use [0]', 30 | default=0, type=int) 31 | parser.add_argument('--solver', dest='solver', 32 | help='solver prototxt', 33 | default=None, type=str) 34 | parser.add_argument('--iters', dest='max_iters', 35 | help='number of iterations to train', 36 | default=40000, type=int) 37 | parser.add_argument('--weights', dest='pretrained_model', 38 | help='initialize with pretrained model weights', 39 | default=None, type=str) 40 | parser.add_argument('--cfg', dest='cfg_file', 41 | help='optional config file', 42 | default=None, type=str) 43 | parser.add_argument('--imdb', dest='imdb_name', 44 | help='dataset to train on', 45 | default='voc_2007_trainval', type=str) 46 | parser.add_argument('--rand', dest='randomize', 47 | help='randomize (do not use a fixed seed)', 48 | action='store_true') 49 | parser.add_argument('--set', dest='set_cfgs', 50 | help='set config keys', default=None, 51 | nargs=argparse.REMAINDER) 52 | 53 | if len(sys.argv) == 1: 54 | parser.print_help() 55 | sys.exit(1) 56 | 57 | args = parser.parse_args() 58 | return args 59 | 60 | def combined_roidb(imdb_names): 61 | def get_roidb(imdb_name): 62 | imdb = get_imdb(imdb_name) 63 | print 'Loaded dataset `{:s}` for training'.format(imdb.name) 64 | imdb.set_proposal_method(cfg.TRAIN.PROPOSAL_METHOD) 65 | print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) 66 | roidb = get_training_roidb(imdb) 67 | return roidb 68 | 69 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 70 | roidb = roidbs[0] 71 | if len(roidbs) > 1: 72 | for r in roidbs[1:]: 73 | roidb.extend(r) 74 | imdb = datasets.imdb.imdb(imdb_names) 75 | else: 76 | imdb = get_imdb(imdb_names) 77 | return imdb, roidb 78 | 79 | if __name__ == '__main__': 80 | args = parse_args() 81 | 82 | print('Called with args:') 83 | print(args) 84 | 85 | if args.cfg_file is not None: 86 | cfg_from_file(args.cfg_file) 87 | if args.set_cfgs is not None: 88 | cfg_from_list(args.set_cfgs) 89 | 90 | cfg.GPU_ID = args.gpu_id 91 | 92 | print('Using config:') 93 | pprint.pprint(cfg) 94 | 95 | if not args.randomize: 96 | # fix the random seeds (numpy and caffe) for reproducibility 97 | np.random.seed(cfg.RNG_SEED) 98 | caffe.set_random_seed(cfg.RNG_SEED) 99 | 100 | # set up caffe 101 | caffe.set_mode_gpu() 102 | caffe.set_device(args.gpu_id) 103 | 104 | imdb, roidb = combined_roidb(args.imdb_name) 105 | print '{:d} roidb entries'.format(len(roidb)) 106 | 107 | output_dir = get_output_dir(imdb) 108 | print 'Output will be saved to `{:s}`'.format(output_dir) 109 | 110 | train_net(args.solver, roidb, output_dir, 111 | pretrained_model=args.pretrained_model, 112 | max_iters=args.max_iters) 113 | --------------------------------------------------------------------------------