├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── data ├── .gitignore ├── README.md ├── demo │ ├── 000004.jpg │ ├── 000004_boxes.mat │ ├── 001551.jpg │ └── 001551_boxes.mat ├── pylintrc └── scripts │ ├── fetch_fast_rcnn_models.sh │ ├── fetch_imagenet_models.sh │ └── fetch_selective_search_data.sh ├── experiments ├── README.md ├── cfgs │ ├── fc_only.yml │ ├── multiscale.yml │ ├── no_bbox_reg.yml │ ├── piecewise.yml │ └── svm.yml ├── logs │ └── .gitignore └── scripts │ ├── all_caffenet.sh │ ├── all_vgg16.sh │ ├── all_vgg_cnn_m_1024.sh │ ├── default_caffenet.sh │ ├── default_vgg16.sh │ ├── default_vgg_cnn_m_1024.sh │ ├── fc_only_vgg16.sh │ ├── multiscale_caffenet.sh │ ├── multiscale_vgg_cnn_m_1024.sh │ ├── multitask_no_bbox_reg_caffenet.sh │ ├── multitask_no_bbox_reg_vgg16.sh │ ├── multitask_no_bbox_reg_vgg_cnn_m_1024.sh │ ├── no_bbox_reg_caffenet.sh │ ├── no_bbox_reg_vgg16.sh │ ├── no_bbox_reg_vgg_cnn_m_1024.sh │ ├── piecewise_caffenet.sh │ ├── piecewise_vgg16.sh │ ├── piecewise_vgg_cnn_m_1024.sh │ ├── svd_caffenet.sh │ ├── svd_vgg16.sh │ ├── svd_vgg_cnn_m_1024.sh │ ├── svm_caffenet.sh │ ├── svm_vgg16.sh │ └── svm_vgg_cnn_m_1024.sh ├── lib ├── Makefile ├── datasets │ ├── VOCdevkit-matlab-wrapper │ │ ├── get_voc_opts.m │ │ ├── voc_eval.m │ │ └── xVOCap.m │ ├── __init__.py │ ├── factory.py │ ├── imdb.py │ └── pascal_voc.py ├── fast_rcnn │ ├── __init__.py │ ├── config.py │ ├── test.py │ └── train.py ├── roi_data_layer │ ├── __init__.py │ ├── layer.py │ ├── minibatch.py │ └── roidb.py ├── setup.py └── utils │ ├── .gitignore │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ ├── nms.py │ ├── nms.pyx │ └── timer.py ├── matlab ├── README.md ├── fast_rcnn_demo.m ├── fast_rcnn_im_detect.m ├── fast_rcnn_load_net.m ├── nms.m └── showboxes.m ├── models ├── CaffeNet │ ├── compressed │ │ └── test.prototxt │ ├── no_bbox_reg │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── piecewise │ │ ├── solver.prototxt │ │ └── train.prototxt │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt ├── README.md ├── VGG16 │ ├── compressed │ │ └── test.prototxt │ ├── fc_only │ │ ├── solver.prototxt │ │ └── train.prototxt │ ├── no_bbox_reg │ │ ├── solver.prototxt │ │ ├── test.prototxt │ │ └── train.prototxt │ ├── piecewise │ │ ├── solver.prototxt │ │ └── train.prototxt │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt └── VGG_CNN_M_1024 │ ├── compressed │ └── test.prototxt │ ├── no_bbox_reg │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt │ ├── piecewise │ ├── solver.prototxt │ └── train.prototxt │ ├── solver.prototxt │ ├── test.prototxt │ └── train.prototxt ├── output ├── .gitignore └── README.md ├── todo.txt └── tools ├── README.md ├── _init_paths.py ├── compress_net.py ├── demo.py ├── reval.py ├── test_net.py ├── train_net.py └── train_svms.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .ipynb_checkpoints 3 | utils/*.c 4 | utils/*.so 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "caffe-fast-rcnn"] 2 | path = caffe-fast-rcnn 3 | url = https://github.com/rbgirshick/caffe-fast-rcnn.git 4 | branch = fast-rcnn 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Fast R-CNN 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | All rights reserved. 6 | 7 | MIT License 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a 10 | copy of this software and associated documentation files (the "Software"), 11 | to deal in the Software without restriction, including without limitation 12 | the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | and/or sell copies of the Software, and to permit persons to whom the 14 | Software is furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included 17 | in all copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR 23 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 24 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 25 | OTHER DEALINGS IN THE SOFTWARE. 26 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | selective_search* 2 | imagenet_models* 3 | fast_rcnn_models* 4 | VOCdevkit* 5 | cache 6 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | This directory holds (*after you download them*): 2 | - Pre-computed object proposals 3 | - Caffe models pre-trained on ImageNet 4 | - Fast R-CNN models 5 | - Symlinks to datasets 6 | 7 | To download precomputed Selective Search proposals for PASCAL VOC 2007 and 2012, run: 8 | 9 | ``` 10 | ./data/scripts/fetch_selective_search_data.sh 11 | ``` 12 | 13 | This script will populate `data/selective_search_data`. 14 | 15 | To download Caffe models (CaffeNet, VGG_CNN_M_1024, VGG16) pre-trained on ImageNet, run: 16 | 17 | ``` 18 | ./data/scripts/fetch_imagenet_models.sh 19 | ``` 20 | 21 | This script will populate `data/imagenet_models`. 22 | 23 | To download Fast R-CNN models trained on VOC 2007, run: 24 | 25 | ``` 26 | ./data/scripts/fetch_fast_rcnn_models.sh 27 | ``` 28 | 29 | This script will populate `data/fast_rcnn_models`. 30 | 31 | In order to train and test with PASCAL VOC, you will need to establish symlinks. 32 | From the `data` directory (`cd data`): 33 | 34 | ``` 35 | # For VOC 2007 36 | ln -s /your/path/to/VOC2007/VOCdevkit VOCdevkit2007 37 | 38 | # For VOC 2012 39 | ln -s /your/path/to/VOC2012/VOCdevkit VOCdevkit2012 40 | ``` 41 | 42 | Since you'll likely be experimenting with multiple installs of Fast R-CNN in 43 | parallel, you'll probably want to keep all of this data in a shared place and 44 | use symlinks. On my system I create the following symlinks inside `data`: 45 | 46 | ``` 47 | # data/cache holds various outputs created by the datasets package 48 | ln -s /data/fast_rcnn_shared/cache 49 | 50 | # move the imagenet_models to shared location and symlink to them 51 | ln -s /data/fast_rcnn_shared/imagenet_models 52 | 53 | # move the selective search data to a shared location and symlink to them 54 | ln -s /data/fast_rcnn_shared/selective_search_data 55 | 56 | ln -s /data/VOC2007/VOCdevkit VOCdevkit2007 57 | ln -s /data/VOC2012/VOCdevkit VOCdevkit2012 58 | ``` 59 | -------------------------------------------------------------------------------- /data/demo/000004.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/fast-rcnn/b612190f279da3c11dd8b1396dd5e72779f8e463/data/demo/000004.jpg -------------------------------------------------------------------------------- /data/demo/000004_boxes.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/fast-rcnn/b612190f279da3c11dd8b1396dd5e72779f8e463/data/demo/000004_boxes.mat -------------------------------------------------------------------------------- /data/demo/001551.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/fast-rcnn/b612190f279da3c11dd8b1396dd5e72779f8e463/data/demo/001551.jpg -------------------------------------------------------------------------------- /data/demo/001551_boxes.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbgirshick/fast-rcnn/b612190f279da3c11dd8b1396dd5e72779f8e463/data/demo/001551_boxes.mat -------------------------------------------------------------------------------- /data/pylintrc: -------------------------------------------------------------------------------- 1 | [TYPECHECK] 2 | 3 | ignored-modules = numpy, numpy.random, cv2 4 | -------------------------------------------------------------------------------- /data/scripts/fetch_fast_rcnn_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=fast_rcnn_models.tgz 7 | URL=https://dl.dropboxusercontent.com/s/e3ugqq3lca4z8q6/fast_rcnn_models.tgz 8 | CHECKSUM=5f7dde9f5376e18c8e065338cc5df3f7 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading Fast R-CNN demo models (0.96G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_imagenet_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=imagenet_models.tgz 7 | URL=https://dl.dropboxusercontent.com/s/riazjuizq0w7dqm/imagenet_models.tgz 8 | CHECKSUM=8b1d4b9da0593fc70ef403284f810adc 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading pretrained ImageNet models (1G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /data/scripts/fetch_selective_search_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../" && pwd )" 4 | cd $DIR 5 | 6 | FILE=selective_search_data.tgz 7 | URL=https://dl.dropboxusercontent.com/s/orrt7o6bp6ae0tc/selective_search_data.tgz 8 | CHECKSUM=7078c1db87a7851b31966b96774cd9b9 9 | 10 | if [ -f $FILE ]; then 11 | echo "File already exists. Checking md5..." 12 | os=`uname -s` 13 | if [ "$os" = "Linux" ]; then 14 | checksum=`md5sum $FILE | awk '{ print $1 }'` 15 | elif [ "$os" = "Darwin" ]; then 16 | checksum=`cat $FILE | md5` 17 | fi 18 | if [ "$checksum" = "$CHECKSUM" ]; then 19 | echo "Checksum is correct. No need to download." 20 | exit 0 21 | else 22 | echo "Checksum is incorrect. Need to download again." 23 | fi 24 | fi 25 | 26 | echo "Downloading precomputed selective search boxes (0.5G)..." 27 | 28 | wget $URL -O $FILE 29 | 30 | echo "Unzipping..." 31 | 32 | tar zxvf $FILE 33 | 34 | echo "Done. Please run this command again to verify that checksum = $CHECKSUM." 35 | -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- 1 | Scripts to reproduce (most) of the experiments in the paper. 2 | 3 | Scripts are under `experiments/scripts`. 4 | 5 | Each script saves a log file under `experiments/logs`. 6 | 7 | Configuration override files used in the experiments are stored in `experiments/cfgs`. 8 | -------------------------------------------------------------------------------- /experiments/cfgs/fc_only.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: fc_only 2 | TRAIN: 3 | SNAPSHOT_INFIX: fc_only 4 | -------------------------------------------------------------------------------- /experiments/cfgs/multiscale.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: multiscale 2 | TRAIN: 3 | SCALES: !!python/tuple [480, 576, 688, 864, 1200] 4 | MAX_SIZE: 2000 5 | SNAPSHOT_INFIX: multiscale 6 | TEST: 7 | SCALES: !!python/tuple [480, 576, 688, 864, 1200] 8 | MAX_SIZE: 2000 9 | -------------------------------------------------------------------------------- /experiments/cfgs/no_bbox_reg.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: "no_bbox_reg" 2 | TRAIN: 3 | BBOX_REG: False 4 | SNAPSHOT_INFIX: no_bbox_reg 5 | TEST: 6 | BBOX_REG: False 7 | -------------------------------------------------------------------------------- /experiments/cfgs/piecewise.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: piecewise 2 | TRAIN: 3 | SNAPSHOT_INFIX: piecewise 4 | -------------------------------------------------------------------------------- /experiments/cfgs/svm.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: svm 2 | TRAIN: 3 | # don't use flipped examples when training SVMs for two reasons: 4 | # 1) R-CNN didn't 5 | # 2) I've tried and it doesn't help, yet makes SVM training take 2x longer 6 | USE_FLIPPED: False 7 | TEST: 8 | SVM: True 9 | -------------------------------------------------------------------------------- /experiments/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt* 2 | -------------------------------------------------------------------------------- /experiments/scripts/all_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | GPU=$1 5 | NET=caffenet 6 | ./experiments/scripts/default_${NET}.sh $GPU 7 | ./experiments/scripts/multiscale_${NET}.sh $GPU 8 | ./experiments/scripts/multitask_no_bbox_reg_${NET}.sh $GPU 9 | ./experiments/scripts/no_bbox_reg_${NET}.sh $GPU 10 | ./experiments/scripts/piecewise_${NET}.sh $GPU 11 | ./experiments/scripts/svd_${NET}.sh $GPU 12 | ./experiments/scripts/svm_${NET}.sh $GPU 13 | -------------------------------------------------------------------------------- /experiments/scripts/all_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | GPU=$1 5 | NET=vgg16 6 | ./experiments/scripts/default_${NET}.sh $GPU 7 | ./experiments/scripts/fc_only_${NET}.sh $GPU 8 | ./experiments/scripts/multitask_no_bbox_reg_${NET}.sh $GPU 9 | ./experiments/scripts/no_bbox_reg_${NET}.sh $GPU 10 | ./experiments/scripts/piecewise_${NET}.sh $GPU 11 | ./experiments/scripts/svd_${NET}.sh $GPU 12 | ./experiments/scripts/svm_${NET}.sh $GPU 13 | -------------------------------------------------------------------------------- /experiments/scripts/all_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | GPU=$1 5 | NET=vgg_cnn_m_1024 6 | ./experiments/scripts/default_${NET}.sh $GPU 7 | ./experiments/scripts/multiscale_${NET}.sh $GPU 8 | ./experiments/scripts/multitask_no_bbox_reg_${NET}.sh $GPU 9 | ./experiments/scripts/no_bbox_reg_${NET}.sh $GPU 10 | ./experiments/scripts/piecewise_${NET}.sh $GPU 11 | ./experiments/scripts/svd_${NET}.sh $GPU 12 | ./experiments/scripts/svm_${NET}.sh $GPU 13 | -------------------------------------------------------------------------------- /experiments/scripts/default_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/default_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/CaffeNet/solver.prototxt \ 14 | --weights data/imagenet_models/CaffeNet.v2.caffemodel \ 15 | --imdb voc_2007_trainval 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/CaffeNet/test.prototxt \ 19 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/default_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/default_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG16/solver.prototxt \ 14 | --weights data/imagenet_models/VGG16.v2.caffemodel \ 15 | --imdb voc_2007_trainval 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/VGG16/test.prototxt \ 19 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/default_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/default_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG_CNN_M_1024/solver.prototxt \ 14 | --weights data/imagenet_models/VGG_CNN_M_1024.v2.caffemodel \ 15 | --imdb voc_2007_trainval 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/VGG_CNN_M_1024/test.prototxt \ 19 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/fc_only_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/fc_only_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG16/fc_only/solver.prototxt \ 14 | --weights data/imagenet_models/VGG16.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/fc_only.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG16/test.prototxt \ 20 | --net output/fc_only/voc_2007_trainval/vgg16_fast_rcnn_fc_only_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/fc_only.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/multiscale_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/multiscale_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/CaffeNet/solver.prototxt \ 14 | --weights data/imagenet_models/CaffeNet.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/multiscale.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/CaffeNet/test.prototxt \ 20 | --net output/multiscale/voc_2007_trainval/caffenet_fast_rcnn_multiscale_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/multiscale.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/multiscale_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/multiscale_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG_CNN_M_1024/solver.prototxt \ 14 | --weights data/imagenet_models/VGG_CNN_M_1024.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/multiscale.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG_CNN_M_1024/test.prototxt \ 20 | --net output/multiscale/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_multiscale_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/multiscale.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/multitask_no_bbox_reg_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/multitask_no_bbox_reg_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/test_net.py --gpu $1 \ 13 | --def models/CaffeNet/test.prototxt \ 14 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_test \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | -------------------------------------------------------------------------------- /experiments/scripts/multitask_no_bbox_reg_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/multitask_no_bbox_reg_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/test_net.py --gpu $1 \ 13 | --def models/VGG16/test.prototxt \ 14 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_test \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | -------------------------------------------------------------------------------- /experiments/scripts/multitask_no_bbox_reg_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/multitask_no_bbox_reg_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/test_net.py --gpu $1 \ 13 | --def models/VGG_CNN_M_1024/test.prototxt \ 14 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_test \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | -------------------------------------------------------------------------------- /experiments/scripts/no_bbox_reg_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/no_bbox_reg_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/CaffeNet/no_bbox_reg/solver.prototxt \ 14 | --weights data/imagenet_models/CaffeNet.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/CaffeNet/no_bbox_reg/test.prototxt \ 20 | --net output/no_bbox_reg/voc_2007_trainval/caffenet_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/no_bbox_reg.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/no_bbox_reg_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/no_bbox_reg_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG16/no_bbox_reg/solver.prototxt \ 14 | --weights data/imagenet_models/VGG16.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG16/no_bbox_reg/test.prototxt \ 20 | --net output/no_bbox_reg/voc_2007_trainval/vgg16_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/no_bbox_reg.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/no_bbox_reg_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/no_bbox_reg_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG_CNN_M_1024/no_bbox_reg/solver.prototxt \ 14 | --weights data/imagenet_models/VGG_CNN_M_1024.v2.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/no_bbox_reg.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG_CNN_M_1024/no_bbox_reg/test.prototxt \ 20 | --net output/no_bbox_reg/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/no_bbox_reg.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/piecewise_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/piecewise_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/CaffeNet/piecewise/solver.prototxt \ 14 | --weights output/no_bbox_reg/voc_2007_trainval/caffenet_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/piecewise.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/CaffeNet/test.prototxt \ 20 | --net output/piecewise/voc_2007_trainval/caffenet_fast_rcnn_piecewise_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/piecewise.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/piecewise_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/piecewise_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG16/piecewise/solver.prototxt \ 14 | --weights output/no_bbox_reg/voc_2007_trainval/vgg16_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/piecewise.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG16/test.prototxt \ 20 | --net output/piecewise/voc_2007_trainval/vgg16_fast_rcnn_piecewise_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/piecewise.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/piecewise_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/piecewise_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_net.py --gpu $1 \ 13 | --solver models/VGG_CNN_M_1024/piecewise/solver.prototxt \ 14 | --weights output/no_bbox_reg/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_no_bbox_reg_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/piecewise.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG_CNN_M_1024/test.prototxt \ 20 | --net output/piecewise/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_piecewise_iter_40000.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/piecewise.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/svd_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svd_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/compress_net.py \ 13 | --def models/CaffeNet/test.prototxt \ 14 | --def-svd models/CaffeNet/compressed/test.prototxt \ 15 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000.caffemodel 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/CaffeNet/compressed/test.prototxt \ 19 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000_svd_fc6_1024_fc7_256.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/svd_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svd_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/compress_net.py \ 13 | --def models/VGG16/test.prototxt \ 14 | --def-svd models/VGG16/compressed/test.prototxt \ 15 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000.caffemodel 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/VGG16/compressed/test.prototxt \ 19 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000_svd_fc6_1024_fc7_256.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/svd_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svd_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/compress_net.py \ 13 | --def models/VGG_CNN_M_1024/test.prototxt \ 14 | --def-svd models/VGG_CNN_M_1024/compressed/test.prototxt \ 15 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000.caffemodel 16 | 17 | time ./tools/test_net.py --gpu $1 \ 18 | --def models/VGG_CNN_M_1024/compressed/test.prototxt \ 19 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000_svd_fc6_1024_fc7_256.caffemodel \ 20 | --imdb voc_2007_test 21 | -------------------------------------------------------------------------------- /experiments/scripts/svm_caffenet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svm_caffenet.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_svms.py --gpu $1 \ 13 | --def models/CaffeNet/test.prototxt \ 14 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/svm.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/CaffeNet/test.prototxt \ 20 | --net output/default/voc_2007_trainval/caffenet_fast_rcnn_iter_40000_svm.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/svm.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/svm_vgg16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svm_vgg16.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_svms.py --gpu $1 \ 13 | --def models/VGG16/test.prototxt \ 14 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/svm.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG16/test.prototxt \ 20 | --net output/default/voc_2007_trainval/vgg16_fast_rcnn_iter_40000_svm.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/svm.yml 23 | -------------------------------------------------------------------------------- /experiments/scripts/svm_vgg_cnn_m_1024.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | LOG="experiments/logs/svm_vgg_cnn_m_1024.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 9 | exec &> >(tee -a "$LOG") 10 | echo Logging output to "$LOG" 11 | 12 | time ./tools/train_svms.py --gpu $1 \ 13 | --def models/VGG_CNN_M_1024/test.prototxt \ 14 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000.caffemodel \ 15 | --imdb voc_2007_trainval \ 16 | --cfg experiments/cfgs/svm.yml 17 | 18 | time ./tools/test_net.py --gpu $1 \ 19 | --def models/VGG_CNN_M_1024/test.prototxt \ 20 | --net output/default/voc_2007_trainval/vgg_cnn_m_1024_fast_rcnn_iter_40000_svm.caffemodel \ 21 | --imdb voc_2007_test \ 22 | --cfg experiments/cfgs/svm.yml 23 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/get_voc_opts.m: -------------------------------------------------------------------------------- 1 | function VOCopts = get_voc_opts(path) 2 | 3 | tmp = pwd; 4 | cd(path); 5 | try 6 | addpath('VOCcode'); 7 | VOCinit; 8 | catch 9 | rmpath('VOCcode'); 10 | cd(tmp); 11 | error(sprintf('VOCcode directory not found under %s', path)); 12 | end 13 | rmpath('VOCcode'); 14 | cd(tmp); 15 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/voc_eval.m: -------------------------------------------------------------------------------- 1 | function res = voc_eval(path, comp_id, test_set, output_dir, rm_res) 2 | 3 | VOCopts = get_voc_opts(path); 4 | VOCopts.testset = test_set; 5 | VOCopts.detrespath=[VOCopts.resdir 'Main/%s_det_' VOCopts.testset '_%s.txt']; 6 | 7 | for i = 1:length(VOCopts.classes) 8 | cls = VOCopts.classes{i}; 9 | res(i) = voc_eval_cls(cls, VOCopts, comp_id, output_dir, rm_res); 10 | end 11 | 12 | fprintf('\n~~~~~~~~~~~~~~~~~~~~\n'); 13 | fprintf('Results:\n'); 14 | aps = [res(:).ap]'; 15 | fprintf('%.1f\n', aps * 100); 16 | fprintf('%.1f\n', mean(aps) * 100); 17 | fprintf('~~~~~~~~~~~~~~~~~~~~\n'); 18 | 19 | function res = voc_eval_cls(cls, VOCopts, comp_id, output_dir, rm_res) 20 | 21 | test_set = VOCopts.testset; 22 | year = VOCopts.dataset(4:end); 23 | 24 | addpath(fullfile(VOCopts.datadir, 'VOCcode')); 25 | 26 | res_fn = sprintf(VOCopts.detrespath, comp_id, cls); 27 | 28 | recall = []; 29 | prec = []; 30 | ap = 0; 31 | ap_auc = 0; 32 | 33 | do_eval = (str2num(year) <= 2007) | ~strcmp(test_set, 'test'); 34 | if do_eval 35 | % Bug in VOCevaldet requires that tic has been called first 36 | tic; 37 | [recall, prec, ap] = VOCevaldet(VOCopts, comp_id, cls, true); 38 | ap_auc = xVOCap(recall, prec); 39 | 40 | % force plot limits 41 | ylim([0 1]); 42 | xlim([0 1]); 43 | 44 | print(gcf, '-djpeg', '-r0', ... 45 | [output_dir '/' cls '_pr.jpg']); 46 | end 47 | fprintf('!!! %s : %.4f %.4f\n', cls, ap, ap_auc); 48 | 49 | res.recall = recall; 50 | res.prec = prec; 51 | res.ap = ap; 52 | res.ap_auc = ap_auc; 53 | 54 | save([output_dir '/' cls '_pr.mat'], ... 55 | 'res', 'recall', 'prec', 'ap', 'ap_auc'); 56 | 57 | if rm_res 58 | delete(res_fn); 59 | end 60 | 61 | rmpath(fullfile(VOCopts.datadir, 'VOCcode')); 62 | -------------------------------------------------------------------------------- /lib/datasets/VOCdevkit-matlab-wrapper/xVOCap.m: -------------------------------------------------------------------------------- 1 | function ap = xVOCap(rec,prec) 2 | % From the PASCAL VOC 2011 devkit 3 | 4 | mrec=[0 ; rec ; 1]; 5 | mpre=[0 ; prec ; 0]; 6 | for i=numel(mpre)-1:-1:1 7 | mpre(i)=max(mpre(i),mpre(i+1)); 8 | end 9 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 10 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 11 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .imdb import imdb 9 | from .pascal_voc import pascal_voc 10 | from . import factory 11 | 12 | import os.path as osp 13 | ROOT_DIR = osp.join(osp.dirname(__file__), '..', '..') 14 | 15 | # We assume your matlab binary is in your path and called `matlab'. 16 | # If either is not true, just add it to your path and alias it as matlab, or 17 | # you could change this file. 18 | MATLAB = 'matlab' 19 | 20 | # http://stackoverflow.com/questions/377017/test-if-executable-exists-in-python 21 | def _which(program): 22 | import os 23 | def is_exe(fpath): 24 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) 25 | 26 | fpath, fname = os.path.split(program) 27 | if fpath: 28 | if is_exe(program): 29 | return program 30 | else: 31 | for path in os.environ["PATH"].split(os.pathsep): 32 | path = path.strip('"') 33 | exe_file = os.path.join(path, program) 34 | if is_exe(exe_file): 35 | return exe_file 36 | 37 | return None 38 | 39 | if _which(MATLAB) is None: 40 | msg = ("MATLAB command '{}' not found. " 41 | "Please add '{}' to your PATH.").format(MATLAB, MATLAB) 42 | raise EnvironmentError(msg) 43 | -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Factory method for easily getting imdbs by name.""" 9 | 10 | __sets = {} 11 | 12 | import datasets.pascal_voc 13 | import numpy as np 14 | 15 | def _selective_search_IJCV_top_k(split, year, top_k): 16 | """Return an imdb that uses the top k proposals from the selective search 17 | IJCV code. 18 | """ 19 | imdb = datasets.pascal_voc(split, year) 20 | imdb.roidb_handler = imdb.selective_search_IJCV_roidb 21 | imdb.config['top_k'] = top_k 22 | return imdb 23 | 24 | # Set up voc__ using selective search "fast" mode 25 | for year in ['2007', '2012']: 26 | for split in ['train', 'val', 'trainval', 'test']: 27 | name = 'voc_{}_{}'.format(year, split) 28 | __sets[name] = (lambda split=split, year=year: 29 | datasets.pascal_voc(split, year)) 30 | 31 | # Set up voc___top_ using selective search "quality" mode 32 | # but only returning the first k boxes 33 | for top_k in np.arange(1000, 11000, 1000): 34 | for year in ['2007', '2012']: 35 | for split in ['train', 'val', 'trainval', 'test']: 36 | name = 'voc_{}_{}_top_{:d}'.format(year, split, top_k) 37 | __sets[name] = (lambda split=split, year=year, top_k=top_k: 38 | _selective_search_IJCV_top_k(split, year, top_k)) 39 | 40 | def get_imdb(name): 41 | """Get an imdb (image database) by name.""" 42 | if not __sets.has_key(name): 43 | raise KeyError('Unknown dataset: {}'.format(name)) 44 | return __sets[name]() 45 | 46 | def list_imdbs(): 47 | """List all registered imdbs.""" 48 | return __sets.keys() 49 | -------------------------------------------------------------------------------- /lib/datasets/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import os.path as osp 10 | import PIL 11 | from utils.cython_bbox import bbox_overlaps 12 | import numpy as np 13 | import scipy.sparse 14 | import datasets 15 | 16 | class imdb(object): 17 | """Image database.""" 18 | 19 | def __init__(self, name): 20 | self._name = name 21 | self._num_classes = 0 22 | self._classes = [] 23 | self._image_index = [] 24 | self._obj_proposer = 'selective_search' 25 | self._roidb = None 26 | self._roidb_handler = self.default_roidb 27 | # Use this dict for storing dataset specific config options 28 | self.config = {} 29 | 30 | @property 31 | def name(self): 32 | return self._name 33 | 34 | @property 35 | def num_classes(self): 36 | return len(self._classes) 37 | 38 | @property 39 | def classes(self): 40 | return self._classes 41 | 42 | @property 43 | def image_index(self): 44 | return self._image_index 45 | 46 | @property 47 | def roidb_handler(self): 48 | return self._roidb_handler 49 | 50 | @roidb_handler.setter 51 | def roidb_handler(self, val): 52 | self._roidb_handler = val 53 | 54 | @property 55 | def roidb(self): 56 | # A roidb is a list of dictionaries, each with the following keys: 57 | # boxes 58 | # gt_overlaps 59 | # gt_classes 60 | # flipped 61 | if self._roidb is not None: 62 | return self._roidb 63 | self._roidb = self.roidb_handler() 64 | return self._roidb 65 | 66 | @property 67 | def cache_path(self): 68 | cache_path = osp.abspath(osp.join(datasets.ROOT_DIR, 'data', 'cache')) 69 | if not os.path.exists(cache_path): 70 | os.makedirs(cache_path) 71 | return cache_path 72 | 73 | @property 74 | def num_images(self): 75 | return len(self.image_index) 76 | 77 | def image_path_at(self, i): 78 | raise NotImplementedError 79 | 80 | def default_roidb(self): 81 | raise NotImplementedError 82 | 83 | def evaluate_detections(self, all_boxes, output_dir=None): 84 | """ 85 | all_boxes is a list of length number-of-classes. 86 | Each list element is a list of length number-of-images. 87 | Each of those list elements is either an empty list [] 88 | or a numpy array of detection. 89 | 90 | all_boxes[class][image] = [] or np.array of shape #dets x 5 91 | """ 92 | raise NotImplementedError 93 | 94 | def append_flipped_images(self): 95 | num_images = self.num_images 96 | widths = [PIL.Image.open(self.image_path_at(i)).size[0] 97 | for i in xrange(num_images)] 98 | for i in xrange(num_images): 99 | boxes = self.roidb[i]['boxes'].copy() 100 | oldx1 = boxes[:, 0].copy() 101 | oldx2 = boxes[:, 2].copy() 102 | boxes[:, 0] = widths[i] - oldx2 - 1 103 | boxes[:, 2] = widths[i] - oldx1 - 1 104 | assert (boxes[:, 2] >= boxes[:, 0]).all() 105 | entry = {'boxes' : boxes, 106 | 'gt_overlaps' : self.roidb[i]['gt_overlaps'], 107 | 'gt_classes' : self.roidb[i]['gt_classes'], 108 | 'flipped' : True} 109 | self.roidb.append(entry) 110 | self._image_index = self._image_index * 2 111 | 112 | def evaluate_recall(self, candidate_boxes, ar_thresh=0.5): 113 | # Record max overlap value for each gt box 114 | # Return vector of overlap values 115 | gt_overlaps = np.zeros(0) 116 | for i in xrange(self.num_images): 117 | gt_inds = np.where(self.roidb[i]['gt_classes'] > 0)[0] 118 | gt_boxes = self.roidb[i]['boxes'][gt_inds, :] 119 | 120 | boxes = candidate_boxes[i] 121 | if boxes.shape[0] == 0: 122 | continue 123 | overlaps = bbox_overlaps(boxes.astype(np.float), 124 | gt_boxes.astype(np.float)) 125 | 126 | # gt_overlaps = np.hstack((gt_overlaps, overlaps.max(axis=0))) 127 | _gt_overlaps = np.zeros((gt_boxes.shape[0])) 128 | for j in xrange(gt_boxes.shape[0]): 129 | argmax_overlaps = overlaps.argmax(axis=0) 130 | max_overlaps = overlaps.max(axis=0) 131 | gt_ind = max_overlaps.argmax() 132 | gt_ovr = max_overlaps.max() 133 | assert(gt_ovr >= 0) 134 | box_ind = argmax_overlaps[gt_ind] 135 | _gt_overlaps[j] = overlaps[box_ind, gt_ind] 136 | assert(_gt_overlaps[j] == gt_ovr) 137 | overlaps[box_ind, :] = -1 138 | overlaps[:, gt_ind] = -1 139 | 140 | gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) 141 | 142 | num_pos = gt_overlaps.size 143 | gt_overlaps = np.sort(gt_overlaps) 144 | step = 0.001 145 | thresholds = np.minimum(np.arange(0.5, 1.0 + step, step), 1.0) 146 | recalls = np.zeros_like(thresholds) 147 | for i, t in enumerate(thresholds): 148 | recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) 149 | ar = 2 * np.trapz(recalls, thresholds) 150 | 151 | return ar, gt_overlaps, recalls, thresholds 152 | 153 | def create_roidb_from_box_list(self, box_list, gt_roidb): 154 | assert len(box_list) == self.num_images, \ 155 | 'Number of boxes must match number of ground-truth images' 156 | roidb = [] 157 | for i in xrange(self.num_images): 158 | boxes = box_list[i] 159 | num_boxes = boxes.shape[0] 160 | overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) 161 | 162 | if gt_roidb is not None: 163 | gt_boxes = gt_roidb[i]['boxes'] 164 | gt_classes = gt_roidb[i]['gt_classes'] 165 | gt_overlaps = bbox_overlaps(boxes.astype(np.float), 166 | gt_boxes.astype(np.float)) 167 | argmaxes = gt_overlaps.argmax(axis=1) 168 | maxes = gt_overlaps.max(axis=1) 169 | I = np.where(maxes > 0)[0] 170 | overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] 171 | 172 | overlaps = scipy.sparse.csr_matrix(overlaps) 173 | roidb.append({'boxes' : boxes, 174 | 'gt_classes' : np.zeros((num_boxes,), 175 | dtype=np.int32), 176 | 'gt_overlaps' : overlaps, 177 | 'flipped' : False}) 178 | return roidb 179 | 180 | @staticmethod 181 | def merge_roidbs(a, b): 182 | assert len(a) == len(b) 183 | for i in xrange(len(a)): 184 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 185 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 186 | b[i]['gt_classes'])) 187 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 188 | b[i]['gt_overlaps']]) 189 | return a 190 | 191 | def competition_mode(self, on): 192 | """Turn competition mode on or off.""" 193 | pass 194 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from . import config 9 | from . import train 10 | from . import test 11 | -------------------------------------------------------------------------------- /lib/fast_rcnn/config.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Fast R-CNN config system. 9 | 10 | This file specifies default config options for Fast R-CNN. You should not 11 | change values in this file. Instead, you should write a config file (in yaml) 12 | and use cfg_from_file(yaml_file) to load it and override the default options. 13 | 14 | Most tools in $ROOT/tools take a --cfg option to specify an override file. 15 | - See tools/{train,test}_net.py for example code that uses cfg_from_file() 16 | - See experiments/cfgs/*.yml for example YAML config override files 17 | """ 18 | 19 | import os 20 | import os.path as osp 21 | import numpy as np 22 | # `pip install easydict` if you don't have it 23 | from easydict import EasyDict as edict 24 | 25 | __C = edict() 26 | # Consumers can get config by: 27 | # from fast_rcnn_config import cfg 28 | cfg = __C 29 | 30 | # 31 | # Training options 32 | # 33 | 34 | __C.TRAIN = edict() 35 | 36 | # Scales to use during training (can list multiple scales) 37 | # Each scale is the pixel size of an image's shortest side 38 | __C.TRAIN.SCALES = (600,) 39 | 40 | # Max pixel size of the longest side of a scaled input image 41 | __C.TRAIN.MAX_SIZE = 1000 42 | 43 | # Images to use per minibatch 44 | __C.TRAIN.IMS_PER_BATCH = 2 45 | 46 | # Minibatch size (number of regions of interest [ROIs]) 47 | __C.TRAIN.BATCH_SIZE = 128 48 | 49 | # Fraction of minibatch that is labeled foreground (i.e. class > 0) 50 | __C.TRAIN.FG_FRACTION = 0.25 51 | 52 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) 53 | __C.TRAIN.FG_THRESH = 0.5 54 | 55 | # Overlap threshold for a ROI to be considered background (class = 0 if 56 | # overlap in [LO, HI)) 57 | __C.TRAIN.BG_THRESH_HI = 0.5 58 | __C.TRAIN.BG_THRESH_LO = 0.1 59 | 60 | # Use horizontally-flipped images during training? 61 | __C.TRAIN.USE_FLIPPED = True 62 | 63 | # Train bounding-box regressors 64 | __C.TRAIN.BBOX_REG = True 65 | 66 | # Overlap required between a ROI and ground-truth box in order for that ROI to 67 | # be used as a bounding-box regression training example 68 | __C.TRAIN.BBOX_THRESH = 0.5 69 | 70 | # Iterations between snapshots 71 | __C.TRAIN.SNAPSHOT_ITERS = 10000 72 | 73 | # solver.prototxt specifies the snapshot path prefix, this adds an optional 74 | # infix to yield the path: [_]_iters_XYZ.caffemodel 75 | __C.TRAIN.SNAPSHOT_INFIX = '' 76 | 77 | # Use a prefetch thread in roi_data_layer.layer 78 | # So far I haven't found this useful; likely more engineering work is required 79 | __C.TRAIN.USE_PREFETCH = False 80 | 81 | # 82 | # Testing options 83 | # 84 | 85 | __C.TEST = edict() 86 | 87 | # Scales to use during testing (can list multiple scales) 88 | # Each scale is the pixel size of an image's shortest side 89 | __C.TEST.SCALES = (600,) 90 | 91 | # Max pixel size of the longest side of a scaled input image 92 | __C.TEST.MAX_SIZE = 1000 93 | 94 | # Overlap threshold used for non-maximum suppression (suppress boxes with 95 | # IoU >= this threshold) 96 | __C.TEST.NMS = 0.3 97 | 98 | # Experimental: treat the (K+1) units in the cls_score layer as linear 99 | # predictors (trained, eg, with one-vs-rest SVMs). 100 | __C.TEST.SVM = False 101 | 102 | # Test using bounding-box regressors 103 | __C.TEST.BBOX_REG = True 104 | 105 | # 106 | # MISC 107 | # 108 | 109 | # The mapping from image coordinates to feature map coordinates might cause 110 | # some boxes that are distinct in image space to become identical in feature 111 | # coordinates. If DEDUP_BOXES > 0, then DEDUP_BOXES is used as the scale factor 112 | # for identifying duplicate boxes. 113 | # 1/16 is correct for {Alex,Caffe}Net, VGG_CNN_M_1024, and VGG16 114 | __C.DEDUP_BOXES = 1./16. 115 | 116 | # Pixel mean values (BGR order) as a (1, 1, 3) array 117 | # We use the same pixel mean for all networks even though it's not exactly what 118 | # they were trained with 119 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 120 | 121 | # For reproducibility 122 | __C.RNG_SEED = 3 123 | 124 | # A small number that's used many times 125 | __C.EPS = 1e-14 126 | 127 | # Root directory of project 128 | __C.ROOT_DIR = osp.abspath(osp.join(osp.dirname(__file__), '..', '..')) 129 | 130 | # Place outputs under an experiments directory 131 | __C.EXP_DIR = 'default' 132 | 133 | def get_output_dir(imdb, net): 134 | """Return the directory where experimental artifacts are placed. 135 | 136 | A canonical path is built using the name from an imdb and a network 137 | (if not None). 138 | """ 139 | path = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 140 | if net is None: 141 | return path 142 | else: 143 | return osp.join(path, net.name) 144 | 145 | def _merge_a_into_b(a, b): 146 | """Merge config dictionary a into config dictionary b, clobbering the 147 | options in b whenever they are also specified in a. 148 | """ 149 | if type(a) is not edict: 150 | return 151 | 152 | for k, v in a.iteritems(): 153 | # a must specify keys that are in b 154 | if not b.has_key(k): 155 | raise KeyError('{} is not a valid config key'.format(k)) 156 | 157 | # the types must match, too 158 | if type(b[k]) is not type(v): 159 | raise ValueError(('Type mismatch ({} vs. {}) ' 160 | 'for config key: {}').format(type(b[k]), 161 | type(v), k)) 162 | 163 | # recursively merge dicts 164 | if type(v) is edict: 165 | try: 166 | _merge_a_into_b(a[k], b[k]) 167 | except: 168 | print('Error under config key: {}'.format(k)) 169 | raise 170 | else: 171 | b[k] = v 172 | 173 | def cfg_from_file(filename): 174 | """Load a config file and merge it into the default options.""" 175 | import yaml 176 | with open(filename, 'r') as f: 177 | yaml_cfg = edict(yaml.load(f)) 178 | 179 | _merge_a_into_b(yaml_cfg, __C) 180 | 181 | def cfg_from_list(cfg_list): 182 | """Set config keys via list (e.g., from command line).""" 183 | from ast import literal_eval 184 | assert len(cfg_list) % 2 == 0 185 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 186 | key_list = k.split('.') 187 | d = __C 188 | for subkey in key_list[:-1]: 189 | assert d.has_key(subkey) 190 | d = d[subkey] 191 | subkey = key_list[-1] 192 | assert d.has_key(subkey) 193 | try: 194 | value = literal_eval(v) 195 | except: 196 | # handle the case when v is a string literal 197 | value = v 198 | assert type(value) == type(d[subkey]), \ 199 | 'type {} does not match original type {}'.format( 200 | type(value), type(d[subkey])) 201 | d[subkey] = value 202 | -------------------------------------------------------------------------------- /lib/fast_rcnn/train.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Train a Fast R-CNN network.""" 9 | 10 | import caffe 11 | from fast_rcnn.config import cfg 12 | import roi_data_layer.roidb as rdl_roidb 13 | from utils.timer import Timer 14 | import numpy as np 15 | import os 16 | 17 | from caffe.proto import caffe_pb2 18 | import google.protobuf as pb2 19 | 20 | class SolverWrapper(object): 21 | """A simple wrapper around Caffe's solver. 22 | This wrapper gives us control over he snapshotting process, which we 23 | use to unnormalize the learned bounding-box regression weights. 24 | """ 25 | 26 | def __init__(self, solver_prototxt, roidb, output_dir, 27 | pretrained_model=None): 28 | """Initialize the SolverWrapper.""" 29 | self.output_dir = output_dir 30 | 31 | print 'Computing bounding-box regression targets...' 32 | self.bbox_means, self.bbox_stds = \ 33 | rdl_roidb.add_bbox_regression_targets(roidb) 34 | print 'done' 35 | 36 | self.solver = caffe.SGDSolver(solver_prototxt) 37 | if pretrained_model is not None: 38 | print ('Loading pretrained model ' 39 | 'weights from {:s}').format(pretrained_model) 40 | self.solver.net.copy_from(pretrained_model) 41 | 42 | self.solver_param = caffe_pb2.SolverParameter() 43 | with open(solver_prototxt, 'rt') as f: 44 | pb2.text_format.Merge(f.read(), self.solver_param) 45 | 46 | self.solver.net.layers[0].set_roidb(roidb) 47 | 48 | def snapshot(self): 49 | """Take a snapshot of the network after unnormalizing the learned 50 | bounding-box regression weights. This enables easy use at test-time. 51 | """ 52 | net = self.solver.net 53 | 54 | if cfg.TRAIN.BBOX_REG: 55 | # save original values 56 | orig_0 = net.params['bbox_pred'][0].data.copy() 57 | orig_1 = net.params['bbox_pred'][1].data.copy() 58 | 59 | # scale and shift with bbox reg unnormalization; then save snapshot 60 | net.params['bbox_pred'][0].data[...] = \ 61 | (net.params['bbox_pred'][0].data * 62 | self.bbox_stds[:, np.newaxis]) 63 | net.params['bbox_pred'][1].data[...] = \ 64 | (net.params['bbox_pred'][1].data * 65 | self.bbox_stds + self.bbox_means) 66 | 67 | if not os.path.exists(self.output_dir): 68 | os.makedirs(self.output_dir) 69 | 70 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 71 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 72 | filename = (self.solver_param.snapshot_prefix + infix + 73 | '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') 74 | filename = os.path.join(self.output_dir, filename) 75 | 76 | net.save(str(filename)) 77 | print 'Wrote snapshot to: {:s}'.format(filename) 78 | 79 | if cfg.TRAIN.BBOX_REG: 80 | # restore net to original state 81 | net.params['bbox_pred'][0].data[...] = orig_0 82 | net.params['bbox_pred'][1].data[...] = orig_1 83 | 84 | def train_model(self, max_iters): 85 | """Network training loop.""" 86 | last_snapshot_iter = -1 87 | timer = Timer() 88 | while self.solver.iter < max_iters: 89 | # Make one SGD update 90 | timer.tic() 91 | self.solver.step(1) 92 | timer.toc() 93 | if self.solver.iter % (10 * self.solver_param.display) == 0: 94 | print 'speed: {:.3f}s / iter'.format(timer.average_time) 95 | 96 | if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: 97 | last_snapshot_iter = self.solver.iter 98 | self.snapshot() 99 | 100 | if last_snapshot_iter != self.solver.iter: 101 | self.snapshot() 102 | 103 | def get_training_roidb(imdb): 104 | """Returns a roidb (Region of Interest database) for use in training.""" 105 | if cfg.TRAIN.USE_FLIPPED: 106 | print 'Appending horizontally-flipped training examples...' 107 | imdb.append_flipped_images() 108 | print 'done' 109 | 110 | print 'Preparing training data...' 111 | rdl_roidb.prepare_roidb(imdb) 112 | print 'done' 113 | 114 | return imdb.roidb 115 | 116 | def train_net(solver_prototxt, roidb, output_dir, 117 | pretrained_model=None, max_iters=40000): 118 | """Train a Fast R-CNN network.""" 119 | sw = SolverWrapper(solver_prototxt, roidb, output_dir, 120 | pretrained_model=pretrained_model) 121 | 122 | print 'Solving...' 123 | sw.train_model(max_iters) 124 | print 'done solving' 125 | -------------------------------------------------------------------------------- /lib/roi_data_layer/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/roi_data_layer/layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """The data layer used during training to train a Fast R-CNN network. 9 | 10 | RoIDataLayer implements a Caffe Python layer. 11 | """ 12 | 13 | import caffe 14 | from fast_rcnn.config import cfg 15 | from roi_data_layer.minibatch import get_minibatch 16 | import numpy as np 17 | import yaml 18 | from multiprocessing import Process, Queue 19 | 20 | class RoIDataLayer(caffe.Layer): 21 | """Fast R-CNN data layer used for training.""" 22 | 23 | def _shuffle_roidb_inds(self): 24 | """Randomly permute the training roidb.""" 25 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 26 | self._cur = 0 27 | 28 | def _get_next_minibatch_inds(self): 29 | """Return the roidb indices for the next minibatch.""" 30 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 31 | self._shuffle_roidb_inds() 32 | 33 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 34 | self._cur += cfg.TRAIN.IMS_PER_BATCH 35 | return db_inds 36 | 37 | def _get_next_minibatch(self): 38 | """Return the blobs to be used for the next minibatch. 39 | 40 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 41 | separate process and made available through self._blob_queue. 42 | """ 43 | if cfg.TRAIN.USE_PREFETCH: 44 | return self._blob_queue.get() 45 | else: 46 | db_inds = self._get_next_minibatch_inds() 47 | minibatch_db = [self._roidb[i] for i in db_inds] 48 | return get_minibatch(minibatch_db, self._num_classes) 49 | 50 | def set_roidb(self, roidb): 51 | """Set the roidb to be used by this layer during training.""" 52 | self._roidb = roidb 53 | self._shuffle_roidb_inds() 54 | if cfg.TRAIN.USE_PREFETCH: 55 | self._blob_queue = Queue(10) 56 | self._prefetch_process = BlobFetcher(self._blob_queue, 57 | self._roidb, 58 | self._num_classes) 59 | self._prefetch_process.start() 60 | # Terminate the child process when the parent exists 61 | def cleanup(): 62 | print 'Terminating BlobFetcher' 63 | self._prefetch_process.terminate() 64 | self._prefetch_process.join() 65 | import atexit 66 | atexit.register(cleanup) 67 | 68 | def setup(self, bottom, top): 69 | """Setup the RoIDataLayer.""" 70 | 71 | # parse the layer parameter string, which must be valid YAML 72 | layer_params = yaml.load(self.param_str_) 73 | 74 | self._num_classes = layer_params['num_classes'] 75 | 76 | self._name_to_top_map = { 77 | 'data': 0, 78 | 'rois': 1, 79 | 'labels': 2} 80 | 81 | # data blob: holds a batch of N images, each with 3 channels 82 | # The height and width (100 x 100) are dummy values 83 | top[0].reshape(1, 3, 100, 100) 84 | 85 | # rois blob: holds R regions of interest, each is a 5-tuple 86 | # (n, x1, y1, x2, y2) specifying an image batch index n and a 87 | # rectangle (x1, y1, x2, y2) 88 | top[1].reshape(1, 5) 89 | 90 | # labels blob: R categorical labels in [0, ..., K] for K foreground 91 | # classes plus background 92 | top[2].reshape(1) 93 | 94 | if cfg.TRAIN.BBOX_REG: 95 | self._name_to_top_map['bbox_targets'] = 3 96 | self._name_to_top_map['bbox_loss_weights'] = 4 97 | 98 | # bbox_targets blob: R bounding-box regression targets with 4 99 | # targets per class 100 | top[3].reshape(1, self._num_classes * 4) 101 | 102 | # bbox_loss_weights blob: At most 4 targets per roi are active; 103 | # thisbinary vector sepcifies the subset of active targets 104 | top[4].reshape(1, self._num_classes * 4) 105 | 106 | def forward(self, bottom, top): 107 | """Get blobs and copy them into this layer's top blob vector.""" 108 | blobs = self._get_next_minibatch() 109 | 110 | for blob_name, blob in blobs.iteritems(): 111 | top_ind = self._name_to_top_map[blob_name] 112 | # Reshape net's input blobs 113 | top[top_ind].reshape(*(blob.shape)) 114 | # Copy data into net's input blobs 115 | top[top_ind].data[...] = blob.astype(np.float32, copy=False) 116 | 117 | def backward(self, top, propagate_down, bottom): 118 | """This layer does not propagate gradients.""" 119 | pass 120 | 121 | def reshape(self, bottom, top): 122 | """Reshaping happens during the call to forward.""" 123 | pass 124 | 125 | class BlobFetcher(Process): 126 | """Experimental class for prefetching blobs in a separate process.""" 127 | def __init__(self, queue, roidb, num_classes): 128 | super(BlobFetcher, self).__init__() 129 | self._queue = queue 130 | self._roidb = roidb 131 | self._num_classes = num_classes 132 | self._perm = None 133 | self._cur = 0 134 | self._shuffle_roidb_inds() 135 | # fix the random seed for reproducibility 136 | np.random.seed(cfg.RNG_SEED) 137 | 138 | def _shuffle_roidb_inds(self): 139 | """Randomly permute the training roidb.""" 140 | # TODO(rbg): remove duplicated code 141 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 142 | self._cur = 0 143 | 144 | def _get_next_minibatch_inds(self): 145 | """Return the roidb indices for the next minibatch.""" 146 | # TODO(rbg): remove duplicated code 147 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 148 | self._shuffle_roidb_inds() 149 | 150 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 151 | self._cur += cfg.TRAIN.IMS_PER_BATCH 152 | return db_inds 153 | 154 | def run(self): 155 | print 'BlobFetcher started' 156 | while True: 157 | db_inds = self._get_next_minibatch_inds() 158 | minibatch_db = [self._roidb[i] for i in db_inds] 159 | blobs = get_minibatch(minibatch_db, self._num_classes) 160 | self._queue.put(blobs) 161 | -------------------------------------------------------------------------------- /lib/roi_data_layer/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Compute minibatch blobs for training a Fast R-CNN network.""" 9 | 10 | import numpy as np 11 | import numpy.random as npr 12 | import cv2 13 | from fast_rcnn.config import cfg 14 | from utils.blob import prep_im_for_blob, im_list_to_blob 15 | 16 | def get_minibatch(roidb, num_classes): 17 | """Given a roidb, construct a minibatch sampled from it.""" 18 | num_images = len(roidb) 19 | # Sample random scales to use for each image in this batch 20 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 21 | size=num_images) 22 | assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 23 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 24 | format(num_images, cfg.TRAIN.BATCH_SIZE) 25 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 26 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 27 | 28 | # Get the input image blob, formatted for caffe 29 | im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) 30 | 31 | # Now, build the region of interest and label blobs 32 | rois_blob = np.zeros((0, 5), dtype=np.float32) 33 | labels_blob = np.zeros((0), dtype=np.float32) 34 | bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) 35 | bbox_loss_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) 36 | # all_overlaps = [] 37 | for im_i in xrange(num_images): 38 | labels, overlaps, im_rois, bbox_targets, bbox_loss \ 39 | = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, 40 | num_classes) 41 | 42 | # Add to RoIs blob 43 | rois = _project_im_rois(im_rois, im_scales[im_i]) 44 | batch_ind = im_i * np.ones((rois.shape[0], 1)) 45 | rois_blob_this_image = np.hstack((batch_ind, rois)) 46 | rois_blob = np.vstack((rois_blob, rois_blob_this_image)) 47 | 48 | # Add to labels, bbox targets, and bbox loss blobs 49 | labels_blob = np.hstack((labels_blob, labels)) 50 | bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) 51 | bbox_loss_blob = np.vstack((bbox_loss_blob, bbox_loss)) 52 | # all_overlaps = np.hstack((all_overlaps, overlaps)) 53 | 54 | # For debug visualizations 55 | # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) 56 | 57 | blobs = {'data': im_blob, 58 | 'rois': rois_blob, 59 | 'labels': labels_blob} 60 | 61 | if cfg.TRAIN.BBOX_REG: 62 | blobs['bbox_targets'] = bbox_targets_blob 63 | blobs['bbox_loss_weights'] = bbox_loss_blob 64 | 65 | return blobs 66 | 67 | def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): 68 | """Generate a random sample of RoIs comprising foreground and background 69 | examples. 70 | """ 71 | # label = class RoI has max overlap with 72 | labels = roidb['max_classes'] 73 | overlaps = roidb['max_overlaps'] 74 | rois = roidb['boxes'] 75 | 76 | # Select foreground RoIs as those with >= FG_THRESH overlap 77 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 78 | # Guard against the case when an image has fewer than fg_rois_per_image 79 | # foreground RoIs 80 | fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) 81 | # Sample foreground regions without replacement 82 | if fg_inds.size > 0: 83 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, 84 | replace=False) 85 | 86 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 87 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 88 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 89 | # Compute number of background RoIs to take from this image (guarding 90 | # against there being fewer than desired) 91 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 92 | bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, 93 | bg_inds.size) 94 | # Sample foreground regions without replacement 95 | if bg_inds.size > 0: 96 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, 97 | replace=False) 98 | 99 | # The indices that we're selecting (both fg and bg) 100 | keep_inds = np.append(fg_inds, bg_inds) 101 | # Select sampled values from various arrays: 102 | labels = labels[keep_inds] 103 | # Clamp labels for the background RoIs to 0 104 | labels[fg_rois_per_this_image:] = 0 105 | overlaps = overlaps[keep_inds] 106 | rois = rois[keep_inds] 107 | 108 | bbox_targets, bbox_loss_weights = \ 109 | _get_bbox_regression_labels(roidb['bbox_targets'][keep_inds, :], 110 | num_classes) 111 | 112 | return labels, overlaps, rois, bbox_targets, bbox_loss_weights 113 | 114 | def _get_image_blob(roidb, scale_inds): 115 | """Builds an input blob from the images in the roidb at the specified 116 | scales. 117 | """ 118 | num_images = len(roidb) 119 | processed_ims = [] 120 | im_scales = [] 121 | for i in xrange(num_images): 122 | im = cv2.imread(roidb[i]['image']) 123 | if roidb[i]['flipped']: 124 | im = im[:, ::-1, :] 125 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 126 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 127 | cfg.TRAIN.MAX_SIZE) 128 | im_scales.append(im_scale) 129 | processed_ims.append(im) 130 | 131 | # Create a blob to hold the input images 132 | blob = im_list_to_blob(processed_ims) 133 | 134 | return blob, im_scales 135 | 136 | def _project_im_rois(im_rois, im_scale_factor): 137 | """Project image RoIs into the rescaled training image.""" 138 | rois = im_rois * im_scale_factor 139 | return rois 140 | 141 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 142 | """Bounding-box regression targets are stored in a compact form in the 143 | roidb. 144 | 145 | This function expands those targets into the 4-of-4*K representation used 146 | by the network (i.e. only one class has non-zero targets). The loss weights 147 | are similarly expanded. 148 | 149 | Returns: 150 | bbox_target_data (ndarray): N x 4K blob of regression targets 151 | bbox_loss_weights (ndarray): N x 4K blob of loss weights 152 | """ 153 | clss = bbox_target_data[:, 0] 154 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 155 | bbox_loss_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 156 | inds = np.where(clss > 0)[0] 157 | for ind in inds: 158 | cls = clss[ind] 159 | start = 4 * cls 160 | end = start + 4 161 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 162 | bbox_loss_weights[ind, start:end] = [1., 1., 1., 1.] 163 | return bbox_targets, bbox_loss_weights 164 | 165 | def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): 166 | """Visualize a mini-batch for debugging.""" 167 | import matplotlib.pyplot as plt 168 | for i in xrange(rois_blob.shape[0]): 169 | rois = rois_blob[i, :] 170 | im_ind = rois[0] 171 | roi = rois[1:] 172 | im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() 173 | im += cfg.PIXEL_MEANS 174 | im = im[:, :, (2, 1, 0)] 175 | im = im.astype(np.uint8) 176 | cls = labels_blob[i] 177 | plt.imshow(im) 178 | print 'class: ', cls, ' overlap: ', overlaps[i] 179 | plt.gca().add_patch( 180 | plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], 181 | roi[3] - roi[1], fill=False, 182 | edgecolor='r', linewidth=3) 183 | ) 184 | plt.show() 185 | -------------------------------------------------------------------------------- /lib/roi_data_layer/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 9 | 10 | import numpy as np 11 | from fast_rcnn.config import cfg 12 | import utils.cython_bbox 13 | 14 | def prepare_roidb(imdb): 15 | """Enrich the imdb's roidb by adding some derived quantities that 16 | are useful for training. This function precomputes the maximum 17 | overlap, taken over ground-truth boxes, between each ROI and 18 | each ground-truth box. The class with maximum overlap is also 19 | recorded. 20 | """ 21 | roidb = imdb.roidb 22 | for i in xrange(len(imdb.image_index)): 23 | roidb[i]['image'] = imdb.image_path_at(i) 24 | # need gt_overlaps as a dense array for argmax 25 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 26 | # max overlap with gt over classes (columns) 27 | max_overlaps = gt_overlaps.max(axis=1) 28 | # gt class that had the max overlap 29 | max_classes = gt_overlaps.argmax(axis=1) 30 | roidb[i]['max_classes'] = max_classes 31 | roidb[i]['max_overlaps'] = max_overlaps 32 | # sanity checks 33 | # max overlap of 0 => class should be zero (background) 34 | zero_inds = np.where(max_overlaps == 0)[0] 35 | assert all(max_classes[zero_inds] == 0) 36 | # max overlap > 0 => class should not be zero (must be a fg class) 37 | nonzero_inds = np.where(max_overlaps > 0)[0] 38 | assert all(max_classes[nonzero_inds] != 0) 39 | 40 | def add_bbox_regression_targets(roidb): 41 | """Add information needed to train bounding-box regressors.""" 42 | assert len(roidb) > 0 43 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 44 | 45 | num_images = len(roidb) 46 | # Infer number of classes from the number of columns in gt_overlaps 47 | num_classes = roidb[0]['gt_overlaps'].shape[1] 48 | for im_i in xrange(num_images): 49 | rois = roidb[im_i]['boxes'] 50 | max_overlaps = roidb[im_i]['max_overlaps'] 51 | max_classes = roidb[im_i]['max_classes'] 52 | roidb[im_i]['bbox_targets'] = \ 53 | _compute_targets(rois, max_overlaps, max_classes) 54 | 55 | # Compute values needed for means and stds 56 | # var(x) = E(x^2) - E(x)^2 57 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 58 | sums = np.zeros((num_classes, 4)) 59 | squared_sums = np.zeros((num_classes, 4)) 60 | for im_i in xrange(num_images): 61 | targets = roidb[im_i]['bbox_targets'] 62 | for cls in xrange(1, num_classes): 63 | cls_inds = np.where(targets[:, 0] == cls)[0] 64 | if cls_inds.size > 0: 65 | class_counts[cls] += cls_inds.size 66 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 67 | squared_sums[cls, :] += (targets[cls_inds, 1:] ** 2).sum(axis=0) 68 | 69 | means = sums / class_counts 70 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 71 | 72 | # Normalize targets 73 | for im_i in xrange(num_images): 74 | targets = roidb[im_i]['bbox_targets'] 75 | for cls in xrange(1, num_classes): 76 | cls_inds = np.where(targets[:, 0] == cls)[0] 77 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 78 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 79 | 80 | # These values will be needed for making predictions 81 | # (the predicts will need to be unnormalized and uncentered) 82 | return means.ravel(), stds.ravel() 83 | 84 | def _compute_targets(rois, overlaps, labels): 85 | """Compute bounding-box regression targets for an image.""" 86 | # Ensure ROIs are floats 87 | rois = rois.astype(np.float, copy=False) 88 | 89 | # Indices of ground-truth ROIs 90 | gt_inds = np.where(overlaps == 1)[0] 91 | # Indices of examples for which we try to make predictions 92 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 93 | 94 | # Get IoU overlap between each ex ROI and gt ROI 95 | ex_gt_overlaps = utils.cython_bbox.bbox_overlaps(rois[ex_inds, :], 96 | rois[gt_inds, :]) 97 | 98 | # Find which gt ROI each ex ROI has max overlap with: 99 | # this will be the ex ROI's gt target 100 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 101 | gt_rois = rois[gt_inds[gt_assignment], :] 102 | ex_rois = rois[ex_inds, :] 103 | 104 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + cfg.EPS 105 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + cfg.EPS 106 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 107 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 108 | 109 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + cfg.EPS 110 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + cfg.EPS 111 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 112 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 113 | 114 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 115 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 116 | targets_dw = np.log(gt_widths / ex_widths) 117 | targets_dh = np.log(gt_heights / ex_heights) 118 | 119 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 120 | targets[ex_inds, 0] = labels[ex_inds] 121 | targets[ex_inds, 1] = targets_dx 122 | targets[ex_inds, 2] = targets_dy 123 | targets[ex_inds, 3] = targets_dw 124 | targets[ex_inds, 4] = targets_dh 125 | return targets 126 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | from distutils.core import setup 10 | from distutils.extension import Extension 11 | from Cython.Distutils import build_ext 12 | 13 | cmdclass = {} 14 | ext_modules = [ 15 | Extension( 16 | "utils.cython_bbox", 17 | ["utils/bbox.pyx"], 18 | extra_compile_args=["-Wno-cpp", "-Wno-unused-function"], 19 | ), 20 | Extension( 21 | "utils.cython_nms", 22 | ["utils/nms.pyx"], 23 | extra_compile_args=["-Wno-cpp", "-Wno-unused-function"], 24 | ) 25 | ] 26 | cmdclass.update({'build_ext': build_ext}) 27 | 28 | setup( 29 | name='fast_rcnn', 30 | cmdclass=cmdclass, 31 | ext_modules=ext_modules, 32 | include_dirs=[np.get_include()] 33 | ) 34 | -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.so 3 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | # Move channels (axis 3) to axis 1 26 | # Axis order will become: (batch elem, channel, height, width) 27 | channel_swap = (0, 3, 1, 2) 28 | blob = blob.transpose(channel_swap) 29 | return blob 30 | 31 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 32 | """Mean subtract and scale an image for use in a blob.""" 33 | im = im.astype(np.float32, copy=False) 34 | im -= pixel_means 35 | im_shape = im.shape 36 | im_size_min = np.min(im_shape[0:2]) 37 | im_size_max = np.max(im_shape[0:2]) 38 | im_scale = float(target_size) / float(im_size_min) 39 | # Prevent the biggest axis from being more than MAX_SIZE 40 | if np.round(im_scale * im_size_max) > max_size: 41 | im_scale = float(max_size) / float(im_size_max) 42 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 43 | interpolation=cv2.INTER_LINEAR) 44 | 45 | return im, im_scale 46 | -------------------------------------------------------------------------------- /lib/utils/nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def nms(dets, thresh): 11 | x1 = dets[:, 0] 12 | y1 = dets[:, 1] 13 | x2 = dets[:, 2] 14 | y2 = dets[:, 3] 15 | scores = dets[:, 4] 16 | 17 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 18 | order = scores.argsort()[::-1] 19 | 20 | keep = [] 21 | while order.size > 0: 22 | i = order[0] 23 | keep.append(i) 24 | xx1 = np.maximum(x1[i], x1[order[1:]]) 25 | yy1 = np.maximum(y1[i], y1[order[1:]]) 26 | xx2 = np.minimum(x2[i], x2[order[1:]]) 27 | yy2 = np.minimum(y2[i], y2[order[1:]]) 28 | 29 | w = np.maximum(0.0, xx2 - xx1 + 1) 30 | h = np.maximum(0.0, yy2 - yy1 + 1) 31 | inter = w * h 32 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 33 | 34 | inds = np.where(ovr <= thresh)[0] 35 | order = order[inds + 1] 36 | 37 | return keep 38 | -------------------------------------------------------------------------------- /lib/utils/nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /matlab/README.md: -------------------------------------------------------------------------------- 1 | A basic demo in MATLAB. 2 | 3 | Detection is also implemented in MATLAB (though missing some bells and whistles 4 | compared to the Python version) via the fast_rcnn_im_detect() function. 5 | 6 | See fast_rcnn_demo.m for example usage. 7 | -------------------------------------------------------------------------------- /matlab/fast_rcnn_demo.m: -------------------------------------------------------------------------------- 1 | % -------------------------------------------------------- 2 | % Fast R-CNN 3 | % Copyright (c) 2015 Microsoft 4 | % Licensed under The MIT License [see LICENSE for details] 5 | % Written by Ross Girshick 6 | % -------------------------------------------------------- 7 | 8 | function fast_rcnn_demo() 9 | % Fast R-CNN demo (in matlab). 10 | 11 | [folder, name, ext] = fileparts(mfilename('fullpath')); 12 | 13 | caffe_path = fullfile(folder, '..', 'caffe-fast-rcnn', 'matlab', 'caffe'); 14 | addpath(caffe_path); 15 | 16 | use_gpu = true; 17 | % You can try other models here: 18 | def = fullfile(folder, '..', 'models', 'VGG16', 'test.prototxt');; 19 | net = fullfile(folder, '..', 'data', 'fast_rcnn_models', ... 20 | 'vgg16_fast_rcnn_iter_40000.caffemodel'); 21 | model = fast_rcnn_load_net(def, net, use_gpu); 22 | 23 | car_ind = 7; 24 | sofa_ind = 18; 25 | tv_ind = 20; 26 | 27 | demo(model, '000004', [car_ind], {'car'}); 28 | demo(model, '001551', [sofa_ind, tv_ind], {'sofa', 'tvmonitor'}); 29 | fprintf('\n'); 30 | 31 | % ------------------------------------------------------------------------ 32 | function demo(model, im_id, cls_inds, cls_names) 33 | % ------------------------------------------------------------------------ 34 | [folder, name, ext] = fileparts(mfilename('fullpath')); 35 | box_file = fullfile(folder, '..', 'data', 'demo', [im_id '_boxes.mat']); 36 | % Boxes were saved with 0-based indexing 37 | ld = load(box_file); boxes = single(ld.boxes) + 1; clear ld; 38 | im_file = fullfile(folder, '..', 'data', 'demo', [im_id '.jpg']); 39 | im = imread(im_file); 40 | dets = fast_rcnn_im_detect(model, im, boxes); 41 | 42 | THRESH = 0.8; 43 | for j = 1:length(cls_inds) 44 | cls_ind = cls_inds(j); 45 | cls_name = cls_names{j}; 46 | I = find(dets{cls_ind}(:, end) >= THRESH); 47 | showboxes(im, dets{cls_ind}(I, :)); 48 | title(sprintf('%s detections with p(%s | box) >= %.3f', ... 49 | cls_name, cls_name, THRESH)) 50 | fprintf('\n> Press any key to continue'); 51 | pause; 52 | end 53 | -------------------------------------------------------------------------------- /matlab/fast_rcnn_im_detect.m: -------------------------------------------------------------------------------- 1 | % -------------------------------------------------------- 2 | % Fast R-CNN 3 | % Copyright (c) 2015 Microsoft 4 | % Licensed under The MIT License [see LICENSE for details] 5 | % Written by Ross Girshick 6 | % -------------------------------------------------------- 7 | 8 | function dets = fast_rcnn_im_detect(model, im, boxes) 9 | % Perform detection a Fast R-CNN network given an image and 10 | % object proposals. 11 | 12 | if model.init_key ~= caffe('get_init_key') 13 | error('You probably need call fast_rcnn_load_net() first.'); 14 | end 15 | 16 | [im_batch, scales] = image_pyramid(im, model.pixel_means, false); 17 | 18 | [feat_pyra_boxes, feat_pyra_levels] = project_im_rois(boxes, scales); 19 | rois = cat(2, feat_pyra_levels, feat_pyra_boxes); 20 | % Adjust to 0-based indexing and make roi info the fastest dimension 21 | rois = rois - 1; 22 | rois = permute(rois, [2 1]); 23 | 24 | input_blobs = cell(2, 1); 25 | input_blobs{1} = im_batch; 26 | input_blobs{2} = rois; 27 | th = tic(); 28 | blobs_out = caffe('forward', input_blobs); 29 | fprintf('fwd: %.3fs\n', toc(th)); 30 | 31 | bbox_deltas = squeeze(blobs_out{1})'; 32 | probs = squeeze(blobs_out{2})'; 33 | 34 | num_classes = size(probs, 2); 35 | dets = cell(num_classes - 1, 1); 36 | NMS_THRESH = 0.3; 37 | % class index 1 is __background__, so we don't return it 38 | for j = 2:num_classes 39 | cls_probs = probs(:, j); 40 | cls_deltas = bbox_deltas(:, (1 + (j - 1) * 4):(j * 4)); 41 | pred_boxes = bbox_pred(boxes, cls_deltas); 42 | cls_dets = [pred_boxes cls_probs]; 43 | keep = nms(cls_dets, NMS_THRESH); 44 | cls_dets = cls_dets(keep, :); 45 | dets{j - 1} = cls_dets; 46 | end 47 | 48 | % ------------------------------------------------------------------------ 49 | function [batch, scales] = image_pyramid(im, pixel_means, multiscale) 50 | % ------------------------------------------------------------------------ 51 | % Construct an image pyramid that's ready for feeding directly into caffe 52 | if ~multiscale 53 | SCALES = [600]; 54 | MAX_SIZE = 1000; 55 | else 56 | SCALES = [1200 864 688 576 480]; 57 | MAX_SIZE = 2000; 58 | end 59 | num_levels = length(SCALES); 60 | 61 | im = single(im); 62 | % Convert to BGR 63 | im = im(:, :, [3 2 1]); 64 | % Subtract mean (mean of the image mean--one mean per channel) 65 | im = bsxfun(@minus, im, pixel_means); 66 | 67 | im_orig = im; 68 | im_size = min([size(im_orig, 1) size(im_orig, 2)]); 69 | im_size_big = max([size(im_orig, 1) size(im_orig, 2)]); 70 | scale_factors = SCALES ./ im_size; 71 | 72 | max_size = [0 0 0]; 73 | for i = 1:num_levels 74 | if round(im_size_big * scale_factors(i)) > MAX_SIZE 75 | scale_factors(i) = MAX_SIZE / im_size_big; 76 | end 77 | ims{i} = imresize(im_orig, scale_factors(i), 'bilinear', ... 78 | 'antialiasing', false); 79 | max_size = max(cat(1, max_size, size(ims{i})), [], 1); 80 | end 81 | 82 | batch = zeros(max_size(2), max_size(1), 3, num_levels, 'single'); 83 | for i = 1:num_levels 84 | im = ims{i}; 85 | im_sz = size(im); 86 | im_sz = im_sz(1:2); 87 | % Make width the fastest dimension (for caffe) 88 | im = permute(im, [2 1 3]); 89 | batch(1:im_sz(2), 1:im_sz(1), :, i) = im; 90 | end 91 | scales = scale_factors'; 92 | 93 | % ------------------------------------------------------------------------ 94 | function [boxes, levels] = project_im_rois(boxes, scales) 95 | % ------------------------------------------------------------------------ 96 | widths = boxes(:,3) - boxes(:,1) + 1; 97 | heights = boxes(:,4) - boxes(:,2) + 1; 98 | 99 | areas = widths .* heights; 100 | scaled_areas = bsxfun(@times, areas, (scales.^2)'); 101 | diff_areas = abs(scaled_areas - (224 * 224)); 102 | [~, levels] = min(diff_areas, [], 2); 103 | 104 | boxes = boxes - 1; 105 | boxes = bsxfun(@times, boxes, scales(levels)); 106 | boxes = boxes + 1; 107 | 108 | % ------------------------------------------------------------------------ 109 | function pred_boxes = bbox_pred(boxes, bbox_deltas) 110 | % ------------------------------------------------------------------------ 111 | if isempty(boxes) 112 | pred_boxes = []; 113 | return; 114 | end 115 | 116 | Y = bbox_deltas; 117 | 118 | % Read out predictions 119 | dst_ctr_x = Y(:, 1); 120 | dst_ctr_y = Y(:, 2); 121 | dst_scl_x = Y(:, 3); 122 | dst_scl_y = Y(:, 4); 123 | 124 | src_w = boxes(:, 3) - boxes(:, 1) + eps; 125 | src_h = boxes(:, 4) - boxes(:, 2) + eps; 126 | src_ctr_x = boxes(:, 1) + 0.5 * src_w; 127 | src_ctr_y = boxes(:, 2) + 0.5 * src_h; 128 | 129 | pred_ctr_x = (dst_ctr_x .* src_w) + src_ctr_x; 130 | pred_ctr_y = (dst_ctr_y .* src_h) + src_ctr_y; 131 | pred_w = exp(dst_scl_x) .* src_w; 132 | pred_h = exp(dst_scl_y) .* src_h; 133 | pred_boxes = [pred_ctr_x - 0.5 * pred_w, pred_ctr_y - 0.5 * pred_h, ... 134 | pred_ctr_x + 0.5 * pred_w, pred_ctr_y + 0.5 * pred_h]; 135 | -------------------------------------------------------------------------------- /matlab/fast_rcnn_load_net.m: -------------------------------------------------------------------------------- 1 | % -------------------------------------------------------- 2 | % Fast R-CNN 3 | % Copyright (c) 2015 Microsoft 4 | % Licensed under The MIT License [see LICENSE for details] 5 | % Written by Ross Girshick 6 | % -------------------------------------------------------- 7 | 8 | function model = fast_rcnn_load_net(def, net, use_gpu) 9 | % Load a Fast R-CNN network. 10 | 11 | init_key = caffe('init', def, net, 'test'); 12 | if exist('use_gpu', 'var') && ~use_gpu 13 | caffe('set_mode_cpu'); 14 | else 15 | caffe('set_mode_gpu'); 16 | end 17 | 18 | model.init_key = init_key; 19 | % model.stride is correct for the included models, but may not be correct 20 | % for other models! 21 | model.stride = 16; 22 | model.pixel_means = reshape([102.9801, 115.9465, 122.7717], [1 1 3]); 23 | -------------------------------------------------------------------------------- /matlab/nms.m: -------------------------------------------------------------------------------- 1 | function pick = nms(boxes, overlap) 2 | % top = nms(boxes, overlap) 3 | % Non-maximum suppression. (FAST VERSION) 4 | % Greedily select high-scoring detections and skip detections 5 | % that are significantly covered by a previously selected 6 | % detection. 7 | % 8 | % NOTE: This is adapted from Pedro Felzenszwalb's version (nms.m), 9 | % but an inner loop has been eliminated to significantly speed it 10 | % up in the case of a large number of boxes 11 | 12 | % Copyright (C) 2011-12 by Tomasz Malisiewicz 13 | % All rights reserved. 14 | % 15 | % This file is part of the Exemplar-SVM library and is made 16 | % available under the terms of the MIT license (see COPYING file). 17 | % Project homepage: https://github.com/quantombone/exemplarsvm 18 | 19 | 20 | if isempty(boxes) 21 | pick = []; 22 | return; 23 | end 24 | 25 | x1 = boxes(:,1); 26 | y1 = boxes(:,2); 27 | x2 = boxes(:,3); 28 | y2 = boxes(:,4); 29 | s = boxes(:,end); 30 | 31 | area = (x2-x1+1) .* (y2-y1+1); 32 | [vals, I] = sort(s); 33 | 34 | pick = s*0; 35 | counter = 1; 36 | while ~isempty(I) 37 | last = length(I); 38 | i = I(last); 39 | pick(counter) = i; 40 | counter = counter + 1; 41 | 42 | xx1 = max(x1(i), x1(I(1:last-1))); 43 | yy1 = max(y1(i), y1(I(1:last-1))); 44 | xx2 = min(x2(i), x2(I(1:last-1))); 45 | yy2 = min(y2(i), y2(I(1:last-1))); 46 | 47 | w = max(0.0, xx2-xx1+1); 48 | h = max(0.0, yy2-yy1+1); 49 | 50 | inter = w.*h; 51 | o = inter ./ (area(i) + area(I(1:last-1)) - inter); 52 | 53 | I = I(find(o<=overlap)); 54 | end 55 | 56 | pick = pick(1:(counter-1)); 57 | -------------------------------------------------------------------------------- /matlab/showboxes.m: -------------------------------------------------------------------------------- 1 | % -------------------------------------------------------- 2 | % Fast R-CNN 3 | % Copyright (c) 2015 Microsoft 4 | % Licensed under The MIT License [see LICENSE for details] 5 | % Written by Ross Girshick 6 | % -------------------------------------------------------- 7 | 8 | function showboxes(im, boxes) 9 | 10 | image(im); 11 | axis image; 12 | axis off; 13 | set(gcf, 'Color', 'white'); 14 | 15 | if ~isempty(boxes) 16 | x1 = boxes(:, 1); 17 | y1 = boxes(:, 2); 18 | x2 = boxes(:, 3); 19 | y2 = boxes(:, 4); 20 | c = 'r'; 21 | s = '-'; 22 | line([x1 x1 x2 x2 x1]', [y1 y2 y2 y1 y1]', ... 23 | 'color', c, 'linewidth', 2, 'linestyle', s); 24 | for i = 1:size(boxes, 1) 25 | text(double(x1(i)), double(y1(i)) - 2, ... 26 | sprintf('%.3f', boxes(i, end)), ... 27 | 'backgroundcolor', 'r', 'color', 'w'); 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /models/CaffeNet/compressed/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 227 7 | dim: 227 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 11 30 | pad: 5 31 | stride: 4 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | pad: 1 49 | stride: 2 50 | } 51 | } 52 | layer { 53 | name: "norm1" 54 | type: "LRN" 55 | bottom: "pool1" 56 | top: "norm1" 57 | lrn_param { 58 | local_size: 5 59 | alpha: 0.0001 60 | beta: 0.75 61 | } 62 | } 63 | layer { 64 | name: "conv2" 65 | type: "Convolution" 66 | bottom: "norm1" 67 | top: "conv2" 68 | param { 69 | lr_mult: 1 70 | decay_mult: 1 71 | } 72 | param { 73 | lr_mult: 2 74 | decay_mult: 0 75 | } 76 | convolution_param { 77 | num_output: 256 78 | kernel_size: 5 79 | pad: 2 80 | group: 2 81 | } 82 | } 83 | layer { 84 | name: "relu2" 85 | type: "ReLU" 86 | bottom: "conv2" 87 | top: "conv2" 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | pad: 1 98 | stride: 2 99 | } 100 | } 101 | layer { 102 | name: "norm2" 103 | type: "LRN" 104 | bottom: "pool2" 105 | top: "norm2" 106 | lrn_param { 107 | local_size: 5 108 | alpha: 0.0001 109 | beta: 0.75 110 | } 111 | } 112 | layer { 113 | name: "conv3" 114 | type: "Convolution" 115 | bottom: "norm2" 116 | top: "conv3" 117 | param { 118 | lr_mult: 1 119 | decay_mult: 1 120 | } 121 | param { 122 | lr_mult: 2 123 | decay_mult: 0 124 | } 125 | convolution_param { 126 | num_output: 384 127 | kernel_size: 3 128 | pad: 1 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | kernel_size: 3 153 | pad: 1 154 | group: 2 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 1 170 | decay_mult: 1 171 | } 172 | param { 173 | lr_mult: 2 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 256 178 | kernel_size: 3 179 | pad: 1 180 | group: 2 181 | } 182 | } 183 | layer { 184 | name: "relu5" 185 | type: "ReLU" 186 | bottom: "conv5" 187 | top: "conv5" 188 | } 189 | layer { 190 | name: "roi_pool5" 191 | type: "ROIPooling" 192 | bottom: "conv5" 193 | bottom: "rois" 194 | top: "pool5" 195 | roi_pooling_param { 196 | pooled_w: 6 197 | pooled_h: 6 198 | spatial_scale: 0.0625 # 1/16 199 | } 200 | } 201 | layer { 202 | name: "fc6_L" 203 | type: "InnerProduct" 204 | bottom: "pool5" 205 | top: "fc6_L" 206 | param { 207 | lr_mult: 1 208 | decay_mult: 1 209 | } 210 | inner_product_param { 211 | num_output: 1024 212 | bias_term: false 213 | } 214 | } 215 | layer { 216 | name: "fc6_U" 217 | type: "InnerProduct" 218 | bottom: "fc6_L" 219 | top: "fc6_U" 220 | param { 221 | lr_mult: 1 222 | decay_mult: 1 223 | } 224 | param { 225 | lr_mult: 2 226 | decay_mult: 0 227 | } 228 | inner_product_param { 229 | num_output: 4096 230 | } 231 | } 232 | layer { 233 | name: "relu6" 234 | type: "ReLU" 235 | bottom: "fc6_U" 236 | top: "fc6_U" 237 | } 238 | layer { 239 | name: "drop6" 240 | type: "Dropout" 241 | bottom: "fc6_U" 242 | top: "fc6_U" 243 | dropout_param { 244 | dropout_ratio: 0.5 245 | } 246 | } 247 | layer { 248 | name: "fc7_L" 249 | type: "InnerProduct" 250 | bottom: "fc6_U" 251 | top: "fc7_L" 252 | param { 253 | lr_mult: 1 254 | decay_mult: 1 255 | } 256 | inner_product_param { 257 | num_output: 256 258 | bias_term: false 259 | } 260 | } 261 | layer { 262 | name: "fc7_U" 263 | type: "InnerProduct" 264 | bottom: "fc7_L" 265 | top: "fc7_U" 266 | param { 267 | lr_mult: 1 268 | decay_mult: 1 269 | } 270 | param { 271 | lr_mult: 2 272 | decay_mult: 0 273 | } 274 | inner_product_param { 275 | num_output: 4096 276 | } 277 | } 278 | layer { 279 | name: "relu7" 280 | type: "ReLU" 281 | bottom: "fc7_U" 282 | top: "fc7_U" 283 | } 284 | layer { 285 | name: "drop7" 286 | type: "Dropout" 287 | bottom: "fc7_U" 288 | top: "fc7_U" 289 | dropout_param { 290 | dropout_ratio: 0.5 291 | } 292 | } 293 | layer { 294 | name: "cls_score" 295 | type: "InnerProduct" 296 | bottom: "fc7_U" 297 | top: "cls_score" 298 | param { 299 | lr_mult: 1 300 | decay_mult: 1 301 | } 302 | param { 303 | lr_mult: 2 304 | decay_mult: 0 305 | } 306 | inner_product_param { 307 | num_output: 21 308 | weight_filler { 309 | type: "gaussian" 310 | std: 0.01 311 | } 312 | bias_filler { 313 | type: "constant" 314 | value: 0 315 | } 316 | } 317 | } 318 | layer { 319 | name: "bbox_pred" 320 | type: "InnerProduct" 321 | bottom: "fc7_U" 322 | top: "bbox_pred" 323 | param { 324 | lr_mult: 1 325 | decay_mult: 1 326 | } 327 | param { 328 | lr_mult: 2 329 | decay_mult: 0 330 | } 331 | inner_product_param { 332 | num_output: 84 333 | weight_filler { 334 | type: "gaussian" 335 | std: 0.001 336 | } 337 | bias_filler { 338 | type: "constant" 339 | value: 0 340 | } 341 | } 342 | } 343 | layer { 344 | name: "cls_prob" 345 | type: "Softmax" 346 | bottom: "cls_score" 347 | top: "cls_prob" 348 | } 349 | -------------------------------------------------------------------------------- /models/CaffeNet/no_bbox_reg/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/CaffeNet/no_bbox_reg/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "caffenet_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/CaffeNet/no_bbox_reg/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 227 7 | dim: 227 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 11 30 | pad: 5 31 | stride: 4 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | pad: 1 49 | stride: 2 50 | } 51 | } 52 | layer { 53 | name: "norm1" 54 | type: "LRN" 55 | bottom: "pool1" 56 | top: "norm1" 57 | lrn_param { 58 | local_size: 5 59 | alpha: 0.0001 60 | beta: 0.75 61 | } 62 | } 63 | layer { 64 | name: "conv2" 65 | type: "Convolution" 66 | bottom: "norm1" 67 | top: "conv2" 68 | param { 69 | lr_mult: 1 70 | decay_mult: 1 71 | } 72 | param { 73 | lr_mult: 2 74 | decay_mult: 0 75 | } 76 | convolution_param { 77 | num_output: 256 78 | kernel_size: 5 79 | pad: 2 80 | group: 2 81 | } 82 | } 83 | layer { 84 | name: "relu2" 85 | type: "ReLU" 86 | bottom: "conv2" 87 | top: "conv2" 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | pad: 1 98 | stride: 2 99 | } 100 | } 101 | layer { 102 | name: "norm2" 103 | type: "LRN" 104 | bottom: "pool2" 105 | top: "norm2" 106 | lrn_param { 107 | local_size: 5 108 | alpha: 0.0001 109 | beta: 0.75 110 | } 111 | } 112 | layer { 113 | name: "conv3" 114 | type: "Convolution" 115 | bottom: "norm2" 116 | top: "conv3" 117 | param { 118 | lr_mult: 1 119 | decay_mult: 1 120 | } 121 | param { 122 | lr_mult: 2 123 | decay_mult: 0 124 | } 125 | convolution_param { 126 | num_output: 384 127 | kernel_size: 3 128 | pad: 1 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | kernel_size: 3 153 | pad: 1 154 | group: 2 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 1 170 | decay_mult: 1 171 | } 172 | param { 173 | lr_mult: 2 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 256 178 | kernel_size: 3 179 | pad: 1 180 | group: 2 181 | } 182 | } 183 | layer { 184 | name: "relu5" 185 | type: "ReLU" 186 | bottom: "conv5" 187 | top: "conv5" 188 | } 189 | layer { 190 | name: "roi_pool5" 191 | type: "ROIPooling" 192 | bottom: "conv5" 193 | bottom: "rois" 194 | top: "pool5" 195 | roi_pooling_param { 196 | pooled_w: 6 197 | pooled_h: 6 198 | spatial_scale: 0.0625 # 1/16 199 | } 200 | } 201 | layer { 202 | name: "fc6" 203 | type: "InnerProduct" 204 | bottom: "pool5" 205 | top: "fc6" 206 | param { 207 | lr_mult: 1 208 | decay_mult: 1 209 | } 210 | param { 211 | lr_mult: 2 212 | decay_mult: 0 213 | } 214 | inner_product_param { 215 | num_output: 4096 216 | } 217 | } 218 | layer { 219 | name: "relu6" 220 | type: "ReLU" 221 | bottom: "fc6" 222 | top: "fc6" 223 | } 224 | layer { 225 | name: "drop6" 226 | type: "Dropout" 227 | bottom: "fc6" 228 | top: "fc6" 229 | dropout_param { 230 | dropout_ratio: 0.5 231 | } 232 | } 233 | layer { 234 | name: "fc7" 235 | type: "InnerProduct" 236 | bottom: "fc6" 237 | top: "fc7" 238 | param { 239 | lr_mult: 1 240 | decay_mult: 1 241 | } 242 | param { 243 | lr_mult: 2 244 | decay_mult: 0 245 | } 246 | inner_product_param { 247 | num_output: 4096 248 | } 249 | } 250 | layer { 251 | name: "relu7" 252 | type: "ReLU" 253 | bottom: "fc7" 254 | top: "fc7" 255 | } 256 | layer { 257 | name: "drop7" 258 | type: "Dropout" 259 | bottom: "fc7" 260 | top: "fc7" 261 | dropout_param { 262 | dropout_ratio: 0.5 263 | } 264 | } 265 | layer { 266 | name: "cls_score" 267 | type: "InnerProduct" 268 | bottom: "fc7" 269 | top: "cls_score" 270 | param { 271 | lr_mult: 1 272 | decay_mult: 1 273 | } 274 | param { 275 | lr_mult: 2 276 | decay_mult: 0 277 | } 278 | inner_product_param { 279 | num_output: 21 280 | weight_filler { 281 | type: "gaussian" 282 | std: 0.01 283 | } 284 | bias_filler { 285 | type: "constant" 286 | value: 0 287 | } 288 | } 289 | } 290 | layer { 291 | name: "cls_prob" 292 | type: "Softmax" 293 | bottom: "cls_score" 294 | top: "cls_prob" 295 | } 296 | -------------------------------------------------------------------------------- /models/CaffeNet/no_bbox_reg/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 11 30 | pad: 5 31 | stride: 4 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | pad: 1 49 | stride: 2 50 | } 51 | } 52 | layer { 53 | name: "norm1" 54 | type: "LRN" 55 | bottom: "pool1" 56 | top: "norm1" 57 | lrn_param { 58 | local_size: 5 59 | alpha: 0.0001 60 | beta: 0.75 61 | } 62 | } 63 | layer { 64 | name: "conv2" 65 | type: "Convolution" 66 | bottom: "norm1" 67 | top: "conv2" 68 | param { 69 | lr_mult: 1 70 | decay_mult: 1 71 | } 72 | param { 73 | lr_mult: 2 74 | decay_mult: 0 75 | } 76 | convolution_param { 77 | num_output: 256 78 | kernel_size: 5 79 | pad: 2 80 | group: 2 81 | } 82 | } 83 | layer { 84 | name: "relu2" 85 | type: "ReLU" 86 | bottom: "conv2" 87 | top: "conv2" 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | pad: 1 98 | stride: 2 99 | } 100 | } 101 | layer { 102 | name: "norm2" 103 | type: "LRN" 104 | bottom: "pool2" 105 | top: "norm2" 106 | lrn_param { 107 | local_size: 5 108 | alpha: 0.0001 109 | beta: 0.75 110 | } 111 | } 112 | layer { 113 | name: "conv3" 114 | type: "Convolution" 115 | bottom: "norm2" 116 | top: "conv3" 117 | param { 118 | lr_mult: 1 119 | decay_mult: 1 120 | } 121 | param { 122 | lr_mult: 2 123 | decay_mult: 0 124 | } 125 | convolution_param { 126 | num_output: 384 127 | kernel_size: 3 128 | pad: 1 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | kernel_size: 3 153 | pad: 1 154 | group: 2 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 1 170 | decay_mult: 1 171 | } 172 | param { 173 | lr_mult: 2 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 256 178 | kernel_size: 3 179 | pad: 1 180 | group: 2 181 | } 182 | } 183 | layer { 184 | name: "relu5" 185 | type: "ReLU" 186 | bottom: "conv5" 187 | top: "conv5" 188 | } 189 | layer { 190 | name: "roi_pool5" 191 | type: "ROIPooling" 192 | bottom: "conv5" 193 | bottom: "rois" 194 | top: "pool5" 195 | roi_pooling_param { 196 | pooled_w: 6 197 | pooled_h: 6 198 | spatial_scale: 0.0625 # 1/16 199 | } 200 | } 201 | layer { 202 | name: "fc6" 203 | type: "InnerProduct" 204 | bottom: "pool5" 205 | top: "fc6" 206 | param { 207 | lr_mult: 1 208 | decay_mult: 1 209 | } 210 | param { 211 | lr_mult: 2 212 | decay_mult: 0 213 | } 214 | inner_product_param { 215 | num_output: 4096 216 | } 217 | } 218 | layer { 219 | name: "relu6" 220 | type: "ReLU" 221 | bottom: "fc6" 222 | top: "fc6" 223 | } 224 | layer { 225 | name: "drop6" 226 | type: "Dropout" 227 | bottom: "fc6" 228 | top: "fc6" 229 | dropout_param { 230 | dropout_ratio: 0.5 231 | } 232 | } 233 | layer { 234 | name: "fc7" 235 | type: "InnerProduct" 236 | bottom: "fc6" 237 | top: "fc7" 238 | param { 239 | lr_mult: 1 240 | decay_mult: 1 241 | } 242 | param { 243 | lr_mult: 2 244 | decay_mult: 0 245 | } 246 | inner_product_param { 247 | num_output: 4096 248 | } 249 | } 250 | layer { 251 | name: "relu7" 252 | type: "ReLU" 253 | bottom: "fc7" 254 | top: "fc7" 255 | } 256 | layer { 257 | name: "drop7" 258 | type: "Dropout" 259 | bottom: "fc7" 260 | top: "fc7" 261 | dropout_param { 262 | dropout_ratio: 0.5 263 | } 264 | } 265 | layer { 266 | name: "cls_score" 267 | type: "InnerProduct" 268 | bottom: "fc7" 269 | top: "cls_score" 270 | param { 271 | lr_mult: 1 272 | decay_mult: 1 273 | } 274 | param { 275 | lr_mult: 2 276 | decay_mult: 0 277 | } 278 | inner_product_param { 279 | num_output: 21 280 | weight_filler { 281 | type: "gaussian" 282 | std: 0.01 283 | } 284 | bias_filler { 285 | type: "constant" 286 | value: 0 287 | } 288 | } 289 | } 290 | layer { 291 | name: "loss_cls" 292 | type: "SoftmaxWithLoss" 293 | bottom: "cls_score" 294 | bottom: "labels" 295 | top: "loss_cls" 296 | loss_weight: 1 297 | } 298 | -------------------------------------------------------------------------------- /models/CaffeNet/piecewise/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/CaffeNet/piecewise/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "caffenet_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/CaffeNet/piecewise/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_loss_weights' 10 | python_param { 11 | module: 'roi_data_layer.layer' 12 | layer: 'RoIDataLayer' 13 | param_str: "'num_classes': 21" 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 0 23 | decay_mult: 0 24 | } 25 | param { 26 | lr_mult: 0 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 11 32 | pad: 5 33 | stride: 4 34 | } 35 | } 36 | layer { 37 | name: "relu1" 38 | type: "ReLU" 39 | bottom: "conv1" 40 | top: "conv1" 41 | } 42 | layer { 43 | name: "pool1" 44 | type: "Pooling" 45 | bottom: "conv1" 46 | top: "pool1" 47 | pooling_param { 48 | pool: MAX 49 | kernel_size: 3 50 | pad: 1 51 | stride: 2 52 | } 53 | } 54 | layer { 55 | name: "norm1" 56 | type: "LRN" 57 | bottom: "pool1" 58 | top: "norm1" 59 | lrn_param { 60 | local_size: 5 61 | alpha: 0.0001 62 | beta: 0.75 63 | } 64 | } 65 | layer { 66 | name: "conv2" 67 | type: "Convolution" 68 | bottom: "norm1" 69 | top: "conv2" 70 | param { 71 | lr_mult: 0 72 | decay_mult: 0 73 | } 74 | param { 75 | lr_mult: 0 76 | decay_mult: 0 77 | } 78 | convolution_param { 79 | num_output: 256 80 | kernel_size: 5 81 | pad: 2 82 | group: 2 83 | } 84 | } 85 | layer { 86 | name: "relu2" 87 | type: "ReLU" 88 | bottom: "conv2" 89 | top: "conv2" 90 | } 91 | layer { 92 | name: "pool2" 93 | type: "Pooling" 94 | bottom: "conv2" 95 | top: "pool2" 96 | pooling_param { 97 | pool: MAX 98 | kernel_size: 3 99 | pad: 1 100 | stride: 2 101 | } 102 | } 103 | layer { 104 | name: "norm2" 105 | type: "LRN" 106 | bottom: "pool2" 107 | top: "norm2" 108 | lrn_param { 109 | local_size: 5 110 | alpha: 0.0001 111 | beta: 0.75 112 | } 113 | } 114 | layer { 115 | name: "conv3" 116 | type: "Convolution" 117 | bottom: "norm2" 118 | top: "conv3" 119 | param { 120 | lr_mult: 0 121 | decay_mult: 0 122 | } 123 | param { 124 | lr_mult: 0 125 | decay_mult: 0 126 | } 127 | convolution_param { 128 | num_output: 384 129 | kernel_size: 3 130 | pad: 1 131 | } 132 | } 133 | layer { 134 | name: "relu3" 135 | type: "ReLU" 136 | bottom: "conv3" 137 | top: "conv3" 138 | } 139 | layer { 140 | name: "conv4" 141 | type: "Convolution" 142 | bottom: "conv3" 143 | top: "conv4" 144 | param { 145 | lr_mult: 0 146 | decay_mult: 0 147 | } 148 | param { 149 | lr_mult: 0 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 384 154 | kernel_size: 3 155 | pad: 1 156 | group: 2 157 | } 158 | } 159 | layer { 160 | name: "relu4" 161 | type: "ReLU" 162 | bottom: "conv4" 163 | top: "conv4" 164 | } 165 | layer { 166 | name: "conv5" 167 | type: "Convolution" 168 | bottom: "conv4" 169 | top: "conv5" 170 | param { 171 | lr_mult: 0 172 | decay_mult: 0 173 | } 174 | param { 175 | lr_mult: 0 176 | decay_mult: 0 177 | } 178 | convolution_param { 179 | num_output: 256 180 | kernel_size: 3 181 | pad: 1 182 | group: 2 183 | } 184 | } 185 | layer { 186 | name: "relu5" 187 | type: "ReLU" 188 | bottom: "conv5" 189 | top: "conv5" 190 | } 191 | layer { 192 | name: "roi_pool5" 193 | type: "ROIPooling" 194 | bottom: "conv5" 195 | bottom: "rois" 196 | top: "pool5" 197 | roi_pooling_param { 198 | pooled_w: 6 199 | pooled_h: 6 200 | spatial_scale: 0.0625 # 1/16 201 | } 202 | } 203 | layer { 204 | name: "fc6" 205 | type: "InnerProduct" 206 | bottom: "pool5" 207 | top: "fc6" 208 | param { 209 | lr_mult: 0 210 | decay_mult: 0 211 | } 212 | param { 213 | lr_mult: 0 214 | decay_mult: 0 215 | } 216 | inner_product_param { 217 | num_output: 4096 218 | } 219 | } 220 | layer { 221 | name: "relu6" 222 | type: "ReLU" 223 | bottom: "fc6" 224 | top: "fc6" 225 | } 226 | layer { 227 | name: "drop6" 228 | type: "Dropout" 229 | bottom: "fc6" 230 | top: "fc6" 231 | dropout_param { 232 | dropout_ratio: 0.5 233 | } 234 | } 235 | layer { 236 | name: "fc7" 237 | type: "InnerProduct" 238 | bottom: "fc6" 239 | top: "fc7" 240 | param { 241 | lr_mult: 0 242 | decay_mult: 0 243 | } 244 | param { 245 | lr_mult: 0 246 | decay_mult: 0 247 | } 248 | inner_product_param { 249 | num_output: 4096 250 | } 251 | } 252 | layer { 253 | name: "relu7" 254 | type: "ReLU" 255 | bottom: "fc7" 256 | top: "fc7" 257 | } 258 | layer { 259 | name: "drop7" 260 | type: "Dropout" 261 | bottom: "fc7" 262 | top: "fc7" 263 | dropout_param { 264 | dropout_ratio: 0.5 265 | } 266 | } 267 | layer { 268 | name: "cls_score" 269 | type: "InnerProduct" 270 | bottom: "fc7" 271 | top: "cls_score" 272 | param { 273 | lr_mult: 0 274 | decay_mult: 0 275 | } 276 | param { 277 | lr_mult: 0 278 | decay_mult: 0 279 | } 280 | inner_product_param { 281 | num_output: 21 282 | weight_filler { 283 | type: "gaussian" 284 | std: 0.01 285 | } 286 | bias_filler { 287 | type: "constant" 288 | value: 0 289 | } 290 | } 291 | } 292 | layer { 293 | name: "bbox_pred" 294 | type: "InnerProduct" 295 | bottom: "fc7" 296 | top: "bbox_pred" 297 | param { 298 | lr_mult: 1 299 | decay_mult: 1 300 | } 301 | param { 302 | lr_mult: 2 303 | decay_mult: 0 304 | } 305 | inner_product_param { 306 | num_output: 84 307 | weight_filler { 308 | type: "gaussian" 309 | std: 0.001 310 | } 311 | bias_filler { 312 | type: "constant" 313 | value: 0 314 | } 315 | } 316 | } 317 | layer { 318 | name: "loss_cls" 319 | type: "SoftmaxWithLoss" 320 | bottom: "cls_score" 321 | bottom: "labels" 322 | top: "loss_cls" 323 | loss_weight: 0 324 | } 325 | layer { 326 | name: "loss_bbox" 327 | type: "SmoothL1Loss" 328 | bottom: "bbox_pred" 329 | bottom: "bbox_targets" 330 | bottom: "bbox_loss_weights" 331 | top: "loss_bbox" 332 | loss_weight: 1 333 | } 334 | -------------------------------------------------------------------------------- /models/CaffeNet/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/CaffeNet/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "caffenet_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/CaffeNet/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 227 7 | dim: 227 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 11 30 | pad: 5 31 | stride: 4 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | pad: 1 49 | stride: 2 50 | } 51 | } 52 | layer { 53 | name: "norm1" 54 | type: "LRN" 55 | bottom: "pool1" 56 | top: "norm1" 57 | lrn_param { 58 | local_size: 5 59 | alpha: 0.0001 60 | beta: 0.75 61 | } 62 | } 63 | layer { 64 | name: "conv2" 65 | type: "Convolution" 66 | bottom: "norm1" 67 | top: "conv2" 68 | param { 69 | lr_mult: 1 70 | decay_mult: 1 71 | } 72 | param { 73 | lr_mult: 2 74 | decay_mult: 0 75 | } 76 | convolution_param { 77 | num_output: 256 78 | kernel_size: 5 79 | pad: 2 80 | group: 2 81 | } 82 | } 83 | layer { 84 | name: "relu2" 85 | type: "ReLU" 86 | bottom: "conv2" 87 | top: "conv2" 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | pad: 1 98 | stride: 2 99 | } 100 | } 101 | layer { 102 | name: "norm2" 103 | type: "LRN" 104 | bottom: "pool2" 105 | top: "norm2" 106 | lrn_param { 107 | local_size: 5 108 | alpha: 0.0001 109 | beta: 0.75 110 | } 111 | } 112 | layer { 113 | name: "conv3" 114 | type: "Convolution" 115 | bottom: "norm2" 116 | top: "conv3" 117 | param { 118 | lr_mult: 1 119 | decay_mult: 1 120 | } 121 | param { 122 | lr_mult: 2 123 | decay_mult: 0 124 | } 125 | convolution_param { 126 | num_output: 384 127 | kernel_size: 3 128 | pad: 1 129 | } 130 | } 131 | layer { 132 | name: "relu3" 133 | type: "ReLU" 134 | bottom: "conv3" 135 | top: "conv3" 136 | } 137 | layer { 138 | name: "conv4" 139 | type: "Convolution" 140 | bottom: "conv3" 141 | top: "conv4" 142 | param { 143 | lr_mult: 1 144 | decay_mult: 1 145 | } 146 | param { 147 | lr_mult: 2 148 | decay_mult: 0 149 | } 150 | convolution_param { 151 | num_output: 384 152 | kernel_size: 3 153 | pad: 1 154 | group: 2 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 1 170 | decay_mult: 1 171 | } 172 | param { 173 | lr_mult: 2 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 256 178 | kernel_size: 3 179 | pad: 1 180 | group: 2 181 | } 182 | } 183 | layer { 184 | name: "relu5" 185 | type: "ReLU" 186 | bottom: "conv5" 187 | top: "conv5" 188 | } 189 | layer { 190 | name: "roi_pool5" 191 | type: "ROIPooling" 192 | bottom: "conv5" 193 | bottom: "rois" 194 | top: "pool5" 195 | roi_pooling_param { 196 | pooled_w: 6 197 | pooled_h: 6 198 | spatial_scale: 0.0625 # 1/16 199 | } 200 | } 201 | layer { 202 | name: "fc6" 203 | type: "InnerProduct" 204 | bottom: "pool5" 205 | top: "fc6" 206 | param { 207 | lr_mult: 1 208 | decay_mult: 1 209 | } 210 | param { 211 | lr_mult: 2 212 | decay_mult: 0 213 | } 214 | inner_product_param { 215 | num_output: 4096 216 | } 217 | } 218 | layer { 219 | name: "relu6" 220 | type: "ReLU" 221 | bottom: "fc6" 222 | top: "fc6" 223 | } 224 | layer { 225 | name: "drop6" 226 | type: "Dropout" 227 | bottom: "fc6" 228 | top: "fc6" 229 | dropout_param { 230 | dropout_ratio: 0.5 231 | } 232 | } 233 | layer { 234 | name: "fc7" 235 | type: "InnerProduct" 236 | bottom: "fc6" 237 | top: "fc7" 238 | param { 239 | lr_mult: 1 240 | decay_mult: 1 241 | } 242 | param { 243 | lr_mult: 2 244 | decay_mult: 0 245 | } 246 | inner_product_param { 247 | num_output: 4096 248 | } 249 | } 250 | layer { 251 | name: "relu7" 252 | type: "ReLU" 253 | bottom: "fc7" 254 | top: "fc7" 255 | } 256 | layer { 257 | name: "drop7" 258 | type: "Dropout" 259 | bottom: "fc7" 260 | top: "fc7" 261 | dropout_param { 262 | dropout_ratio: 0.5 263 | } 264 | } 265 | layer { 266 | name: "cls_score" 267 | type: "InnerProduct" 268 | bottom: "fc7" 269 | top: "cls_score" 270 | param { 271 | lr_mult: 1 272 | decay_mult: 1 273 | } 274 | param { 275 | lr_mult: 2 276 | decay_mult: 0 277 | } 278 | inner_product_param { 279 | num_output: 21 280 | weight_filler { 281 | type: "gaussian" 282 | std: 0.01 283 | } 284 | bias_filler { 285 | type: "constant" 286 | value: 0 287 | } 288 | } 289 | } 290 | layer { 291 | name: "bbox_pred" 292 | type: "InnerProduct" 293 | bottom: "fc7" 294 | top: "bbox_pred" 295 | param { 296 | lr_mult: 1 297 | decay_mult: 1 298 | } 299 | param { 300 | lr_mult: 2 301 | decay_mult: 0 302 | } 303 | inner_product_param { 304 | num_output: 84 305 | weight_filler { 306 | type: "gaussian" 307 | std: 0.001 308 | } 309 | bias_filler { 310 | type: "constant" 311 | value: 0 312 | } 313 | } 314 | } 315 | layer { 316 | name: "cls_prob" 317 | type: "Softmax" 318 | bottom: "cls_score" 319 | top: "cls_prob" 320 | } 321 | -------------------------------------------------------------------------------- /models/CaffeNet/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_loss_weights' 10 | python_param { 11 | module: 'roi_data_layer.layer' 12 | layer: 'RoIDataLayer' 13 | param_str: "'num_classes': 21" 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 0 23 | decay_mult: 0 24 | } 25 | param { 26 | lr_mult: 0 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 11 32 | pad: 5 33 | stride: 4 34 | } 35 | } 36 | layer { 37 | name: "relu1" 38 | type: "ReLU" 39 | bottom: "conv1" 40 | top: "conv1" 41 | } 42 | layer { 43 | name: "pool1" 44 | type: "Pooling" 45 | bottom: "conv1" 46 | top: "pool1" 47 | pooling_param { 48 | pool: MAX 49 | kernel_size: 3 50 | pad: 1 51 | stride: 2 52 | } 53 | } 54 | layer { 55 | name: "norm1" 56 | type: "LRN" 57 | bottom: "pool1" 58 | top: "norm1" 59 | lrn_param { 60 | local_size: 5 61 | alpha: 0.0001 62 | beta: 0.75 63 | } 64 | } 65 | layer { 66 | name: "conv2" 67 | type: "Convolution" 68 | bottom: "norm1" 69 | top: "conv2" 70 | param { 71 | lr_mult: 1 72 | decay_mult: 1 73 | } 74 | param { 75 | lr_mult: 2 76 | decay_mult: 0 77 | } 78 | convolution_param { 79 | num_output: 256 80 | kernel_size: 5 81 | pad: 2 82 | group: 2 83 | } 84 | } 85 | layer { 86 | name: "relu2" 87 | type: "ReLU" 88 | bottom: "conv2" 89 | top: "conv2" 90 | } 91 | layer { 92 | name: "pool2" 93 | type: "Pooling" 94 | bottom: "conv2" 95 | top: "pool2" 96 | pooling_param { 97 | pool: MAX 98 | kernel_size: 3 99 | pad: 1 100 | stride: 2 101 | } 102 | } 103 | layer { 104 | name: "norm2" 105 | type: "LRN" 106 | bottom: "pool2" 107 | top: "norm2" 108 | lrn_param { 109 | local_size: 5 110 | alpha: 0.0001 111 | beta: 0.75 112 | } 113 | } 114 | layer { 115 | name: "conv3" 116 | type: "Convolution" 117 | bottom: "norm2" 118 | top: "conv3" 119 | param { 120 | lr_mult: 1 121 | decay_mult: 1 122 | } 123 | param { 124 | lr_mult: 2 125 | decay_mult: 0 126 | } 127 | convolution_param { 128 | num_output: 384 129 | kernel_size: 3 130 | pad: 1 131 | } 132 | } 133 | layer { 134 | name: "relu3" 135 | type: "ReLU" 136 | bottom: "conv3" 137 | top: "conv3" 138 | } 139 | layer { 140 | name: "conv4" 141 | type: "Convolution" 142 | bottom: "conv3" 143 | top: "conv4" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 384 154 | kernel_size: 3 155 | pad: 1 156 | group: 2 157 | } 158 | } 159 | layer { 160 | name: "relu4" 161 | type: "ReLU" 162 | bottom: "conv4" 163 | top: "conv4" 164 | } 165 | layer { 166 | name: "conv5" 167 | type: "Convolution" 168 | bottom: "conv4" 169 | top: "conv5" 170 | param { 171 | lr_mult: 1 172 | decay_mult: 1 173 | } 174 | param { 175 | lr_mult: 2 176 | decay_mult: 0 177 | } 178 | convolution_param { 179 | num_output: 256 180 | kernel_size: 3 181 | pad: 1 182 | group: 2 183 | } 184 | } 185 | layer { 186 | name: "relu5" 187 | type: "ReLU" 188 | bottom: "conv5" 189 | top: "conv5" 190 | } 191 | layer { 192 | name: "roi_pool5" 193 | type: "ROIPooling" 194 | bottom: "conv5" 195 | bottom: "rois" 196 | top: "pool5" 197 | roi_pooling_param { 198 | pooled_w: 6 199 | pooled_h: 6 200 | spatial_scale: 0.0625 # 1/16 201 | } 202 | } 203 | layer { 204 | name: "fc6" 205 | type: "InnerProduct" 206 | bottom: "pool5" 207 | top: "fc6" 208 | param { 209 | lr_mult: 1 210 | decay_mult: 1 211 | } 212 | param { 213 | lr_mult: 2 214 | decay_mult: 0 215 | } 216 | inner_product_param { 217 | num_output: 4096 218 | } 219 | } 220 | layer { 221 | name: "relu6" 222 | type: "ReLU" 223 | bottom: "fc6" 224 | top: "fc6" 225 | } 226 | layer { 227 | name: "drop6" 228 | type: "Dropout" 229 | bottom: "fc6" 230 | top: "fc6" 231 | dropout_param { 232 | dropout_ratio: 0.5 233 | } 234 | } 235 | layer { 236 | name: "fc7" 237 | type: "InnerProduct" 238 | bottom: "fc6" 239 | top: "fc7" 240 | param { 241 | lr_mult: 1 242 | decay_mult: 1 243 | } 244 | param { 245 | lr_mult: 2 246 | decay_mult: 0 247 | } 248 | inner_product_param { 249 | num_output: 4096 250 | } 251 | } 252 | layer { 253 | name: "relu7" 254 | type: "ReLU" 255 | bottom: "fc7" 256 | top: "fc7" 257 | } 258 | layer { 259 | name: "drop7" 260 | type: "Dropout" 261 | bottom: "fc7" 262 | top: "fc7" 263 | dropout_param { 264 | dropout_ratio: 0.5 265 | } 266 | } 267 | layer { 268 | name: "cls_score" 269 | type: "InnerProduct" 270 | bottom: "fc7" 271 | top: "cls_score" 272 | param { 273 | lr_mult: 1 274 | decay_mult: 1 275 | } 276 | param { 277 | lr_mult: 2 278 | decay_mult: 0 279 | } 280 | inner_product_param { 281 | num_output: 21 282 | weight_filler { 283 | type: "gaussian" 284 | std: 0.01 285 | } 286 | bias_filler { 287 | type: "constant" 288 | value: 0 289 | } 290 | } 291 | } 292 | layer { 293 | name: "bbox_pred" 294 | type: "InnerProduct" 295 | bottom: "fc7" 296 | top: "bbox_pred" 297 | param { 298 | lr_mult: 1 299 | decay_mult: 1 300 | } 301 | param { 302 | lr_mult: 2 303 | decay_mult: 0 304 | } 305 | inner_product_param { 306 | num_output: 84 307 | weight_filler { 308 | type: "gaussian" 309 | std: 0.001 310 | } 311 | bias_filler { 312 | type: "constant" 313 | value: 0 314 | } 315 | } 316 | } 317 | layer { 318 | name: "loss_cls" 319 | type: "SoftmaxWithLoss" 320 | bottom: "cls_score" 321 | bottom: "labels" 322 | top: "loss_cls" 323 | loss_weight: 1 324 | } 325 | layer { 326 | name: "loss_bbox" 327 | type: "SmoothL1Loss" 328 | bottom: "bbox_pred" 329 | bottom: "bbox_targets" 330 | bottom: "bbox_loss_weights" 331 | top: "loss_bbox" 332 | loss_weight: 1 333 | } 334 | -------------------------------------------------------------------------------- /models/README.md: -------------------------------------------------------------------------------- 1 | Prototxt files that define models and solvers. 2 | 3 | Three models are defined, with some variations of each to support experiments 4 | in the paper. 5 | - Caffenet (model **S**) 6 | - VGG_CNN_M_1024 (model **M**) 7 | - VGG16 (model **L**) 8 | -------------------------------------------------------------------------------- /models/VGG16/fc_only/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/fc_only/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/VGG16/no_bbox_reg/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/no_bbox_reg/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/VGG16/no_bbox_reg/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 # to be changed on-the-fly to num ROIs 14 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 256 154 | pad: 1 155 | kernel_size: 3 156 | } 157 | } 158 | layer { 159 | name: "relu3_1" 160 | type: "ReLU" 161 | bottom: "conv3_1" 162 | top: "conv3_1" 163 | } 164 | layer { 165 | name: "conv3_2" 166 | type: "Convolution" 167 | bottom: "conv3_1" 168 | top: "conv3_2" 169 | param { 170 | lr_mult: 1 171 | decay_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | } 182 | } 183 | layer { 184 | name: "relu3_2" 185 | type: "ReLU" 186 | bottom: "conv3_2" 187 | top: "conv3_2" 188 | } 189 | layer { 190 | name: "conv3_3" 191 | type: "Convolution" 192 | bottom: "conv3_2" 193 | top: "conv3_3" 194 | param { 195 | lr_mult: 1 196 | decay_mult: 1 197 | } 198 | param { 199 | lr_mult: 2 200 | decay_mult: 0 201 | } 202 | convolution_param { 203 | num_output: 256 204 | pad: 1 205 | kernel_size: 3 206 | } 207 | } 208 | layer { 209 | name: "relu3_3" 210 | type: "ReLU" 211 | bottom: "conv3_3" 212 | top: "conv3_3" 213 | } 214 | layer { 215 | name: "pool3" 216 | type: "Pooling" 217 | bottom: "conv3_3" 218 | top: "pool3" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 2 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "conv4_1" 227 | type: "Convolution" 228 | bottom: "pool3" 229 | top: "conv4_1" 230 | param { 231 | lr_mult: 1 232 | decay_mult: 1 233 | } 234 | param { 235 | lr_mult: 2 236 | decay_mult: 0 237 | } 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 241 | kernel_size: 3 242 | } 243 | } 244 | layer { 245 | name: "relu4_1" 246 | type: "ReLU" 247 | bottom: "conv4_1" 248 | top: "conv4_1" 249 | } 250 | layer { 251 | name: "conv4_2" 252 | type: "Convolution" 253 | bottom: "conv4_1" 254 | top: "conv4_2" 255 | param { 256 | lr_mult: 1 257 | decay_mult: 1 258 | } 259 | param { 260 | lr_mult: 2 261 | decay_mult: 0 262 | } 263 | convolution_param { 264 | num_output: 512 265 | pad: 1 266 | kernel_size: 3 267 | } 268 | } 269 | layer { 270 | name: "relu4_2" 271 | type: "ReLU" 272 | bottom: "conv4_2" 273 | top: "conv4_2" 274 | } 275 | layer { 276 | name: "conv4_3" 277 | type: "Convolution" 278 | bottom: "conv4_2" 279 | top: "conv4_3" 280 | param { 281 | lr_mult: 1 282 | decay_mult: 1 283 | } 284 | param { 285 | lr_mult: 2 286 | decay_mult: 0 287 | } 288 | convolution_param { 289 | num_output: 512 290 | pad: 1 291 | kernel_size: 3 292 | } 293 | } 294 | layer { 295 | name: "relu4_3" 296 | type: "ReLU" 297 | bottom: "conv4_3" 298 | top: "conv4_3" 299 | } 300 | layer { 301 | name: "pool4" 302 | type: "Pooling" 303 | bottom: "conv4_3" 304 | top: "pool4" 305 | pooling_param { 306 | pool: MAX 307 | kernel_size: 2 308 | stride: 2 309 | } 310 | } 311 | layer { 312 | name: "conv5_1" 313 | type: "Convolution" 314 | bottom: "pool4" 315 | top: "conv5_1" 316 | param { 317 | lr_mult: 1 318 | decay_mult: 1 319 | } 320 | param { 321 | lr_mult: 2 322 | decay_mult: 0 323 | } 324 | convolution_param { 325 | num_output: 512 326 | pad: 1 327 | kernel_size: 3 328 | } 329 | } 330 | layer { 331 | name: "relu5_1" 332 | type: "ReLU" 333 | bottom: "conv5_1" 334 | top: "conv5_1" 335 | } 336 | layer { 337 | name: "conv5_2" 338 | type: "Convolution" 339 | bottom: "conv5_1" 340 | top: "conv5_2" 341 | param { 342 | lr_mult: 1 343 | decay_mult: 1 344 | } 345 | param { 346 | lr_mult: 2 347 | decay_mult: 0 348 | } 349 | convolution_param { 350 | num_output: 512 351 | pad: 1 352 | kernel_size: 3 353 | } 354 | } 355 | layer { 356 | name: "relu5_2" 357 | type: "ReLU" 358 | bottom: "conv5_2" 359 | top: "conv5_2" 360 | } 361 | layer { 362 | name: "conv5_3" 363 | type: "Convolution" 364 | bottom: "conv5_2" 365 | top: "conv5_3" 366 | param { 367 | lr_mult: 1 368 | decay_mult: 1 369 | } 370 | param { 371 | lr_mult: 2 372 | decay_mult: 0 373 | } 374 | convolution_param { 375 | num_output: 512 376 | pad: 1 377 | kernel_size: 3 378 | } 379 | } 380 | layer { 381 | name: "relu5_3" 382 | type: "ReLU" 383 | bottom: "conv5_3" 384 | top: "conv5_3" 385 | } 386 | layer { 387 | name: "roi_pool5" 388 | type: "ROIPooling" 389 | bottom: "conv5_3" 390 | bottom: "rois" 391 | top: "pool5" 392 | roi_pooling_param { 393 | pooled_w: 7 394 | pooled_h: 7 395 | spatial_scale: 0.0625 # 1/16 396 | } 397 | } 398 | layer { 399 | name: "fc6" 400 | type: "InnerProduct" 401 | bottom: "pool5" 402 | top: "fc6" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | inner_product_param { 412 | num_output: 4096 413 | } 414 | } 415 | layer { 416 | name: "relu6" 417 | type: "ReLU" 418 | bottom: "fc6" 419 | top: "fc6" 420 | } 421 | layer { 422 | name: "drop6" 423 | type: "Dropout" 424 | bottom: "fc6" 425 | top: "fc6" 426 | dropout_param { 427 | dropout_ratio: 0.5 428 | } 429 | } 430 | layer { 431 | name: "fc7" 432 | type: "InnerProduct" 433 | bottom: "fc6" 434 | top: "fc7" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | inner_product_param { 444 | num_output: 4096 445 | } 446 | } 447 | layer { 448 | name: "relu7" 449 | type: "ReLU" 450 | bottom: "fc7" 451 | top: "fc7" 452 | } 453 | layer { 454 | name: "drop7" 455 | type: "Dropout" 456 | bottom: "fc7" 457 | top: "fc7" 458 | dropout_param { 459 | dropout_ratio: 0.5 460 | } 461 | } 462 | layer { 463 | name: "cls_score" 464 | type: "InnerProduct" 465 | bottom: "fc7" 466 | top: "cls_score" 467 | param { 468 | lr_mult: 1 469 | decay_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | decay_mult: 0 474 | } 475 | inner_product_param { 476 | num_output: 21 477 | weight_filler { 478 | type: "gaussian" 479 | std: 0.01 480 | } 481 | bias_filler { 482 | type: "constant" 483 | value: 0 484 | } 485 | } 486 | } 487 | layer { 488 | name: "cls_prob" 489 | type: "Softmax" 490 | bottom: "cls_score" 491 | top: "cls_prob" 492 | } 493 | -------------------------------------------------------------------------------- /models/VGG16/no_bbox_reg/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | layer { 15 | name: "conv1_1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1_1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 64 29 | pad: 1 30 | kernel_size: 3 31 | } 32 | } 33 | layer { 34 | name: "relu1_1" 35 | type: "ReLU" 36 | bottom: "conv1_1" 37 | top: "conv1_1" 38 | } 39 | layer { 40 | name: "conv1_2" 41 | type: "Convolution" 42 | bottom: "conv1_1" 43 | top: "conv1_2" 44 | param { 45 | lr_mult: 0 46 | decay_mult: 0 47 | } 48 | param { 49 | lr_mult: 0 50 | decay_mult: 0 51 | } 52 | convolution_param { 53 | num_output: 64 54 | pad: 1 55 | kernel_size: 3 56 | } 57 | } 58 | layer { 59 | name: "relu1_2" 60 | type: "ReLU" 61 | bottom: "conv1_2" 62 | top: "conv1_2" 63 | } 64 | layer { 65 | name: "pool1" 66 | type: "Pooling" 67 | bottom: "conv1_2" 68 | top: "pool1" 69 | pooling_param { 70 | pool: MAX 71 | kernel_size: 2 72 | stride: 2 73 | } 74 | } 75 | layer { 76 | name: "conv2_1" 77 | type: "Convolution" 78 | bottom: "pool1" 79 | top: "conv2_1" 80 | param { 81 | lr_mult: 0 82 | decay_mult: 0 83 | } 84 | param { 85 | lr_mult: 0 86 | decay_mult: 0 87 | } 88 | convolution_param { 89 | num_output: 128 90 | pad: 1 91 | kernel_size: 3 92 | } 93 | } 94 | layer { 95 | name: "relu2_1" 96 | type: "ReLU" 97 | bottom: "conv2_1" 98 | top: "conv2_1" 99 | } 100 | layer { 101 | name: "conv2_2" 102 | type: "Convolution" 103 | bottom: "conv2_1" 104 | top: "conv2_2" 105 | param { 106 | lr_mult: 0 107 | decay_mult: 0 108 | } 109 | param { 110 | lr_mult: 0 111 | decay_mult: 0 112 | } 113 | convolution_param { 114 | num_output: 128 115 | pad: 1 116 | kernel_size: 3 117 | } 118 | } 119 | layer { 120 | name: "relu2_2" 121 | type: "ReLU" 122 | bottom: "conv2_2" 123 | top: "conv2_2" 124 | } 125 | layer { 126 | name: "pool2" 127 | type: "Pooling" 128 | bottom: "conv2_2" 129 | top: "pool2" 130 | pooling_param { 131 | pool: MAX 132 | kernel_size: 2 133 | stride: 2 134 | } 135 | } 136 | layer { 137 | name: "conv3_1" 138 | type: "Convolution" 139 | bottom: "pool2" 140 | top: "conv3_1" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 256 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu3_1" 157 | type: "ReLU" 158 | bottom: "conv3_1" 159 | top: "conv3_1" 160 | } 161 | layer { 162 | name: "conv3_2" 163 | type: "Convolution" 164 | bottom: "conv3_1" 165 | top: "conv3_2" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 256 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu3_2" 182 | type: "ReLU" 183 | bottom: "conv3_2" 184 | top: "conv3_2" 185 | } 186 | layer { 187 | name: "conv3_3" 188 | type: "Convolution" 189 | bottom: "conv3_2" 190 | top: "conv3_3" 191 | param { 192 | lr_mult: 1 193 | decay_mult: 1 194 | } 195 | param { 196 | lr_mult: 2 197 | decay_mult: 0 198 | } 199 | convolution_param { 200 | num_output: 256 201 | pad: 1 202 | kernel_size: 3 203 | } 204 | } 205 | layer { 206 | name: "relu3_3" 207 | type: "ReLU" 208 | bottom: "conv3_3" 209 | top: "conv3_3" 210 | } 211 | layer { 212 | name: "pool3" 213 | type: "Pooling" 214 | bottom: "conv3_3" 215 | top: "pool3" 216 | pooling_param { 217 | pool: MAX 218 | kernel_size: 2 219 | stride: 2 220 | } 221 | } 222 | layer { 223 | name: "conv4_1" 224 | type: "Convolution" 225 | bottom: "pool3" 226 | top: "conv4_1" 227 | param { 228 | lr_mult: 1 229 | decay_mult: 1 230 | } 231 | param { 232 | lr_mult: 2 233 | decay_mult: 0 234 | } 235 | convolution_param { 236 | num_output: 512 237 | pad: 1 238 | kernel_size: 3 239 | } 240 | } 241 | layer { 242 | name: "relu4_1" 243 | type: "ReLU" 244 | bottom: "conv4_1" 245 | top: "conv4_1" 246 | } 247 | layer { 248 | name: "conv4_2" 249 | type: "Convolution" 250 | bottom: "conv4_1" 251 | top: "conv4_2" 252 | param { 253 | lr_mult: 1 254 | decay_mult: 1 255 | } 256 | param { 257 | lr_mult: 2 258 | decay_mult: 0 259 | } 260 | convolution_param { 261 | num_output: 512 262 | pad: 1 263 | kernel_size: 3 264 | } 265 | } 266 | layer { 267 | name: "relu4_2" 268 | type: "ReLU" 269 | bottom: "conv4_2" 270 | top: "conv4_2" 271 | } 272 | layer { 273 | name: "conv4_3" 274 | type: "Convolution" 275 | bottom: "conv4_2" 276 | top: "conv4_3" 277 | param { 278 | lr_mult: 1 279 | decay_mult: 1 280 | } 281 | param { 282 | lr_mult: 2 283 | decay_mult: 0 284 | } 285 | convolution_param { 286 | num_output: 512 287 | pad: 1 288 | kernel_size: 3 289 | } 290 | } 291 | layer { 292 | name: "relu4_3" 293 | type: "ReLU" 294 | bottom: "conv4_3" 295 | top: "conv4_3" 296 | } 297 | layer { 298 | name: "pool4" 299 | type: "Pooling" 300 | bottom: "conv4_3" 301 | top: "pool4" 302 | pooling_param { 303 | pool: MAX 304 | kernel_size: 2 305 | stride: 2 306 | } 307 | } 308 | layer { 309 | name: "conv5_1" 310 | type: "Convolution" 311 | bottom: "pool4" 312 | top: "conv5_1" 313 | param { 314 | lr_mult: 1 315 | decay_mult: 1 316 | } 317 | param { 318 | lr_mult: 2 319 | decay_mult: 0 320 | } 321 | convolution_param { 322 | num_output: 512 323 | pad: 1 324 | kernel_size: 3 325 | } 326 | } 327 | layer { 328 | name: "relu5_1" 329 | type: "ReLU" 330 | bottom: "conv5_1" 331 | top: "conv5_1" 332 | } 333 | layer { 334 | name: "conv5_2" 335 | type: "Convolution" 336 | bottom: "conv5_1" 337 | top: "conv5_2" 338 | param { 339 | lr_mult: 1 340 | decay_mult: 1 341 | } 342 | param { 343 | lr_mult: 2 344 | decay_mult: 0 345 | } 346 | convolution_param { 347 | num_output: 512 348 | pad: 1 349 | kernel_size: 3 350 | } 351 | } 352 | layer { 353 | name: "relu5_2" 354 | type: "ReLU" 355 | bottom: "conv5_2" 356 | top: "conv5_2" 357 | } 358 | layer { 359 | name: "conv5_3" 360 | type: "Convolution" 361 | bottom: "conv5_2" 362 | top: "conv5_3" 363 | param { 364 | lr_mult: 1 365 | decay_mult: 1 366 | } 367 | param { 368 | lr_mult: 2 369 | decay_mult: 0 370 | } 371 | convolution_param { 372 | num_output: 512 373 | pad: 1 374 | kernel_size: 3 375 | } 376 | } 377 | layer { 378 | name: "relu5_3" 379 | type: "ReLU" 380 | bottom: "conv5_3" 381 | top: "conv5_3" 382 | } 383 | layer { 384 | name: "roi_pool5" 385 | type: "ROIPooling" 386 | bottom: "conv5_3" 387 | bottom: "rois" 388 | top: "pool5" 389 | roi_pooling_param { 390 | pooled_w: 7 391 | pooled_h: 7 392 | spatial_scale: 0.0625 # 1/16 393 | } 394 | } 395 | layer { 396 | name: "fc6" 397 | type: "InnerProduct" 398 | bottom: "pool5" 399 | top: "fc6" 400 | param { 401 | lr_mult: 1 402 | decay_mult: 1 403 | } 404 | param { 405 | lr_mult: 2 406 | decay_mult: 0 407 | } 408 | inner_product_param { 409 | num_output: 4096 410 | } 411 | } 412 | layer { 413 | name: "relu6" 414 | type: "ReLU" 415 | bottom: "fc6" 416 | top: "fc6" 417 | } 418 | layer { 419 | name: "drop6" 420 | type: "Dropout" 421 | bottom: "fc6" 422 | top: "fc6" 423 | dropout_param { 424 | dropout_ratio: 0.5 425 | } 426 | } 427 | layer { 428 | name: "fc7" 429 | type: "InnerProduct" 430 | bottom: "fc6" 431 | top: "fc7" 432 | param { 433 | lr_mult: 1 434 | decay_mult: 1 435 | } 436 | param { 437 | lr_mult: 2 438 | decay_mult: 0 439 | } 440 | inner_product_param { 441 | num_output: 4096 442 | } 443 | } 444 | layer { 445 | name: "relu7" 446 | type: "ReLU" 447 | bottom: "fc7" 448 | top: "fc7" 449 | } 450 | layer { 451 | name: "drop7" 452 | type: "Dropout" 453 | bottom: "fc7" 454 | top: "fc7" 455 | dropout_param { 456 | dropout_ratio: 0.5 457 | } 458 | } 459 | layer { 460 | name: "cls_score" 461 | type: "InnerProduct" 462 | bottom: "fc7" 463 | top: "cls_score" 464 | param { 465 | lr_mult: 1 466 | decay_mult: 1 467 | } 468 | param { 469 | lr_mult: 2 470 | decay_mult: 0 471 | } 472 | inner_product_param { 473 | num_output: 21 474 | weight_filler { 475 | type: "gaussian" 476 | std: 0.01 477 | } 478 | bias_filler { 479 | type: "constant" 480 | value: 0 481 | } 482 | } 483 | } 484 | layer { 485 | name: "loss_cls" 486 | type: "SoftmaxWithLoss" 487 | bottom: "cls_score" 488 | bottom: "labels" 489 | top: "loss_cls" 490 | loss_weight: 1 491 | } 492 | -------------------------------------------------------------------------------- /models/VGG16/piecewise/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/piecewise/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/VGG16/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_fast_rcnn" 16 | #debug_info: true 17 | -------------------------------------------------------------------------------- /models/VGG16/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 # to be changed on-the-fly to num ROIs 14 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 15 | } 16 | 17 | layer { 18 | name: "conv1_1" 19 | type: "Convolution" 20 | bottom: "data" 21 | top: "conv1_1" 22 | param { 23 | lr_mult: 0 24 | decay_mult: 0 25 | } 26 | param { 27 | lr_mult: 0 28 | decay_mult: 0 29 | } 30 | convolution_param { 31 | num_output: 64 32 | pad: 1 33 | kernel_size: 3 34 | } 35 | } 36 | layer { 37 | name: "relu1_1" 38 | type: "ReLU" 39 | bottom: "conv1_1" 40 | top: "conv1_1" 41 | } 42 | layer { 43 | name: "conv1_2" 44 | type: "Convolution" 45 | bottom: "conv1_1" 46 | top: "conv1_2" 47 | param { 48 | lr_mult: 0 49 | decay_mult: 0 50 | } 51 | param { 52 | lr_mult: 0 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 64 57 | pad: 1 58 | kernel_size: 3 59 | } 60 | } 61 | layer { 62 | name: "relu1_2" 63 | type: "ReLU" 64 | bottom: "conv1_2" 65 | top: "conv1_2" 66 | } 67 | layer { 68 | name: "pool1" 69 | type: "Pooling" 70 | bottom: "conv1_2" 71 | top: "pool1" 72 | pooling_param { 73 | pool: MAX 74 | kernel_size: 2 75 | stride: 2 76 | } 77 | } 78 | layer { 79 | name: "conv2_1" 80 | type: "Convolution" 81 | bottom: "pool1" 82 | top: "conv2_1" 83 | param { 84 | lr_mult: 0 85 | decay_mult: 0 86 | } 87 | param { 88 | lr_mult: 0 89 | decay_mult: 0 90 | } 91 | convolution_param { 92 | num_output: 128 93 | pad: 1 94 | kernel_size: 3 95 | } 96 | } 97 | layer { 98 | name: "relu2_1" 99 | type: "ReLU" 100 | bottom: "conv2_1" 101 | top: "conv2_1" 102 | } 103 | layer { 104 | name: "conv2_2" 105 | type: "Convolution" 106 | bottom: "conv2_1" 107 | top: "conv2_2" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 128 118 | pad: 1 119 | kernel_size: 3 120 | } 121 | } 122 | layer { 123 | name: "relu2_2" 124 | type: "ReLU" 125 | bottom: "conv2_2" 126 | top: "conv2_2" 127 | } 128 | layer { 129 | name: "pool2" 130 | type: "Pooling" 131 | bottom: "conv2_2" 132 | top: "pool2" 133 | pooling_param { 134 | pool: MAX 135 | kernel_size: 2 136 | stride: 2 137 | } 138 | } 139 | layer { 140 | name: "conv3_1" 141 | type: "Convolution" 142 | bottom: "pool2" 143 | top: "conv3_1" 144 | param { 145 | lr_mult: 1 146 | decay_mult: 1 147 | } 148 | param { 149 | lr_mult: 2 150 | decay_mult: 0 151 | } 152 | convolution_param { 153 | num_output: 256 154 | pad: 1 155 | kernel_size: 3 156 | } 157 | } 158 | layer { 159 | name: "relu3_1" 160 | type: "ReLU" 161 | bottom: "conv3_1" 162 | top: "conv3_1" 163 | } 164 | layer { 165 | name: "conv3_2" 166 | type: "Convolution" 167 | bottom: "conv3_1" 168 | top: "conv3_2" 169 | param { 170 | lr_mult: 1 171 | decay_mult: 1 172 | } 173 | param { 174 | lr_mult: 2 175 | decay_mult: 0 176 | } 177 | convolution_param { 178 | num_output: 256 179 | pad: 1 180 | kernel_size: 3 181 | } 182 | } 183 | layer { 184 | name: "relu3_2" 185 | type: "ReLU" 186 | bottom: "conv3_2" 187 | top: "conv3_2" 188 | } 189 | layer { 190 | name: "conv3_3" 191 | type: "Convolution" 192 | bottom: "conv3_2" 193 | top: "conv3_3" 194 | param { 195 | lr_mult: 1 196 | decay_mult: 1 197 | } 198 | param { 199 | lr_mult: 2 200 | decay_mult: 0 201 | } 202 | convolution_param { 203 | num_output: 256 204 | pad: 1 205 | kernel_size: 3 206 | } 207 | } 208 | layer { 209 | name: "relu3_3" 210 | type: "ReLU" 211 | bottom: "conv3_3" 212 | top: "conv3_3" 213 | } 214 | layer { 215 | name: "pool3" 216 | type: "Pooling" 217 | bottom: "conv3_3" 218 | top: "pool3" 219 | pooling_param { 220 | pool: MAX 221 | kernel_size: 2 222 | stride: 2 223 | } 224 | } 225 | layer { 226 | name: "conv4_1" 227 | type: "Convolution" 228 | bottom: "pool3" 229 | top: "conv4_1" 230 | param { 231 | lr_mult: 1 232 | decay_mult: 1 233 | } 234 | param { 235 | lr_mult: 2 236 | decay_mult: 0 237 | } 238 | convolution_param { 239 | num_output: 512 240 | pad: 1 241 | kernel_size: 3 242 | } 243 | } 244 | layer { 245 | name: "relu4_1" 246 | type: "ReLU" 247 | bottom: "conv4_1" 248 | top: "conv4_1" 249 | } 250 | layer { 251 | name: "conv4_2" 252 | type: "Convolution" 253 | bottom: "conv4_1" 254 | top: "conv4_2" 255 | param { 256 | lr_mult: 1 257 | decay_mult: 1 258 | } 259 | param { 260 | lr_mult: 2 261 | decay_mult: 0 262 | } 263 | convolution_param { 264 | num_output: 512 265 | pad: 1 266 | kernel_size: 3 267 | } 268 | } 269 | layer { 270 | name: "relu4_2" 271 | type: "ReLU" 272 | bottom: "conv4_2" 273 | top: "conv4_2" 274 | } 275 | layer { 276 | name: "conv4_3" 277 | type: "Convolution" 278 | bottom: "conv4_2" 279 | top: "conv4_3" 280 | param { 281 | lr_mult: 1 282 | decay_mult: 1 283 | } 284 | param { 285 | lr_mult: 2 286 | decay_mult: 0 287 | } 288 | convolution_param { 289 | num_output: 512 290 | pad: 1 291 | kernel_size: 3 292 | } 293 | } 294 | layer { 295 | name: "relu4_3" 296 | type: "ReLU" 297 | bottom: "conv4_3" 298 | top: "conv4_3" 299 | } 300 | layer { 301 | name: "pool4" 302 | type: "Pooling" 303 | bottom: "conv4_3" 304 | top: "pool4" 305 | pooling_param { 306 | pool: MAX 307 | kernel_size: 2 308 | stride: 2 309 | } 310 | } 311 | layer { 312 | name: "conv5_1" 313 | type: "Convolution" 314 | bottom: "pool4" 315 | top: "conv5_1" 316 | param { 317 | lr_mult: 1 318 | decay_mult: 1 319 | } 320 | param { 321 | lr_mult: 2 322 | decay_mult: 0 323 | } 324 | convolution_param { 325 | num_output: 512 326 | pad: 1 327 | kernel_size: 3 328 | } 329 | } 330 | layer { 331 | name: "relu5_1" 332 | type: "ReLU" 333 | bottom: "conv5_1" 334 | top: "conv5_1" 335 | } 336 | layer { 337 | name: "conv5_2" 338 | type: "Convolution" 339 | bottom: "conv5_1" 340 | top: "conv5_2" 341 | param { 342 | lr_mult: 1 343 | decay_mult: 1 344 | } 345 | param { 346 | lr_mult: 2 347 | decay_mult: 0 348 | } 349 | convolution_param { 350 | num_output: 512 351 | pad: 1 352 | kernel_size: 3 353 | } 354 | } 355 | layer { 356 | name: "relu5_2" 357 | type: "ReLU" 358 | bottom: "conv5_2" 359 | top: "conv5_2" 360 | } 361 | layer { 362 | name: "conv5_3" 363 | type: "Convolution" 364 | bottom: "conv5_2" 365 | top: "conv5_3" 366 | param { 367 | lr_mult: 1 368 | decay_mult: 1 369 | } 370 | param { 371 | lr_mult: 2 372 | decay_mult: 0 373 | } 374 | convolution_param { 375 | num_output: 512 376 | pad: 1 377 | kernel_size: 3 378 | } 379 | } 380 | layer { 381 | name: "relu5_3" 382 | type: "ReLU" 383 | bottom: "conv5_3" 384 | top: "conv5_3" 385 | } 386 | layer { 387 | name: "roi_pool5" 388 | type: "ROIPooling" 389 | bottom: "conv5_3" 390 | bottom: "rois" 391 | top: "pool5" 392 | roi_pooling_param { 393 | pooled_w: 7 394 | pooled_h: 7 395 | spatial_scale: 0.0625 # 1/16 396 | } 397 | } 398 | layer { 399 | name: "fc6" 400 | type: "InnerProduct" 401 | bottom: "pool5" 402 | top: "fc6" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | inner_product_param { 412 | num_output: 4096 413 | } 414 | } 415 | layer { 416 | name: "relu6" 417 | type: "ReLU" 418 | bottom: "fc6" 419 | top: "fc6" 420 | } 421 | layer { 422 | name: "drop6" 423 | type: "Dropout" 424 | bottom: "fc6" 425 | top: "fc6" 426 | dropout_param { 427 | dropout_ratio: 0.5 428 | } 429 | } 430 | layer { 431 | name: "fc7" 432 | type: "InnerProduct" 433 | bottom: "fc6" 434 | top: "fc7" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | inner_product_param { 444 | num_output: 4096 445 | } 446 | } 447 | layer { 448 | name: "relu7" 449 | type: "ReLU" 450 | bottom: "fc7" 451 | top: "fc7" 452 | } 453 | layer { 454 | name: "drop7" 455 | type: "Dropout" 456 | bottom: "fc7" 457 | top: "fc7" 458 | dropout_param { 459 | dropout_ratio: 0.5 460 | } 461 | } 462 | layer { 463 | name: "cls_score" 464 | type: "InnerProduct" 465 | bottom: "fc7" 466 | top: "cls_score" 467 | param { 468 | lr_mult: 1 469 | decay_mult: 1 470 | } 471 | param { 472 | lr_mult: 2 473 | decay_mult: 0 474 | } 475 | inner_product_param { 476 | num_output: 21 477 | weight_filler { 478 | type: "gaussian" 479 | std: 0.01 480 | } 481 | bias_filler { 482 | type: "constant" 483 | value: 0 484 | } 485 | } 486 | } 487 | layer { 488 | name: "bbox_pred" 489 | type: "InnerProduct" 490 | bottom: "fc7" 491 | top: "bbox_pred" 492 | param { 493 | lr_mult: 1 494 | decay_mult: 1 495 | } 496 | param { 497 | lr_mult: 2 498 | decay_mult: 0 499 | } 500 | inner_product_param { 501 | num_output: 84 502 | weight_filler { 503 | type: "gaussian" 504 | std: 0.001 505 | } 506 | bias_filler { 507 | type: "constant" 508 | value: 0 509 | } 510 | } 511 | } 512 | layer { 513 | name: "cls_prob" 514 | type: "Softmax" 515 | bottom: "cls_score" 516 | top: "cls_prob" 517 | } 518 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/compressed/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6_L" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6_L" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | inner_product_param { 208 | num_output: 1024 209 | bias_term: false 210 | } 211 | } 212 | layer { 213 | name: "fc6_U" 214 | type: "InnerProduct" 215 | bottom: "fc6_L" 216 | top: "fc6_U" 217 | param { 218 | lr_mult: 1 219 | decay_mult: 1 220 | } 221 | param { 222 | lr_mult: 2 223 | decay_mult: 0 224 | } 225 | inner_product_param { 226 | num_output: 4096 227 | } 228 | } 229 | layer { 230 | name: "relu6" 231 | type: "ReLU" 232 | bottom: "fc6_U" 233 | top: "fc6_U" 234 | } 235 | layer { 236 | name: "drop6" 237 | type: "Dropout" 238 | bottom: "fc6_U" 239 | top: "fc6_U" 240 | dropout_param { 241 | dropout_ratio: 0.5 242 | } 243 | } 244 | layer { 245 | name: "fc7_L" 246 | type: "InnerProduct" 247 | bottom: "fc6_U" 248 | top: "fc7_L" 249 | param { 250 | lr_mult: 1 251 | decay_mult: 1 252 | } 253 | inner_product_param { 254 | num_output: 256 255 | bias_term: false 256 | } 257 | } 258 | layer { 259 | name: "fc7_U" 260 | type: "InnerProduct" 261 | bottom: "fc7_L" 262 | top: "fc7_U" 263 | param { 264 | lr_mult: 1 265 | decay_mult: 1 266 | } 267 | param { 268 | lr_mult: 2 269 | decay_mult: 0 270 | } 271 | inner_product_param { 272 | num_output: 1024 273 | } 274 | } 275 | layer { 276 | name: "relu7" 277 | type: "ReLU" 278 | bottom: "fc7_U" 279 | top: "fc7_U" 280 | } 281 | layer { 282 | name: "drop7" 283 | type: "Dropout" 284 | bottom: "fc7_U" 285 | top: "fc7_U" 286 | dropout_param { 287 | dropout_ratio: 0.5 288 | } 289 | } 290 | layer { 291 | name: "cls_score" 292 | type: "InnerProduct" 293 | bottom: "fc7_U" 294 | top: "cls_score" 295 | param { 296 | lr_mult: 1 297 | decay_mult: 1 298 | } 299 | param { 300 | lr_mult: 2 301 | decay_mult: 0 302 | } 303 | inner_product_param { 304 | num_output: 21 305 | weight_filler { 306 | type: "gaussian" 307 | std: 0.01 308 | } 309 | bias_filler { 310 | type: "constant" 311 | value: 0 312 | } 313 | } 314 | } 315 | layer { 316 | name: "bbox_pred" 317 | type: "InnerProduct" 318 | bottom: "fc7_U" 319 | top: "bbox_pred" 320 | param { 321 | lr_mult: 1 322 | decay_mult: 1 323 | } 324 | param { 325 | lr_mult: 2 326 | decay_mult: 0 327 | } 328 | inner_product_param { 329 | num_output: 84 330 | weight_filler { 331 | type: "gaussian" 332 | std: 0.001 333 | } 334 | bias_filler { 335 | type: "constant" 336 | value: 0 337 | } 338 | } 339 | } 340 | layer { 341 | name: "cls_prob" 342 | type: "Softmax" 343 | bottom: "cls_score" 344 | top: "cls_prob" 345 | } 346 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/no_bbox_reg/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG_CNN_M_1024/no_bbox_reg/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/no_bbox_reg/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "drop6" 223 | type: "Dropout" 224 | bottom: "fc6" 225 | top: "fc6" 226 | dropout_param { 227 | dropout_ratio: 0.5 228 | } 229 | } 230 | layer { 231 | name: "fc7" 232 | type: "InnerProduct" 233 | bottom: "fc6" 234 | top: "fc7" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 1024 245 | } 246 | } 247 | layer { 248 | name: "relu7" 249 | type: "ReLU" 250 | bottom: "fc7" 251 | top: "fc7" 252 | } 253 | layer { 254 | name: "drop7" 255 | type: "Dropout" 256 | bottom: "fc7" 257 | top: "fc7" 258 | dropout_param { 259 | dropout_ratio: 0.5 260 | } 261 | } 262 | layer { 263 | name: "cls_score" 264 | type: "InnerProduct" 265 | bottom: "fc7" 266 | top: "cls_score" 267 | param { 268 | lr_mult: 1 269 | decay_mult: 1 270 | } 271 | param { 272 | lr_mult: 2 273 | decay_mult: 0 274 | } 275 | inner_product_param { 276 | num_output: 21 277 | weight_filler { 278 | type: "gaussian" 279 | std: 0.01 280 | } 281 | bias_filler { 282 | type: "constant" 283 | value: 0 284 | } 285 | } 286 | } 287 | layer { 288 | name: "cls_prob" 289 | type: "Softmax" 290 | bottom: "cls_score" 291 | top: "cls_prob" 292 | } 293 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/no_bbox_reg/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | python_param { 9 | module: 'roi_data_layer.layer' 10 | layer: 'RoIDataLayer' 11 | param_str: "'num_classes': 21" 12 | } 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "drop6" 223 | type: "Dropout" 224 | bottom: "fc6" 225 | top: "fc6" 226 | dropout_param { 227 | dropout_ratio: 0.5 228 | } 229 | } 230 | layer { 231 | name: "fc7" 232 | type: "InnerProduct" 233 | bottom: "fc6" 234 | top: "fc7" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 1024 245 | } 246 | } 247 | layer { 248 | name: "relu7" 249 | type: "ReLU" 250 | bottom: "fc7" 251 | top: "fc7" 252 | } 253 | layer { 254 | name: "drop7" 255 | type: "Dropout" 256 | bottom: "fc7" 257 | top: "fc7" 258 | dropout_param { 259 | dropout_ratio: 0.5 260 | } 261 | } 262 | layer { 263 | name: "cls_score" 264 | type: "InnerProduct" 265 | bottom: "fc7" 266 | top: "cls_score" 267 | param { 268 | lr_mult: 1 269 | decay_mult: 1 270 | } 271 | param { 272 | lr_mult: 2 273 | decay_mult: 0 274 | } 275 | inner_product_param { 276 | num_output: 21 277 | weight_filler { 278 | type: "gaussian" 279 | std: 0.01 280 | } 281 | bias_filler { 282 | type: "constant" 283 | value: 0 284 | } 285 | } 286 | } 287 | layer { 288 | name: "loss_cls" 289 | type: "SoftmaxWithLoss" 290 | bottom: "cls_score" 291 | bottom: "labels" 292 | top: "loss_cls" 293 | loss_weight: 1 294 | } 295 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/piecewise/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG_CNN_M_1024/piecewise/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/piecewise/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_loss_weights' 10 | python_param { 11 | module: 'roi_data_layer.layer' 12 | layer: 'RoIDataLayer' 13 | param_str: "'num_classes': 21" 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 0 23 | decay_mult: 0 24 | } 25 | param { 26 | lr_mult: 0 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 7 32 | stride: 2 33 | } 34 | } 35 | layer { 36 | name: "relu1" 37 | type: "ReLU" 38 | bottom: "conv1" 39 | top: "conv1" 40 | } 41 | layer { 42 | name: "norm1" 43 | type: "LRN" 44 | bottom: "conv1" 45 | top: "norm1" 46 | lrn_param { 47 | local_size: 5 48 | alpha: 0.0005 49 | beta: 0.75 50 | k: 2 51 | } 52 | } 53 | layer { 54 | name: "pool1" 55 | type: "Pooling" 56 | bottom: "norm1" 57 | top: "pool1" 58 | pooling_param { 59 | pool: MAX 60 | kernel_size: 3 61 | stride: 2 62 | } 63 | } 64 | layer { 65 | name: "conv2" 66 | type: "Convolution" 67 | bottom: "pool1" 68 | top: "conv2" 69 | param { 70 | lr_mult: 0 71 | decay_mult: 0 72 | } 73 | param { 74 | lr_mult: 0 75 | decay_mult: 0 76 | } 77 | convolution_param { 78 | num_output: 256 79 | pad: 1 80 | kernel_size: 5 81 | stride: 2 82 | } 83 | } 84 | layer { 85 | name: "relu2" 86 | type: "ReLU" 87 | bottom: "conv2" 88 | top: "conv2" 89 | } 90 | layer { 91 | name: "norm2" 92 | type: "LRN" 93 | bottom: "conv2" 94 | top: "norm2" 95 | lrn_param { 96 | local_size: 5 97 | alpha: 0.0005 98 | beta: 0.75 99 | k: 2 100 | } 101 | } 102 | layer { 103 | name: "pool2" 104 | type: "Pooling" 105 | bottom: "norm2" 106 | top: "pool2" 107 | pooling_param { 108 | pool: MAX 109 | kernel_size: 3 110 | stride: 2 111 | } 112 | } 113 | layer { 114 | name: "conv3" 115 | type: "Convolution" 116 | bottom: "pool2" 117 | top: "conv3" 118 | param { 119 | lr_mult: 0 120 | decay_mult: 0 121 | } 122 | param { 123 | lr_mult: 0 124 | decay_mult: 0 125 | } 126 | convolution_param { 127 | num_output: 512 128 | pad: 1 129 | kernel_size: 3 130 | } 131 | } 132 | layer { 133 | name: "relu3" 134 | type: "ReLU" 135 | bottom: "conv3" 136 | top: "conv3" 137 | } 138 | layer { 139 | name: "conv4" 140 | type: "Convolution" 141 | bottom: "conv3" 142 | top: "conv4" 143 | param { 144 | lr_mult: 0 145 | decay_mult: 0 146 | } 147 | param { 148 | lr_mult: 0 149 | decay_mult: 0 150 | } 151 | convolution_param { 152 | num_output: 512 153 | pad: 1 154 | kernel_size: 3 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 0 170 | decay_mult: 0 171 | } 172 | param { 173 | lr_mult: 0 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 512 178 | pad: 1 179 | kernel_size: 3 180 | } 181 | } 182 | layer { 183 | name: "relu5" 184 | type: "ReLU" 185 | bottom: "conv5" 186 | top: "conv5" 187 | } 188 | layer { 189 | name: "roi_pool5" 190 | type: "ROIPooling" 191 | bottom: "conv5" 192 | bottom: "rois" 193 | top: "pool5" 194 | roi_pooling_param { 195 | pooled_w: 6 196 | pooled_h: 6 197 | spatial_scale: 0.0625 # 1/16 198 | } 199 | } 200 | layer { 201 | name: "fc6" 202 | type: "InnerProduct" 203 | bottom: "pool5" 204 | top: "fc6" 205 | param { 206 | lr_mult: 0 207 | decay_mult: 0 208 | } 209 | param { 210 | lr_mult: 0 211 | decay_mult: 0 212 | } 213 | inner_product_param { 214 | num_output: 4096 215 | } 216 | } 217 | layer { 218 | name: "relu6" 219 | type: "ReLU" 220 | bottom: "fc6" 221 | top: "fc6" 222 | } 223 | layer { 224 | name: "drop6" 225 | type: "Dropout" 226 | bottom: "fc6" 227 | top: "fc6" 228 | dropout_param { 229 | dropout_ratio: 0.5 230 | } 231 | } 232 | layer { 233 | name: "fc7" 234 | type: "InnerProduct" 235 | bottom: "fc6" 236 | top: "fc7" 237 | param { 238 | lr_mult: 0 239 | decay_mult: 0 240 | } 241 | param { 242 | lr_mult: 0 243 | decay_mult: 0 244 | } 245 | inner_product_param { 246 | num_output: 1024 247 | } 248 | } 249 | layer { 250 | name: "relu7" 251 | type: "ReLU" 252 | bottom: "fc7" 253 | top: "fc7" 254 | } 255 | layer { 256 | name: "drop7" 257 | type: "Dropout" 258 | bottom: "fc7" 259 | top: "fc7" 260 | dropout_param { 261 | dropout_ratio: 0.5 262 | } 263 | } 264 | layer { 265 | name: "cls_score" 266 | type: "InnerProduct" 267 | bottom: "fc7" 268 | top: "cls_score" 269 | param { 270 | lr_mult: 0 271 | decay_mult: 0 272 | } 273 | param { 274 | lr_mult: 0 275 | decay_mult: 0 276 | } 277 | inner_product_param { 278 | num_output: 21 279 | weight_filler { 280 | type: "gaussian" 281 | std: 0.01 282 | } 283 | bias_filler { 284 | type: "constant" 285 | value: 0 286 | } 287 | } 288 | } 289 | layer { 290 | name: "bbox_pred" 291 | type: "InnerProduct" 292 | bottom: "fc7" 293 | top: "bbox_pred" 294 | param { 295 | lr_mult: 1 296 | decay_mult: 1 297 | } 298 | param { 299 | lr_mult: 2 300 | decay_mult: 0 301 | } 302 | inner_product_param { 303 | num_output: 84 304 | weight_filler { 305 | type: "gaussian" 306 | std: 0.001 307 | } 308 | bias_filler { 309 | type: "constant" 310 | value: 0 311 | } 312 | } 313 | } 314 | layer { 315 | name: "loss_cls" 316 | type: "SoftmaxWithLoss" 317 | bottom: "cls_score" 318 | bottom: "labels" 319 | top: "loss_cls" 320 | loss_weight: 0 321 | } 322 | layer { 323 | name: "loss_bbox" 324 | type: "SmoothL1Loss" 325 | bottom: "bbox_pred" 326 | bottom: "bbox_targets" 327 | bottom: "bbox_loss_weights" 328 | top: "loss_bbox" 329 | loss_weight: 1 330 | } 331 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG_CNN_M_1024/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 30000 6 | display: 20 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | # We still use the snapshot prefix, though 14 | snapshot_prefix: "vgg_cnn_m_1024_fast_rcnn" 15 | #debug_info: true 16 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 224 7 | dim: 224 8 | } 9 | input: "rois" 10 | input_shape { 11 | dim: 1 # to be changed on-the-fly to num ROIs 12 | dim: 5 # [batch ind, x1, y1, x2, y2] zero-based indexing 13 | } 14 | layer { 15 | name: "conv1" 16 | type: "Convolution" 17 | bottom: "data" 18 | top: "conv1" 19 | param { 20 | lr_mult: 0 21 | decay_mult: 0 22 | } 23 | param { 24 | lr_mult: 0 25 | decay_mult: 0 26 | } 27 | convolution_param { 28 | num_output: 96 29 | kernel_size: 7 30 | stride: 2 31 | } 32 | } 33 | layer { 34 | name: "relu1" 35 | type: "ReLU" 36 | bottom: "conv1" 37 | top: "conv1" 38 | } 39 | layer { 40 | name: "norm1" 41 | type: "LRN" 42 | bottom: "conv1" 43 | top: "norm1" 44 | lrn_param { 45 | local_size: 5 46 | alpha: 0.0005 47 | beta: 0.75 48 | k: 2 49 | } 50 | } 51 | layer { 52 | name: "pool1" 53 | type: "Pooling" 54 | bottom: "norm1" 55 | top: "pool1" 56 | pooling_param { 57 | pool: MAX 58 | kernel_size: 3 59 | stride: 2 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "pool1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 1 78 | kernel_size: 5 79 | stride: 2 80 | } 81 | } 82 | layer { 83 | name: "relu2" 84 | type: "ReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layer { 89 | name: "norm2" 90 | type: "LRN" 91 | bottom: "conv2" 92 | top: "norm2" 93 | lrn_param { 94 | local_size: 5 95 | alpha: 0.0005 96 | beta: 0.75 97 | k: 2 98 | } 99 | } 100 | layer { 101 | name: "pool2" 102 | type: "Pooling" 103 | bottom: "norm2" 104 | top: "pool2" 105 | pooling_param { 106 | pool: MAX 107 | kernel_size: 3 108 | stride: 2 109 | } 110 | } 111 | layer { 112 | name: "conv3" 113 | type: "Convolution" 114 | bottom: "pool2" 115 | top: "conv3" 116 | param { 117 | lr_mult: 1 118 | decay_mult: 1 119 | } 120 | param { 121 | lr_mult: 2 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 512 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu3" 132 | type: "ReLU" 133 | bottom: "conv3" 134 | top: "conv3" 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "conv3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | decay_mult: 1 144 | } 145 | param { 146 | lr_mult: 2 147 | decay_mult: 0 148 | } 149 | convolution_param { 150 | num_output: 512 151 | pad: 1 152 | kernel_size: 3 153 | } 154 | } 155 | layer { 156 | name: "relu4" 157 | type: "ReLU" 158 | bottom: "conv4" 159 | top: "conv4" 160 | } 161 | layer { 162 | name: "conv5" 163 | type: "Convolution" 164 | bottom: "conv4" 165 | top: "conv5" 166 | param { 167 | lr_mult: 1 168 | decay_mult: 1 169 | } 170 | param { 171 | lr_mult: 2 172 | decay_mult: 0 173 | } 174 | convolution_param { 175 | num_output: 512 176 | pad: 1 177 | kernel_size: 3 178 | } 179 | } 180 | layer { 181 | name: "relu5" 182 | type: "ReLU" 183 | bottom: "conv5" 184 | top: "conv5" 185 | } 186 | layer { 187 | name: "roi_pool5" 188 | type: "ROIPooling" 189 | bottom: "conv5" 190 | bottom: "rois" 191 | top: "pool5" 192 | roi_pooling_param { 193 | pooled_w: 6 194 | pooled_h: 6 195 | spatial_scale: 0.0625 # 1/16 196 | } 197 | } 198 | layer { 199 | name: "fc6" 200 | type: "InnerProduct" 201 | bottom: "pool5" 202 | top: "fc6" 203 | param { 204 | lr_mult: 1 205 | decay_mult: 1 206 | } 207 | param { 208 | lr_mult: 2 209 | decay_mult: 0 210 | } 211 | inner_product_param { 212 | num_output: 4096 213 | } 214 | } 215 | layer { 216 | name: "relu6" 217 | type: "ReLU" 218 | bottom: "fc6" 219 | top: "fc6" 220 | } 221 | layer { 222 | name: "drop6" 223 | type: "Dropout" 224 | bottom: "fc6" 225 | top: "fc6" 226 | dropout_param { 227 | dropout_ratio: 0.5 228 | } 229 | } 230 | layer { 231 | name: "fc7" 232 | type: "InnerProduct" 233 | bottom: "fc6" 234 | top: "fc7" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 1024 245 | } 246 | } 247 | layer { 248 | name: "relu7" 249 | type: "ReLU" 250 | bottom: "fc7" 251 | top: "fc7" 252 | } 253 | layer { 254 | name: "drop7" 255 | type: "Dropout" 256 | bottom: "fc7" 257 | top: "fc7" 258 | dropout_param { 259 | dropout_ratio: 0.5 260 | } 261 | } 262 | layer { 263 | name: "cls_score" 264 | type: "InnerProduct" 265 | bottom: "fc7" 266 | top: "cls_score" 267 | param { 268 | lr_mult: 1 269 | decay_mult: 1 270 | } 271 | param { 272 | lr_mult: 2 273 | decay_mult: 0 274 | } 275 | inner_product_param { 276 | num_output: 21 277 | weight_filler { 278 | type: "gaussian" 279 | std: 0.01 280 | } 281 | bias_filler { 282 | type: "constant" 283 | value: 0 284 | } 285 | } 286 | } 287 | layer { 288 | name: "bbox_pred" 289 | type: "InnerProduct" 290 | bottom: "fc7" 291 | top: "bbox_pred" 292 | param { 293 | lr_mult: 1 294 | decay_mult: 1 295 | } 296 | param { 297 | lr_mult: 2 298 | decay_mult: 0 299 | } 300 | inner_product_param { 301 | num_output: 84 302 | weight_filler { 303 | type: "gaussian" 304 | std: 0.001 305 | } 306 | bias_filler { 307 | type: "constant" 308 | value: 0 309 | } 310 | } 311 | } 312 | layer { 313 | name: "cls_prob" 314 | type: "Softmax" 315 | bottom: "cls_score" 316 | top: "cls_prob" 317 | } 318 | -------------------------------------------------------------------------------- /models/VGG_CNN_M_1024/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_CNN_M_1024" 2 | layer { 3 | name: 'data' 4 | type: 'Python' 5 | top: 'data' 6 | top: 'rois' 7 | top: 'labels' 8 | top: 'bbox_targets' 9 | top: 'bbox_loss_weights' 10 | python_param { 11 | module: 'roi_data_layer.layer' 12 | layer: 'RoIDataLayer' 13 | param_str: "'num_classes': 21" 14 | } 15 | } 16 | layer { 17 | name: "conv1" 18 | type: "Convolution" 19 | bottom: "data" 20 | top: "conv1" 21 | param { 22 | lr_mult: 0 23 | decay_mult: 0 24 | } 25 | param { 26 | lr_mult: 0 27 | decay_mult: 0 28 | } 29 | convolution_param { 30 | num_output: 96 31 | kernel_size: 7 32 | stride: 2 33 | } 34 | } 35 | layer { 36 | name: "relu1" 37 | type: "ReLU" 38 | bottom: "conv1" 39 | top: "conv1" 40 | } 41 | layer { 42 | name: "norm1" 43 | type: "LRN" 44 | bottom: "conv1" 45 | top: "norm1" 46 | lrn_param { 47 | local_size: 5 48 | alpha: 0.0005 49 | beta: 0.75 50 | k: 2 51 | } 52 | } 53 | layer { 54 | name: "pool1" 55 | type: "Pooling" 56 | bottom: "norm1" 57 | top: "pool1" 58 | pooling_param { 59 | pool: MAX 60 | kernel_size: 3 61 | stride: 2 62 | } 63 | } 64 | layer { 65 | name: "conv2" 66 | type: "Convolution" 67 | bottom: "pool1" 68 | top: "conv2" 69 | param { 70 | lr_mult: 1 71 | decay_mult: 1 72 | } 73 | param { 74 | lr_mult: 2 75 | decay_mult: 0 76 | } 77 | convolution_param { 78 | num_output: 256 79 | pad: 1 80 | kernel_size: 5 81 | stride: 2 82 | } 83 | } 84 | layer { 85 | name: "relu2" 86 | type: "ReLU" 87 | bottom: "conv2" 88 | top: "conv2" 89 | } 90 | layer { 91 | name: "norm2" 92 | type: "LRN" 93 | bottom: "conv2" 94 | top: "norm2" 95 | lrn_param { 96 | local_size: 5 97 | alpha: 0.0005 98 | beta: 0.75 99 | k: 2 100 | } 101 | } 102 | layer { 103 | name: "pool2" 104 | type: "Pooling" 105 | bottom: "norm2" 106 | top: "pool2" 107 | pooling_param { 108 | pool: MAX 109 | kernel_size: 3 110 | stride: 2 111 | } 112 | } 113 | layer { 114 | name: "conv3" 115 | type: "Convolution" 116 | bottom: "pool2" 117 | top: "conv3" 118 | param { 119 | lr_mult: 1 120 | decay_mult: 1 121 | } 122 | param { 123 | lr_mult: 2 124 | decay_mult: 0 125 | } 126 | convolution_param { 127 | num_output: 512 128 | pad: 1 129 | kernel_size: 3 130 | } 131 | } 132 | layer { 133 | name: "relu3" 134 | type: "ReLU" 135 | bottom: "conv3" 136 | top: "conv3" 137 | } 138 | layer { 139 | name: "conv4" 140 | type: "Convolution" 141 | bottom: "conv3" 142 | top: "conv4" 143 | param { 144 | lr_mult: 1 145 | decay_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | decay_mult: 0 150 | } 151 | convolution_param { 152 | num_output: 512 153 | pad: 1 154 | kernel_size: 3 155 | } 156 | } 157 | layer { 158 | name: "relu4" 159 | type: "ReLU" 160 | bottom: "conv4" 161 | top: "conv4" 162 | } 163 | layer { 164 | name: "conv5" 165 | type: "Convolution" 166 | bottom: "conv4" 167 | top: "conv5" 168 | param { 169 | lr_mult: 1 170 | decay_mult: 1 171 | } 172 | param { 173 | lr_mult: 2 174 | decay_mult: 0 175 | } 176 | convolution_param { 177 | num_output: 512 178 | pad: 1 179 | kernel_size: 3 180 | } 181 | } 182 | layer { 183 | name: "relu5" 184 | type: "ReLU" 185 | bottom: "conv5" 186 | top: "conv5" 187 | } 188 | layer { 189 | name: "roi_pool5" 190 | type: "ROIPooling" 191 | bottom: "conv5" 192 | bottom: "rois" 193 | top: "pool5" 194 | roi_pooling_param { 195 | pooled_w: 6 196 | pooled_h: 6 197 | spatial_scale: 0.0625 # 1/16 198 | } 199 | } 200 | layer { 201 | name: "fc6" 202 | type: "InnerProduct" 203 | bottom: "pool5" 204 | top: "fc6" 205 | param { 206 | lr_mult: 1 207 | decay_mult: 1 208 | } 209 | param { 210 | lr_mult: 2 211 | decay_mult: 0 212 | } 213 | inner_product_param { 214 | num_output: 4096 215 | } 216 | } 217 | layer { 218 | name: "relu6" 219 | type: "ReLU" 220 | bottom: "fc6" 221 | top: "fc6" 222 | } 223 | layer { 224 | name: "drop6" 225 | type: "Dropout" 226 | bottom: "fc6" 227 | top: "fc6" 228 | dropout_param { 229 | dropout_ratio: 0.5 230 | } 231 | } 232 | layer { 233 | name: "fc7" 234 | type: "InnerProduct" 235 | bottom: "fc6" 236 | top: "fc7" 237 | param { 238 | lr_mult: 1 239 | decay_mult: 1 240 | } 241 | param { 242 | lr_mult: 2 243 | decay_mult: 0 244 | } 245 | inner_product_param { 246 | num_output: 1024 247 | } 248 | } 249 | layer { 250 | name: "relu7" 251 | type: "ReLU" 252 | bottom: "fc7" 253 | top: "fc7" 254 | } 255 | layer { 256 | name: "drop7" 257 | type: "Dropout" 258 | bottom: "fc7" 259 | top: "fc7" 260 | dropout_param { 261 | dropout_ratio: 0.5 262 | } 263 | } 264 | layer { 265 | name: "cls_score" 266 | type: "InnerProduct" 267 | bottom: "fc7" 268 | top: "cls_score" 269 | param { 270 | lr_mult: 1 271 | decay_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | decay_mult: 0 276 | } 277 | inner_product_param { 278 | num_output: 21 279 | weight_filler { 280 | type: "gaussian" 281 | std: 0.01 282 | } 283 | bias_filler { 284 | type: "constant" 285 | value: 0 286 | } 287 | } 288 | } 289 | layer { 290 | name: "bbox_pred" 291 | type: "InnerProduct" 292 | bottom: "fc7" 293 | top: "bbox_pred" 294 | param { 295 | lr_mult: 1 296 | decay_mult: 1 297 | } 298 | param { 299 | lr_mult: 2 300 | decay_mult: 0 301 | } 302 | inner_product_param { 303 | num_output: 84 304 | weight_filler { 305 | type: "gaussian" 306 | std: 0.001 307 | } 308 | bias_filler { 309 | type: "constant" 310 | value: 0 311 | } 312 | } 313 | } 314 | layer { 315 | name: "loss_cls" 316 | type: "SoftmaxWithLoss" 317 | bottom: "cls_score" 318 | bottom: "labels" 319 | top: "loss_cls" 320 | loss_weight: 1 321 | } 322 | layer { 323 | name: "loss_bbox" 324 | type: "SmoothL1Loss" 325 | bottom: "bbox_pred" 326 | bottom: "bbox_targets" 327 | bottom: "bbox_loss_weights" 328 | top: "loss_bbox" 329 | loss_weight: 1 330 | } 331 | -------------------------------------------------------------------------------- /output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /output/README.md: -------------------------------------------------------------------------------- 1 | Artifacts generated by the scripts in `tools` are written in this directory. 2 | 3 | Trained Fast R-CNN networks are saved under: 4 | 5 | ``` 6 | output/// 7 | ``` 8 | 9 | Test outputs are saved under: 10 | 11 | ``` 12 | output//// 13 | ``` 14 | -------------------------------------------------------------------------------- /todo.txt: -------------------------------------------------------------------------------- 1 | - ImageNet ILSVRC detection dataset 2 | - COCO bounding-box detection 3 | - Port PASCAL evaluation code from Matlab to Python 4 | -------------------------------------------------------------------------------- /tools/README.md: -------------------------------------------------------------------------------- 1 | Tools for training, testing, and compressing Fast R-CNN networks. 2 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Set up paths for Fast R-CNN.""" 9 | 10 | import os.path as osp 11 | import sys 12 | 13 | def add_path(path): 14 | if path not in sys.path: 15 | sys.path.insert(0, path) 16 | 17 | this_dir = osp.dirname(__file__) 18 | 19 | # Add caffe to PYTHONPATH 20 | caffe_path = osp.join(this_dir, '..', 'caffe-fast-rcnn', 'python') 21 | add_path(caffe_path) 22 | 23 | # Add lib to PYTHONPATH 24 | lib_path = osp.join(this_dir, '..', 'lib') 25 | add_path(lib_path) 26 | -------------------------------------------------------------------------------- /tools/compress_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Compress a Fast R-CNN network using truncated SVD.""" 11 | 12 | import _init_paths 13 | import caffe 14 | import argparse 15 | import numpy as np 16 | import os, sys 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Compress a Fast R-CNN network') 21 | parser.add_argument('--def', dest='prototxt', 22 | help='prototxt file defining the uncompressed network; ' 23 | 'e.g., models/VGG16/test.prototxt', 24 | default=None, type=str) 25 | parser.add_argument('--def-svd', dest='prototxt_svd', 26 | help='prototxt file defining the SVD compressed network ' 27 | 'e.g., models/VGG16/compressed/test.prototxt', 28 | default=None, type=str) 29 | parser.add_argument('--net', dest='caffemodel', 30 | help='model to compress', 31 | default=None, type=str) 32 | 33 | if len(sys.argv) == 1: 34 | parser.print_help() 35 | sys.exit(1) 36 | 37 | args = parser.parse_args() 38 | return args 39 | 40 | def compress_weights(W, l): 41 | """Compress the weight matrix W of an inner product (fully connected) layer 42 | using truncated SVD. 43 | 44 | Parameters: 45 | W: N x M weights matrix 46 | l: number of singular values to retain 47 | 48 | Returns: 49 | Ul, L: matrices such that W \approx Ul*L 50 | """ 51 | 52 | # numpy doesn't seem to have a fast truncated SVD algorithm... 53 | # this could be faster 54 | U, s, V = np.linalg.svd(W, full_matrices=False) 55 | 56 | Ul = U[:, :l] 57 | sl = s[:l] 58 | Vl = V[:l, :] 59 | 60 | L = np.dot(np.diag(sl), Vl) 61 | return Ul, L 62 | 63 | def main(): 64 | args = parse_args() 65 | 66 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 67 | 68 | net_svd = caffe.Net(args.prototxt_svd, args.caffemodel, caffe.TEST) 69 | 70 | print('Uncompressed network {} : {}'.format(args.prototxt, args.caffemodel)) 71 | print('Compressed network prototxt {}'.format(args.prototxt_svd)) 72 | 73 | out = os.path.splitext(os.path.basename(args.caffemodel))[0] + '_svd' 74 | out_dir = os.path.dirname(args.caffemodel) 75 | 76 | # Compress fc6 77 | if net_svd.params.has_key('fc6_L'): 78 | l_fc6 = net_svd.params['fc6_L'][0].data.shape[0] 79 | print(' fc6_L bottleneck size: {}'.format(l_fc6)) 80 | 81 | # uncompressed weights and biases 82 | W_fc6 = net.params['fc6'][0].data 83 | B_fc6 = net.params['fc6'][1].data 84 | 85 | print(' compressing fc6...') 86 | Ul_fc6, L_fc6 = compress_weights(W_fc6, l_fc6) 87 | 88 | assert(len(net_svd.params['fc6_L']) == 1) 89 | 90 | # install compressed matrix factors (and original biases) 91 | net_svd.params['fc6_L'][0].data[...] = L_fc6 92 | 93 | net_svd.params['fc6_U'][0].data[...] = Ul_fc6 94 | net_svd.params['fc6_U'][1].data[...] = B_fc6 95 | 96 | out += '_fc6_{}'.format(l_fc6) 97 | 98 | # Compress fc7 99 | if net_svd.params.has_key('fc7_L'): 100 | l_fc7 = net_svd.params['fc7_L'][0].data.shape[0] 101 | print ' fc7_L bottleneck size: {}'.format(l_fc7) 102 | 103 | W_fc7 = net.params['fc7'][0].data 104 | B_fc7 = net.params['fc7'][1].data 105 | 106 | print(' compressing fc7...') 107 | Ul_fc7, L_fc7 = compress_weights(W_fc7, l_fc7) 108 | 109 | assert(len(net_svd.params['fc7_L']) == 1) 110 | 111 | net_svd.params['fc7_L'][0].data[...] = L_fc7 112 | 113 | net_svd.params['fc7_U'][0].data[...] = Ul_fc7 114 | net_svd.params['fc7_U'][1].data[...] = B_fc7 115 | 116 | out += '_fc7_{}'.format(l_fc7) 117 | 118 | filename = '{}/{}.caffemodel'.format(out_dir, out) 119 | net_svd.save(filename) 120 | print 'Wrote svd model to: {:s}'.format(filename) 121 | 122 | if __name__ == '__main__': 123 | main() 124 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """ 11 | Demo script showing detections in sample images. 12 | 13 | See README.md for installation instructions before running. 14 | """ 15 | 16 | import _init_paths 17 | from fast_rcnn.config import cfg 18 | from fast_rcnn.test import im_detect 19 | from utils.cython_nms import nms 20 | from utils.timer import Timer 21 | import matplotlib.pyplot as plt 22 | import numpy as np 23 | import scipy.io as sio 24 | import caffe, os, sys, cv2 25 | import argparse 26 | 27 | CLASSES = ('__background__', 28 | 'aeroplane', 'bicycle', 'bird', 'boat', 29 | 'bottle', 'bus', 'car', 'cat', 'chair', 30 | 'cow', 'diningtable', 'dog', 'horse', 31 | 'motorbike', 'person', 'pottedplant', 32 | 'sheep', 'sofa', 'train', 'tvmonitor') 33 | 34 | NETS = {'vgg16': ('VGG16', 35 | 'vgg16_fast_rcnn_iter_40000.caffemodel'), 36 | 'vgg_cnn_m_1024': ('VGG_CNN_M_1024', 37 | 'vgg_cnn_m_1024_fast_rcnn_iter_40000.caffemodel'), 38 | 'caffenet': ('CaffeNet', 39 | 'caffenet_fast_rcnn_iter_40000.caffemodel')} 40 | 41 | 42 | def vis_detections(im, class_name, dets, thresh=0.5): 43 | """Draw detected bounding boxes.""" 44 | inds = np.where(dets[:, -1] >= thresh)[0] 45 | if len(inds) == 0: 46 | return 47 | 48 | im = im[:, :, (2, 1, 0)] 49 | fig, ax = plt.subplots(figsize=(12, 12)) 50 | ax.imshow(im, aspect='equal') 51 | for i in inds: 52 | bbox = dets[i, :4] 53 | score = dets[i, -1] 54 | 55 | ax.add_patch( 56 | plt.Rectangle((bbox[0], bbox[1]), 57 | bbox[2] - bbox[0], 58 | bbox[3] - bbox[1], fill=False, 59 | edgecolor='red', linewidth=3.5) 60 | ) 61 | ax.text(bbox[0], bbox[1] - 2, 62 | '{:s} {:.3f}'.format(class_name, score), 63 | bbox=dict(facecolor='blue', alpha=0.5), 64 | fontsize=14, color='white') 65 | 66 | ax.set_title(('{} detections with ' 67 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 68 | thresh), 69 | fontsize=14) 70 | plt.axis('off') 71 | plt.tight_layout() 72 | plt.draw() 73 | 74 | def demo(net, image_name, classes): 75 | """Detect object classes in an image using pre-computed object proposals.""" 76 | 77 | # Load pre-computed Selected Search object proposals 78 | box_file = os.path.join(cfg.ROOT_DIR, 'data', 'demo', 79 | image_name + '_boxes.mat') 80 | obj_proposals = sio.loadmat(box_file)['boxes'] 81 | 82 | # Load the demo image 83 | im_file = os.path.join(cfg.ROOT_DIR, 'data', 'demo', image_name + '.jpg') 84 | im = cv2.imread(im_file) 85 | 86 | # Detect all object classes and regress object bounds 87 | timer = Timer() 88 | timer.tic() 89 | scores, boxes = im_detect(net, im, obj_proposals) 90 | timer.toc() 91 | print ('Detection took {:.3f}s for ' 92 | '{:d} object proposals').format(timer.total_time, boxes.shape[0]) 93 | 94 | # Visualize detections for each class 95 | CONF_THRESH = 0.8 96 | NMS_THRESH = 0.3 97 | for cls in classes: 98 | cls_ind = CLASSES.index(cls) 99 | cls_boxes = boxes[:, 4*cls_ind:4*(cls_ind + 1)] 100 | cls_scores = scores[:, cls_ind] 101 | keep = np.where(cls_scores >= CONF_THRESH)[0] 102 | cls_boxes = cls_boxes[keep, :] 103 | cls_scores = cls_scores[keep] 104 | dets = np.hstack((cls_boxes, 105 | cls_scores[:, np.newaxis])).astype(np.float32) 106 | keep = nms(dets, NMS_THRESH) 107 | dets = dets[keep, :] 108 | print 'All {} detections with p({} | box) >= {:.1f}'.format(cls, cls, 109 | CONF_THRESH) 110 | vis_detections(im, cls, dets, thresh=CONF_THRESH) 111 | 112 | def parse_args(): 113 | """Parse input arguments.""" 114 | parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') 115 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 116 | default=0, type=int) 117 | parser.add_argument('--cpu', dest='cpu_mode', 118 | help='Use CPU mode (overrides --gpu)', 119 | action='store_true') 120 | parser.add_argument('--net', dest='demo_net', help='Network to use [vgg16]', 121 | choices=NETS.keys(), default='vgg16') 122 | 123 | args = parser.parse_args() 124 | 125 | return args 126 | 127 | if __name__ == '__main__': 128 | args = parse_args() 129 | 130 | prototxt = os.path.join(cfg.ROOT_DIR, 'models', NETS[args.demo_net][0], 131 | 'test.prototxt') 132 | caffemodel = os.path.join(cfg.ROOT_DIR, 'data', 'fast_rcnn_models', 133 | NETS[args.demo_net][1]) 134 | 135 | if not os.path.isfile(caffemodel): 136 | raise IOError(('{:s} not found.\nDid you run ./data/scripts/' 137 | 'fetch_fast_rcnn_models.sh?').format(caffemodel)) 138 | 139 | if args.cpu_mode: 140 | caffe.set_mode_cpu() 141 | else: 142 | caffe.set_mode_gpu() 143 | caffe.set_device(args.gpu_id) 144 | net = caffe.Net(prototxt, caffemodel, caffe.TEST) 145 | 146 | print '\n\nLoaded network {:s}'.format(caffemodel) 147 | 148 | print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' 149 | print 'Demo for data/demo/000004.jpg' 150 | demo(net, '000004', ('car',)) 151 | 152 | print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' 153 | print 'Demo for data/demo/001551.jpg' 154 | demo(net, '001551', ('sofa', 'tvmonitor')) 155 | 156 | plt.show() 157 | -------------------------------------------------------------------------------- /tools/reval.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Reval = re-eval. Re-evaluate saved detections.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.test import apply_nms 14 | from fast_rcnn.config import cfg 15 | from datasets.factory import get_imdb 16 | import cPickle 17 | import os, sys, argparse 18 | import numpy as np 19 | 20 | def parse_args(): 21 | """ 22 | Parse input arguments 23 | """ 24 | parser = argparse.ArgumentParser(description='Re-evaluate results') 25 | parser.add_argument('output_dir', nargs=1, help='results directory', 26 | type=str) 27 | parser.add_argument('--rerun', dest='rerun', 28 | help=('re-run evaluation code ' 29 | '(otherwise: results are loaded from file)'), 30 | action='store_true') 31 | parser.add_argument('--imdb', dest='imdb_name', 32 | help='dataset to re-evaluate', 33 | default='voc_2007_test', type=str) 34 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 35 | action='store_true') 36 | 37 | if len(sys.argv) == 1: 38 | parser.print_help() 39 | sys.exit(1) 40 | 41 | args = parser.parse_args() 42 | return args 43 | 44 | 45 | def from_mats(imdb_name, output_dir): 46 | import scipy.io as sio 47 | 48 | imdb = get_imdb(imdb_name) 49 | 50 | aps = [] 51 | for i, cls in enumerate(imdb.classes[1:]): 52 | mat = sio.loadmat(os.path.join(output_dir, cls + '_pr.mat')) 53 | ap = mat['ap'][0, 0] * 100 54 | apAuC = mat['ap_auc'][0, 0] * 100 55 | print '!!! {} : {:.1f} {:.1f}'.format(cls, ap, apAuC) 56 | aps.append(ap) 57 | 58 | print '~~~~~~~~~~~~~~~~~~~' 59 | print 'Results (from mat files):' 60 | for ap in aps: 61 | print '{:.1f}'.format(ap) 62 | print '{:.1f}'.format(np.array(aps).mean()) 63 | print '~~~~~~~~~~~~~~~~~~~' 64 | 65 | 66 | def from_dets(imdb_name, output_dir, comp_mode): 67 | imdb = get_imdb(imdb_name) 68 | imdb.competition_mode(comp_mode) 69 | with open(os.path.join(output_dir, 'detections.pkl'), 'rb') as f: 70 | dets = cPickle.load(f) 71 | 72 | print 'Applying NMS to all detections' 73 | nms_dets = apply_nms(dets, cfg.TEST.NMS) 74 | 75 | print 'Evaluating detections' 76 | imdb.evaluate_detections(nms_dets, output_dir) 77 | 78 | if __name__ == '__main__': 79 | args = parse_args() 80 | 81 | output_dir = os.path.abspath(args.output_dir[0]) 82 | imdb_name = args.imdb_name 83 | 84 | if args.comp_mode and not args.rerun: 85 | raise ValueError('--rerun must be used with --comp') 86 | 87 | if args.rerun: 88 | from_dets(imdb_name, output_dir, args.comp_mode) 89 | else: 90 | from_mats(imdb_name, output_dir) 91 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Test a Fast R-CNN network on an image database.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.test import test_net 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list 15 | from datasets.factory import get_imdb 16 | import caffe 17 | import argparse 18 | import pprint 19 | import time, os, sys 20 | 21 | def parse_args(): 22 | """ 23 | Parse input arguments 24 | """ 25 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 26 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 27 | default=0, type=int) 28 | parser.add_argument('--def', dest='prototxt', 29 | help='prototxt file defining the network', 30 | default=None, type=str) 31 | parser.add_argument('--net', dest='caffemodel', 32 | help='model to test', 33 | default=None, type=str) 34 | parser.add_argument('--cfg', dest='cfg_file', 35 | help='optional config file', default=None, type=str) 36 | parser.add_argument('--wait', dest='wait', 37 | help='wait until net file exists', 38 | default=True, type=bool) 39 | parser.add_argument('--imdb', dest='imdb_name', 40 | help='dataset to test', 41 | default='voc_2007_test', type=str) 42 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 43 | action='store_true') 44 | parser.add_argument('--set', dest='set_cfgs', 45 | help='set config keys', default=None, 46 | nargs=argparse.REMAINDER) 47 | 48 | if len(sys.argv) == 1: 49 | parser.print_help() 50 | sys.exit(1) 51 | 52 | args = parser.parse_args() 53 | return args 54 | 55 | if __name__ == '__main__': 56 | args = parse_args() 57 | 58 | print('Called with args:') 59 | print(args) 60 | 61 | if args.cfg_file is not None: 62 | cfg_from_file(args.cfg_file) 63 | if args.set_cfgs is not None: 64 | cfg_from_list(args.set_cfgs) 65 | 66 | print('Using config:') 67 | pprint.pprint(cfg) 68 | 69 | while not os.path.exists(args.caffemodel) and args.wait: 70 | print('Waiting for {} to exist...'.format(args.caffemodel)) 71 | time.sleep(10) 72 | 73 | caffe.set_mode_gpu() 74 | caffe.set_device(args.gpu_id) 75 | net = caffe.Net(args.prototxt, args.caffemodel, caffe.TEST) 76 | net.name = os.path.splitext(os.path.basename(args.caffemodel))[0] 77 | 78 | imdb = get_imdb(args.imdb_name) 79 | imdb.competition_mode(args.comp_mode) 80 | 81 | test_net(net, imdb) 82 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Fast R-CNN 5 | # Copyright (c) 2015 Microsoft 6 | # Licensed under The MIT License [see LICENSE for details] 7 | # Written by Ross Girshick 8 | # -------------------------------------------------------- 9 | 10 | """Train a Fast R-CNN network on a region of interest database.""" 11 | 12 | import _init_paths 13 | from fast_rcnn.train import get_training_roidb, train_net 14 | from fast_rcnn.config import cfg, cfg_from_file, cfg_from_list, get_output_dir 15 | from datasets.factory import get_imdb 16 | import caffe 17 | import argparse 18 | import pprint 19 | import numpy as np 20 | import sys 21 | 22 | def parse_args(): 23 | """ 24 | Parse input arguments 25 | """ 26 | parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') 27 | parser.add_argument('--gpu', dest='gpu_id', 28 | help='GPU device id to use [0]', 29 | default=0, type=int) 30 | parser.add_argument('--solver', dest='solver', 31 | help='solver prototxt', 32 | default=None, type=str) 33 | parser.add_argument('--iters', dest='max_iters', 34 | help='number of iterations to train', 35 | default=40000, type=int) 36 | parser.add_argument('--weights', dest='pretrained_model', 37 | help='initialize with pretrained model weights', 38 | default=None, type=str) 39 | parser.add_argument('--cfg', dest='cfg_file', 40 | help='optional config file', 41 | default=None, type=str) 42 | parser.add_argument('--imdb', dest='imdb_name', 43 | help='dataset to train on', 44 | default='voc_2007_trainval', type=str) 45 | parser.add_argument('--rand', dest='randomize', 46 | help='randomize (do not use a fixed seed)', 47 | action='store_true') 48 | parser.add_argument('--set', dest='set_cfgs', 49 | help='set config keys', default=None, 50 | nargs=argparse.REMAINDER) 51 | 52 | if len(sys.argv) == 1: 53 | parser.print_help() 54 | sys.exit(1) 55 | 56 | args = parser.parse_args() 57 | return args 58 | 59 | if __name__ == '__main__': 60 | args = parse_args() 61 | 62 | print('Called with args:') 63 | print(args) 64 | 65 | if args.cfg_file is not None: 66 | cfg_from_file(args.cfg_file) 67 | if args.set_cfgs is not None: 68 | cfg_from_list(args.set_cfgs) 69 | 70 | print('Using config:') 71 | pprint.pprint(cfg) 72 | 73 | if not args.randomize: 74 | # fix the random seeds (numpy and caffe) for reproducibility 75 | np.random.seed(cfg.RNG_SEED) 76 | caffe.set_random_seed(cfg.RNG_SEED) 77 | 78 | # set up caffe 79 | caffe.set_mode_gpu() 80 | if args.gpu_id is not None: 81 | caffe.set_device(args.gpu_id) 82 | 83 | imdb = get_imdb(args.imdb_name) 84 | print 'Loaded dataset `{:s}` for training'.format(imdb.name) 85 | roidb = get_training_roidb(imdb) 86 | 87 | output_dir = get_output_dir(imdb, None) 88 | print 'Output will be saved to `{:s}`'.format(output_dir) 89 | 90 | train_net(args.solver, roidb, output_dir, 91 | pretrained_model=args.pretrained_model, 92 | max_iters=args.max_iters) 93 | --------------------------------------------------------------------------------