├── src ├── dataset │ ├── kitti-eval │ │ ├── cpp │ │ │ ├── .gitignore │ │ │ ├── mail.h │ │ │ └── evaluate_object.cpp │ │ └── Makefile │ ├── __init__.py │ ├── pascal_voc.py │ ├── voc_eval.py │ ├── imdb.py │ └── kitti.py ├── __init__.py ├── utils │ ├── __init__.py │ ├── caffemodel2pkl.py │ └── util.py ├── nets │ ├── __init__.py │ ├── vgg16_convDet.py │ ├── squeezeDet.py │ ├── squeezeDetPlus.py │ └── resnet50_convDet.py ├── config │ ├── __init__.py │ ├── kitti_res50_config.py │ ├── kitti_vgg16_config.py │ ├── kitti_squeezeDet_config.py │ ├── kitti_squeezeDetPlus_config.py │ ├── kitti_model_config.py │ └── config.py ├── demo.py ├── eval.py ├── train.py └── nn_skeleton.py ├── .gitignore ├── data ├── sample.png └── random_split_train_val.py ├── README ├── det_img.png ├── graph.png ├── out_sample.png └── detection_analysis.png ├── requirements.txt ├── LICENSE ├── scripts ├── eval.sh └── train.sh └── README.md /src/dataset/kitti-eval/cpp/.gitignore: -------------------------------------------------------------------------------- 1 | evaluate_object 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *pyc 2 | data/out/ 3 | data/model_checkpoints 4 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | -------------------------------------------------------------------------------- /data/sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dividiti/squeezeDet/master/data/sample.png -------------------------------------------------------------------------------- /README/det_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/det_img.png -------------------------------------------------------------------------------- /README/graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/graph.png -------------------------------------------------------------------------------- /src/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from kitti import kitti 2 | from pascal_voc import pascal_voc 3 | -------------------------------------------------------------------------------- /README/out_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/out_sample.png -------------------------------------------------------------------------------- /README/detection_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/detection_analysis.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | easydict==1.6 2 | joblib==0.10.3 3 | numpy==1.12.0 4 | opencv-python==3.2.0.6 5 | Pillow==4.0.0 6 | tensorflow-gpu==1.0.0 7 | -------------------------------------------------------------------------------- /src/dataset/kitti-eval/Makefile: -------------------------------------------------------------------------------- 1 | 2 | cpp/evaluate_object : cpp/evaluate_object.cpp 3 | g++ -Wall -Wno-sign-compare -o cpp/evaluate_object cpp/evaluate_object.cpp 4 | -------------------------------------------------------------------------------- /src/nets/__init__.py: -------------------------------------------------------------------------------- 1 | from squeezeDet import SqueezeDet 2 | from squeezeDetPlus import SqueezeDetPlus 3 | from resnet50_convDet import ResNet50ConvDet 4 | from vgg16_convDet import VGG16ConvDet 5 | -------------------------------------------------------------------------------- /src/config/__init__.py: -------------------------------------------------------------------------------- 1 | from kitti_model_config import kitti_model_config 2 | from kitti_vgg16_config import kitti_vgg16_config 3 | from kitti_res50_config import kitti_res50_config 4 | from kitti_squeezeDet_config import kitti_squeezeDet_config 5 | from kitti_squeezeDetPlus_config import kitti_squeezeDetPlus_config 6 | -------------------------------------------------------------------------------- /data/random_split_train_val.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | image_set_dir = './KITTI/ImageSets' 4 | trainval_file = image_set_dir+'/trainval.txt' 5 | train_file = image_set_dir+'/train.txt' 6 | val_file = image_set_dir+'/val.txt' 7 | 8 | idx = [] 9 | with open(trainval_file) as f: 10 | for line in f: 11 | idx.append(line.strip()) 12 | f.close() 13 | 14 | idx = np.random.permutation(idx) 15 | 16 | train_idx = sorted(idx[:len(idx)/2]) 17 | val_idx = sorted(idx[len(idx)/2:]) 18 | 19 | with open(train_file, 'w') as f: 20 | for i in train_idx: 21 | f.write('{}\n'.format(i)) 22 | f.close() 23 | 24 | with open(val_file, 'w') as f: 25 | for i in val_idx: 26 | f.write('{}\n'.format(i)) 27 | f.close() 28 | 29 | print 'Trainining set is saved to ' + train_file 30 | print 'Validation set is saved to ' + val_file 31 | -------------------------------------------------------------------------------- /src/dataset/kitti-eval/cpp/mail.h: -------------------------------------------------------------------------------- 1 | #ifndef MAIL_H 2 | #define MAIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | class Mail { 9 | 10 | public: 11 | 12 | Mail (std::string email = "") { 13 | if (email.compare("")) { 14 | mail = popen("/usr/lib/sendmail -t -f noreply@cvlibs.net","w"); 15 | fprintf(mail,"To: %s\n", email.c_str()); 16 | fprintf(mail,"From: noreply@cvlibs.net\n"); 17 | fprintf(mail,"Subject: KITTI Evaluation Benchmark\n"); 18 | fprintf(mail,"\n\n"); 19 | } else { 20 | mail = 0; 21 | } 22 | } 23 | 24 | ~Mail() { 25 | if (mail) { 26 | pclose(mail); 27 | } 28 | } 29 | 30 | void msg (const char *format, ...) { 31 | va_list args; 32 | va_start(args,format); 33 | if (mail) { 34 | vfprintf(mail,format,args); 35 | fprintf(mail,"\n"); 36 | } 37 | vprintf(format,args); 38 | printf("\n"); 39 | va_end(args); 40 | } 41 | 42 | private: 43 | 44 | FILE *mail; 45 | 46 | }; 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/utils/caffemodel2pkl.py: -------------------------------------------------------------------------------- 1 | # Merged from https://raw.githubusercontent.com/bgshih/vgg16.tf/master/src/dump_caffemodel_weights.py 2 | 3 | import sys 4 | import os 5 | 6 | import argparse 7 | import numpy as np 8 | import joblib 9 | 10 | import caffe 11 | 12 | parser = argparse.ArgumentParser(description='') 13 | parser.add_argument('--caffe_root', help='Caffe root directory.') 14 | parser.add_argument('--prototxt_path', help='Model prototxt path.') 15 | parser.add_argument('--caffemodel_path', help='Caffe model weights file (.caffemodel) path.') 16 | parser.add_argument('--caffe_weights_path', default='/tmp/VGG_ILSVRC_16_layers_weights.pkl', 17 | help='VGG16 weights dump path.') 18 | args = parser.parse_args() 19 | 20 | def dump_caffemodel_weights(): 21 | net = caffe.Net(args.prototxt_path, args.caffemodel_path, caffe.TEST) 22 | weights = {} 23 | n_layers = len(net.layers) 24 | for i in range(n_layers): 25 | layer_name = net._layer_names[i] 26 | layer = net.layers[i] 27 | layer_blobs = [o.data for o in layer.blobs] 28 | weights[layer_name] = layer_blobs 29 | joblib.dump(weights, args.caffe_weights_path) 30 | 31 | 32 | if __name__ == '__main__': 33 | dump_caffemodel_weights() 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2016, Bichen Wu 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /scripts/eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export GPUID=0 4 | export NET="squeezeDet" 5 | export EVAL_DIR="/tmp/bichen/logs/SqueezeDet/" 6 | export IMAGE_SET="val" 7 | 8 | if [ $# -eq 0 ] 9 | then 10 | echo "Usage: ./scripts/train.sh [options]" 11 | echo " " 12 | echo "options:" 13 | echo "-h, --help show brief help" 14 | echo "-net (squeezeDet|squeezeDet+|vgg16|resnet50)" 15 | echo "-gpu gpu id" 16 | echo "-eval_dir directory to save logs" 17 | echo "-image_set (train|val)" 18 | exit 0 19 | fi 20 | 21 | while test $# -gt 0; do 22 | case "$1" in 23 | -h|--help) 24 | echo "Usage: ./scripts/train.sh [options]" 25 | echo " " 26 | echo "options:" 27 | echo "-h, --help show brief help" 28 | echo "-net (squeezeDet|squeezeDet+|vgg16|resnet50)" 29 | echo "-gpu gpu id" 30 | echo "-eval_dir directory to save logs" 31 | echo "-image_set (train|val)" 32 | exit 0 33 | ;; 34 | -net) 35 | export NET="$2" 36 | shift 37 | shift 38 | ;; 39 | -gpu) 40 | export GPUID="$2" 41 | shift 42 | shift 43 | ;; 44 | -eval_dir) 45 | export EVAL_DIR="$2" 46 | shift 47 | shift 48 | ;; 49 | -image_set) 50 | export IMAGE_SET="$2" 51 | shift 52 | shift 53 | ;; 54 | *) 55 | break 56 | ;; 57 | esac 58 | done 59 | 60 | # =========================================================================== # 61 | # command for squeezeDet: 62 | # =========================================================================== # 63 | python ./src/eval.py \ 64 | --dataset=KITTI \ 65 | --data_path=./data/KITTI \ 66 | --image_set=$IMAGE_SET \ 67 | --eval_dir="$EVAL_DIR/$IMAGE_SET" \ 68 | --checkpoint_path="$EVAL_DIR/train" \ 69 | --net=$NET \ 70 | --gpu=$GPUID 71 | -------------------------------------------------------------------------------- /scripts/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export GPUID=0 4 | export NET="squeezeDet" 5 | export TRAIN_DIR="/tmp/bichen/logs/SqueezeDet/" 6 | 7 | if [ $# -eq 0 ] 8 | then 9 | echo "Usage: ./scripts/train.sh [options]" 10 | echo " " 11 | echo "options:" 12 | echo "-h, --help show brief help" 13 | echo "-net (squeezeDet|squeezeDet+|vgg16|resnet50)" 14 | echo "-gpu gpu id" 15 | echo "-train_dir directory for training logs" 16 | exit 0 17 | fi 18 | 19 | while test $# -gt 0; do 20 | case "$1" in 21 | -h|--help) 22 | echo "Usage: ./scripts/train.sh [options]" 23 | echo " " 24 | echo "options:" 25 | echo "-h, --help show brief help" 26 | echo "-net (squeezeDet|squeezeDet+|vgg16|resnet50)" 27 | echo "-gpu gpu id" 28 | echo "-train_dir directory for training logs" 29 | exit 0 30 | ;; 31 | -net) 32 | export NET="$2" 33 | shift 34 | shift 35 | ;; 36 | -gpu) 37 | export GPUID="$2" 38 | shift 39 | shift 40 | ;; 41 | -train_dir) 42 | export TRAIN_DIR="$2" 43 | shift 44 | shift 45 | ;; 46 | *) 47 | break 48 | ;; 49 | esac 50 | done 51 | 52 | case "$NET" in 53 | "squeezeDet") 54 | export PRETRAINED_MODEL_PATH="./data/SqueezeNet/squeezenet_v1.1.pkl" 55 | ;; 56 | "squeezeDet+") 57 | export PRETRAINED_MODEL_PATH="./data/SqueezeNet/squeezenet_v1.0_SR_0.750.pkl" 58 | ;; 59 | "resnet50") 60 | export PRETRAINED_MODEL_PATH="./data/ResNet/ResNet-50-weights.pkl" 61 | ;; 62 | "vgg16") 63 | export PRETRAINED_MODEL_PATH="./data/VGG16/VGG_ILSVRC_16_layers_weights.pkl" 64 | ;; 65 | *) 66 | echo "net architecture not supported." 67 | exit 0 68 | ;; 69 | esac 70 | 71 | 72 | python ./src/train.py \ 73 | --dataset=KITTI \ 74 | --pretrained_model_path=$PRETRAINED_MODEL_PATH \ 75 | --data_path=./data/KITTI \ 76 | --image_set=train \ 77 | --train_dir="$TRAIN_DIR/train" \ 78 | --net=$NET \ 79 | --summary_step=100 \ 80 | --checkpoint_step=500 \ 81 | --gpu=$GPUID 82 | -------------------------------------------------------------------------------- /src/config/kitti_res50_config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Model configuration for pascal dataset""" 4 | 5 | import numpy as np 6 | 7 | from config import base_model_config 8 | 9 | def kitti_res50_config(): 10 | """Specify the parameters to tune below.""" 11 | mc = base_model_config('KITTI') 12 | 13 | mc.IMAGE_WIDTH = 1242 14 | mc.IMAGE_HEIGHT = 375 15 | mc.BATCH_SIZE = 20 16 | 17 | mc.WEIGHT_DECAY = 0.0001 18 | mc.LEARNING_RATE = 0.01 19 | mc.DECAY_STEPS = 10000 20 | mc.MAX_GRAD_NORM = 1.0 21 | mc.MOMENTUM = 0.9 22 | mc.LR_DECAY_FACTOR = 0.5 23 | 24 | mc.LOSS_COEF_BBOX = 5.0 25 | mc.LOSS_COEF_CONF_POS = 75.0 26 | mc.LOSS_COEF_CONF_NEG = 100.0 27 | mc.LOSS_COEF_CLASS = 1.0 28 | 29 | mc.PLOT_PROB_THRESH = 0.4 30 | mc.NMS_THRESH = 0.4 31 | mc.PROB_THRESH = 0.005 32 | mc.TOP_N_DETECTION = 64 33 | 34 | mc.DATA_AUGMENTATION = True 35 | mc.DRIFT_X = 150 36 | mc.DRIFT_Y = 100 37 | mc.EXCLUDE_HARD_EXAMPLES = False 38 | 39 | mc.ANCHOR_BOX = set_anchors(mc) 40 | mc.ANCHORS = len(mc.ANCHOR_BOX) 41 | mc.ANCHOR_PER_GRID = 9 42 | 43 | return mc 44 | 45 | def set_anchors(mc): 46 | H, W, B = 24, 78, 9 47 | anchor_shapes = np.reshape( 48 | [np.array( 49 | [[ 94., 49.], [ 225., 161.], [ 170., 91.], 50 | [ 390., 181.], [ 41., 32.], [ 128., 64.], 51 | [ 298., 164.], [ 232., 99.], [ 65., 42.]])] * H * W, 52 | (H, W, B, 2) 53 | ) 54 | center_x = np.reshape( 55 | np.transpose( 56 | np.reshape( 57 | np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 58 | (B, H, W) 59 | ), 60 | (1, 2, 0) 61 | ), 62 | (H, W, B, 1) 63 | ) 64 | center_y = np.reshape( 65 | np.transpose( 66 | np.reshape( 67 | np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B), 68 | (B, W, H) 69 | ), 70 | (2, 1, 0) 71 | ), 72 | (H, W, B, 1) 73 | ) 74 | anchors = np.reshape( 75 | np.concatenate((center_x, center_y, anchor_shapes), axis=3), 76 | (-1, 4) 77 | ) 78 | 79 | return anchors 80 | -------------------------------------------------------------------------------- /src/config/kitti_vgg16_config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Model configuration for pascal dataset""" 4 | 5 | import numpy as np 6 | 7 | from config import base_model_config 8 | 9 | def kitti_vgg16_config(): 10 | """Specify the parameters to tune below.""" 11 | mc = base_model_config('KITTI') 12 | 13 | mc.IMAGE_WIDTH = 1242 14 | mc.IMAGE_HEIGHT = 375 15 | mc.BATCH_SIZE = 5 16 | 17 | mc.WEIGHT_DECAY = 0.0001 18 | mc.LEARNING_RATE = 0.01 19 | mc.DECAY_STEPS = 10000 20 | mc.MAX_GRAD_NORM = 1.0 21 | mc.MOMENTUM = 0.9 22 | mc.LR_DECAY_FACTOR = 0.5 23 | 24 | mc.LOSS_COEF_BBOX = 5.0 25 | mc.LOSS_COEF_CONF_POS = 75.0 26 | mc.LOSS_COEF_CONF_NEG = 100.0 27 | mc.LOSS_COEF_CLASS = 1.0 28 | 29 | mc.PLOT_PROB_THRESH = 0.4 30 | mc.NMS_THRESH = 0.4 31 | mc.PROB_THRESH = 0.005 32 | mc.TOP_N_DETECTION = 64 33 | 34 | mc.DATA_AUGMENTATION = True 35 | mc.DRIFT_X = 150 36 | mc.DRIFT_Y = 100 37 | mc.EXCLUDE_HARD_EXAMPLES = False 38 | 39 | mc.ANCHOR_BOX = set_anchors(mc) 40 | mc.ANCHORS = len(mc.ANCHOR_BOX) 41 | mc.ANCHOR_PER_GRID = 9 42 | 43 | return mc 44 | 45 | def set_anchors(mc): 46 | H, W, B = 24, 78, 9 47 | anchor_shapes = np.reshape( 48 | [np.array( 49 | [[ 36., 37.], [ 366., 174.], [ 115., 59.], 50 | [ 162., 87.], [ 38., 90.], [ 258., 173.], 51 | [ 224., 108.], [ 78., 170.], [ 72., 43.]])] * H * W, 52 | (H, W, B, 2) 53 | ) 54 | center_x = np.reshape( 55 | np.transpose( 56 | np.reshape( 57 | np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 58 | (B, H, W) 59 | ), 60 | (1, 2, 0) 61 | ), 62 | (H, W, B, 1) 63 | ) 64 | center_y = np.reshape( 65 | np.transpose( 66 | np.reshape( 67 | np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B), 68 | (B, W, H) 69 | ), 70 | (2, 1, 0) 71 | ), 72 | (H, W, B, 1) 73 | ) 74 | anchors = np.reshape( 75 | np.concatenate((center_x, center_y, anchor_shapes), axis=3), 76 | (-1, 4) 77 | ) 78 | 79 | return anchors 80 | -------------------------------------------------------------------------------- /src/config/kitti_squeezeDet_config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Model configuration for pascal dataset""" 4 | 5 | import numpy as np 6 | 7 | from config import base_model_config 8 | 9 | def kitti_squeezeDet_config(): 10 | """Specify the parameters to tune below.""" 11 | mc = base_model_config('KITTI') 12 | 13 | mc.IMAGE_WIDTH = 1248 14 | mc.IMAGE_HEIGHT = 384 15 | mc.BATCH_SIZE = 20 16 | 17 | mc.WEIGHT_DECAY = 0.0001 18 | mc.LEARNING_RATE = 0.01 19 | mc.DECAY_STEPS = 10000 20 | mc.MAX_GRAD_NORM = 1.0 21 | mc.MOMENTUM = 0.9 22 | mc.LR_DECAY_FACTOR = 0.5 23 | 24 | mc.LOSS_COEF_BBOX = 5.0 25 | mc.LOSS_COEF_CONF_POS = 75.0 26 | mc.LOSS_COEF_CONF_NEG = 100.0 27 | mc.LOSS_COEF_CLASS = 1.0 28 | 29 | mc.PLOT_PROB_THRESH = 0.4 30 | mc.NMS_THRESH = 0.4 31 | mc.PROB_THRESH = 0.005 32 | mc.TOP_N_DETECTION = 64 33 | 34 | mc.DATA_AUGMENTATION = True 35 | mc.DRIFT_X = 150 36 | mc.DRIFT_Y = 100 37 | mc.EXCLUDE_HARD_EXAMPLES = False 38 | 39 | mc.ANCHOR_BOX = set_anchors(mc) 40 | mc.ANCHORS = len(mc.ANCHOR_BOX) 41 | mc.ANCHOR_PER_GRID = 9 42 | 43 | return mc 44 | 45 | def set_anchors(mc): 46 | H, W, B = 24, 78, 9 47 | anchor_shapes = np.reshape( 48 | [np.array( 49 | [[ 36., 37.], [ 366., 174.], [ 115., 59.], 50 | [ 162., 87.], [ 38., 90.], [ 258., 173.], 51 | [ 224., 108.], [ 78., 170.], [ 72., 43.]])] * H * W, 52 | (H, W, B, 2) 53 | ) 54 | center_x = np.reshape( 55 | np.transpose( 56 | np.reshape( 57 | np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 58 | (B, H, W) 59 | ), 60 | (1, 2, 0) 61 | ), 62 | (H, W, B, 1) 63 | ) 64 | center_y = np.reshape( 65 | np.transpose( 66 | np.reshape( 67 | np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B), 68 | (B, W, H) 69 | ), 70 | (2, 1, 0) 71 | ), 72 | (H, W, B, 1) 73 | ) 74 | anchors = np.reshape( 75 | np.concatenate((center_x, center_y, anchor_shapes), axis=3), 76 | (-1, 4) 77 | ) 78 | 79 | return anchors 80 | -------------------------------------------------------------------------------- /src/config/kitti_squeezeDetPlus_config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Model configuration for pascal dataset""" 4 | 5 | import numpy as np 6 | 7 | from config import base_model_config 8 | 9 | def kitti_squeezeDetPlus_config(): 10 | """Specify the parameters to tune below.""" 11 | mc = base_model_config('KITTI') 12 | 13 | mc.IMAGE_WIDTH = 1242 14 | mc.IMAGE_HEIGHT = 375 15 | mc.BATCH_SIZE = 20 16 | 17 | mc.WEIGHT_DECAY = 0.0001 18 | mc.LEARNING_RATE = 0.01 19 | mc.DECAY_STEPS = 10000 20 | mc.MAX_GRAD_NORM = 1.0 21 | mc.MOMENTUM = 0.9 22 | mc.LR_DECAY_FACTOR = 0.5 23 | 24 | mc.LOSS_COEF_BBOX = 5.0 25 | mc.LOSS_COEF_CONF_POS = 75.0 26 | mc.LOSS_COEF_CONF_NEG = 100.0 27 | mc.LOSS_COEF_CLASS = 1.0 28 | 29 | mc.PLOT_PROB_THRESH = 0.4 30 | mc.NMS_THRESH = 0.4 31 | mc.PROB_THRESH = 0.005 32 | mc.TOP_N_DETECTION = 64 33 | 34 | mc.DATA_AUGMENTATION = True 35 | mc.DRIFT_X = 150 36 | mc.DRIFT_Y = 100 37 | mc.EXCLUDE_HARD_EXAMPLES = False 38 | 39 | mc.ANCHOR_BOX = set_anchors(mc) 40 | mc.ANCHORS = len(mc.ANCHOR_BOX) 41 | mc.ANCHOR_PER_GRID = 9 42 | 43 | return mc 44 | 45 | def set_anchors(mc): 46 | H, W, B = 22, 76, 9 47 | anchor_shapes = np.reshape( 48 | [np.array( 49 | [[ 36., 37.], [ 366., 174.], [ 115., 59.], 50 | [ 162., 87.], [ 38., 90.], [ 258., 173.], 51 | [ 224., 108.], [ 78., 170.], [ 72., 43.]])] * H * W, 52 | (H, W, B, 2) 53 | ) 54 | center_x = np.reshape( 55 | np.transpose( 56 | np.reshape( 57 | np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 58 | (B, H, W) 59 | ), 60 | (1, 2, 0) 61 | ), 62 | (H, W, B, 1) 63 | ) 64 | center_y = np.reshape( 65 | np.transpose( 66 | np.reshape( 67 | np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B), 68 | (B, W, H) 69 | ), 70 | (2, 1, 0) 71 | ), 72 | (H, W, B, 1) 73 | ) 74 | anchors = np.reshape( 75 | np.concatenate((center_x, center_y, anchor_shapes), axis=3), 76 | (-1, 4) 77 | ) 78 | 79 | return anchors 80 | -------------------------------------------------------------------------------- /src/config/kitti_model_config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Model configuration for pascal dataset""" 4 | 5 | import numpy as np 6 | 7 | from config import base_model_config 8 | 9 | def kitti_model_config(): 10 | """Specify the parameters to tune below.""" 11 | mc = base_model_config('KITTI') 12 | # mc.IMAGE_WIDTH = 1864 # half width 621 13 | # mc.IMAGE_HEIGHT = 562 # half height 187 14 | mc.IMAGE_WIDTH = 1248 # half width 621 15 | mc.IMAGE_HEIGHT = 384 # half height 187 16 | # mc.IMAGE_WIDTH = 621 17 | # mc.IMAGE_HEIGHT = 187 18 | 19 | mc.WEIGHT_DECAY = 0.0001 20 | mc.PROB_THRESH = 0.005 21 | mc.TOP_N_DETECTION = 64 22 | mc.PLOT_PROB_THRESH = 0.4 23 | mc.NMS_THRESH = 0.4 24 | mc.LEARNING_RATE = 0.01 25 | mc.MOMENTUM = 0.9 26 | mc.DECAY_STEPS = 10000 27 | mc.LR_DECAY_FACTOR = 0.5 28 | mc.BATCH_SIZE = 20 29 | mc.LOSS_COEF_BBOX = 5.0 30 | mc.LOSS_COEF_CONF_POS = 75.0 31 | mc.LOSS_COEF_CONF_NEG = 100.0 32 | mc.LOSS_COEF_CLASS = 1.0 33 | mc.MAX_GRAD_NORM = 1.0 34 | mc.DATA_AUGMENTATION = True 35 | mc.DRIFT_X = 150 36 | mc.DRIFT_Y = 100 37 | mc.ANCHOR_BOX = set_anchors(mc) 38 | mc.ANCHORS = len(mc.ANCHOR_BOX) 39 | mc.ANCHOR_PER_GRID = 9 40 | mc.USE_DECONV = False 41 | mc.EXCLUDE_HARD_EXAMPLES = False 42 | 43 | return mc 44 | 45 | def set_anchors(mc): 46 | H, W, B = 24, 78, 9 47 | anchor_shapes = np.reshape( 48 | [np.array( 49 | [[ 36., 37.], [ 366., 174.], [ 115., 59.], 50 | [ 162., 87.], [ 38., 90.], [ 258., 173.], 51 | [ 224., 108.], [ 78., 170.], [ 72., 43.]])] * H * W, 52 | (H, W, B, 2) 53 | ) 54 | center_x = np.reshape( 55 | np.transpose( 56 | np.reshape( 57 | np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 58 | (B, H, W) 59 | ), 60 | (1, 2, 0) 61 | ), 62 | (H, W, B, 1) 63 | ) 64 | center_y = np.reshape( 65 | np.transpose( 66 | np.reshape( 67 | np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B), 68 | (B, W, H) 69 | ), 70 | (2, 1, 0) 71 | ), 72 | (H, W, B, 1) 73 | ) 74 | anchors = np.reshape( 75 | np.concatenate((center_x, center_y, anchor_shapes), axis=3), 76 | (-1, 4) 77 | ) 78 | 79 | return anchors 80 | -------------------------------------------------------------------------------- /src/nets/vgg16_convDet.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """VGG16+ConvDet model.""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | 12 | import joblib 13 | from utils import util 14 | from easydict import EasyDict as edict 15 | import numpy as np 16 | import tensorflow as tf 17 | from nn_skeleton import ModelSkeleton 18 | 19 | 20 | class VGG16ConvDet(ModelSkeleton): 21 | def __init__(self, mc, gpu_id=0): 22 | with tf.device('/gpu:{}'.format(gpu_id)): 23 | ModelSkeleton.__init__(self, mc) 24 | 25 | self._add_forward_graph() 26 | self._add_interpretation_graph() 27 | self._add_loss_graph() 28 | self._add_train_graph() 29 | self._add_viz_graph() 30 | 31 | def _add_forward_graph(self): 32 | """Build the VGG-16 model.""" 33 | 34 | mc = self.mc 35 | if mc.LOAD_PRETRAINED_MODEL: 36 | assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \ 37 | 'Cannot find pretrained model at the given path:' \ 38 | ' {}'.format(mc.PRETRAINED_MODEL_PATH) 39 | self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH) 40 | 41 | with tf.variable_scope('conv1') as scope: 42 | conv1_1 = self._conv_layer( 43 | 'conv1_1', self.image_input, filters=64, size=3, stride=1, freeze=True) 44 | conv1_2 = self._conv_layer( 45 | 'conv1_2', conv1_1, filters=64, size=3, stride=1, freeze=True) 46 | pool1 = self._pooling_layer( 47 | 'pool1', conv1_2, size=2, stride=2) 48 | 49 | with tf.variable_scope('conv2') as scope: 50 | conv2_1 = self._conv_layer( 51 | 'conv2_1', pool1, filters=128, size=3, stride=1, freeze=True) 52 | conv2_2 = self._conv_layer( 53 | 'conv2_2', conv2_1, filters=128, size=3, stride=1, freeze=True) 54 | pool2 = self._pooling_layer( 55 | 'pool2', conv2_2, size=2, stride=2) 56 | 57 | with tf.variable_scope('conv3') as scope: 58 | conv3_1 = self._conv_layer( 59 | 'conv3_1', pool2, filters=256, size=3, stride=1) 60 | conv3_2 = self._conv_layer( 61 | 'conv3_2', conv3_1, filters=256, size=3, stride=1) 62 | conv3_3 = self._conv_layer( 63 | 'conv3_3', conv3_2, filters=256, size=3, stride=1) 64 | pool3 = self._pooling_layer( 65 | 'pool3', conv3_3, size=2, stride=2) 66 | 67 | with tf.variable_scope('conv4') as scope: 68 | conv4_1 = self._conv_layer( 69 | 'conv4_1', pool3, filters=512, size=3, stride=1) 70 | conv4_2 = self._conv_layer( 71 | 'conv4_2', conv4_1, filters=512, size=3, stride=1) 72 | conv4_3 = self._conv_layer( 73 | 'conv4_3', conv4_2, filters=512, size=3, stride=1) 74 | pool4 = self._pooling_layer( 75 | 'pool4', conv4_3, size=2, stride=2) 76 | 77 | with tf.variable_scope('conv5') as scope: 78 | conv5_1 = self._conv_layer( 79 | 'conv5_1', pool4, filters=512, size=3, stride=1) 80 | conv5_2 = self._conv_layer( 81 | 'conv5_2', conv5_1, filters=512, size=3, stride=1) 82 | conv5_3 = self._conv_layer( 83 | 'conv5_3', conv5_2, filters=512, size=3, stride=1) 84 | 85 | dropout5 = tf.nn.dropout(conv5_3, self.keep_prob, name='drop6') 86 | 87 | num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) 88 | self.preds = self._conv_layer( 89 | 'conv6', dropout5, filters=num_output, size=3, stride=1, 90 | padding='SAME', xavier=False, relu=False, stddev=0.0001) 91 | -------------------------------------------------------------------------------- /src/nets/squeezeDet.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """SqueezeDet model.""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | 12 | import joblib 13 | from utils import util 14 | from easydict import EasyDict as edict 15 | import numpy as np 16 | import tensorflow as tf 17 | from nn_skeleton import ModelSkeleton 18 | 19 | class SqueezeDet(ModelSkeleton): 20 | def __init__(self, mc, gpu_id=0): 21 | with tf.device('/gpu:{}'.format(gpu_id)): 22 | ModelSkeleton.__init__(self, mc) 23 | 24 | self._add_forward_graph() 25 | self._add_interpretation_graph() 26 | self._add_loss_graph() 27 | self._add_train_graph() 28 | self._add_viz_graph() 29 | 30 | def _add_forward_graph(self): 31 | """NN architecture.""" 32 | 33 | mc = self.mc 34 | if mc.LOAD_PRETRAINED_MODEL: 35 | assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \ 36 | 'Cannot find pretrained model at the given path:' \ 37 | ' {}'.format(mc.PRETRAINED_MODEL_PATH) 38 | self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH) 39 | 40 | conv1 = self._conv_layer( 41 | 'conv1', self.image_input, filters=64, size=3, stride=2, 42 | padding='SAME', freeze=True) 43 | pool1 = self._pooling_layer( 44 | 'pool1', conv1, size=3, stride=2, padding='SAME') 45 | 46 | fire2 = self._fire_layer( 47 | 'fire2', pool1, s1x1=16, e1x1=64, e3x3=64, freeze=False) 48 | fire3 = self._fire_layer( 49 | 'fire3', fire2, s1x1=16, e1x1=64, e3x3=64, freeze=False) 50 | pool3 = self._pooling_layer( 51 | 'pool3', fire3, size=3, stride=2, padding='SAME') 52 | 53 | fire4 = self._fire_layer( 54 | 'fire4', pool3, s1x1=32, e1x1=128, e3x3=128, freeze=False) 55 | fire5 = self._fire_layer( 56 | 'fire5', fire4, s1x1=32, e1x1=128, e3x3=128, freeze=False) 57 | pool5 = self._pooling_layer( 58 | 'pool5', fire5, size=3, stride=2, padding='SAME') 59 | 60 | fire6 = self._fire_layer( 61 | 'fire6', pool5, s1x1=48, e1x1=192, e3x3=192, freeze=False) 62 | fire7 = self._fire_layer( 63 | 'fire7', fire6, s1x1=48, e1x1=192, e3x3=192, freeze=False) 64 | fire8 = self._fire_layer( 65 | 'fire8', fire7, s1x1=64, e1x1=256, e3x3=256, freeze=False) 66 | fire9 = self._fire_layer( 67 | 'fire9', fire8, s1x1=64, e1x1=256, e3x3=256, freeze=False) 68 | 69 | # Two extra fire modules that are not trained before 70 | fire10 = self._fire_layer( 71 | 'fire10', fire9, s1x1=96, e1x1=384, e3x3=384, freeze=False) 72 | fire11 = self._fire_layer( 73 | 'fire11', fire10, s1x1=96, e1x1=384, e3x3=384, freeze=False) 74 | dropout11 = tf.nn.dropout(fire11, self.keep_prob, name='drop11') 75 | 76 | num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) 77 | self.preds = self._conv_layer( 78 | 'conv12', dropout11, filters=num_output, size=3, stride=1, 79 | padding='SAME', xavier=False, relu=False, stddev=0.0001) 80 | 81 | def _fire_layer(self, layer_name, inputs, s1x1, e1x1, e3x3, stddev=0.01, 82 | freeze=False): 83 | """Fire layer constructor. 84 | 85 | Args: 86 | layer_name: layer name 87 | inputs: input tensor 88 | s1x1: number of 1x1 filters in squeeze layer. 89 | e1x1: number of 1x1 filters in expand layer. 90 | e3x3: number of 3x3 filters in expand layer. 91 | freeze: if true, do not train parameters in this layer. 92 | Returns: 93 | fire layer operation. 94 | """ 95 | 96 | sq1x1 = self._conv_layer( 97 | layer_name+'/squeeze1x1', inputs, filters=s1x1, size=1, stride=1, 98 | padding='SAME', stddev=stddev, freeze=freeze) 99 | ex1x1 = self._conv_layer( 100 | layer_name+'/expand1x1', sq1x1, filters=e1x1, size=1, stride=1, 101 | padding='SAME', stddev=stddev, freeze=freeze) 102 | ex3x3 = self._conv_layer( 103 | layer_name+'/expand3x3', sq1x1, filters=e3x3, size=3, stride=1, 104 | padding='SAME', stddev=stddev, freeze=freeze) 105 | 106 | return tf.concat([ex1x1, ex3x3], 3, name=layer_name+'/concat') 107 | -------------------------------------------------------------------------------- /src/nets/squeezeDetPlus.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """SqueezeDet+ model.""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | 12 | import joblib 13 | from utils import util 14 | from easydict import EasyDict as edict 15 | import numpy as np 16 | import tensorflow as tf 17 | from nn_skeleton import ModelSkeleton 18 | 19 | class SqueezeDetPlus(ModelSkeleton): 20 | def __init__(self, mc, gpu_id=0): 21 | with tf.device('/gpu:{}'.format(gpu_id)): 22 | ModelSkeleton.__init__(self, mc) 23 | 24 | self._add_forward_graph() 25 | self._add_interpretation_graph() 26 | self._add_loss_graph() 27 | self._add_train_graph() 28 | self._add_viz_graph() 29 | 30 | def _add_forward_graph(self): 31 | """NN architecture.""" 32 | 33 | mc = self.mc 34 | if mc.LOAD_PRETRAINED_MODEL: 35 | assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \ 36 | 'Cannot find pretrained model at the given path:' \ 37 | ' {}'.format(mc.PRETRAINED_MODEL_PATH) 38 | self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH) 39 | 40 | conv1 = self._conv_layer( 41 | 'conv1', self.image_input, filters=96, size=7, stride=2, 42 | padding='VALID', freeze=True) 43 | pool1 = self._pooling_layer( 44 | 'pool1', conv1, size=3, stride=2, padding='VALID') 45 | 46 | fire2 = self._fire_layer( 47 | 'fire2', pool1, s1x1=96, e1x1=64, e3x3=64, freeze=False) 48 | fire3 = self._fire_layer( 49 | 'fire3', fire2, s1x1=96, e1x1=64, e3x3=64, freeze=False) 50 | fire4 = self._fire_layer( 51 | 'fire4', fire3, s1x1=192, e1x1=128, e3x3=128, freeze=False) 52 | pool4 = self._pooling_layer( 53 | 'pool4', fire4, size=3, stride=2, padding='VALID') 54 | 55 | fire5 = self._fire_layer( 56 | 'fire5', pool4, s1x1=192, e1x1=128, e3x3=128, freeze=False) 57 | fire6 = self._fire_layer( 58 | 'fire6', fire5, s1x1=288, e1x1=192, e3x3=192, freeze=False) 59 | fire7 = self._fire_layer( 60 | 'fire7', fire6, s1x1=288, e1x1=192, e3x3=192, freeze=False) 61 | fire8 = self._fire_layer( 62 | 'fire8', fire7, s1x1=384, e1x1=256, e3x3=256, freeze=False) 63 | pool8 = self._pooling_layer( 64 | 'pool8', fire8, size=3, stride=2, padding='VALID') 65 | 66 | fire9 = self._fire_layer( 67 | 'fire9', pool8, s1x1=384, e1x1=256, e3x3=256, freeze=False) 68 | 69 | # Two extra fire modules that are not trained before 70 | fire10 = self._fire_layer( 71 | 'fire10', fire9, s1x1=384, e1x1=256, e3x3=256, freeze=False) 72 | fire11 = self._fire_layer( 73 | 'fire11', fire10, s1x1=384, e1x1=256, e3x3=256, freeze=False) 74 | dropout11 = tf.nn.dropout(fire11, self.keep_prob, name='drop11') 75 | 76 | num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) 77 | self.preds = self._conv_layer( 78 | 'conv12', dropout11, filters=num_output, size=3, stride=1, 79 | padding='SAME', xavier=False, relu=False, stddev=0.0001) 80 | 81 | def _fire_layer(self, layer_name, inputs, s1x1, e1x1, e3x3, stddev=0.01, 82 | freeze=False): 83 | """Fire layer constructor. 84 | 85 | Args: 86 | layer_name: layer name 87 | inputs: input tensor 88 | s1x1: number of 1x1 filters in squeeze layer. 89 | e1x1: number of 1x1 filters in expand layer. 90 | e3x3: number of 3x3 filters in expand layer. 91 | freeze: if true, do not train parameters in this layer. 92 | Returns: 93 | fire layer operation. 94 | """ 95 | 96 | sq1x1 = self._conv_layer( 97 | layer_name+'/squeeze1x1', inputs, filters=s1x1, size=1, stride=1, 98 | padding='SAME', stddev=stddev, freeze=freeze) 99 | ex1x1 = self._conv_layer( 100 | layer_name+'/expand1x1', sq1x1, filters=e1x1, size=1, stride=1, 101 | padding='SAME', stddev=stddev, freeze=freeze) 102 | ex3x3 = self._conv_layer( 103 | layer_name+'/expand3x3', sq1x1, filters=e3x3, size=3, stride=1, 104 | padding='SAME', stddev=stddev, freeze=freeze) 105 | 106 | return tf.concat([ex1x1, ex3x3], 3, name=layer_name+'/concat') 107 | -------------------------------------------------------------------------------- /src/config/config.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Base Model configurations""" 4 | 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | from easydict import EasyDict as edict 9 | 10 | def base_model_config(dataset='PASCAL_VOC'): 11 | assert dataset.upper()=='PASCAL_VOC' or dataset.upper()=='KITTI', \ 12 | 'Currently only support PASCAL_VOC or KITTI dataset' 13 | 14 | cfg = edict() 15 | 16 | # Dataset used to train/val/test model. Now support PASCAL_VOC or KITTI 17 | cfg.DATASET = dataset.upper() 18 | 19 | if cfg.DATASET == 'PASCAL_VOC': 20 | # object categories to classify 21 | cfg.CLASS_NAMES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 22 | 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 23 | 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 24 | 'sofa', 'train', 'tvmonitor') 25 | elif cfg.DATASET == 'KITTI': 26 | cfg.CLASS_NAMES = ('car', 'pedestrian', 'cyclist') 27 | 28 | # number of categories to classify 29 | cfg.CLASSES = len(cfg.CLASS_NAMES) 30 | 31 | # ROI pooling output width 32 | cfg.GRID_POOL_WIDTH = 7 33 | 34 | # ROI pooling output height 35 | cfg.GRID_POOL_HEIGHT = 7 36 | 37 | # parameter used in leaky ReLU 38 | cfg.LEAKY_COEF = 0.1 39 | 40 | # Probability to keep a node in dropout 41 | cfg.KEEP_PROB = 0.5 42 | 43 | # image width 44 | cfg.IMAGE_WIDTH = 224 45 | 46 | # image height 47 | cfg.IMAGE_HEIGHT = 224 48 | 49 | # anchor box, array of [cx, cy, w, h]. To be defined later 50 | cfg.ANCHOR_BOX = [] 51 | 52 | # number of anchor boxes 53 | cfg.ANCHORS = len(cfg.ANCHOR_BOX) 54 | 55 | # number of anchor boxes per grid 56 | cfg.ANCHOR_PER_GRID = -1 57 | 58 | # batch size 59 | cfg.BATCH_SIZE = 20 60 | 61 | # Only keep boxes with probability higher than this threshold 62 | cfg.PROB_THRESH = 0.005 63 | 64 | # Only plot boxes with probability higher than this threshold 65 | cfg.PLOT_PROB_THRESH = 0.5 66 | 67 | # Bounding boxes with IOU larger than this are going to be removed 68 | cfg.NMS_THRESH = 0.2 69 | 70 | # Pixel mean values (BGR order) as a (1, 1, 3) array. Below is the BGR mean 71 | # of VGG16 72 | cfg.BGR_MEANS = np.array([[[103.939, 116.779, 123.68]]]) 73 | 74 | # loss coefficient for confidence regression 75 | cfg.LOSS_COEF_CONF = 1.0 76 | 77 | # loss coefficient for classification regression 78 | cfg.LOSS_COEF_CLASS = 1.0 79 | 80 | # loss coefficient for bounding box regression 81 | cfg.LOSS_COEF_BBOX = 10.0 82 | 83 | # reduce step size after this many steps 84 | cfg.DECAY_STEPS = 10000 85 | 86 | # multiply the learning rate by this factor 87 | cfg.LR_DECAY_FACTOR = 0.1 88 | 89 | # learning rate 90 | cfg.LEARNING_RATE = 0.005 91 | 92 | # momentum 93 | cfg.MOMENTUM = 0.9 94 | 95 | # weight decay 96 | cfg.WEIGHT_DECAY = 0.0005 97 | 98 | # wether to load pre-trained model 99 | cfg.LOAD_PRETRAINED_MODEL = True 100 | 101 | # path to load the pre-trained model 102 | cfg.PRETRAINED_MODEL_PATH = '' 103 | 104 | # print log to console in debug mode 105 | cfg.DEBUG_MODE = False 106 | 107 | # a small value used to prevent numerical instability 108 | cfg.EPSILON = 1e-16 109 | 110 | # threshold for safe exponential operation 111 | cfg.EXP_THRESH=1.0 112 | 113 | # gradients with norm larger than this is going to be clipped. 114 | cfg.MAX_GRAD_NORM = 10.0 115 | 116 | # Whether to do data augmentation 117 | cfg.DATA_AUGMENTATION = False 118 | 119 | # The range to randomly shift the image widht 120 | cfg.DRIFT_X = 0 121 | 122 | # The range to randomly shift the image height 123 | cfg.DRIFT_Y = 0 124 | 125 | # Whether to exclude images harder than hard-category. Only useful for KITTI 126 | # dataset. 127 | cfg.EXCLUDE_HARD_EXAMPLES = True 128 | 129 | # small value used in batch normalization to prevent dividing by 0. The 130 | # default value here is the same with caffe's default value. 131 | cfg.BATCH_NORM_EPSILON = 1e-5 132 | 133 | # number of threads to fetch data 134 | cfg.NUM_THREAD = 4 135 | 136 | # capacity for FIFOQueue 137 | cfg.QUEUE_CAPACITY = 100 138 | 139 | # indicate if the model is in training mode 140 | cfg.IS_TRAINING = False 141 | 142 | return cfg 143 | -------------------------------------------------------------------------------- /src/dataset/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Image data base class for pascal voc""" 4 | 5 | import cv2 6 | import os 7 | import numpy as np 8 | import xml.etree.ElementTree as ET 9 | 10 | from utils.util import bbox_transform_inv 11 | from dataset.imdb import imdb 12 | from dataset.voc_eval import voc_eval 13 | 14 | class pascal_voc(imdb): 15 | def __init__(self, image_set, year, data_path, mc): 16 | imdb.__init__(self, 'voc_'+year+'_'+image_set, mc) 17 | self._year = year 18 | self._image_set = image_set 19 | self._data_root_path = data_path 20 | self._data_path = os.path.join(self._data_root_path, 'VOC' + self._year) 21 | self._classes = self.mc.CLASS_NAMES 22 | self._class_to_idx = dict(zip(self.classes, xrange(self.num_classes))) 23 | 24 | # a list of string indices of images in the directory 25 | self._image_idx = self._load_image_set_idx() 26 | # a dict of image_idx -> [[cx, cy, w, h, cls_idx]]. x,y,w,h are not divided by 27 | # the image width and height 28 | self._rois = self._load_pascal_annotation() 29 | 30 | ## batch reader ## 31 | self._perm_idx = None 32 | self._cur_idx = 0 33 | # TODO(bichen): add a random seed as parameter 34 | self._shuffle_image_idx() 35 | 36 | def _load_image_set_idx(self): 37 | image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main', 38 | self._image_set+'.txt') 39 | assert os.path.exists(image_set_file), \ 40 | 'File does not exist: {}'.format(image_set_file) 41 | 42 | with open(image_set_file) as f: 43 | image_idx = [x.strip() for x in f.readlines()] 44 | return image_idx 45 | 46 | def _image_path_at(self, idx): 47 | image_path = os.path.join(self._data_path, 'JPEGImages', idx+'.jpg') 48 | assert os.path.exists(image_path), \ 49 | 'Image does not exist: {}'.format(image_path) 50 | return image_path 51 | 52 | def _load_pascal_annotation(self): 53 | idx2annotation = {} 54 | for index in self._image_idx: 55 | filename = os.path.join(self._data_path, 'Annotations', index+'.xml') 56 | tree = ET.parse(filename) 57 | objs = tree.findall('object') 58 | objs = [obj for obj in objs if int(obj.find('difficult').text) == 0] 59 | bboxes = [] 60 | for obj in objs: 61 | bbox = obj.find('bndbox') 62 | # Make pixel indexes 0-based 63 | xmin = float(bbox.find('xmin').text) - 1 64 | xmax = float(bbox.find('xmax').text) - 1 65 | ymin = float(bbox.find('ymin').text) - 1 66 | ymax = float(bbox.find('ymax').text) - 1 67 | assert xmin >= 0.0 and xmin <= xmax, \ 68 | 'Invalid bounding box x-coord xmin {} or xmax {} at {}.xml' \ 69 | .format(xmin, xmax, index) 70 | assert ymin >= 0.0 and ymin <= ymax, \ 71 | 'Invalid bounding box y-coord ymin {} or ymax {} at {}.xml' \ 72 | .format(ymin, ymax, index) 73 | x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax]) 74 | cls = self._class_to_idx[obj.find('name').text.lower().strip()] 75 | bboxes.append([x, y, w, h, cls]) 76 | 77 | idx2annotation[index] = bboxes 78 | 79 | return idx2annotation 80 | 81 | def evaluate_detections(self, eval_dir, global_step, all_boxes): 82 | """Evaluate detection results. 83 | Args: 84 | eval_dir: directory to write evaluation logs 85 | global_step: step of the checkpoint 86 | all_boxes: all_boxes[cls][image] = N x 5 arrays of 87 | [xmin, ymin, xmax, ymax, score] 88 | Returns: 89 | aps: array of average precisions. 90 | names: class names corresponding to each ap 91 | """ 92 | det_file_dir = os.path.join( 93 | eval_dir, 'detection_files_{:s}'.format(global_step)) 94 | if not os.path.isdir(det_file_dir): 95 | os.mkdir(det_file_dir) 96 | det_file_path_template = os.path.join(det_file_dir, '{:s}.txt') 97 | 98 | for cls_idx, cls in enumerate(self._classes): 99 | det_file_name = det_file_path_template.format(cls) 100 | with open(det_file_name, 'wt') as f: 101 | for im_idx, index in enumerate(self._image_idx): 102 | dets = all_boxes[cls_idx][im_idx] 103 | # VOC expects 1-based indices 104 | for k in xrange(len(dets)): 105 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 106 | format(index, dets[k][-1], 107 | dets[k][0]+1, dets[k][1]+1, 108 | dets[k][2]+1, dets[k][3]+1) 109 | ) 110 | 111 | # Evaluate detection results 112 | annopath = os.path.join( 113 | self._data_root_path, 114 | 'VOC'+self._year, 115 | 'Annotations', 116 | '{:s}.xml' 117 | ) 118 | imagesetfile = os.path.join( 119 | self._data_root_path, 120 | 'VOC'+self._year, 121 | 'ImageSets', 122 | 'Main', 123 | self._image_set+'.txt' 124 | ) 125 | cachedir = os.path.join(self._data_root_path, 'annotations_cache') 126 | aps = [] 127 | use_07_metric = True if int(self._year) < 2010 else False 128 | for i, cls in enumerate(self._classes): 129 | filename = det_file_path_template.format(cls) 130 | _, _, ap = voc_eval( 131 | filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5, 132 | use_07_metric=use_07_metric) 133 | aps += [ap] 134 | print ('{:s}: AP = {:.4f}'.format(cls, ap)) 135 | 136 | print ('Mean AP = {:.4f}'.format(np.mean(aps))) 137 | return aps, self._classes 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## _SqueezeDet:_ Unified, Small, Low Power Fully Convolutional Neural Networks for Real-Time Object Detection for Autonomous Driving 2 | By Bichen Wu, Forrest Iandola, Peter H. Jin, Kurt Keutzer (UC Berkeley & DeepScale) 3 | 4 | This repository contains a tensorflow implementation of SqueezeDet, a convolutional neural network based object detector described in our paper: https://arxiv.org/abs/1612.01051. If you find this work useful for your research, please consider citing: 5 | 6 | @inproceedings{squeezedet, 7 | Author = {Bichen Wu and Forrest Iandola and Peter H. Jin and Kurt Keutzer}, 8 | Title = {SqueezeDet: Unified, Small, Low Power Fully Convolutional Neural Networks for Real-Time Object Detection for Autonomous Driving}, 9 | Journal = {arXiv:1612.01051}, 10 | Year = {2016} 11 | } 12 | 13 | ## Installation: 14 | 15 | The following instructions are written for Linux-based distros. 16 | 17 | - Clone the SqueezeDet repository: 18 | 19 | ```Shell 20 | git clone https://github.com/BichenWuUCB/squeezeDet.git 21 | ``` 22 | Let's call the top level directory of SqueezeDet `$SQDT_ROOT`. 23 | 24 | - (Optional) Setup your own virtual environment. 25 | 26 | 1. The following assumes `python` is the Python2.7 executable. Navigate to your user home directory, and create the virtual environment there. 27 | 28 | ```Shell 29 | cd ~ 30 | virtualenv env --python=python 31 | ``` 32 | 33 | 2. Launch the virtual environment. 34 | 35 | ```Shell 36 | source env/bin/activate 37 | ``` 38 | 39 | - Use pip to install required Python packages: 40 | 41 | ```Shell 42 | pip install -r requirements.txt 43 | ``` 44 | ## Demo: 45 | - Download SqueezeDet model parameters from [here](https://www.dropbox.com/s/a6t3er8f03gdl4z/model_checkpoints.tgz?dl=0), untar it, and put it under `$SQDT_ROOT/data/` If you are using command line, type: 46 | 47 | ```Shell 48 | cd $SQDT_ROOT/data/ 49 | wget https://www.dropbox.com/s/a6t3er8f03gdl4z/model_checkpoints.tgz 50 | tar -xzvf model_checkpoints.tgz 51 | rm model_checkpoints.tgz 52 | ``` 53 | 54 | 55 | - Now we can run the demo. To detect the sample image `$SQDT_ROOT/data/sample.png`, 56 | 57 | ```Shell 58 | cd $SQDT_ROOT/ 59 | python ./src/demo.py 60 | ``` 61 | If the installation is correct, the detector should generate this image: ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/out_sample.png) 62 | 63 | To detect other image(s), use the flag `--input_path=./data/*.png` to point to input image(s). Input image(s) will be scaled to the resolution of 1242x375 (KITTI image resolution), so it works best when original resolution is close to that. 64 | 65 | - SqueezeDet is a real-time object detector, which can be used to detect videos. The video demo will be released later. 66 | 67 | ## Training/Validation: 68 | - Download KITTI object detection dataset: [images](http://www.cvlibs.net/download.php?file=data_object_image_2.zip) and [labels](http://www.cvlibs.net/download.php?file=data_object_label_2.zip). Put them under `$SQDT_ROOT/data/KITTI/`. Unzip them, then you will get two directories: `$SQDT_ROOT/data/KITTI/training/` and `$SQDT_ROOT/data/KITTI/testing/`. 69 | 70 | - Now we need to split the training data into a training set and a vlidation set. 71 | 72 | ```Shell 73 | cd $SQDT_ROOT/data/KITTI/ 74 | mkdir ImageSets 75 | cd ./ImageSets 76 | ls ../training/image_2/ | grep ".png" | sed s/.png// > trainval.txt 77 | ``` 78 | `trainval.txt` contains indices to all the images in the training data. In our experiments, we randomly split half of indices in `trainval.txt` into `train.txt` to form a training set and rest of them into `val.txt` to form a validation set. For your convenience, we provide a script to split the train-val set automatically. Simply run 79 | 80 | ```Shell 81 | cd $SQDT_ROOT/data/ 82 | python random_split_train_val.py 83 | ``` 84 | 85 | then you should get the `train.txt` and `val.txt` under `$SQDT_ROOT/data/KITTI/ImageSets`. 86 | 87 | When above two steps are finished, the structure of `$SQDT_ROOT/data/KITTI/` should at least contain: 88 | 89 | ```Shell 90 | $SQDT_ROOT/data/KITTI/ 91 | |->training/ 92 | | |-> image_2/00****.png 93 | | L-> label_2/00****.txt 94 | |->testing/ 95 | | L-> image_2/00****.png 96 | L->ImageSets/ 97 | |-> trainval.txt 98 | |-> train.txt 99 | L-> val.txt 100 | ``` 101 | 102 | - Next, download the CNN model pretrained for ImageNet classification: 103 | ```Shell 104 | cd $SQDT_ROOT/data/ 105 | # SqueezeNet 106 | wget https://www.dropbox.com/s/fzvtkc42hu3xw47/SqueezeNet.tgz 107 | tar -xzvf SqueezeNet.tgz 108 | # ResNet50 109 | wget https://www.dropbox.com/s/p65lktictdq011t/ResNet.tgz 110 | tar -xzvf ResNet.tgz 111 | # VGG16 112 | wget https://www.dropbox.com/s/zxd72nj012lzrlf/VGG16.tgz 113 | tar -xzvf VGG16.tgz 114 | ``` 115 | 116 | - Now we can start training. Training script can be found in `$SQDT_ROOT/scripts/train.sh`, which contains commands to train 4 models: SqueezeDet, SqueezeDet+, VGG16+ConvDet, ResNet50+ConvDet. 117 | ```Shell 118 | cd $SQDT_ROOT/ 119 | ./scripts/train.sh -net (squeezeDet|squeezeDet+|vgg16|resnet50) -train_dir /tmp/bichen/logs/squeezedet -gpu 0 120 | ``` 121 | 122 | Training logs are saved to the directory specified by `-train_dir`. GPU id is specified by `-gpu`. Network to train is specificed by `-net` 123 | 124 | - Before evaluation, you need to first compile the official evaluation script of KITTI dataset 125 | ```Shell 126 | cd $SQDT_ROOT/src/dataset/kitti-eval 127 | make 128 | ``` 129 | 130 | - Then, you can launch the evaluation script (in parallel with training) by 131 | 132 | ```Shell 133 | cd $SQDT_ROOT/ 134 | ./scripts/eval.sh -net (squeezeDet|squeezeDet+|vgg16|resnet50) -eval_dir /tmp/bichen/logs/squeezedet -image_set (train|val) -gpu 1 135 | ``` 136 | 137 | Note that `-train_dir` in the training script should be the same as `-eval_dir` in the evaluation script to make it easy for tensorboard to load logs. 138 | 139 | You can run two evaluation scripts to simultaneously evaluate the model on training and validation set. The training script keeps dumping checkpoint (model parameters) to the training directory once every 1000 steps (step size can be changed). Once a new checkpoint is saved, evaluation threads load the new checkpoint file and evaluate them on training and validation set. 140 | 141 | - Finally, to monitor training and evaluation process, you can use tensorboard by 142 | 143 | ```Shell 144 | tensorboard --logdir=$LOG_DIR 145 | ``` 146 | Here, `$LOG_DIR` is the directory where your training and evaluation threads dump log events, which should be the same as `-train_dir` and `-eval_dir` specified in `train.sh` and `eval.sh`. From tensorboard, you should be able to see a lot of information including loss, average precision, error analysis, example detections, model visualization, etc. 147 | 148 | ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/detection_analysis.png) 149 | ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/graph.png) 150 | ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/det_img.png) 151 | -------------------------------------------------------------------------------- /src/nets/resnet50_convDet.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """ResNet50+ConvDet model.""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | 12 | import joblib 13 | from utils import util 14 | from easydict import EasyDict as edict 15 | import numpy as np 16 | import tensorflow as tf 17 | from nn_skeleton import ModelSkeleton 18 | 19 | 20 | class ResNet50ConvDet(ModelSkeleton): 21 | def __init__(self, mc, gpu_id=0): 22 | with tf.device('/gpu:{}'.format(gpu_id)): 23 | ModelSkeleton.__init__(self, mc) 24 | 25 | self._add_forward_graph() 26 | self._add_interpretation_graph() 27 | self._add_loss_graph() 28 | self._add_train_graph() 29 | self._add_viz_graph() 30 | 31 | def _add_forward_graph(self): 32 | """NN architecture.""" 33 | 34 | mc = self.mc 35 | if mc.LOAD_PRETRAINED_MODEL: 36 | assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \ 37 | 'Cannot find pretrained model at the given path:' \ 38 | ' {}'.format(mc.PRETRAINED_MODEL_PATH) 39 | self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH) 40 | 41 | conv1 = self._conv_bn_layer( 42 | self.image_input, 'conv1', 'bn_conv1', 'scale_conv1', filters=64, 43 | size=7, stride=2, freeze=True, conv_with_bias=True) 44 | pool1 = self._pooling_layer( 45 | 'pool1', conv1, size=3, stride=2, padding='VALID') 46 | 47 | with tf.variable_scope('conv2_x') as scope: 48 | with tf.variable_scope('res2a'): 49 | branch1 = self._conv_bn_layer( 50 | pool1, 'res2a_branch1', 'bn2a_branch1', 'scale2a_branch1', 51 | filters=256, size=1, stride=1, freeze=True, relu=False) 52 | branch2 = self._res_branch( 53 | pool1, layer_name='2a', in_filters=64, out_filters=256, 54 | down_sample=False, freeze=True) 55 | res2a = tf.nn.relu(branch1+branch2, 'relu') 56 | with tf.variable_scope('res2b'): 57 | branch2 = self._res_branch( 58 | res2a, layer_name='2b', in_filters=64, out_filters=256, 59 | down_sample=False, freeze=True) 60 | res2b = tf.nn.relu(res2a+branch2, 'relu') 61 | with tf.variable_scope('res2c'): 62 | branch2 = self._res_branch( 63 | res2b, layer_name='2c', in_filters=64, out_filters=256, 64 | down_sample=False, freeze=True) 65 | res2c = tf.nn.relu(res2b+branch2, 'relu') 66 | 67 | with tf.variable_scope('conv3_x') as scope: 68 | with tf.variable_scope('res3a'): 69 | branch1 = self._conv_bn_layer( 70 | res2c, 'res3a_branch1', 'bn3a_branch1', 'scale3a_branch1', 71 | filters=512, size=1, stride=2, freeze=True, relu=False) 72 | branch2 = self._res_branch( 73 | res2c, layer_name='3a', in_filters=128, out_filters=512, 74 | down_sample=True, freeze=True) 75 | res3a = tf.nn.relu(branch1+branch2, 'relu') 76 | with tf.variable_scope('res3b'): 77 | branch2 = self._res_branch( 78 | res3a, layer_name='3b', in_filters=128, out_filters=512, 79 | down_sample=False, freeze=True) 80 | res3b = tf.nn.relu(res3a+branch2, 'relu') 81 | with tf.variable_scope('res3c'): 82 | branch2 = self._res_branch( 83 | res3b, layer_name='3c', in_filters=128, out_filters=512, 84 | down_sample=False, freeze=True) 85 | res3c = tf.nn.relu(res3b+branch2, 'relu') 86 | with tf.variable_scope('res3d'): 87 | branch2 = self._res_branch( 88 | res3c, layer_name='3d', in_filters=128, out_filters=512, 89 | down_sample=False, freeze=True) 90 | res3d = tf.nn.relu(res3c+branch2, 'relu') 91 | 92 | with tf.variable_scope('conv4_x') as scope: 93 | with tf.variable_scope('res4a'): 94 | branch1 = self._conv_bn_layer( 95 | res3d, 'res4a_branch1', 'bn4a_branch1', 'scale4a_branch1', 96 | filters=1024, size=1, stride=2, relu=False) 97 | branch2 = self._res_branch( 98 | res3d, layer_name='4a', in_filters=256, out_filters=1024, 99 | down_sample=True) 100 | res4a = tf.nn.relu(branch1+branch2, 'relu') 101 | with tf.variable_scope('res4b'): 102 | branch2 = self._res_branch( 103 | res4a, layer_name='4b', in_filters=256, out_filters=1024, 104 | down_sample=False) 105 | res4b = tf.nn.relu(res4a+branch2, 'relu') 106 | with tf.variable_scope('res4c'): 107 | branch2 = self._res_branch( 108 | res4b, layer_name='4c', in_filters=256, out_filters=1024, 109 | down_sample=False) 110 | res4c = tf.nn.relu(res4b+branch2, 'relu') 111 | with tf.variable_scope('res4d'): 112 | branch2 = self._res_branch( 113 | res4c, layer_name='4d', in_filters=256, out_filters=1024, 114 | down_sample=False) 115 | res4d = tf.nn.relu(res4c+branch2, 'relu') 116 | with tf.variable_scope('res4e'): 117 | branch2 = self._res_branch( 118 | res4d, layer_name='4e', in_filters=256, out_filters=1024, 119 | down_sample=False) 120 | res4e = tf.nn.relu(res4d+branch2, 'relu') 121 | with tf.variable_scope('res4f'): 122 | branch2 = self._res_branch( 123 | res4e, layer_name='4f', in_filters=256, out_filters=1024, 124 | down_sample=False) 125 | res4f = tf.nn.relu(res4e+branch2, 'relu') 126 | 127 | dropout4 = tf.nn.dropout(res4f, self.keep_prob, name='drop4') 128 | 129 | num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4) 130 | self.preds = self._conv_layer( 131 | 'conv5', dropout4, filters=num_output, size=3, stride=1, 132 | padding='SAME', xavier=False, relu=False, stddev=0.0001) 133 | 134 | def _res_branch( 135 | self, inputs, layer_name, in_filters, out_filters, down_sample=False, 136 | freeze=False): 137 | """Residual branch constructor. 138 | 139 | Args: 140 | inputs: input tensor 141 | layer_name: layer name 142 | in_filters: number of filters in XX_branch2a and XX_branch2b layers. 143 | out_filters: number of filters in XX_branch2clayers. 144 | donw_sample: if true, down sample the input feature map 145 | freeze: if true, do not change parameters in this layer 146 | Returns: 147 | A residual branch output operation. 148 | """ 149 | with tf.variable_scope('res'+layer_name+'_branch2'): 150 | stride = 2 if down_sample else 1 151 | output = self._conv_bn_layer( 152 | inputs, 153 | conv_param_name='res'+layer_name+'_branch2a', 154 | bn_param_name='bn'+layer_name+'_branch2a', 155 | scale_param_name='scale'+layer_name+'_branch2a', 156 | filters=in_filters, size=1, stride=stride, freeze=freeze) 157 | output = self._conv_bn_layer( 158 | output, 159 | conv_param_name='res'+layer_name+'_branch2b', 160 | bn_param_name='bn'+layer_name+'_branch2b', 161 | scale_param_name='scale'+layer_name+'_branch2b', 162 | filters=in_filters, size=3, stride=1, freeze=freeze) 163 | output = self._conv_bn_layer( 164 | output, 165 | conv_param_name='res'+layer_name+'_branch2c', 166 | bn_param_name='bn'+layer_name+'_branch2c', 167 | scale_param_name='scale'+layer_name+'_branch2c', 168 | filters=out_filters, size=1, stride=1, freeze=freeze, relu=False) 169 | return output 170 | -------------------------------------------------------------------------------- /src/utils/util.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Utility functions.""" 4 | 5 | import numpy as np 6 | import time 7 | import tensorflow as tf 8 | 9 | def iou(box1, box2): 10 | """Compute the Intersection-Over-Union of two given boxes. 11 | 12 | Args: 13 | box1: array of 4 elements [cx, cy, width, height]. 14 | box2: same as above 15 | Returns: 16 | iou: a float number in range [0, 1]. iou of the two boxes. 17 | """ 18 | 19 | lr = min(box1[0]+0.5*box1[2], box2[0]+0.5*box2[2]) - \ 20 | max(box1[0]-0.5*box1[2], box2[0]-0.5*box2[2]) 21 | if lr > 0: 22 | tb = min(box1[1]+0.5*box1[3], box2[1]+0.5*box2[3]) - \ 23 | max(box1[1]-0.5*box1[3], box2[1]-0.5*box2[3]) 24 | if tb > 0: 25 | intersection = tb*lr 26 | union = box1[2]*box1[3]+box2[2]*box2[3]-intersection 27 | 28 | return intersection/union 29 | 30 | return 0 31 | 32 | def batch_iou(boxes, box): 33 | """Compute the Intersection-Over-Union of a batch of boxes with another 34 | box. 35 | 36 | Args: 37 | box1: 2D array of [cx, cy, width, height]. 38 | box2: a single array of [cx, cy, width, height] 39 | Returns: 40 | ious: array of a float number in range [0, 1]. 41 | """ 42 | lr = np.maximum( 43 | np.minimum(boxes[:,0]+0.5*boxes[:,2], box[0]+0.5*box[2]) - \ 44 | np.maximum(boxes[:,0]-0.5*boxes[:,2], box[0]-0.5*box[2]), 45 | 0 46 | ) 47 | tb = np.maximum( 48 | np.minimum(boxes[:,1]+0.5*boxes[:,3], box[1]+0.5*box[3]) - \ 49 | np.maximum(boxes[:,1]-0.5*boxes[:,3], box[1]-0.5*box[3]), 50 | 0 51 | ) 52 | inter = lr*tb 53 | union = boxes[:,2]*boxes[:,3] + box[2]*box[3] - inter 54 | return inter/union 55 | 56 | def nms(boxes, probs, threshold): 57 | """Non-Maximum supression. 58 | Args: 59 | boxes: array of [cx, cy, w, h] (center format) 60 | probs: array of probabilities 61 | threshold: two boxes are considered overlapping if their IOU is largher than 62 | this threshold 63 | form: 'center' or 'diagonal' 64 | Returns: 65 | keep: array of True or False. 66 | """ 67 | 68 | order = probs.argsort()[::-1] 69 | keep = [True]*len(order) 70 | 71 | for i in range(len(order)-1): 72 | ovps = batch_iou(boxes[order[i+1:]], boxes[order[i]]) 73 | for j, ov in enumerate(ovps): 74 | if ov > threshold: 75 | keep[order[j+i+1]] = False 76 | return keep 77 | 78 | # TODO(bichen): this is not equivalent with full NMS. Need to improve it. 79 | def recursive_nms(boxes, probs, threshold, form='center'): 80 | """Recursive Non-Maximum supression. 81 | Args: 82 | boxes: array of [cx, cy, w, h] (center format) or [xmin, ymin, xmax, ymax] 83 | probs: array of probabilities 84 | threshold: two boxes are considered overlapping if their IOU is largher than 85 | this threshold 86 | form: 'center' or 'diagonal' 87 | Returns: 88 | keep: array of True or False. 89 | """ 90 | 91 | assert form == 'center' or form == 'diagonal', \ 92 | 'bounding box format not accepted: {}.'.format(form) 93 | 94 | if form == 'center': 95 | # convert to diagonal format 96 | boxes = np.array([bbox_transform(b) for b in boxes]) 97 | 98 | areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1]) 99 | hidx = boxes[:, 0].argsort() 100 | keep = [True]*len(hidx) 101 | 102 | def _nms(hidx): 103 | order = probs[hidx].argsort()[::-1] 104 | 105 | for idx in range(len(order)): 106 | if not keep[hidx[order[idx]]]: 107 | continue 108 | xx2 = boxes[hidx[order[idx]], 2] 109 | for jdx in range(idx+1, len(order)): 110 | if not keep[hidx[order[jdx]]]: 111 | continue 112 | xx1 = boxes[hidx[order[jdx]], 0] 113 | if xx2 < xx1: 114 | break 115 | w = xx2 - xx1 116 | yy1 = max(boxes[hidx[order[idx]], 1], boxes[hidx[order[jdx]], 1]) 117 | yy2 = min(boxes[hidx[order[idx]], 3], boxes[hidx[order[jdx]], 3]) 118 | if yy2 <= yy1: 119 | continue 120 | h = yy2-yy1 121 | inter = w*h 122 | iou = inter/(areas[hidx[order[idx]]]+areas[hidx[order[jdx]]]-inter) 123 | if iou > threshold: 124 | keep[hidx[order[jdx]]] = False 125 | 126 | def _recur(hidx): 127 | if len(hidx) <= 20: 128 | _nms(hidx) 129 | else: 130 | mid = len(hidx)/2 131 | _recur(hidx[:mid]) 132 | _recur(hidx[mid:]) 133 | _nms([idx for idx in hidx if keep[idx]]) 134 | 135 | _recur(hidx) 136 | 137 | return keep 138 | 139 | def sparse_to_dense(sp_indices, output_shape, values, default_value=0): 140 | """Build a dense matrix from sparse representations. 141 | 142 | Args: 143 | sp_indices: A [0-2]-D array that contains the index to place values. 144 | shape: shape of the dense matrix. 145 | values: A {0,1}-D array where values corresponds to the index in each row of 146 | sp_indices. 147 | default_value: values to set for indices not specified in sp_indices. 148 | Return: 149 | A dense numpy N-D array with shape output_shape. 150 | """ 151 | 152 | assert len(sp_indices) == len(values), \ 153 | 'Length of sp_indices is not equal to length of values' 154 | 155 | array = np.ones(output_shape) * default_value 156 | for idx, value in zip(sp_indices, values): 157 | array[tuple(idx)] = value 158 | return array 159 | 160 | def bgr_to_rgb(ims): 161 | """Convert a list of images from BGR format to RGB format.""" 162 | out = [] 163 | for im in ims: 164 | out.append(im[:,:,::-1]) 165 | return out 166 | 167 | def bbox_transform(bbox): 168 | """convert a bbox of form [cx, cy, w, h] to [xmin, ymin, xmax, ymax]. Works 169 | for numpy array or list of tensors. 170 | """ 171 | with tf.variable_scope('bbox_transform') as scope: 172 | cx, cy, w, h = bbox 173 | out_box = [[]]*4 174 | out_box[0] = cx-w/2 175 | out_box[1] = cy-h/2 176 | out_box[2] = cx+w/2 177 | out_box[3] = cy+h/2 178 | 179 | return out_box 180 | 181 | def bbox_transform_inv(bbox): 182 | """convert a bbox of form [xmin, ymin, xmax, ymax] to [cx, cy, w, h]. Works 183 | for numpy array or list of tensors. 184 | """ 185 | with tf.variable_scope('bbox_transform_inv') as scope: 186 | xmin, ymin, xmax, ymax = bbox 187 | out_box = [[]]*4 188 | 189 | width = xmax - xmin + 1.0 190 | height = ymax - ymin + 1.0 191 | out_box[0] = xmin + 0.5*width 192 | out_box[1] = ymin + 0.5*height 193 | out_box[2] = width 194 | out_box[3] = height 195 | 196 | return out_box 197 | 198 | class Timer(object): 199 | def __init__(self): 200 | self.total_time = 0.0 201 | self.calls = 0 202 | self.start_time = 0.0 203 | self.duration = 0.0 204 | self.average_time = 0.0 205 | 206 | def tic(self): 207 | self.start_time = time.time() 208 | 209 | def toc(self, average=True): 210 | self.duration = time.time() - self.start_time 211 | self.total_time += self.duration 212 | self.calls += 1 213 | self.average_time = self.total_time/self.calls 214 | if average: 215 | return self.average_time 216 | else: 217 | return self.duration 218 | 219 | def safe_exp(w, thresh): 220 | """Safe exponential function for tensors.""" 221 | 222 | slope = np.exp(thresh) 223 | with tf.variable_scope('safe_exponential'): 224 | lin_bool = w > thresh 225 | lin_region = tf.to_float(lin_bool) 226 | 227 | lin_out = slope*(w - thresh + 1.) 228 | exp_out = tf.exp(tf.where(lin_bool, tf.zeros_like(w), w)) 229 | 230 | out = lin_region*lin_out + (1.-lin_region)*exp_out 231 | return out 232 | 233 | 234 | -------------------------------------------------------------------------------- /src/dataset/voc_eval.py: -------------------------------------------------------------------------------- 1 | # This file was from 2 | # https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/master/lib/datasets/voc_eval.py 3 | # -------------------------------------------------------- 4 | # Fast/er R-CNN 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Bharath Hariharan 7 | # -------------------------------------------------------- 8 | 9 | import xml.etree.ElementTree as ET 10 | import os 11 | import cPickle 12 | import numpy as np 13 | 14 | def parse_rec(filename): 15 | """ Parse a PASCAL VOC xml file """ 16 | tree = ET.parse(filename) 17 | objects = [] 18 | for obj in tree.findall('object'): 19 | obj_struct = {} 20 | obj_struct['name'] = obj.find('name').text 21 | obj_struct['pose'] = obj.find('pose').text 22 | obj_struct['truncated'] = int(obj.find('truncated').text) 23 | obj_struct['difficult'] = int(obj.find('difficult').text) 24 | bbox = obj.find('bndbox') 25 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 26 | int(bbox.find('ymin').text), 27 | int(bbox.find('xmax').text), 28 | int(bbox.find('ymax').text)] 29 | objects.append(obj_struct) 30 | 31 | return objects 32 | 33 | def voc_ap(rec, prec, use_07_metric=False): 34 | """ ap = voc_ap(rec, prec, [use_07_metric]) 35 | Compute VOC AP given precision and recall. 36 | If use_07_metric is true, uses the 37 | VOC 07 11 point method (default:False). 38 | """ 39 | if use_07_metric: 40 | # 11 point metric 41 | ap = 0. 42 | for t in np.arange(0., 1.1, 0.1): 43 | if np.sum(rec >= t) == 0: 44 | p = 0 45 | else: 46 | p = np.max(prec[rec >= t]) 47 | ap = ap + p / 11. 48 | else: 49 | # correct AP calculation 50 | # first append sentinel values at the end 51 | mrec = np.concatenate(([0.], rec, [1.])) 52 | mpre = np.concatenate(([0.], prec, [0.])) 53 | 54 | # compute the precision envelope 55 | for i in range(mpre.size - 1, 0, -1): 56 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 57 | 58 | # to calculate area under PR curve, look for points 59 | # where X axis (recall) changes value 60 | i = np.where(mrec[1:] != mrec[:-1])[0] 61 | 62 | # and sum (\Delta recall) * prec 63 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 64 | return ap 65 | 66 | def voc_eval(detpath, 67 | annopath, 68 | imagesetfile, 69 | classname, 70 | cachedir, 71 | ovthresh=0.5, 72 | use_07_metric=False): 73 | """rec, prec, ap = voc_eval(detpath, 74 | annopath, 75 | imagesetfile, 76 | classname, 77 | [ovthresh], 78 | [use_07_metric]) 79 | 80 | Top level function that does the PASCAL VOC evaluation. 81 | 82 | detpath: Path to detections 83 | detpath.format(classname) should produce the detection results file. 84 | annopath: Path to annotations 85 | annopath.format(imagename) should be the xml annotations file. 86 | imagesetfile: Text file containing the list of images, one image per line. 87 | classname: Category name (duh) 88 | cachedir: Directory for caching the annotations 89 | [ovthresh]: Overlap threshold (default = 0.5) 90 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 91 | (default False) 92 | """ 93 | # assumes detections are in detpath.format(classname) 94 | # assumes annotations are in annopath.format(imagename) 95 | # assumes imagesetfile is a text file with each line an image name 96 | # cachedir caches the annotations in a pickle file 97 | 98 | # first load gt 99 | if not os.path.isdir(cachedir): 100 | os.mkdir(cachedir) 101 | cachefile = os.path.join(cachedir, 'annots.pkl') 102 | # read list of images 103 | with open(imagesetfile, 'r') as f: 104 | lines = f.readlines() 105 | imagenames = [x.strip() for x in lines] 106 | 107 | if not os.path.isfile(cachefile): 108 | # load annots 109 | recs = {} 110 | for i, imagename in enumerate(imagenames): 111 | recs[imagename] = parse_rec(annopath.format(imagename)) 112 | if i % 100 == 0: 113 | print 'Reading annotation for {:d}/{:d}'.format( 114 | i + 1, len(imagenames)) 115 | # save 116 | print 'Saving cached annotations to {:s}'.format(cachefile) 117 | with open(cachefile, 'w') as f: 118 | cPickle.dump(recs, f) 119 | else: 120 | # load 121 | with open(cachefile, 'r') as f: 122 | recs = cPickle.load(f) 123 | 124 | # extract gt objects for this class 125 | class_recs = {} 126 | npos = 0 127 | for imagename in imagenames: 128 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 129 | bbox = np.array([x['bbox'] for x in R]) 130 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 131 | det = [False] * len(R) 132 | npos = npos + sum(~difficult) 133 | class_recs[imagename] = {'bbox': bbox, 134 | 'difficult': difficult, 135 | 'det': det} 136 | 137 | # read dets 138 | detfile = detpath.format(classname) 139 | with open(detfile, 'r') as f: 140 | lines = f.readlines() 141 | 142 | splitlines = [x.strip().split(' ') for x in lines] 143 | image_ids = [x[0] for x in splitlines] 144 | confidence = np.array([float(x[1]) for x in splitlines]) 145 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 146 | 147 | if confidence.shape[0] == 0: 148 | return 0, 0, 0 149 | 150 | # sort by confidence 151 | sorted_ind = np.argsort(-confidence) 152 | sorted_scores = np.sort(-confidence) 153 | BB = BB[sorted_ind, :] 154 | image_ids = [image_ids[x] for x in sorted_ind] 155 | 156 | 157 | # go down dets and mark TPs and FPs 158 | nd = len(image_ids) 159 | tp = np.zeros(nd) 160 | fp = np.zeros(nd) 161 | for d in range(nd): 162 | R = class_recs[image_ids[d]] 163 | bb = BB[d, :].astype(float) 164 | ovmax = -np.inf 165 | BBGT = R['bbox'].astype(float) 166 | 167 | if BBGT.size > 0: 168 | # compute overlaps 169 | # intersection 170 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 171 | iymin = np.maximum(BBGT[:, 1], bb[1]) 172 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 173 | iymax = np.minimum(BBGT[:, 3], bb[3]) 174 | iw = np.maximum(ixmax - ixmin + 1., 0.) 175 | ih = np.maximum(iymax - iymin + 1., 0.) 176 | inters = iw * ih 177 | 178 | # union 179 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 180 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 181 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 182 | 183 | overlaps = inters / uni 184 | ovmax = np.max(overlaps) 185 | jmax = np.argmax(overlaps) 186 | 187 | if ovmax > ovthresh: 188 | if not R['difficult'][jmax]: 189 | if not R['det'][jmax]: 190 | tp[d] = 1. 191 | R['det'][jmax] = 1 192 | else: 193 | fp[d] = 1. 194 | else: 195 | fp[d] = 1. 196 | 197 | # compute precision recall 198 | fp = np.cumsum(fp) 199 | tp = np.cumsum(tp) 200 | rec = tp / float(npos) 201 | # avoid divide by zero in case the first detection matches a difficult 202 | # ground truth 203 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 204 | ap = voc_ap(rec, prec, use_07_metric) 205 | 206 | return rec, prec, ap 207 | -------------------------------------------------------------------------------- /src/demo.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """SqueezeDet Demo. 4 | 5 | In image detection mode, for a given image, detect objects and draw bounding 6 | boxes around them. In video detection mode, perform real-time detection on the 7 | video stream. 8 | """ 9 | 10 | from __future__ import absolute_import 11 | from __future__ import division 12 | from __future__ import print_function 13 | 14 | import cv2 15 | import time 16 | import sys 17 | import os 18 | import glob 19 | 20 | import numpy as np 21 | import tensorflow as tf 22 | 23 | from config import * 24 | from train import _draw_box 25 | from nets import * 26 | 27 | FLAGS = tf.app.flags.FLAGS 28 | 29 | tf.app.flags.DEFINE_string( 30 | 'mode', 'image', """'image' or 'video'.""") 31 | tf.app.flags.DEFINE_string( 32 | 'checkpoint', './data/model_checkpoints/squeezeDet/model.ckpt-87000', 33 | """Path to the model parameter file.""") 34 | tf.app.flags.DEFINE_string( 35 | 'input_path', './data/sample.png', 36 | """Input image or video to be detected. Can process glob input such as """ 37 | """./data/00000*.png.""") 38 | tf.app.flags.DEFINE_string( 39 | 'out_dir', './data/out/', """Directory to dump output image or video.""") 40 | tf.app.flags.DEFINE_string( 41 | 'demo_net', 'squeezeDet', """Neural net architecture.""") 42 | 43 | 44 | def video_demo(): 45 | """Detect videos.""" 46 | 47 | cap = cv2.VideoCapture(FLAGS.input_path) 48 | 49 | # Define the codec and create VideoWriter object 50 | # fourcc = cv2.cv.CV_FOURCC(*'XVID') 51 | # fourcc = cv2.cv.CV_FOURCC(*'MJPG') 52 | # in_file_name = os.path.split(FLAGS.input_path)[1] 53 | # out_file_name = os.path.join(FLAGS.out_dir, 'out_'+in_file_name) 54 | # out = cv2.VideoWriter(out_file_name, fourcc, 30.0, (375,1242), True) 55 | # out = VideoWriter(out_file_name, frameSize=(1242, 375)) 56 | # out.open() 57 | 58 | assert FLAGS.demo_net == 'squeezeDet' or FLAGS.demo_net == 'squeezeDet+', \ 59 | 'Selected nueral net architecture not supported: {}'.format(FLAGS.demo_net) 60 | 61 | with tf.Graph().as_default(): 62 | # Load model 63 | if FLAGS.demo_net == 'squeezeDet': 64 | mc = kitti_squeezeDet_config() 65 | mc.BATCH_SIZE = 1 66 | # model parameters will be restored from checkpoint 67 | mc.LOAD_PRETRAINED_MODEL = False 68 | model = SqueezeDet(mc, FLAGS.gpu) 69 | elif FLAGS.demo_net == 'squeezeDet+': 70 | mc = kitti_squeezeDetPlus_config() 71 | mc.BATCH_SIZE = 1 72 | mc.LOAD_PRETRAINED_MODEL = False 73 | model = SqueezeDetPlus(mc, FLAGS.gpu) 74 | 75 | saver = tf.train.Saver(model.model_params) 76 | 77 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 78 | saver.restore(sess, FLAGS.checkpoint) 79 | 80 | times = {} 81 | count = 0 82 | while cap.isOpened(): 83 | t_start = time.time() 84 | count += 1 85 | out_im_name = os.path.join(FLAGS.out_dir, str(count).zfill(6)+'.jpg') 86 | 87 | # Load images from video and crop 88 | ret, frame = cap.read() 89 | if ret==True: 90 | # crop frames 91 | frame = frame[500:-205, 239:-439, :] 92 | im_input = frame.astype(np.float32) - mc.BGR_MEANS 93 | else: 94 | break 95 | 96 | t_reshape = time.time() 97 | times['reshape']= t_reshape - t_start 98 | 99 | # Detect 100 | det_boxes, det_probs, det_class = sess.run( 101 | [model.det_boxes, model.det_probs, model.det_class], 102 | feed_dict={model.image_input:[im_input]}) 103 | 104 | t_detect = time.time() 105 | times['detect']= t_detect - t_reshape 106 | 107 | # Filter 108 | final_boxes, final_probs, final_class = model.filter_prediction( 109 | det_boxes[0], det_probs[0], det_class[0]) 110 | 111 | keep_idx = [idx for idx in range(len(final_probs)) \ 112 | if final_probs[idx] > mc.PLOT_PROB_THRESH] 113 | final_boxes = [final_boxes[idx] for idx in keep_idx] 114 | final_probs = [final_probs[idx] for idx in keep_idx] 115 | final_class = [final_class[idx] for idx in keep_idx] 116 | 117 | t_filter = time.time() 118 | times['filter']= t_filter - t_detect 119 | 120 | # Draw boxes 121 | 122 | # TODO(bichen): move this color dict to configuration file 123 | cls2clr = { 124 | 'car': (255, 191, 0), 125 | 'cyclist': (0, 191, 255), 126 | 'pedestrian':(255, 0, 191) 127 | } 128 | _draw_box( 129 | frame, final_boxes, 130 | [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \ 131 | for idx, prob in zip(final_class, final_probs)], 132 | cdict=cls2clr 133 | ) 134 | 135 | t_draw = time.time() 136 | times['draw']= t_draw - t_filter 137 | 138 | cv2.imwrite(out_im_name, frame) 139 | # out.write(frame) 140 | 141 | times['total']= time.time() - t_start 142 | 143 | # time_str = '' 144 | # for t in times: 145 | # time_str += '{} time: {:.4f} '.format(t[0], t[1]) 146 | # time_str += '\n' 147 | time_str = 'Total time: {:.4f}, detection time: {:.4f}, filter time: '\ 148 | '{:.4f}'. \ 149 | format(times['total'], times['detect'], times['filter']) 150 | 151 | print (time_str) 152 | 153 | if cv2.waitKey(1) & 0xFF == ord('q'): 154 | break 155 | # Release everything if job is finished 156 | cap.release() 157 | # out.release() 158 | cv2.destroyAllWindows() 159 | 160 | 161 | def image_demo(): 162 | """Detect image.""" 163 | 164 | assert FLAGS.demo_net == 'squeezeDet' or FLAGS.demo_net == 'squeezeDet+', \ 165 | 'Selected nueral net architecture not supported: {}'.format(FLAGS.demo_net) 166 | 167 | with tf.Graph().as_default(): 168 | # Load model 169 | if FLAGS.demo_net == 'squeezeDet': 170 | mc = kitti_squeezeDet_config() 171 | mc.BATCH_SIZE = 1 172 | # model parameters will be restored from checkpoint 173 | mc.LOAD_PRETRAINED_MODEL = False 174 | model = SqueezeDet(mc, FLAGS.gpu) 175 | elif FLAGS.demo_net == 'squeezeDet+': 176 | mc = kitti_squeezeDetPlus_config() 177 | mc.BATCH_SIZE = 1 178 | mc.LOAD_PRETRAINED_MODEL = False 179 | model = SqueezeDetPlus(mc, FLAGS.gpu) 180 | 181 | saver = tf.train.Saver(model.model_params) 182 | 183 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 184 | saver.restore(sess, FLAGS.checkpoint) 185 | 186 | for f in glob.iglob(FLAGS.input_path): 187 | im = cv2.imread(f) 188 | im = im.astype(np.float32, copy=False) 189 | im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT)) 190 | input_image = im - mc.BGR_MEANS 191 | 192 | # Detect 193 | det_boxes, det_probs, det_class = sess.run( 194 | [model.det_boxes, model.det_probs, model.det_class], 195 | feed_dict={model.image_input:[input_image]}) 196 | 197 | # Filter 198 | final_boxes, final_probs, final_class = model.filter_prediction( 199 | det_boxes[0], det_probs[0], det_class[0]) 200 | 201 | keep_idx = [idx for idx in range(len(final_probs)) \ 202 | if final_probs[idx] > mc.PLOT_PROB_THRESH] 203 | final_boxes = [final_boxes[idx] for idx in keep_idx] 204 | final_probs = [final_probs[idx] for idx in keep_idx] 205 | final_class = [final_class[idx] for idx in keep_idx] 206 | 207 | # TODO(bichen): move this color dict to configuration file 208 | cls2clr = { 209 | 'car': (255, 191, 0), 210 | 'cyclist': (0, 191, 255), 211 | 'pedestrian':(255, 0, 191) 212 | } 213 | 214 | # Draw boxes 215 | _draw_box( 216 | im, final_boxes, 217 | [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \ 218 | for idx, prob in zip(final_class, final_probs)], 219 | cdict=cls2clr, 220 | ) 221 | 222 | file_name = os.path.split(f)[1] 223 | out_file_name = os.path.join(FLAGS.out_dir, 'out_'+file_name) 224 | cv2.imwrite(out_file_name, im) 225 | print ('Image detection output saved to {}'.format(out_file_name)) 226 | 227 | 228 | def main(argv=None): 229 | if not tf.gfile.Exists(FLAGS.out_dir): 230 | tf.gfile.MakeDirs(FLAGS.out_dir) 231 | if FLAGS.mode == 'image': 232 | image_demo() 233 | else: 234 | video_demo() 235 | 236 | if __name__ == '__main__': 237 | tf.app.run() 238 | -------------------------------------------------------------------------------- /src/eval.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Evaluation""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import cv2 10 | from datetime import datetime 11 | import os.path 12 | import sys 13 | import time 14 | 15 | import numpy as np 16 | from six.moves import xrange 17 | import tensorflow as tf 18 | 19 | from config import * 20 | from dataset import pascal_voc, kitti 21 | from utils.util import bbox_transform, Timer 22 | from nets import * 23 | 24 | FLAGS = tf.app.flags.FLAGS 25 | 26 | tf.app.flags.DEFINE_string('dataset', 'KITTI', 27 | """Currently support PASCAL_VOC or KITTI dataset.""") 28 | tf.app.flags.DEFINE_string('data_path', '', """Root directory of data""") 29 | tf.app.flags.DEFINE_string('image_set', 'test', 30 | """Only used for VOC data.""" 31 | """Can be train, trainval, val, or test""") 32 | tf.app.flags.DEFINE_string('year', '2007', 33 | """VOC challenge year. 2007 or 2012""" 34 | """Only used for VOC data""") 35 | tf.app.flags.DEFINE_string('eval_dir', '/tmp/bichen/logs/squeezeDet/eval', 36 | """Directory where to write event logs """) 37 | tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/bichen/logs/squeezeDet/train', 38 | """Path to the training checkpoint.""") 39 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1, 40 | """How often to check if new cpt is saved.""") 41 | tf.app.flags.DEFINE_boolean('run_once', False, 42 | """Whether to run eval only once.""") 43 | tf.app.flags.DEFINE_string('net', 'squeezeDet', 44 | """Neural net architecture.""") 45 | tf.app.flags.DEFINE_string('gpu', '0', """gpu id.""") 46 | 47 | 48 | def eval_once( 49 | saver, ckpt_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb, 50 | model): 51 | 52 | with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: 53 | 54 | # Restores from checkpoint 55 | saver.restore(sess, ckpt_path) 56 | # Assuming model_checkpoint_path looks something like: 57 | # /ckpt_dir/model.ckpt-0, 58 | # extract global_step from it. 59 | global_step = ckpt_path.split('/')[-1].split('-')[-1] 60 | 61 | num_images = len(imdb.image_idx) 62 | 63 | all_boxes = [[[] for _ in xrange(num_images)] 64 | for _ in xrange(imdb.num_classes)] 65 | 66 | _t = {'im_detect': Timer(), 'im_read': Timer(), 'misc': Timer()} 67 | 68 | num_detection = 0.0 69 | for i in xrange(num_images): 70 | _t['im_read'].tic() 71 | images, scales = imdb.read_image_batch(shuffle=False) 72 | _t['im_read'].toc() 73 | 74 | _t['im_detect'].tic() 75 | det_boxes, det_probs, det_class = sess.run( 76 | [model.det_boxes, model.det_probs, model.det_class], 77 | feed_dict={model.image_input:images}) 78 | _t['im_detect'].toc() 79 | 80 | _t['misc'].tic() 81 | for j in range(len(det_boxes)): # batch 82 | # rescale 83 | det_boxes[j, :, 0::2] /= scales[j][0] 84 | det_boxes[j, :, 1::2] /= scales[j][1] 85 | 86 | det_bbox, score, det_class = model.filter_prediction( 87 | det_boxes[j], det_probs[j], det_class[j]) 88 | 89 | num_detection += len(det_bbox) 90 | for c, b, s in zip(det_class, det_bbox, score): 91 | all_boxes[c][i].append(bbox_transform(b) + [s]) 92 | _t['misc'].toc() 93 | 94 | print ('im_detect: {:d}/{:d} im_read: {:.3f}s ' 95 | 'detect: {:.3f}s misc: {:.3f}s'.format( 96 | i+1, num_images, _t['im_read'].average_time, 97 | _t['im_detect'].average_time, _t['misc'].average_time)) 98 | 99 | print ('Evaluating detections...') 100 | aps, ap_names = imdb.evaluate_detections( 101 | FLAGS.eval_dir, global_step, all_boxes) 102 | 103 | print ('Evaluation summary:') 104 | print (' Average number of detections per image: {}:'.format( 105 | num_detection/num_images)) 106 | print (' Timing:') 107 | print (' im_read: {:.3f}s detect: {:.3f}s misc: {:.3f}s'.format( 108 | _t['im_read'].average_time, _t['im_detect'].average_time, 109 | _t['misc'].average_time)) 110 | print (' Average precisions:') 111 | 112 | feed_dict = {} 113 | for cls, ap in zip(ap_names, aps): 114 | feed_dict[eval_summary_phs['APs/'+cls]] = ap 115 | print (' {}: {:.3f}'.format(cls, ap)) 116 | 117 | print (' Mean average precision: {:.3f}'.format(np.mean(aps))) 118 | feed_dict[eval_summary_phs['APs/mAP']] = np.mean(aps) 119 | feed_dict[eval_summary_phs['timing/im_detect']] = \ 120 | _t['im_detect'].average_time 121 | feed_dict[eval_summary_phs['timing/im_read']] = \ 122 | _t['im_read'].average_time 123 | feed_dict[eval_summary_phs['timing/post_proc']] = \ 124 | _t['misc'].average_time 125 | feed_dict[eval_summary_phs['num_det_per_image']] = \ 126 | num_detection/num_images 127 | 128 | print ('Analyzing detections...') 129 | stats, ims = imdb.do_detection_analysis_in_eval( 130 | FLAGS.eval_dir, global_step) 131 | 132 | eval_summary_str = sess.run(eval_summary_ops, feed_dict=feed_dict) 133 | for sum_str in eval_summary_str: 134 | summary_writer.add_summary(sum_str, global_step) 135 | 136 | def evaluate(): 137 | """Evaluate.""" 138 | assert FLAGS.dataset == 'KITTI', \ 139 | 'Currently only supports KITTI dataset' 140 | 141 | os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu 142 | 143 | with tf.Graph().as_default() as g: 144 | 145 | assert FLAGS.net == 'vgg16' or FLAGS.net == 'resnet50' \ 146 | or FLAGS.net == 'squeezeDet' or FLAGS.net == 'squeezeDet+', \ 147 | 'Selected neural net architecture not supported: {}'.format(FLAGS.net) 148 | if FLAGS.net == 'vgg16': 149 | mc = kitti_vgg16_config() 150 | mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1 151 | mc.LOAD_PRETRAINED_MODEL = False 152 | model = VGG16ConvDet(mc) 153 | elif FLAGS.net == 'resnet50': 154 | mc = kitti_res50_config() 155 | mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1 156 | mc.LOAD_PRETRAINED_MODEL = False 157 | model = ResNet50ConvDet(mc) 158 | elif FLAGS.net == 'squeezeDet': 159 | mc = kitti_squeezeDet_config() 160 | mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1 161 | mc.LOAD_PRETRAINED_MODEL = False 162 | model = SqueezeDet(mc) 163 | elif FLAGS.net == 'squeezeDet+': 164 | mc = kitti_squeezeDetPlus_config() 165 | mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1 166 | mc.LOAD_PRETRAINED_MODEL = False 167 | model = SqueezeDetPlus(mc) 168 | 169 | imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) 170 | 171 | # add summary ops and placeholders 172 | ap_names = [] 173 | for cls in imdb.classes: 174 | ap_names.append(cls+'_easy') 175 | ap_names.append(cls+'_medium') 176 | ap_names.append(cls+'_hard') 177 | 178 | eval_summary_ops = [] 179 | eval_summary_phs = {} 180 | for ap_name in ap_names: 181 | ph = tf.placeholder(tf.float32) 182 | eval_summary_phs['APs/'+ap_name] = ph 183 | eval_summary_ops.append(tf.summary.scalar('APs/'+ap_name, ph)) 184 | 185 | ph = tf.placeholder(tf.float32) 186 | eval_summary_phs['APs/mAP'] = ph 187 | eval_summary_ops.append(tf.summary.scalar('APs/mAP', ph)) 188 | 189 | ph = tf.placeholder(tf.float32) 190 | eval_summary_phs['timing/im_detect'] = ph 191 | eval_summary_ops.append(tf.summary.scalar('timing/im_detect', ph)) 192 | 193 | ph = tf.placeholder(tf.float32) 194 | eval_summary_phs['timing/im_read'] = ph 195 | eval_summary_ops.append(tf.summary.scalar('timing/im_read', ph)) 196 | 197 | ph = tf.placeholder(tf.float32) 198 | eval_summary_phs['timing/post_proc'] = ph 199 | eval_summary_ops.append(tf.summary.scalar('timing/post_proc', ph)) 200 | 201 | ph = tf.placeholder(tf.float32) 202 | eval_summary_phs['num_det_per_image'] = ph 203 | eval_summary_ops.append(tf.summary.scalar('num_det_per_image', ph)) 204 | 205 | saver = tf.train.Saver(model.model_params) 206 | 207 | summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g) 208 | 209 | ckpts = set() 210 | while True: 211 | if FLAGS.run_once: 212 | # When run_once is true, checkpoint_path should point to the exact 213 | # checkpoint file. 214 | eval_once( 215 | saver, FLAGS.checkpoint_path, summary_writer, eval_summary_ops, 216 | eval_summary_phs, imdb, model) 217 | return 218 | else: 219 | # When run_once is false, checkpoint_path should point to the directory 220 | # that stores checkpoint files. 221 | ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path) 222 | if ckpt and ckpt.model_checkpoint_path: 223 | if ckpt.model_checkpoint_path in ckpts: 224 | # Do not evaluate on the same checkpoint 225 | print ('Wait {:d}s for new checkpoints to be saved ... ' 226 | .format(FLAGS.eval_interval_secs)) 227 | time.sleep(FLAGS.eval_interval_secs) 228 | else: 229 | ckpts.add(ckpt.model_checkpoint_path) 230 | print ('Evaluating {}...'.format(ckpt.model_checkpoint_path)) 231 | eval_once( 232 | saver, ckpt.model_checkpoint_path, summary_writer, 233 | eval_summary_ops, eval_summary_phs, imdb, model) 234 | else: 235 | print('No checkpoint file found') 236 | if not FLAGS.run_once: 237 | print ('Wait {:d}s for new checkpoints to be saved ... ' 238 | .format(FLAGS.eval_interval_secs)) 239 | time.sleep(FLAGS.eval_interval_secs) 240 | 241 | 242 | def main(argv=None): # pylint: disable=unused-argument 243 | if tf.gfile.Exists(FLAGS.eval_dir): 244 | tf.gfile.DeleteRecursively(FLAGS.eval_dir) 245 | tf.gfile.MakeDirs(FLAGS.eval_dir) 246 | evaluate() 247 | 248 | 249 | if __name__ == '__main__': 250 | tf.app.run() 251 | -------------------------------------------------------------------------------- /src/dataset/imdb.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """The data base wrapper class""" 4 | 5 | import os 6 | import random 7 | import shutil 8 | 9 | from PIL import Image, ImageFont, ImageDraw 10 | import cv2 11 | import numpy as np 12 | from utils.util import iou, batch_iou 13 | 14 | class imdb(object): 15 | """Image database.""" 16 | 17 | def __init__(self, name, mc): 18 | self._name = name 19 | self._classes = [] 20 | self._image_set = [] 21 | self._image_idx = [] 22 | self._data_root_path = [] 23 | self._rois = {} 24 | self.mc = mc 25 | 26 | # batch reader 27 | self._perm_idx = None 28 | self._cur_idx = 0 29 | 30 | @property 31 | def name(self): 32 | return self._name 33 | 34 | @property 35 | def classes(self): 36 | return self._classes 37 | 38 | @property 39 | def num_classes(self): 40 | return len(self._classes) 41 | 42 | @property 43 | def image_idx(self): 44 | return self._image_idx 45 | 46 | @property 47 | def image_set(self): 48 | return self._image_set 49 | 50 | @property 51 | def data_root_path(self): 52 | return self._data_root_path 53 | 54 | @property 55 | def year(self): 56 | return self._year 57 | 58 | def _shuffle_image_idx(self): 59 | self._perm_idx = [self._image_idx[i] for i in 60 | np.random.permutation(np.arange(len(self._image_idx)))] 61 | self._cur_idx = 0 62 | 63 | def read_image_batch(self, shuffle=True): 64 | """Only Read a batch of images 65 | Args: 66 | shuffle: whether or not to shuffle the dataset 67 | Returns: 68 | images: length batch_size list of arrays [height, width, 3] 69 | """ 70 | mc = self.mc 71 | if shuffle: 72 | if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx): 73 | self._shuffle_image_idx() 74 | batch_idx = self._perm_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE] 75 | self._cur_idx += mc.BATCH_SIZE 76 | else: 77 | if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx): 78 | batch_idx = self._image_idx[self._cur_idx:] \ 79 | + self._image_idx[:self._cur_idx + mc.BATCH_SIZE-len(self._image_idx)] 80 | self._cur_idx += mc.BATCH_SIZE - len(self._image_idx) 81 | else: 82 | batch_idx = self._image_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE] 83 | self._cur_idx += mc.BATCH_SIZE 84 | 85 | images, scales = [], [] 86 | for i in batch_idx: 87 | im = cv2.imread(self._image_path_at(i)) 88 | im = im.astype(np.float32, copy=False) 89 | im -= mc.BGR_MEANS 90 | orig_h, orig_w, _ = [float(v) for v in im.shape] 91 | im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT)) 92 | x_scale = mc.IMAGE_WIDTH/orig_w 93 | y_scale = mc.IMAGE_HEIGHT/orig_h 94 | images.append(im) 95 | scales.append((x_scale, y_scale)) 96 | 97 | return images, scales 98 | 99 | def read_batch(self, shuffle=True): 100 | """Read a batch of image and bounding box annotations. 101 | Args: 102 | shuffle: whether or not to shuffle the dataset 103 | Returns: 104 | image_per_batch: images. Shape: batch_size x width x height x [b, g, r] 105 | label_per_batch: labels. Shape: batch_size x object_num 106 | delta_per_batch: bounding box deltas. Shape: batch_size x object_num x 107 | [dx ,dy, dw, dh] 108 | aidx_per_batch: index of anchors that are responsible for prediction. 109 | Shape: batch_size x object_num 110 | bbox_per_batch: scaled bounding boxes. Shape: batch_size x object_num x 111 | [cx, cy, w, h] 112 | """ 113 | mc = self.mc 114 | 115 | if shuffle: 116 | if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx): 117 | self._shuffle_image_idx() 118 | batch_idx = self._perm_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE] 119 | self._cur_idx += mc.BATCH_SIZE 120 | else: 121 | if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx): 122 | batch_idx = self._image_idx[self._cur_idx:] \ 123 | + self._image_idx[:self._cur_idx + mc.BATCH_SIZE-len(self._image_idx)] 124 | self._cur_idx += mc.BATCH_SIZE - len(self._image_idx) 125 | else: 126 | batch_idx = self._image_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE] 127 | self._cur_idx += mc.BATCH_SIZE 128 | 129 | image_per_batch = [] 130 | label_per_batch = [] 131 | bbox_per_batch = [] 132 | delta_per_batch = [] 133 | aidx_per_batch = [] 134 | if mc.DEBUG_MODE: 135 | avg_ious = 0. 136 | num_objects = 0. 137 | max_iou = 0.0 138 | min_iou = 1.0 139 | num_zero_iou_obj = 0 140 | 141 | for idx in batch_idx: 142 | # load the image 143 | im = cv2.imread(self._image_path_at(idx)).astype(np.float32, copy=False) 144 | im -= mc.BGR_MEANS 145 | orig_h, orig_w, _ = [float(v) for v in im.shape] 146 | 147 | # load annotations 148 | label_per_batch.append([b[4] for b in self._rois[idx][:]]) 149 | gt_bbox = np.array([[b[0], b[1], b[2], b[3]] for b in self._rois[idx][:]]) 150 | 151 | if mc.DATA_AUGMENTATION: 152 | assert mc.DRIFT_X >= 0 and mc.DRIFT_Y > 0, \ 153 | 'mc.DRIFT_X and mc.DRIFT_Y must be >= 0' 154 | 155 | if mc.DRIFT_X > 0 or mc.DRIFT_Y > 0: 156 | # Ensures that gt boundibg box is not cutted out of the image 157 | max_drift_x = min(gt_bbox[:, 0] - gt_bbox[:, 2]/2.0+1) 158 | max_drift_y = min(gt_bbox[:, 1] - gt_bbox[:, 3]/2.0+1) 159 | assert max_drift_x >= 0 and max_drift_y >= 0, 'bbox out of image' 160 | 161 | dy = np.random.randint(-mc.DRIFT_Y, min(mc.DRIFT_Y+1, max_drift_y)) 162 | dx = np.random.randint(-mc.DRIFT_X, min(mc.DRIFT_X+1, max_drift_x)) 163 | 164 | # shift bbox 165 | gt_bbox[:, 0] = gt_bbox[:, 0] - dx 166 | gt_bbox[:, 1] = gt_bbox[:, 1] - dy 167 | 168 | # distort image 169 | orig_h -= dy 170 | orig_w -= dx 171 | orig_x, dist_x = max(dx, 0), max(-dx, 0) 172 | orig_y, dist_y = max(dy, 0), max(-dy, 0) 173 | 174 | distorted_im = np.zeros( 175 | (int(orig_h), int(orig_w), 3)).astype(np.float32) 176 | distorted_im[dist_y:, dist_x:, :] = im[orig_y:, orig_x:, :] 177 | im = distorted_im 178 | 179 | # Flip image with 50% probability 180 | if np.random.randint(2) > 0.5: 181 | im = im[:, ::-1, :] 182 | gt_bbox[:, 0] = orig_w - 1 - gt_bbox[:, 0] 183 | 184 | # scale image 185 | im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT)) 186 | image_per_batch.append(im) 187 | 188 | # scale annotation 189 | x_scale = mc.IMAGE_WIDTH/orig_w 190 | y_scale = mc.IMAGE_HEIGHT/orig_h 191 | gt_bbox[:, 0::2] = gt_bbox[:, 0::2]*x_scale 192 | gt_bbox[:, 1::2] = gt_bbox[:, 1::2]*y_scale 193 | bbox_per_batch.append(gt_bbox) 194 | 195 | aidx_per_image, delta_per_image = [], [] 196 | aidx_set = set() 197 | for i in range(len(gt_bbox)): 198 | overlaps = batch_iou(mc.ANCHOR_BOX, gt_bbox[i]) 199 | 200 | aidx = len(mc.ANCHOR_BOX) 201 | for ov_idx in np.argsort(overlaps)[::-1]: 202 | if overlaps[ov_idx] <= 0: 203 | if mc.DEBUG_MODE: 204 | min_iou = min(overlaps[ov_idx], min_iou) 205 | num_objects += 1 206 | num_zero_iou_obj += 1 207 | break 208 | if ov_idx not in aidx_set: 209 | aidx_set.add(ov_idx) 210 | aidx = ov_idx 211 | if mc.DEBUG_MODE: 212 | max_iou = max(overlaps[ov_idx], max_iou) 213 | min_iou = min(overlaps[ov_idx], min_iou) 214 | avg_ious += overlaps[ov_idx] 215 | num_objects += 1 216 | break 217 | 218 | if aidx == len(mc.ANCHOR_BOX): 219 | # even the largeset available overlap is 0, thus, choose one with the 220 | # smallest square distance 221 | dist = np.sum(np.square(gt_bbox[i] - mc.ANCHOR_BOX), axis=1) 222 | for dist_idx in np.argsort(dist): 223 | if dist_idx not in aidx_set: 224 | aidx_set.add(dist_idx) 225 | aidx = dist_idx 226 | break 227 | 228 | box_cx, box_cy, box_w, box_h = gt_bbox[i] 229 | delta = [0]*4 230 | delta[0] = (box_cx - mc.ANCHOR_BOX[aidx][0])/mc.ANCHOR_BOX[aidx][2] 231 | delta[1] = (box_cy - mc.ANCHOR_BOX[aidx][1])/mc.ANCHOR_BOX[aidx][3] 232 | delta[2] = np.log(box_w/mc.ANCHOR_BOX[aidx][2]) 233 | delta[3] = np.log(box_h/mc.ANCHOR_BOX[aidx][3]) 234 | 235 | aidx_per_image.append(aidx) 236 | delta_per_image.append(delta) 237 | 238 | delta_per_batch.append(delta_per_image) 239 | aidx_per_batch.append(aidx_per_image) 240 | 241 | if mc.DEBUG_MODE: 242 | print ('max iou: {}'.format(max_iou)) 243 | print ('min iou: {}'.format(min_iou)) 244 | print ('avg iou: {}'.format(avg_ious/num_objects)) 245 | print ('number of objects: {}'.format(num_objects)) 246 | print ('number of objects with 0 iou: {}'.format(num_zero_iou_obj)) 247 | 248 | return image_per_batch, label_per_batch, delta_per_batch, \ 249 | aidx_per_batch, bbox_per_batch 250 | 251 | def evaluate_detections(self): 252 | raise NotImplementedError 253 | 254 | def visualize_detections( 255 | self, image_dir, image_format, det_error_file, output_image_dir, 256 | num_det_per_type=10): 257 | 258 | # load detections 259 | with open(det_error_file) as f: 260 | lines = f.readlines() 261 | random.shuffle(lines) 262 | f.close() 263 | 264 | dets_per_type = {} 265 | for line in lines: 266 | obj = line.strip().split(' ') 267 | error_type = obj[1] 268 | if error_type not in dets_per_type: 269 | dets_per_type[error_type] = [{ 270 | 'im_idx':obj[0], 271 | 'bbox':[float(obj[2]), float(obj[3]), float(obj[4]), float(obj[5])], 272 | 'class':obj[6], 273 | 'score': float(obj[7]) 274 | }] 275 | else: 276 | dets_per_type[error_type].append({ 277 | 'im_idx':obj[0], 278 | 'bbox':[float(obj[2]), float(obj[3]), float(obj[4]), float(obj[5])], 279 | 'class':obj[6], 280 | 'score': float(obj[7]) 281 | }) 282 | 283 | out_ims = [] 284 | # Randomly select some detections and plot them 285 | COLOR = (200, 200, 0) 286 | for error_type, dets in dets_per_type.iteritems(): 287 | det_im_dir = os.path.join(output_image_dir, error_type) 288 | if os.path.exists(det_im_dir): 289 | shutil.rmtree(det_im_dir) 290 | os.makedirs(det_im_dir) 291 | 292 | for i in range(min(num_det_per_type, len(dets))): 293 | det = dets[i] 294 | im = Image.open( 295 | os.path.join(image_dir, det['im_idx']+image_format)) 296 | draw = ImageDraw.Draw(im) 297 | draw.rectangle(det['bbox'], outline=COLOR) 298 | draw.text((det['bbox'][0], det['bbox'][1]), 299 | '{:s} ({:.2f})'.format(det['class'], det['score']), 300 | fill=COLOR) 301 | out_im_path = os.path.join(det_im_dir, str(i)+image_format) 302 | im.save(out_im_path) 303 | im = np.array(im) 304 | out_ims.append(im[:,:,::-1]) # RGB to BGR 305 | return out_ims 306 | 307 | -------------------------------------------------------------------------------- /src/dataset/kitti.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Image data base class for kitti""" 4 | 5 | import cv2 6 | import os 7 | import numpy as np 8 | import subprocess 9 | 10 | from dataset.imdb import imdb 11 | from utils.util import bbox_transform_inv, batch_iou 12 | 13 | class kitti(imdb): 14 | def __init__(self, image_set, data_path, mc): 15 | imdb.__init__(self, 'kitti_'+image_set, mc) 16 | self._image_set = image_set 17 | self._data_root_path = data_path 18 | self._image_path = os.path.join(self._data_root_path, 'training', 'image_2') 19 | self._label_path = os.path.join(self._data_root_path, 'training', 'label_2') 20 | self._classes = self.mc.CLASS_NAMES 21 | self._class_to_idx = dict(zip(self.classes, xrange(self.num_classes))) 22 | 23 | # a list of string indices of images in the directory 24 | self._image_idx = self._load_image_set_idx() 25 | # a dict of image_idx -> [[cx, cy, w, h, cls_idx]]. x,y,w,h are not divided by 26 | # the image width and height 27 | self._rois = self._load_kitti_annotation() 28 | 29 | ## batch reader ## 30 | self._perm_idx = None 31 | self._cur_idx = 0 32 | # TODO(bichen): add a random seed as parameter 33 | self._shuffle_image_idx() 34 | 35 | self._eval_tool = './src/dataset/kitti-eval/cpp/evaluate_object' 36 | 37 | def _load_image_set_idx(self): 38 | image_set_file = os.path.join( 39 | self._data_root_path, 'ImageSets', self._image_set+'.txt') 40 | assert os.path.exists(image_set_file), \ 41 | 'File does not exist: {}'.format(image_set_file) 42 | 43 | with open(image_set_file) as f: 44 | image_idx = [x.strip() for x in f.readlines()] 45 | return image_idx 46 | 47 | def _image_path_at(self, idx): 48 | image_path = os.path.join(self._image_path, idx+'.png') 49 | assert os.path.exists(image_path), \ 50 | 'Image does not exist: {}'.format(image_path) 51 | return image_path 52 | 53 | def _load_kitti_annotation(self): 54 | def _get_obj_level(obj): 55 | height = float(obj[7]) - float(obj[5]) + 1 56 | truncation = float(obj[1]) 57 | occlusion = float(obj[2]) 58 | if height >= 40 and truncation <= 0.15 and occlusion <= 0: 59 | return 1 60 | elif height >= 25 and truncation <= 0.3 and occlusion <= 1: 61 | return 2 62 | elif height >= 25 and truncation <= 0.5 and occlusion <= 2: 63 | return 3 64 | else: 65 | return 4 66 | 67 | idx2annotation = {} 68 | for index in self._image_idx: 69 | filename = os.path.join(self._label_path, index+'.txt') 70 | with open(filename, 'r') as f: 71 | lines = f.readlines() 72 | f.close() 73 | bboxes = [] 74 | for line in lines: 75 | obj = line.strip().split(' ') 76 | try: 77 | cls = self._class_to_idx[obj[0].lower().strip()] 78 | except: 79 | continue 80 | 81 | if self.mc.EXCLUDE_HARD_EXAMPLES and _get_obj_level(obj) > 3: 82 | continue 83 | xmin = float(obj[4]) 84 | ymin = float(obj[5]) 85 | xmax = float(obj[6]) 86 | ymax = float(obj[7]) 87 | assert xmin >= 0.0 and xmin <= xmax, \ 88 | 'Invalid bounding box x-coord xmin {} or xmax {} at {}.txt' \ 89 | .format(xmin, xmax, index) 90 | assert ymin >= 0.0 and ymin <= ymax, \ 91 | 'Invalid bounding box y-coord ymin {} or ymax {} at {}.txt' \ 92 | .format(ymin, ymax, index) 93 | x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax]) 94 | bboxes.append([x, y, w, h, cls]) 95 | 96 | idx2annotation[index] = bboxes 97 | 98 | return idx2annotation 99 | 100 | def evaluate_detections(self, eval_dir, global_step, all_boxes): 101 | """Evaluate detection results. 102 | Args: 103 | eval_dir: directory to write evaluation logs 104 | global_step: step of the checkpoint 105 | all_boxes: all_boxes[cls][image] = N x 5 arrays of 106 | [xmin, ymin, xmax, ymax, score] 107 | Returns: 108 | aps: array of average precisions. 109 | names: class names corresponding to each ap 110 | """ 111 | det_file_dir = os.path.join( 112 | eval_dir, 'detection_files_{:s}'.format(global_step), 'data') 113 | if not os.path.isdir(det_file_dir): 114 | os.makedirs(det_file_dir) 115 | 116 | for im_idx, index in enumerate(self._image_idx): 117 | filename = os.path.join(det_file_dir, index+'.txt') 118 | with open(filename, 'wt') as f: 119 | for cls_idx, cls in enumerate(self._classes): 120 | dets = all_boxes[cls_idx][im_idx] 121 | for k in xrange(len(dets)): 122 | f.write( 123 | '{:s} -1 -1 0.0 {:.2f} {:.2f} {:.2f} {:.2f} 0.0 0.0 0.0 0.0 0.0 ' 124 | '0.0 0.0 {:.3f}\n'.format( 125 | cls.lower(), dets[k][0], dets[k][1], dets[k][2], dets[k][3], 126 | dets[k][4]) 127 | ) 128 | 129 | cmd = self._eval_tool + ' ' \ 130 | + os.path.join(self._data_root_path, 'training') + ' ' \ 131 | + os.path.join(self._data_root_path, 'ImageSets', 132 | self._image_set+'.txt') + ' ' \ 133 | + os.path.dirname(det_file_dir) + ' ' + str(len(self._image_idx)) 134 | 135 | print('Running: {}'.format(cmd)) 136 | status = subprocess.call(cmd, shell=True) 137 | 138 | aps = [] 139 | names = [] 140 | for cls in self._classes: 141 | det_file_name = os.path.join( 142 | os.path.dirname(det_file_dir), 'stats_{:s}_ap.txt'.format(cls)) 143 | if os.path.exists(det_file_name): 144 | with open(det_file_name, 'r') as f: 145 | lines = f.readlines() 146 | assert len(lines) == 3, \ 147 | 'Line number of {} should be 3'.format(det_file_name) 148 | 149 | aps.append(float(lines[0].split('=')[1].strip())) 150 | aps.append(float(lines[1].split('=')[1].strip())) 151 | aps.append(float(lines[2].split('=')[1].strip())) 152 | else: 153 | aps.extend([0.0, 0.0, 0.0]) 154 | 155 | names.append(cls+'_easy') 156 | names.append(cls+'_medium') 157 | names.append(cls+'_hard') 158 | 159 | return aps, names 160 | 161 | def do_detection_analysis_in_eval(self, eval_dir, global_step): 162 | det_file_dir = os.path.join( 163 | eval_dir, 'detection_files_{:s}'.format(global_step), 'data') 164 | det_error_dir = os.path.join( 165 | eval_dir, 'detection_files_{:s}'.format(global_step), 166 | 'error_analysis') 167 | if not os.path.exists(det_error_dir): 168 | os.makedirs(det_error_dir) 169 | det_error_file = os.path.join(det_error_dir, 'det_error_file.txt') 170 | 171 | stats = self.analyze_detections(det_file_dir, det_error_file) 172 | ims = self.visualize_detections( 173 | image_dir=self._image_path, 174 | image_format='.png', 175 | det_error_file=det_error_file, 176 | output_image_dir=det_error_dir, 177 | num_det_per_type=10 178 | ) 179 | 180 | return stats, ims 181 | 182 | def analyze_detections(self, detection_file_dir, det_error_file): 183 | def _save_detection(f, idx, error_type, det, score): 184 | f.write( 185 | '{:s} {:s} {:.1f} {:.1f} {:.1f} {:.1f} {:s} {:.3f}\n'.format( 186 | idx, error_type, 187 | det[0]-det[2]/2., det[1]-det[3]/2., 188 | det[0]+det[2]/2., det[1]+det[3]/2., 189 | self._classes[int(det[4])], 190 | score 191 | ) 192 | ) 193 | 194 | # load detections 195 | self._det_rois = {} 196 | for idx in self._image_idx: 197 | det_file_name = os.path.join(detection_file_dir, idx+'.txt') 198 | with open(det_file_name) as f: 199 | lines = f.readlines() 200 | f.close() 201 | bboxes = [] 202 | for line in lines: 203 | obj = line.strip().split(' ') 204 | cls = self._class_to_idx[obj[0].lower().strip()] 205 | xmin = float(obj[4]) 206 | ymin = float(obj[5]) 207 | xmax = float(obj[6]) 208 | ymax = float(obj[7]) 209 | score = float(obj[-1]) 210 | 211 | x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax]) 212 | bboxes.append([x, y, w, h, cls, score]) 213 | bboxes.sort(key=lambda x: x[-1], reverse=True) 214 | self._det_rois[idx] = bboxes 215 | 216 | # do error analysis 217 | num_objs = 0. 218 | num_dets = 0. 219 | num_correct = 0. 220 | num_loc_error = 0. 221 | num_cls_error = 0. 222 | num_bg_error = 0. 223 | num_repeated_error = 0. 224 | num_detected_obj = 0. 225 | 226 | with open(det_error_file, 'w') as f: 227 | for idx in self._image_idx: 228 | gt_bboxes = np.array(self._rois[idx]) 229 | num_objs += len(gt_bboxes) 230 | detected = [False]*len(gt_bboxes) 231 | 232 | det_bboxes = self._det_rois[idx] 233 | if len(gt_bboxes) < 1: 234 | continue 235 | 236 | for i, det in enumerate(det_bboxes): 237 | if i < len(gt_bboxes): 238 | num_dets += 1 239 | ious = batch_iou(gt_bboxes[:, :4], det[:4]) 240 | max_iou = np.max(ious) 241 | gt_idx = np.argmax(ious) 242 | if max_iou > 0.1: 243 | if gt_bboxes[gt_idx, 4] == det[4]: 244 | if max_iou >= 0.5: 245 | if i < len(gt_bboxes): 246 | if not detected[gt_idx]: 247 | num_correct += 1 248 | detected[gt_idx] = True 249 | else: 250 | num_repeated_error += 1 251 | else: 252 | if i < len(gt_bboxes): 253 | num_loc_error += 1 254 | _save_detection(f, idx, 'loc', det, det[5]) 255 | else: 256 | if i < len(gt_bboxes): 257 | num_cls_error += 1 258 | _save_detection(f, idx, 'cls', det, det[5]) 259 | else: 260 | if i < len(gt_bboxes): 261 | num_bg_error += 1 262 | _save_detection(f, idx, 'bg', det, det[5]) 263 | 264 | for i, gt in enumerate(gt_bboxes): 265 | if not detected[i]: 266 | _save_detection(f, idx, 'missed', gt, -1.0) 267 | num_detected_obj += sum(detected) 268 | f.close() 269 | 270 | print ('Detection Analysis:') 271 | print (' Number of detections: {}'.format(num_dets)) 272 | print (' Number of objects: {}'.format(num_objs)) 273 | print (' Percentage of correct detections: {}'.format( 274 | num_correct/num_dets)) 275 | print (' Percentage of localization error: {}'.format( 276 | num_loc_error/num_dets)) 277 | print (' Percentage of classification error: {}'.format( 278 | num_cls_error/num_dets)) 279 | print (' Percentage of background error: {}'.format( 280 | num_bg_error/num_dets)) 281 | print (' Percentage of repeated detections: {}'.format( 282 | num_repeated_error/num_dets)) 283 | print (' Recall: {}'.format( 284 | num_detected_obj/num_objs)) 285 | 286 | out = {} 287 | out['num of detections'] = num_dets 288 | out['num of objects'] = num_objs 289 | out['% correct detections'] = num_correct/num_dets 290 | out['% localization error'] = num_loc_error/num_dets 291 | out['% classification error'] = num_cls_error/num_dets 292 | out['% background error'] = num_bg_error/num_dets 293 | out['% repeated error'] = num_repeated_error/num_dets 294 | out['% recall'] = num_detected_obj/num_objs 295 | 296 | return out 297 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Train""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import cv2 10 | from datetime import datetime 11 | import os.path 12 | import sys 13 | import time 14 | 15 | import numpy as np 16 | from six.moves import xrange 17 | import tensorflow as tf 18 | import threading 19 | 20 | from config import * 21 | from dataset import pascal_voc, kitti 22 | from utils.util import sparse_to_dense, bgr_to_rgb, bbox_transform 23 | from nets import * 24 | 25 | FLAGS = tf.app.flags.FLAGS 26 | 27 | tf.app.flags.DEFINE_string('dataset', 'KITTI', 28 | """Currently only support KITTI dataset.""") 29 | tf.app.flags.DEFINE_string('data_path', '', """Root directory of data""") 30 | tf.app.flags.DEFINE_string('image_set', 'train', 31 | """ Can be train, trainval, val, or test""") 32 | tf.app.flags.DEFINE_string('year', '2007', 33 | """VOC challenge year. 2007 or 2012""" 34 | """Only used for Pascal VOC dataset""") 35 | tf.app.flags.DEFINE_string('train_dir', '/tmp/bichen/logs/squeezeDet/train', 36 | """Directory where to write event logs """ 37 | """and checkpoint.""") 38 | tf.app.flags.DEFINE_integer('max_steps', 1000000, 39 | """Maximum number of batches to run.""") 40 | tf.app.flags.DEFINE_string('net', 'squeezeDet', 41 | """Neural net architecture. """) 42 | tf.app.flags.DEFINE_string('pretrained_model_path', '', 43 | """Path to the pretrained model.""") 44 | tf.app.flags.DEFINE_integer('summary_step', 10, 45 | """Number of steps to save summary.""") 46 | tf.app.flags.DEFINE_integer('checkpoint_step', 1000, 47 | """Number of steps to save summary.""") 48 | tf.app.flags.DEFINE_string('gpu', '0', """gpu id.""") 49 | 50 | 51 | def _draw_box(im, box_list, label_list, color=(0,255,0), cdict=None, form='center'): 52 | assert form == 'center' or form == 'diagonal', \ 53 | 'bounding box format not accepted: {}.'.format(form) 54 | 55 | for bbox, label in zip(box_list, label_list): 56 | 57 | if form == 'center': 58 | bbox = bbox_transform(bbox) 59 | 60 | xmin, ymin, xmax, ymax = [int(b) for b in bbox] 61 | 62 | l = label.split(':')[0] # text before "CLASS: (PROB)" 63 | if cdict and l in cdict: 64 | c = cdict[l] 65 | else: 66 | c = color 67 | 68 | # draw box 69 | cv2.rectangle(im, (xmin, ymin), (xmax, ymax), c, 1) 70 | # draw label 71 | font = cv2.FONT_HERSHEY_SIMPLEX 72 | cv2.putText(im, label, (xmin, ymax), font, 0.3, c, 1) 73 | 74 | def _viz_prediction_result(model, images, bboxes, labels, batch_det_bbox, 75 | batch_det_class, batch_det_prob): 76 | mc = model.mc 77 | 78 | for i in range(len(images)): 79 | # draw ground truth 80 | _draw_box( 81 | images[i], bboxes[i], 82 | [mc.CLASS_NAMES[idx] for idx in labels[i]], 83 | (0, 255, 0)) 84 | 85 | # draw prediction 86 | det_bbox, det_prob, det_class = model.filter_prediction( 87 | batch_det_bbox[i], batch_det_prob[i], batch_det_class[i]) 88 | 89 | keep_idx = [idx for idx in range(len(det_prob)) \ 90 | if det_prob[idx] > mc.PLOT_PROB_THRESH] 91 | det_bbox = [det_bbox[idx] for idx in keep_idx] 92 | det_prob = [det_prob[idx] for idx in keep_idx] 93 | det_class = [det_class[idx] for idx in keep_idx] 94 | 95 | _draw_box( 96 | images[i], det_bbox, 97 | [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \ 98 | for idx, prob in zip(det_class, det_prob)], 99 | (0, 0, 255)) 100 | 101 | 102 | def train(): 103 | """Train SqueezeDet model""" 104 | assert FLAGS.dataset == 'KITTI', \ 105 | 'Currently only support KITTI dataset' 106 | 107 | os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu 108 | 109 | with tf.Graph().as_default(): 110 | 111 | assert FLAGS.net == 'vgg16' or FLAGS.net == 'resnet50' \ 112 | or FLAGS.net == 'squeezeDet' or FLAGS.net == 'squeezeDet+', \ 113 | 'Selected neural net architecture not supported: {}'.format(FLAGS.net) 114 | if FLAGS.net == 'vgg16': 115 | mc = kitti_vgg16_config() 116 | mc.IS_TRAINING = True 117 | mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path 118 | model = VGG16ConvDet(mc) 119 | elif FLAGS.net == 'resnet50': 120 | mc = kitti_res50_config() 121 | mc.IS_TRAINING = True 122 | mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path 123 | model = ResNet50ConvDet(mc) 124 | elif FLAGS.net == 'squeezeDet': 125 | mc = kitti_squeezeDet_config() 126 | mc.IS_TRAINING = True 127 | mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path 128 | model = SqueezeDet(mc) 129 | elif FLAGS.net == 'squeezeDet+': 130 | mc = kitti_squeezeDetPlus_config() 131 | mc.IS_TRAINING = True 132 | mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path 133 | model = SqueezeDetPlus(mc) 134 | 135 | imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) 136 | 137 | # save model size, flops, activations by layers 138 | with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: 139 | f.write('Number of parameter by layer:\n') 140 | count = 0 141 | for c in model.model_size_counter: 142 | f.write('\t{}: {}\n'.format(c[0], c[1])) 143 | count += c[1] 144 | f.write('\ttotal: {}\n'.format(count)) 145 | 146 | count = 0 147 | f.write('\nActivation size by layer:\n') 148 | for c in model.activation_counter: 149 | f.write('\t{}: {}\n'.format(c[0], c[1])) 150 | count += c[1] 151 | f.write('\ttotal: {}\n'.format(count)) 152 | 153 | count = 0 154 | f.write('\nNumber of flops by layer:\n') 155 | for c in model.flop_counter: 156 | f.write('\t{}: {}\n'.format(c[0], c[1])) 157 | count += c[1] 158 | f.write('\ttotal: {}\n'.format(count)) 159 | f.close() 160 | print ('Model statistics saved to {}.'.format( 161 | os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) 162 | 163 | def _load_data(load_to_placeholder=True): 164 | # read batch input 165 | image_per_batch, label_per_batch, box_delta_per_batch, aidx_per_batch, \ 166 | bbox_per_batch = imdb.read_batch() 167 | 168 | label_indices, bbox_indices, box_delta_values, mask_indices, box_values, \ 169 | = [], [], [], [], [] 170 | aidx_set = set() 171 | num_discarded_labels = 0 172 | num_labels = 0 173 | for i in range(len(label_per_batch)): # batch_size 174 | for j in range(len(label_per_batch[i])): # number of annotations 175 | num_labels += 1 176 | if (i, aidx_per_batch[i][j]) not in aidx_set: 177 | aidx_set.add((i, aidx_per_batch[i][j])) 178 | label_indices.append( 179 | [i, aidx_per_batch[i][j], label_per_batch[i][j]]) 180 | mask_indices.append([i, aidx_per_batch[i][j]]) 181 | bbox_indices.extend( 182 | [[i, aidx_per_batch[i][j], k] for k in range(4)]) 183 | box_delta_values.extend(box_delta_per_batch[i][j]) 184 | box_values.extend(bbox_per_batch[i][j]) 185 | else: 186 | num_discarded_labels += 1 187 | 188 | if mc.DEBUG_MODE: 189 | print ('Warning: Discarded {}/({}) labels that are assigned to the same ' 190 | 'anchor'.format(num_discarded_labels, num_labels)) 191 | 192 | if load_to_placeholder: 193 | image_input = model.ph_image_input 194 | input_mask = model.ph_input_mask 195 | box_delta_input = model.ph_box_delta_input 196 | box_input = model.ph_box_input 197 | labels = model.ph_labels 198 | else: 199 | image_input = model.image_input 200 | input_mask = model.input_mask 201 | box_delta_input = model.box_delta_input 202 | box_input = model.box_input 203 | labels = model.labels 204 | 205 | feed_dict = { 206 | image_input: image_per_batch, 207 | input_mask: np.reshape( 208 | sparse_to_dense( 209 | mask_indices, [mc.BATCH_SIZE, mc.ANCHORS], 210 | [1.0]*len(mask_indices)), 211 | [mc.BATCH_SIZE, mc.ANCHORS, 1]), 212 | box_delta_input: sparse_to_dense( 213 | bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4], 214 | box_delta_values), 215 | box_input: sparse_to_dense( 216 | bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4], 217 | box_values), 218 | labels: sparse_to_dense( 219 | label_indices, 220 | [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES], 221 | [1.0]*len(label_indices)), 222 | } 223 | 224 | return feed_dict, image_per_batch, label_per_batch, bbox_per_batch 225 | 226 | def _enqueue(sess, coord): 227 | try: 228 | while not coord.should_stop(): 229 | feed_dict, _, _, _ = _load_data() 230 | sess.run(model.enqueue_op, feed_dict=feed_dict) 231 | if mc.DEBUG_MODE: 232 | print ("added to the queue") 233 | if mc.DEBUG_MODE: 234 | print ("Finished enqueue") 235 | except Exception, e: 236 | coord.request_stop(e) 237 | 238 | sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) 239 | 240 | saver = tf.train.Saver(tf.global_variables()) 241 | summary_op = tf.summary.merge_all() 242 | 243 | ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) 244 | if ckpt and ckpt.model_checkpoint_path: 245 | saver.restore(sess, ckpt.model_checkpoint_path) 246 | 247 | summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) 248 | 249 | init = tf.global_variables_initializer() 250 | sess.run(init) 251 | 252 | coord = tf.train.Coordinator() 253 | 254 | if mc.NUM_THREAD > 0: 255 | enq_threads = [] 256 | for _ in range(mc.NUM_THREAD): 257 | enq_thread = threading.Thread(target=_enqueue, args=[sess, coord]) 258 | # enq_thread.isDaemon() 259 | enq_thread.start() 260 | enq_threads.append(enq_thread) 261 | 262 | threads = tf.train.start_queue_runners(coord=coord, sess=sess) 263 | run_options = tf.RunOptions(timeout_in_ms=60000) 264 | 265 | # try: 266 | for step in xrange(FLAGS.max_steps): 267 | if coord.should_stop(): 268 | sess.run(model.FIFOQueue.close(cancel_pending_enqueues=True)) 269 | coord.request_stop() 270 | coord.join(threads) 271 | break 272 | 273 | start_time = time.time() 274 | 275 | if step % FLAGS.summary_step == 0: 276 | feed_dict, image_per_batch, label_per_batch, bbox_per_batch = \ 277 | _load_data(load_to_placeholder=False) 278 | op_list = [ 279 | model.train_op, model.loss, summary_op, model.det_boxes, 280 | model.det_probs, model.det_class, model.conf_loss, 281 | model.bbox_loss, model.class_loss 282 | ] 283 | _, loss_value, summary_str, det_boxes, det_probs, det_class, \ 284 | conf_loss, bbox_loss, class_loss = sess.run( 285 | op_list, feed_dict=feed_dict) 286 | 287 | _viz_prediction_result( 288 | model, image_per_batch, bbox_per_batch, label_per_batch, det_boxes, 289 | det_class, det_probs) 290 | image_per_batch = bgr_to_rgb(image_per_batch) 291 | viz_summary = sess.run( 292 | model.viz_op, feed_dict={model.image_to_show: image_per_batch}) 293 | 294 | summary_writer.add_summary(summary_str, step) 295 | summary_writer.add_summary(viz_summary, step) 296 | summary_writer.flush() 297 | 298 | print ('conf_loss: {}, bbox_loss: {}, class_loss: {}'. 299 | format(conf_loss, bbox_loss, class_loss)) 300 | else: 301 | if mc.NUM_THREAD > 0: 302 | _, loss_value, conf_loss, bbox_loss, class_loss = sess.run( 303 | [model.train_op, model.loss, model.conf_loss, model.bbox_loss, 304 | model.class_loss], options=run_options) 305 | else: 306 | feed_dict, _, _, _ = _load_data(load_to_placeholder=False) 307 | _, loss_value, conf_loss, bbox_loss, class_loss = sess.run( 308 | [model.train_op, model.loss, model.conf_loss, model.bbox_loss, 309 | model.class_loss], feed_dict=feed_dict) 310 | 311 | duration = time.time() - start_time 312 | 313 | assert not np.isnan(loss_value), \ 314 | 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 315 | 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) 316 | 317 | if step % 10 == 0: 318 | num_images_per_step = mc.BATCH_SIZE 319 | images_per_sec = num_images_per_step / duration 320 | sec_per_batch = float(duration) 321 | format_str = ('%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 322 | 'sec/batch)') 323 | print (format_str % (datetime.now(), step, loss_value, 324 | images_per_sec, sec_per_batch)) 325 | sys.stdout.flush() 326 | 327 | # Save the model checkpoint periodically. 328 | if step % FLAGS.checkpoint_step == 0 or (step + 1) == FLAGS.max_steps: 329 | checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') 330 | saver.save(sess, checkpoint_path, global_step=step) 331 | # except Exception, e: 332 | # coord.request_stop(e) 333 | # finally: 334 | # coord.request_stop() 335 | # coord.join(threads) 336 | 337 | def main(argv=None): # pylint: disable=unused-argument 338 | if tf.gfile.Exists(FLAGS.train_dir): 339 | tf.gfile.DeleteRecursively(FLAGS.train_dir) 340 | tf.gfile.MakeDirs(FLAGS.train_dir) 341 | train() 342 | 343 | 344 | if __name__ == '__main__': 345 | tf.app.run() 346 | -------------------------------------------------------------------------------- /src/nn_skeleton.py: -------------------------------------------------------------------------------- 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016 2 | 3 | """Neural network model base class.""" 4 | 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import os 10 | import sys 11 | 12 | from utils import util 13 | from easydict import EasyDict as edict 14 | import numpy as np 15 | import tensorflow as tf 16 | 17 | 18 | def _add_loss_summaries(total_loss): 19 | """Add summaries for losses 20 | Generates loss summaries for visualizing the performance of the network. 21 | Args: 22 | total_loss: Total loss from loss(). 23 | """ 24 | losses = tf.get_collection('losses') 25 | 26 | # Attach a scalar summary to all individual losses and the total loss; do the 27 | # same for the averaged version of the losses. 28 | for l in losses + [total_loss]: 29 | tf.summary.scalar(l.op.name, l) 30 | 31 | def _variable_on_device(name, shape, initializer, trainable=True): 32 | """Helper to create a Variable. 33 | 34 | Args: 35 | name: name of the variable 36 | shape: list of ints 37 | initializer: initializer for Variable 38 | 39 | Returns: 40 | Variable Tensor 41 | """ 42 | # TODO(bichen): fix the hard-coded data type below 43 | dtype = tf.float32 44 | if not callable(initializer): 45 | var = tf.get_variable(name, initializer=initializer, trainable=trainable) 46 | else: 47 | var = tf.get_variable( 48 | name, shape, initializer=initializer, dtype=dtype, trainable=trainable) 49 | return var 50 | 51 | def _variable_with_weight_decay(name, shape, wd, initializer, trainable=True): 52 | """Helper to create an initialized Variable with weight decay. 53 | 54 | Note that the Variable is initialized with a truncated normal distribution. 55 | A weight decay is added only if one is specified. 56 | 57 | Args: 58 | name: name of the variable 59 | shape: list of ints 60 | wd: add L2Loss weight decay multiplied by this float. If None, weight 61 | decay is not added for this Variable. 62 | 63 | Returns: 64 | Variable Tensor 65 | """ 66 | var = _variable_on_device(name, shape, initializer, trainable) 67 | if wd is not None and trainable: 68 | weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss') 69 | tf.add_to_collection('losses', weight_decay) 70 | return var 71 | 72 | class ModelSkeleton: 73 | """Base class of NN detection models.""" 74 | def __init__(self, mc): 75 | self.mc = mc 76 | # a scalar tensor in range (0, 1]. Usually set to 0.5 in training phase and 77 | # 1.0 in evaluation phase 78 | self.keep_prob = 0.5 if mc.IS_TRAINING else 1.0 79 | 80 | # image batch input 81 | self.ph_image_input = tf.placeholder( 82 | tf.float32, [mc.BATCH_SIZE, mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3], 83 | name='image_input' 84 | ) 85 | # A tensor where an element is 1 if the corresponding box is "responsible" 86 | # for detection an object and 0 otherwise. 87 | self.ph_input_mask = tf.placeholder( 88 | tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 1], name='box_mask') 89 | # Tensor used to represent bounding box deltas. 90 | self.ph_box_delta_input = tf.placeholder( 91 | tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 4], name='box_delta_input') 92 | # Tensor used to represent bounding box coordinates. 93 | self.ph_box_input = tf.placeholder( 94 | tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 4], name='box_input') 95 | # Tensor used to represent labels 96 | self.ph_labels = tf.placeholder( 97 | tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES], name='labels') 98 | 99 | # IOU between predicted anchors with ground-truth boxes 100 | self.ious = tf.Variable( 101 | initial_value=np.zeros((mc.BATCH_SIZE, mc.ANCHORS)), trainable=False, 102 | name='iou', dtype=tf.float32 103 | ) 104 | 105 | self.FIFOQueue = tf.FIFOQueue( 106 | capacity=mc.QUEUE_CAPACITY, 107 | dtypes=[tf.float32, tf.float32, tf.float32, 108 | tf.float32, tf.float32], 109 | shapes=[[mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3], 110 | [mc.ANCHORS, 1], 111 | [mc.ANCHORS, 4], 112 | [mc.ANCHORS, 4], 113 | [mc.ANCHORS, mc.CLASSES]], 114 | ) 115 | 116 | self.enqueue_op = self.FIFOQueue.enqueue_many( 117 | [self.ph_image_input, self.ph_input_mask, 118 | self.ph_box_delta_input, self.ph_box_input, self.ph_labels] 119 | ) 120 | 121 | self.image_input, self.input_mask, self.box_delta_input, \ 122 | self.box_input, self.labels = tf.train.batch( 123 | self.FIFOQueue.dequeue(), batch_size=mc.BATCH_SIZE, 124 | capacity=mc.QUEUE_CAPACITY) 125 | 126 | # model parameters 127 | self.model_params = [] 128 | 129 | # model size counter 130 | self.model_size_counter = [] # array of tuple of layer name, parameter size 131 | # flop counter 132 | self.flop_counter = [] # array of tuple of layer name, flop number 133 | # activation counter 134 | self.activation_counter = [] # array of tuple of layer name, output activations 135 | self.activation_counter.append(('input', mc.IMAGE_WIDTH*mc.IMAGE_HEIGHT*3)) 136 | 137 | 138 | def _add_forward_graph(self): 139 | """NN architecture specification.""" 140 | raise NotImplementedError 141 | 142 | def _add_interpretation_graph(self): 143 | """Interpret NN output.""" 144 | mc = self.mc 145 | 146 | with tf.variable_scope('interpret_output') as scope: 147 | preds = self.preds 148 | 149 | # probability 150 | num_class_probs = mc.ANCHOR_PER_GRID*mc.CLASSES 151 | self.pred_class_probs = tf.reshape( 152 | tf.nn.softmax( 153 | tf.reshape( 154 | preds[:, :, :, :num_class_probs], 155 | [-1, mc.CLASSES] 156 | ) 157 | ), 158 | [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES], 159 | name='pred_class_probs' 160 | ) 161 | 162 | # confidence 163 | num_confidence_scores = mc.ANCHOR_PER_GRID+num_class_probs 164 | self.pred_conf = tf.sigmoid( 165 | tf.reshape( 166 | preds[:, :, :, num_class_probs:num_confidence_scores], 167 | [mc.BATCH_SIZE, mc.ANCHORS] 168 | ), 169 | name='pred_confidence_score' 170 | ) 171 | 172 | # bbox_delta 173 | self.pred_box_delta = tf.reshape( 174 | preds[:, :, :, num_confidence_scores:], 175 | [mc.BATCH_SIZE, mc.ANCHORS, 4], 176 | name='bbox_delta' 177 | ) 178 | 179 | # number of object. Used to normalize bbox and classification loss 180 | self.num_objects = tf.reduce_sum(self.input_mask, name='num_objects') 181 | 182 | with tf.variable_scope('bbox') as scope: 183 | with tf.variable_scope('stretching'): 184 | delta_x, delta_y, delta_w, delta_h = tf.unstack( 185 | self.pred_box_delta, axis=2) 186 | 187 | anchor_x = mc.ANCHOR_BOX[:, 0] 188 | anchor_y = mc.ANCHOR_BOX[:, 1] 189 | anchor_w = mc.ANCHOR_BOX[:, 2] 190 | anchor_h = mc.ANCHOR_BOX[:, 3] 191 | 192 | box_center_x = tf.identity( 193 | anchor_x + delta_x * anchor_w, name='bbox_cx') 194 | box_center_y = tf.identity( 195 | anchor_y + delta_y * anchor_h, name='bbox_cy') 196 | box_width = tf.identity( 197 | anchor_w * util.safe_exp(delta_w, mc.EXP_THRESH), 198 | name='bbox_width') 199 | box_height = tf.identity( 200 | anchor_h * util.safe_exp(delta_h, mc.EXP_THRESH), 201 | name='bbox_height') 202 | 203 | self._activation_summary(delta_x, 'delta_x') 204 | self._activation_summary(delta_y, 'delta_y') 205 | self._activation_summary(delta_w, 'delta_w') 206 | self._activation_summary(delta_h, 'delta_h') 207 | 208 | self._activation_summary(box_center_x, 'bbox_cx') 209 | self._activation_summary(box_center_y, 'bbox_cy') 210 | self._activation_summary(box_width, 'bbox_width') 211 | self._activation_summary(box_height, 'bbox_height') 212 | 213 | with tf.variable_scope('trimming'): 214 | xmins, ymins, xmaxs, ymaxs = util.bbox_transform( 215 | [box_center_x, box_center_y, box_width, box_height]) 216 | 217 | # The max x position is mc.IMAGE_WIDTH - 1 since we use zero-based 218 | # pixels. Same for y. 219 | xmins = tf.minimum( 220 | tf.maximum(0.0, xmins), mc.IMAGE_WIDTH-1.0, name='bbox_xmin') 221 | self._activation_summary(xmins, 'box_xmin') 222 | 223 | ymins = tf.minimum( 224 | tf.maximum(0.0, ymins), mc.IMAGE_HEIGHT-1.0, name='bbox_ymin') 225 | self._activation_summary(ymins, 'box_ymin') 226 | 227 | xmaxs = tf.maximum( 228 | tf.minimum(mc.IMAGE_WIDTH-1.0, xmaxs), 0.0, name='bbox_xmax') 229 | self._activation_summary(xmaxs, 'box_xmax') 230 | 231 | ymaxs = tf.maximum( 232 | tf.minimum(mc.IMAGE_HEIGHT-1.0, ymaxs), 0.0, name='bbox_ymax') 233 | self._activation_summary(ymaxs, 'box_ymax') 234 | 235 | self.det_boxes = tf.transpose( 236 | tf.stack(util.bbox_transform_inv([xmins, ymins, xmaxs, ymaxs])), 237 | (1, 2, 0), name='bbox' 238 | ) 239 | 240 | with tf.variable_scope('IOU'): 241 | def _tensor_iou(box1, box2): 242 | with tf.variable_scope('intersection'): 243 | xmin = tf.maximum(box1[0], box2[0], name='xmin') 244 | ymin = tf.maximum(box1[1], box2[1], name='ymin') 245 | xmax = tf.minimum(box1[2], box2[2], name='xmax') 246 | ymax = tf.minimum(box1[3], box2[3], name='ymax') 247 | 248 | w = tf.maximum(0.0, xmax-xmin, name='inter_w') 249 | h = tf.maximum(0.0, ymax-ymin, name='inter_h') 250 | intersection = tf.multiply(w, h, name='intersection') 251 | 252 | with tf.variable_scope('union'): 253 | w1 = tf.subtract(box1[2], box1[0], name='w1') 254 | h1 = tf.subtract(box1[3], box1[1], name='h1') 255 | w2 = tf.subtract(box2[2], box2[0], name='w2') 256 | h2 = tf.subtract(box2[3], box2[1], name='h2') 257 | 258 | union = w1*h1 + w2*h2 - intersection 259 | 260 | return intersection/(union+mc.EPSILON) \ 261 | * tf.reshape(self.input_mask, [mc.BATCH_SIZE, mc.ANCHORS]) 262 | 263 | self.ious = self.ious.assign( 264 | _tensor_iou( 265 | util.bbox_transform(tf.unstack(self.det_boxes, axis=2)), 266 | util.bbox_transform(tf.unstack(self.box_input, axis=2)) 267 | ) 268 | ) 269 | self._activation_summary(self.ious, 'conf_score') 270 | 271 | with tf.variable_scope('probability') as scope: 272 | self._activation_summary(self.pred_class_probs, 'class_probs') 273 | 274 | probs = tf.multiply( 275 | self.pred_class_probs, 276 | tf.reshape(self.pred_conf, [mc.BATCH_SIZE, mc.ANCHORS, 1]), 277 | name='final_class_prob' 278 | ) 279 | 280 | self._activation_summary(probs, 'final_class_prob') 281 | 282 | self.det_probs = tf.reduce_max(probs, 2, name='score') 283 | self.det_class = tf.argmax(probs, 2, name='class_idx') 284 | 285 | def _add_loss_graph(self): 286 | """Define the loss operation.""" 287 | mc = self.mc 288 | 289 | with tf.variable_scope('class_regression') as scope: 290 | # cross-entropy: q * -log(p) + (1-q) * -log(1-p) 291 | # add a small value into log to prevent blowing up 292 | self.class_loss = tf.truediv( 293 | tf.reduce_sum( 294 | (self.labels*(-tf.log(self.pred_class_probs+mc.EPSILON)) 295 | + (1-self.labels)*(-tf.log(1-self.pred_class_probs+mc.EPSILON))) 296 | * self.input_mask * mc.LOSS_COEF_CLASS), 297 | self.num_objects, 298 | name='class_loss' 299 | ) 300 | tf.add_to_collection('losses', self.class_loss) 301 | 302 | with tf.variable_scope('confidence_score_regression') as scope: 303 | input_mask = tf.reshape(self.input_mask, [mc.BATCH_SIZE, mc.ANCHORS]) 304 | self.conf_loss = tf.reduce_mean( 305 | tf.reduce_sum( 306 | tf.square((self.ious - self.pred_conf)) 307 | * (input_mask*mc.LOSS_COEF_CONF_POS/self.num_objects 308 | +(1-input_mask)*mc.LOSS_COEF_CONF_NEG/(mc.ANCHORS-self.num_objects)), 309 | reduction_indices=[1] 310 | ), 311 | name='confidence_loss' 312 | ) 313 | tf.add_to_collection('losses', self.conf_loss) 314 | tf.summary.scalar('mean iou', tf.reduce_sum(self.ious)/self.num_objects) 315 | 316 | with tf.variable_scope('bounding_box_regression') as scope: 317 | self.bbox_loss = tf.truediv( 318 | tf.reduce_sum( 319 | mc.LOSS_COEF_BBOX * tf.square( 320 | self.input_mask*(self.pred_box_delta-self.box_delta_input))), 321 | self.num_objects, 322 | name='bbox_loss' 323 | ) 324 | tf.add_to_collection('losses', self.bbox_loss) 325 | 326 | # add above losses as well as weight decay losses to form the total loss 327 | self.loss = tf.add_n(tf.get_collection('losses'), name='total_loss') 328 | 329 | def _add_train_graph(self): 330 | """Define the training operation.""" 331 | mc = self.mc 332 | 333 | self.global_step = tf.Variable(0, name='global_step', trainable=False) 334 | lr = tf.train.exponential_decay(mc.LEARNING_RATE, 335 | self.global_step, 336 | mc.DECAY_STEPS, 337 | mc.LR_DECAY_FACTOR, 338 | staircase=True) 339 | 340 | tf.summary.scalar('learning_rate', lr) 341 | 342 | _add_loss_summaries(self.loss) 343 | 344 | opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=mc.MOMENTUM) 345 | grads_vars = opt.compute_gradients(self.loss, tf.trainable_variables()) 346 | 347 | with tf.variable_scope('clip_gradient') as scope: 348 | for i, (grad, var) in enumerate(grads_vars): 349 | grads_vars[i] = (tf.clip_by_norm(grad, mc.MAX_GRAD_NORM), var) 350 | 351 | apply_gradient_op = opt.apply_gradients(grads_vars, global_step=self.global_step) 352 | 353 | for var in tf.trainable_variables(): 354 | tf.summary.histogram(var.op.name, var) 355 | 356 | for grad, var in grads_vars: 357 | if grad is not None: 358 | tf.summary.histogram(var.op.name + '/gradients', grad) 359 | 360 | with tf.control_dependencies([apply_gradient_op]): 361 | self.train_op = tf.no_op(name='train') 362 | 363 | def _add_viz_graph(self): 364 | """Define the visualization operation.""" 365 | mc = self.mc 366 | self.image_to_show = tf.placeholder( 367 | tf.float32, [None, mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3], 368 | name='image_to_show' 369 | ) 370 | self.viz_op = tf.summary.image('sample_detection_results', 371 | self.image_to_show, collections='image_summary', 372 | max_outputs=mc.BATCH_SIZE) 373 | 374 | def _conv_bn_layer( 375 | self, inputs, conv_param_name, bn_param_name, scale_param_name, filters, 376 | size, stride, padding='SAME', freeze=False, relu=True, 377 | conv_with_bias=False, stddev=0.001): 378 | """ Convolution + BatchNorm + [relu] layer. Batch mean and var are treated 379 | as constant. Weights have to be initialized from a pre-trained model or 380 | restored from a checkpoint. 381 | 382 | Args: 383 | inputs: input tensor 384 | conv_param_name: name of the convolution parameters 385 | bn_param_name: name of the batch normalization parameters 386 | scale_param_name: name of the scale parameters 387 | filters: number of output filters. 388 | size: kernel size. 389 | stride: stride 390 | padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description. 391 | freeze: if true, then do not train the parameters in this layer. 392 | xavier: whether to use xavier weight initializer or not. 393 | relu: whether to use relu or not. 394 | conv_with_bias: whether or not add bias term to the convolution output. 395 | stddev: standard deviation used for random weight initializer. 396 | Returns: 397 | A convolutional layer operation. 398 | """ 399 | mc = self.mc 400 | 401 | with tf.variable_scope(conv_param_name) as scope: 402 | channels = inputs.get_shape()[3] 403 | 404 | if mc.LOAD_PRETRAINED_MODEL: 405 | cw = self.caffemodel_weight 406 | kernel_val = np.transpose(cw[conv_param_name][0], [2,3,1,0]) 407 | if conv_with_bias: 408 | bias_val = cw[conv_param_name][1] 409 | mean_val = cw[bn_param_name][0] 410 | var_val = cw[bn_param_name][1] 411 | gamma_val = cw[scale_param_name][0] 412 | beta_val = cw[scale_param_name][1] 413 | else: 414 | kernel_val = tf.truncated_normal_initializer( 415 | stddev=stddev, dtype=tf.float32) 416 | if conv_with_bias: 417 | bias_val = tf.constant_initializer(0.0) 418 | mean_val = tf.constant_initializer(0.0) 419 | var_val = tf.constant_initializer(1.0) 420 | gamma_val = tf.constant_initializer(1.0) 421 | beta_val = tf.constant_initializer(0.0) 422 | 423 | # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with 424 | # shape [h, w, in, out] 425 | kernel = _variable_with_weight_decay( 426 | 'kernels', shape=[size, size, int(channels), filters], 427 | wd=mc.WEIGHT_DECAY, initializer=kernel_val, trainable=(not freeze)) 428 | self.model_params += [kernel] 429 | if conv_with_bias: 430 | biases = _variable_on_device('biases', [filters], bias_val, 431 | trainable=(not freeze)) 432 | self.model_params += [biases] 433 | gamma = _variable_on_device('gamma', [filters], gamma_val, 434 | trainable=(not freeze)) 435 | beta = _variable_on_device('beta', [filters], beta_val, 436 | trainable=(not freeze)) 437 | mean = _variable_on_device('mean', [filters], mean_val, trainable=False) 438 | var = _variable_on_device('var', [filters], var_val, trainable=False) 439 | self.model_params += [gamma, beta, mean, var] 440 | 441 | conv = tf.nn.conv2d( 442 | inputs, kernel, [1, stride, stride, 1], padding=padding, 443 | name='convolution') 444 | if conv_with_bias: 445 | conv = tf.nn.bias_add(conv, biases, name='bias_add') 446 | 447 | conv = tf.nn.batch_normalization( 448 | conv, mean=mean, variance=var, offset=beta, scale=gamma, 449 | variance_epsilon=mc.BATCH_NORM_EPSILON, name='batch_norm') 450 | 451 | self.model_size_counter.append( 452 | (conv_param_name, (1+size*size*int(channels))*filters) 453 | ) 454 | out_shape = conv.get_shape().as_list() 455 | num_flops = \ 456 | (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2] 457 | if relu: 458 | num_flops += 2*filters*out_shape[1]*out_shape[2] 459 | self.flop_counter.append((conv_param_name, num_flops)) 460 | 461 | self.activation_counter.append( 462 | (conv_param_name, out_shape[1]*out_shape[2]*out_shape[3]) 463 | ) 464 | 465 | if relu: 466 | return tf.nn.relu(conv) 467 | else: 468 | return conv 469 | 470 | 471 | def _conv_layer( 472 | self, layer_name, inputs, filters, size, stride, padding='SAME', 473 | freeze=False, xavier=False, relu=True, stddev=0.001): 474 | """Convolutional layer operation constructor. 475 | 476 | Args: 477 | layer_name: layer name. 478 | inputs: input tensor 479 | filters: number of output filters. 480 | size: kernel size. 481 | stride: stride 482 | padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description. 483 | freeze: if true, then do not train the parameters in this layer. 484 | xavier: whether to use xavier weight initializer or not. 485 | relu: whether to use relu or not. 486 | stddev: standard deviation used for random weight initializer. 487 | Returns: 488 | A convolutional layer operation. 489 | """ 490 | 491 | mc = self.mc 492 | use_pretrained_param = False 493 | if mc.LOAD_PRETRAINED_MODEL: 494 | cw = self.caffemodel_weight 495 | if layer_name in cw: 496 | kernel_val = np.transpose(cw[layer_name][0], [2,3,1,0]) 497 | bias_val = cw[layer_name][1] 498 | # check the shape 499 | if (kernel_val.shape == 500 | (size, size, inputs.get_shape().as_list()[-1], filters)) \ 501 | and (bias_val.shape == (filters, )): 502 | use_pretrained_param = True 503 | else: 504 | print ('Shape of the pretrained parameter of {} does not match, ' 505 | 'use randomly initialized parameter'.format(layer_name)) 506 | else: 507 | print ('Cannot find {} in the pretrained model. Use randomly initialized ' 508 | 'parameters'.format(layer_name)) 509 | 510 | if mc.DEBUG_MODE: 511 | print('Input tensor shape to {}: {}'.format(layer_name, inputs.get_shape())) 512 | 513 | with tf.variable_scope(layer_name) as scope: 514 | channels = inputs.get_shape()[3] 515 | 516 | # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with 517 | # shape [h, w, in, out] 518 | if use_pretrained_param: 519 | if mc.DEBUG_MODE: 520 | print ('Using pretrained model for {}'.format(layer_name)) 521 | kernel_init = tf.constant(kernel_val , dtype=tf.float32) 522 | bias_init = tf.constant(bias_val, dtype=tf.float32) 523 | elif xavier: 524 | kernel_init = tf.contrib.layers.xavier_initializer_conv2d() 525 | bias_init = tf.constant_initializer(0.0) 526 | else: 527 | kernel_init = tf.truncated_normal_initializer( 528 | stddev=stddev, dtype=tf.float32) 529 | bias_init = tf.constant_initializer(0.0) 530 | 531 | kernel = _variable_with_weight_decay( 532 | 'kernels', shape=[size, size, int(channels), filters], 533 | wd=mc.WEIGHT_DECAY, initializer=kernel_init, trainable=(not freeze)) 534 | 535 | biases = _variable_on_device('biases', [filters], bias_init, 536 | trainable=(not freeze)) 537 | self.model_params += [kernel, biases] 538 | 539 | conv = tf.nn.conv2d( 540 | inputs, kernel, [1, stride, stride, 1], padding=padding, 541 | name='convolution') 542 | conv_bias = tf.nn.bias_add(conv, biases, name='bias_add') 543 | 544 | if relu: 545 | out = tf.nn.relu(conv_bias, 'relu') 546 | else: 547 | out = conv_bias 548 | 549 | self.model_size_counter.append( 550 | (layer_name, (1+size*size*int(channels))*filters) 551 | ) 552 | out_shape = out.get_shape().as_list() 553 | num_flops = \ 554 | (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2] 555 | if relu: 556 | num_flops += 2*filters*out_shape[1]*out_shape[2] 557 | self.flop_counter.append((layer_name, num_flops)) 558 | 559 | self.activation_counter.append( 560 | (layer_name, out_shape[1]*out_shape[2]*out_shape[3]) 561 | ) 562 | 563 | return out 564 | 565 | def _pooling_layer( 566 | self, layer_name, inputs, size, stride, padding='SAME'): 567 | """Pooling layer operation constructor. 568 | 569 | Args: 570 | layer_name: layer name. 571 | inputs: input tensor 572 | size: kernel size. 573 | stride: stride 574 | padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description. 575 | Returns: 576 | A pooling layer operation. 577 | """ 578 | 579 | with tf.variable_scope(layer_name) as scope: 580 | out = tf.nn.max_pool(inputs, 581 | ksize=[1, size, size, 1], 582 | strides=[1, stride, stride, 1], 583 | padding=padding) 584 | activation_size = np.prod(out.get_shape().as_list()[1:]) 585 | self.activation_counter.append((layer_name, activation_size)) 586 | return out 587 | 588 | 589 | def _fc_layer( 590 | self, layer_name, inputs, hiddens, flatten=False, relu=True, 591 | xavier=False, stddev=0.001): 592 | """Fully connected layer operation constructor. 593 | 594 | Args: 595 | layer_name: layer name. 596 | inputs: input tensor 597 | hiddens: number of (hidden) neurons in this layer. 598 | flatten: if true, reshape the input 4D tensor of shape 599 | (batch, height, weight, channel) into a 2D tensor with shape 600 | (batch, -1). This is used when the input to the fully connected layer 601 | is output of a convolutional layer. 602 | relu: whether to use relu or not. 603 | xavier: whether to use xavier weight initializer or not. 604 | stddev: standard deviation used for random weight initializer. 605 | Returns: 606 | A fully connected layer operation. 607 | """ 608 | mc = self.mc 609 | 610 | use_pretrained_param = False 611 | if mc.LOAD_PRETRAINED_MODEL: 612 | cw = self.caffemodel_weight 613 | if layer_name in cw: 614 | use_pretrained_param = True 615 | kernel_val = cw[layer_name][0] 616 | bias_val = cw[layer_name][1] 617 | 618 | if mc.DEBUG_MODE: 619 | print('Input tensor shape to {}: {}'.format(layer_name, inputs.get_shape())) 620 | 621 | with tf.variable_scope(layer_name) as scope: 622 | input_shape = inputs.get_shape().as_list() 623 | if flatten: 624 | dim = input_shape[1]*input_shape[2]*input_shape[3] 625 | inputs = tf.reshape(inputs, [-1, dim]) 626 | if use_pretrained_param: 627 | try: 628 | # check the size before layout transform 629 | assert kernel_val.shape == (hiddens, dim), \ 630 | 'kernel shape error at {}'.format(layer_name) 631 | kernel_val = np.reshape( 632 | np.transpose( 633 | np.reshape( 634 | kernel_val, # O x (C*H*W) 635 | (hiddens, input_shape[3], input_shape[1], input_shape[2]) 636 | ), # O x C x H x W 637 | (2, 3, 1, 0) 638 | ), # H x W x C x O 639 | (dim, -1) 640 | ) # (H*W*C) x O 641 | # check the size after layout transform 642 | assert kernel_val.shape == (dim, hiddens), \ 643 | 'kernel shape error at {}'.format(layer_name) 644 | except: 645 | # Do not use pretrained parameter if shape doesn't match 646 | use_pretrained_param = False 647 | print ('Shape of the pretrained parameter of {} does not match, ' 648 | 'use randomly initialized parameter'.format(layer_name)) 649 | else: 650 | dim = input_shape[1] 651 | if use_pretrained_param: 652 | try: 653 | kernel_val = np.transpose(kernel_val, (1,0)) 654 | assert kernel_val.shape == (dim, hiddens), \ 655 | 'kernel shape error at {}'.format(layer_name) 656 | except: 657 | use_pretrained_param = False 658 | print ('Shape of the pretrained parameter of {} does not match, ' 659 | 'use randomly initialized parameter'.format(layer_name)) 660 | 661 | if use_pretrained_param: 662 | if mc.DEBUG_MODE: 663 | print ('Using pretrained model for {}'.format(layer_name)) 664 | kernel_init = tf.constant(kernel_val, dtype=tf.float32) 665 | bias_init = tf.constant(bias_val, dtype=tf.float32) 666 | elif xavier: 667 | kernel_init = tf.contrib.layers.xavier_initializer() 668 | bias_init = tf.constant_initializer(0.0) 669 | else: 670 | kernel_init = tf.truncated_normal_initializer( 671 | stddev=stddev, dtype=tf.float32) 672 | bias_init = tf.constant_initializer(0.0) 673 | 674 | weights = _variable_with_weight_decay( 675 | 'weights', shape=[dim, hiddens], wd=mc.WEIGHT_DECAY, 676 | initializer=kernel_init) 677 | biases = _variable_on_device('biases', [hiddens], bias_init) 678 | self.model_params += [weights, biases] 679 | 680 | outputs = tf.nn.bias_add(tf.matmul(inputs, weights), biases) 681 | if relu: 682 | outputs = tf.nn.relu(outputs, 'relu') 683 | 684 | # count layer stats 685 | self.model_size_counter.append((layer_name, (dim+1)*hiddens)) 686 | 687 | num_flops = 2 * dim * hiddens + hiddens 688 | if relu: 689 | num_flops += 2*hiddens 690 | self.flop_counter.append((layer_name, num_flops)) 691 | 692 | self.activation_counter.append((layer_name, hiddens)) 693 | 694 | return outputs 695 | 696 | def filter_prediction(self, boxes, probs, cls_idx): 697 | """Filter bounding box predictions with probability threshold and 698 | non-maximum supression. 699 | 700 | Args: 701 | boxes: array of [cx, cy, w, h]. 702 | probs: array of probabilities 703 | cls_idx: array of class indices 704 | Returns: 705 | final_boxes: array of filtered bounding boxes. 706 | final_probs: array of filtered probabilities 707 | final_cls_idx: array of filtered class indices 708 | """ 709 | mc = self.mc 710 | 711 | if mc.TOP_N_DETECTION < len(probs) and mc.TOP_N_DETECTION > 0: 712 | order = probs.argsort()[:-mc.TOP_N_DETECTION-1:-1] 713 | probs = probs[order] 714 | boxes = boxes[order] 715 | cls_idx = cls_idx[order] 716 | else: 717 | filtered_idx = np.nonzero(probs>mc.PROB_THRESH)[0] 718 | probs = probs[filtered_idx] 719 | boxes = boxes[filtered_idx] 720 | cls_idx = cls_idx[filtered_idx] 721 | 722 | final_boxes = [] 723 | final_probs = [] 724 | final_cls_idx = [] 725 | 726 | for c in range(mc.CLASSES): 727 | idx_per_class = [i for i in range(len(probs)) if cls_idx[i] == c] 728 | keep = util.nms(boxes[idx_per_class], probs[idx_per_class], mc.NMS_THRESH) 729 | for i in range(len(keep)): 730 | if keep[i]: 731 | final_boxes.append(boxes[idx_per_class[i]]) 732 | final_probs.append(probs[idx_per_class[i]]) 733 | final_cls_idx.append(c) 734 | return final_boxes, final_probs, final_cls_idx 735 | 736 | def _activation_summary(self, x, layer_name): 737 | """Helper to create summaries for activations. 738 | 739 | Args: 740 | x: layer output tensor 741 | layer_name: name of the layer 742 | Returns: 743 | nothing 744 | """ 745 | with tf.variable_scope('activation_summary') as scope: 746 | tf.summary.histogram( 747 | 'activation_summary/'+layer_name, x) 748 | tf.summary.scalar( 749 | 'activation_summary/'+layer_name+'/sparsity', tf.nn.zero_fraction(x)) 750 | tf.summary.scalar( 751 | 'activation_summary/'+layer_name+'/average', tf.reduce_mean(x)) 752 | tf.summary.scalar( 753 | 'activation_summary/'+layer_name+'/max', tf.reduce_max(x)) 754 | tf.summary.scalar( 755 | 'activation_summary/'+layer_name+'/min', tf.reduce_min(x)) 756 | -------------------------------------------------------------------------------- /src/dataset/kitti-eval/cpp/evaluate_object.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "mail.h" 11 | 12 | using namespace std; 13 | 14 | #include 15 | template< typename T > inline std::string str(T const & i) { std::stringstream s; s << i; return s.str(); } // T-to-string 16 | /*======================================================================= 17 | STATIC EVALUATION PARAMETERS 18 | =======================================================================*/ 19 | 20 | // path handling 21 | string ospj( string const & a, string const & b ) { return a + "/" + b; } 22 | string ospj( string const & a, string const & b, string const & c ) { return a + "/" + b + "/" + c; } 23 | 24 | // easy, moderate and hard evaluation level 25 | enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2}; 26 | 27 | // evaluation parameter 28 | const int32_t MIN_HEIGHT[3] = {40, 25, 25}; // minimum height for evaluated groundtruth/detections 29 | const int32_t MAX_OCCLUSION[3] = {0, 1, 2}; // maximum occlusion level of the groundtruth used for evaluation 30 | const double MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation 31 | 32 | // evaluated object classes 33 | enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2}; 34 | 35 | // parameters varying per class 36 | vector CLASS_NAMES; 37 | const double MIN_OVERLAP[3] = {0.7, 0.5, 0.5}; // the minimum overlap required for evaluation 38 | 39 | // no. of recall steps that should be evaluated (discretized) 40 | const double N_SAMPLE_PTS = 41; 41 | 42 | // initialize class names 43 | void initGlobals () { 44 | CLASS_NAMES.push_back("car"); 45 | CLASS_NAMES.push_back("pedestrian"); 46 | CLASS_NAMES.push_back("cyclist"); 47 | } 48 | 49 | /*======================================================================= 50 | DATA TYPES FOR EVALUATION 51 | =======================================================================*/ 52 | 53 | // holding data needed for precision-recall and precision-aos 54 | struct tPrData { 55 | vector v; // detection score for computing score thresholds 56 | double similarity; // orientation similarity 57 | int32_t tp; // true positives 58 | int32_t fp; // false positives 59 | int32_t fn; // false negatives 60 | tPrData () : 61 | similarity(0), tp(0), fp(0), fn(0) {} 62 | }; 63 | 64 | // holding bounding boxes for ground truth and detections 65 | struct tBox { 66 | string type; // object type as car, pedestrian or cyclist,... 67 | double x1; // left corner 68 | double y1; // top corner 69 | double x2; // right corner 70 | double y2; // bottom corner 71 | double alpha; // image orientation 72 | tBox (string type, double x1,double y1,double x2,double y2,double alpha) : 73 | type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {} 74 | }; 75 | 76 | // holding ground truth data 77 | struct tGroundtruth { 78 | tBox box; // object type, box, orientation 79 | double truncation; // truncation 0..1 80 | int32_t occlusion; // occlusion 0,1,2 (non, partly, fully) 81 | tGroundtruth () : 82 | box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {} 83 | tGroundtruth (tBox box,double truncation,int32_t occlusion) : 84 | box(box),truncation(truncation),occlusion(occlusion) {} 85 | tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) : 86 | box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {} 87 | }; 88 | 89 | // holding detection data 90 | struct tDetection { 91 | tBox box; // object type, box, orientation 92 | double thresh; // detection score 93 | tDetection (): 94 | box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {} 95 | tDetection (tBox box,double thresh) : 96 | box(box),thresh(thresh) {} 97 | tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) : 98 | box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {} 99 | }; 100 | 101 | /*======================================================================= 102 | FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS 103 | =======================================================================*/ 104 | 105 | vector loadDetections(string file_name, bool &compute_aos, bool &eval_car, bool &eval_pedestrian, bool &eval_cyclist, bool &success) { 106 | 107 | // holds all detections (ignored detections are indicated by an index vector 108 | vector detections; 109 | FILE *fp = fopen(file_name.c_str(),"r"); 110 | if (!fp) { 111 | success = false; 112 | return detections; 113 | } 114 | while (!feof(fp)) { 115 | tDetection d; 116 | double trash; 117 | char str[255]; 118 | if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", 119 | str, &trash, &trash, &d.box.alpha, 120 | &d.box.x1, &d.box.y1, &d.box.x2, &d.box.y2, 121 | &trash, &trash, &trash, &trash, 122 | &trash, &trash, &trash, &d.thresh )==16) { 123 | d.box.type = str; 124 | detections.push_back(d); 125 | 126 | // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid 127 | if(d.box.alpha==-10) 128 | compute_aos = false; 129 | 130 | // a class is only evaluated if it is detected at least once 131 | if(!eval_car && !strcasecmp(d.box.type.c_str(), "car")) 132 | eval_car = true; 133 | if(!eval_pedestrian && !strcasecmp(d.box.type.c_str(), "pedestrian")) 134 | eval_pedestrian = true; 135 | if(!eval_cyclist && !strcasecmp(d.box.type.c_str(), "cyclist")) 136 | eval_cyclist = true; 137 | } 138 | } 139 | fclose(fp); 140 | success = true; 141 | return detections; 142 | } 143 | 144 | vector loadGroundtruth(string file_name,bool &success) { 145 | 146 | // holds all ground truth (ignored ground truth is indicated by an index vector 147 | vector groundtruth; 148 | FILE *fp = fopen(file_name.c_str(),"r"); 149 | if (!fp) { 150 | success = false; 151 | return groundtruth; 152 | } 153 | while (!feof(fp)) { 154 | tGroundtruth g; 155 | double trash; 156 | char str[255]; 157 | if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", 158 | str, &g.truncation, &g.occlusion, &g.box.alpha, 159 | &g.box.x1, &g.box.y1, &g.box.x2, &g.box.y2, 160 | &trash, &trash, &trash, &trash, 161 | &trash, &trash, &trash )==15) { 162 | g.box.type = str; 163 | groundtruth.push_back(g); 164 | } 165 | } 166 | fclose(fp); 167 | success = true; 168 | return groundtruth; 169 | } 170 | 171 | void saveStats (const vector &precision, const vector &aos, FILE *fp_det, FILE *fp_ap, FILE *fp_ori) { 172 | 173 | // save precision to file 174 | if(precision.empty()) 175 | return; 176 | double AP=0; 177 | uint AP_cnt = 0; 178 | for (int32_t i=0; i getThresholds(vector &v, double n_groundtruth){ 240 | 241 | // holds scores needed to compute N_SAMPLE_PTS recall values 242 | vector t; 243 | 244 | // sort scores in descending order 245 | // (highest score is assumed to give best/most confident detections) 246 | sort(v.begin(), v.end(), greater()); 247 | 248 | // get scores for linearly spaced recall 249 | double current_recall = 0; 250 | for(int32_t i=0; i >, const vector &det, vector &ignored_gt, vector &dc, vector &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){ 275 | 276 | // extract ground truth bounding boxes for current evaluation class 277 | for(int32_t i=0;iMAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height >, const vector &det, const vector &dc, const vector &ignored_gt, const vector &ignored_det, bool compute_fp, bool compute_aos=false, double thresh=0, bool debug=false){ 346 | 347 | tPrData stat = tPrData(); 348 | const double NO_DETECTION = -10000000; 349 | vector delta; // holds angular difference for TPs (needed for AOS evaluation) 350 | vector assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth 351 | assigned_detection.assign(det.size(), false); 352 | vector ignored_threshold; 353 | ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed 354 | 355 | // detections with a low score are ignored for computing precision (needs FP) 356 | if(compute_fp) 357 | for(int32_t i=0; i 0.5) (logical len(det)) 370 | =======================================================================*/ 371 | int32_t det_idx = -1; 372 | double valid_detection = NO_DETECTION; 373 | double max_overlap = 0; 374 | 375 | // search for a possible detection 376 | bool assigned_ignored_det = false; 377 | for(int32_t j=0; jMIN_OVERLAP[current_class] && det[j].thresh>valid_detection){ 392 | det_idx = j; 393 | valid_detection = det[j].thresh; 394 | } 395 | 396 | // for computing pr curve values, the candidate with the greatest overlap is considered 397 | // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used 398 | else if(compute_fp && overlap>MIN_OVERLAP[current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){ 399 | max_overlap = overlap; 400 | det_idx = j; 401 | valid_detection = 1; 402 | assigned_ignored_det = false; 403 | } 404 | else if(compute_fp && overlap>MIN_OVERLAP[current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){ 405 | det_idx = j; 406 | valid_detection = 1; 407 | assigned_ignored_det = true; 408 | } 409 | } 410 | 411 | /*======================================================================= 412 | compute TP, FP and FN 413 | =======================================================================*/ 414 | 415 | // nothing was assigned to this valid ground truth 416 | if(valid_detection==NO_DETECTION && ignored_gt[i]==0) 417 | stat.fn++; 418 | 419 | // only evaluate valid ground truth <=> detection assignments (considering difficulty level) 420 | else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1)) 421 | assigned_detection[det_idx] = true; 422 | 423 | // found a valid true positive 424 | else if(valid_detection!=NO_DETECTION){ 425 | 426 | // write highest score to threshold vector 427 | stat.tp++; 428 | stat.v.push_back(det[det_idx].thresh); 429 | 430 | // compute angular difference of detection and ground truth if valid detection orientation was provided 431 | if(compute_aos) 432 | delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha); 433 | 434 | // clean up 435 | assigned_detection[det_idx] = true; 436 | } 437 | } 438 | 439 | // if FP are requested, consider stuff area 440 | if(compute_fp){ 441 | 442 | // count fp 443 | for(int32_t i=0; iMIN_OVERLAP[current_class]){ 466 | assigned_detection[j] = true; 467 | nstuff++; 468 | } 469 | } 470 | } 471 | 472 | // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas 473 | stat.fp -= nstuff; 474 | 475 | // if all orientation values are valid, the AOS is computed 476 | if(compute_aos){ 477 | vector tmp; 478 | 479 | // FP have a similarity of 0, for all TP compute AOS 480 | tmp.assign(stat.fp, 0); 481 | for(int32_t i=0; i0 || stat.fp>0) 490 | stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0); 491 | 492 | // there was neither a FP nor a TP, so the similarity is ignored in the evaluation 493 | else 494 | stat.similarity = -1; 495 | } 496 | } 497 | return stat; 498 | } 499 | 500 | /*======================================================================= 501 | EVALUATE CLASS-WISE 502 | =======================================================================*/ 503 | 504 | bool eval_class (FILE *fp_det, FILE *fp_ap, FILE *fp_ori, CLASSES current_class,const vector< vector > &groundtruth,const vector< vector > &detections, bool compute_aos, vector &precision, vector &aos, DIFFICULTY difficulty, int32_t N_TESTIMAGES) { 505 | 506 | // init 507 | int32_t n_gt=0; // total no. of gt (denominator of recall) 508 | vector v, thresholds; // detection scores, evaluated for recall discretization 509 | vector< vector > ignored_gt, ignored_det; // index of ignored gt detection for current class/difficulty 510 | vector< vector > dontcare; // index of dontcare areas, included in ground truth 511 | 512 | // for all test images do 513 | for (int32_t i=0; i i_gt, i_det; 517 | vector dc; 518 | 519 | // only evaluate objects of current class and ignore occluded, truncated objects 520 | cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty); 521 | ignored_gt.push_back(i_gt); 522 | ignored_det.push_back(i_det); 523 | dontcare.push_back(dc); 524 | 525 | // compute statistics to get recall values 526 | tPrData pr_tmp = tPrData(); 527 | pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false); 528 | 529 | // add detection scores to vector over all images 530 | for(int32_t j=0; j pr; 539 | pr.assign(thresholds.size(),tPrData()); 540 | for (int32_t i=0; i recall; 559 | precision.assign(N_SAMPLE_PTS, 0); 560 | if(compute_aos) 561 | aos.assign(N_SAMPLE_PTS, 0); 562 | double r=0; 563 | for (int32_t i=0; i vals[],bool is_aos){ 584 | 585 | char command[1024]; 586 | 587 | // save plot data to file 588 | FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w"); 589 | for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++) 590 | fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]); 591 | fclose(fp); 592 | 593 | // create png + eps 594 | for (int32_t j=0; j<2; j++) { 595 | 596 | // open file 597 | FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w"); 598 | 599 | // save gnuplot instructions 600 | if (j==0) { 601 | fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n"); 602 | fprintf(fp,"set output \"%s.png\"\n",file_name.c_str()); 603 | } else { 604 | fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n"); 605 | fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str()); 606 | } 607 | 608 | // set labels and ranges 609 | fprintf(fp,"set size ratio 0.7\n"); 610 | fprintf(fp,"set xrange [0:1]\n"); 611 | fprintf(fp,"set yrange [0:1]\n"); 612 | fprintf(fp,"set xlabel \"Recall\"\n"); 613 | if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n"); 614 | else fprintf(fp,"set ylabel \"Orientation Similarity\"\n"); 615 | obj_type[0] = toupper(obj_type[0]); 616 | fprintf(fp,"set title \"%s\"\n",obj_type.c_str()); 617 | 618 | // line width 619 | int32_t lw = 5; 620 | if (j==0) lw = 3; 621 | 622 | // plot error curve 623 | fprintf(fp,"plot "); 624 | fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw); 625 | fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw); 626 | fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw); 627 | 628 | // close file 629 | fclose(fp); 630 | 631 | // run gnuplot => create png + eps 632 | sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str()); 633 | system(command); 634 | } 635 | 636 | // create pdf and crop 637 | sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); 638 | system(command); 639 | sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); 640 | system(command); 641 | sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str()); 642 | system(command); 643 | } 644 | 645 | bool eval(string const & result_dir, string const & image_set_filename, string const & gt_dir, Mail* mail, int32_t N_TESTIMAGES){ 646 | 647 | // set some global parameters 648 | initGlobals(); 649 | 650 | // ground truth and result directories 651 | // string result_dir = "results/" + result_sha; 652 | string plot_dir = result_dir + "/plot"; 653 | 654 | // create output directories 655 | system(("mkdir " + plot_dir).c_str()); 656 | 657 | // hold detections and ground truth in memory 658 | vector< vector > groundtruth; 659 | vector< vector > detections; 660 | 661 | // holds wether orientation similarity shall be computed (might be set to false while loading detections) 662 | // and which labels where provided by this submission 663 | bool compute_aos=true, eval_car=false, eval_pedestrian=false, eval_cyclist=false; 664 | 665 | // get image names 666 | FILE *fp = fopen( image_set_filename.c_str(),"r" ); 667 | if (!fp) { 668 | mail->msg("ERROR: Couldn't read: image set file %s!", image_set_filename.c_str() ); 669 | return false; 670 | } 671 | vector< string > image_set; 672 | while (!feof(fp)) { 673 | char str[255]; 674 | if (fscanf(fp, "%s", str) == 1){ 675 | image_set.push_back(str); 676 | } 677 | } 678 | fclose(fp); 679 | if( image_set.size() != N_TESTIMAGES ) { 680 | printf( "image_set.size()=%s N_TESTIMAGES=%s\n", str(image_set.size()).c_str(), str(N_TESTIMAGES).c_str() ); 681 | } 682 | assert(image_set.size() == N_TESTIMAGES); 683 | 684 | // for all images read groundtruth and detections 685 | mail->msg("Loading detections..."); 686 | for (int32_t i=0; i gt = loadGroundtruth(ospj(gt_dir,file_name),gt_success); 695 | vector det = loadDetections(ospj(result_dir,"data",file_name), compute_aos, eval_car, eval_pedestrian, eval_cyclist,det_success); 696 | groundtruth.push_back(gt); 697 | detections.push_back(det); 698 | 699 | // check for errors 700 | if (!gt_success) { 701 | mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name); 702 | return false; 703 | } 704 | if (!det_success) { 705 | mail->msg("ERROR: Couldn't read: %s", file_name); 706 | return false; 707 | } 708 | } 709 | mail->msg(" done."); 710 | 711 | // holds pointers for result files 712 | FILE *fp_det=0, *fp_ap=0, *fp_ori=0; 713 | 714 | // eval cars 715 | if(eval_car){ 716 | fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_detection.txt").c_str(),"w"); 717 | fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_ap.txt").c_str(),"w"); 718 | if(compute_aos) 719 | fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_orientation.txt").c_str(),"w"); 720 | vector precision[3], aos[3]; 721 | if( !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[0],aos[0],EASY,N_TESTIMAGES) 722 | || !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE, N_TESTIMAGES) 723 | || !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[2],aos[2],HARD, N_TESTIMAGES)){ 724 | mail->msg("Car evaluation failed."); 725 | return false; 726 | } 727 | fclose(fp_det); 728 | fclose(fp_ap); 729 | saveAndPlotPlots(plot_dir,CLASS_NAMES[CAR] + "_detection",CLASS_NAMES[CAR],precision,0); 730 | if(compute_aos){ 731 | saveAndPlotPlots(plot_dir,CLASS_NAMES[CAR] + "_orientation",CLASS_NAMES[CAR],aos,1); 732 | fclose(fp_ori); 733 | } 734 | } 735 | 736 | // eval pedestrians 737 | if(eval_pedestrian){ 738 | fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_detection.txt").c_str(),"w"); 739 | fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_ap.txt").c_str(),"w"); 740 | if(compute_aos) 741 | fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_orientation.txt").c_str(),"w"); 742 | vector precision[3], aos[3]; 743 | if( !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[0],aos[0],EASY, N_TESTIMAGES) 744 | || !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE,N_TESTIMAGES) 745 | || !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[2],aos[2],HARD,N_TESTIMAGES)){ 746 | mail->msg("Pedestrian evaluation failed."); 747 | return false; 748 | } 749 | fclose(fp_det); 750 | fclose(fp_ap); 751 | saveAndPlotPlots(plot_dir,CLASS_NAMES[PEDESTRIAN] + "_detection",CLASS_NAMES[PEDESTRIAN],precision,0); 752 | if(compute_aos){ 753 | fclose(fp_ori); 754 | saveAndPlotPlots(plot_dir,CLASS_NAMES[PEDESTRIAN] + "_orientation",CLASS_NAMES[PEDESTRIAN],aos,1); 755 | } 756 | } 757 | 758 | // eval cyclists 759 | if(eval_cyclist){ 760 | fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST] + "_detection.txt").c_str(),"w"); 761 | fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST] + "_ap.txt").c_str(),"w"); 762 | if(compute_aos) 763 | fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST] + "_orientation.txt").c_str(),"w"); 764 | vector precision[3], aos[3]; 765 | if( !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[0],aos[0],EASY, N_TESTIMAGES) 766 | || !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE, N_TESTIMAGES) 767 | || !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[2],aos[2],HARD, N_TESTIMAGES)){ 768 | mail->msg("Cyclist evaluation failed."); 769 | return false; 770 | } 771 | fclose(fp_det); 772 | fclose(fp_ap); 773 | saveAndPlotPlots(plot_dir,CLASS_NAMES[CYCLIST] + "_detection",CLASS_NAMES[CYCLIST],precision,0); 774 | if(compute_aos){ 775 | fclose(fp_ori); 776 | saveAndPlotPlots(plot_dir,CLASS_NAMES[CYCLIST] + "_orientation",CLASS_NAMES[CYCLIST],aos,1); 777 | } 778 | } 779 | 780 | // success 781 | return true; 782 | } 783 | 784 | int32_t main (int32_t argc,char *argv[]) { 785 | 786 | // we need 4 arguments! 787 | if (argc!=5) { 788 | cout << "Usage: ./eval_detection kitti_dir image_set_filename result_dir" << endl; 789 | return 1; 790 | } 791 | 792 | // read arguments 793 | string const kitti_dir = argv[1]; 794 | string const gt_dir = ospj( kitti_dir, "label_2" ); // FIXME_MWM: should be part of input? configurable? 795 | string const image_set_filename = argv[2]; 796 | string const result_dir = argv[3]; 797 | int32_t const N_TESTIMAGES = atoi(argv[4]); 798 | 799 | // init notification mail 800 | Mail *mail = new Mail(); 801 | mail->msg("Thank you for participating in our evaluation!"); 802 | 803 | // run evaluation 804 | if (eval( result_dir, image_set_filename, gt_dir, mail, N_TESTIMAGES )) { 805 | mail->msg( ("Your evaluation results are available in " + result_dir).c_str() ); 806 | } else { 807 | mail->msg("An error occured while processing your results."); 808 | mail->msg("Please make sure that the data in your zip archive has the right format!"); 809 | } 810 | 811 | // send mail and exit 812 | delete mail; 813 | 814 | return 0; 815 | } 816 | 817 | --------------------------------------------------------------------------------