├── src
    ├── dataset
    │   ├── kitti-eval
    │   │   ├── cpp
    │   │   │   ├── .gitignore
    │   │   │   ├── mail.h
    │   │   │   └── evaluate_object.cpp
    │   │   └── Makefile
    │   ├── __init__.py
    │   ├── pascal_voc.py
    │   ├── voc_eval.py
    │   ├── imdb.py
    │   └── kitti.py
    ├── __init__.py
    ├── utils
    │   ├── __init__.py
    │   ├── caffemodel2pkl.py
    │   └── util.py
    ├── nets
    │   ├── __init__.py
    │   ├── vgg16_convDet.py
    │   ├── squeezeDet.py
    │   ├── squeezeDetPlus.py
    │   └── resnet50_convDet.py
    ├── config
    │   ├── __init__.py
    │   ├── kitti_res50_config.py
    │   ├── kitti_vgg16_config.py
    │   ├── kitti_squeezeDet_config.py
    │   ├── kitti_squeezeDetPlus_config.py
    │   ├── kitti_model_config.py
    │   └── config.py
    ├── demo.py
    ├── eval.py
    ├── train.py
    └── nn_skeleton.py
├── .gitignore
├── data
    ├── sample.png
    └── random_split_train_val.py
├── README
    ├── det_img.png
    ├── graph.png
    ├── out_sample.png
    └── detection_analysis.png
├── requirements.txt
├── LICENSE
├── scripts
    ├── eval.sh
    └── train.sh
└── README.md


/src/dataset/kitti-eval/cpp/.gitignore:
--------------------------------------------------------------------------------
1 | evaluate_object
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *pyc
2 | data/out/
3 | data/model_checkpoints
4 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
2 | 


--------------------------------------------------------------------------------
/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
2 | 


--------------------------------------------------------------------------------
/data/sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dividiti/squeezeDet/master/data/sample.png


--------------------------------------------------------------------------------
/README/det_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/det_img.png


--------------------------------------------------------------------------------
/README/graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/graph.png


--------------------------------------------------------------------------------
/src/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from kitti import kitti
2 | from pascal_voc import pascal_voc
3 | 


--------------------------------------------------------------------------------
/README/out_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/out_sample.png


--------------------------------------------------------------------------------
/README/detection_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dividiti/squeezeDet/master/README/detection_analysis.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | easydict==1.6
2 | joblib==0.10.3
3 | numpy==1.12.0
4 | opencv-python==3.2.0.6
5 | Pillow==4.0.0
6 | tensorflow-gpu==1.0.0
7 | 


--------------------------------------------------------------------------------
/src/dataset/kitti-eval/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | cpp/evaluate_object : cpp/evaluate_object.cpp
3 | 	g++ -Wall -Wno-sign-compare -o cpp/evaluate_object cpp/evaluate_object.cpp
4 | 


--------------------------------------------------------------------------------
/src/nets/__init__.py:
--------------------------------------------------------------------------------
1 | from squeezeDet import SqueezeDet
2 | from squeezeDetPlus import SqueezeDetPlus
3 | from resnet50_convDet import ResNet50ConvDet
4 | from vgg16_convDet import VGG16ConvDet
5 | 


--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
1 | from kitti_model_config import kitti_model_config
2 | from kitti_vgg16_config import kitti_vgg16_config
3 | from kitti_res50_config import kitti_res50_config
4 | from kitti_squeezeDet_config import kitti_squeezeDet_config
5 | from kitti_squeezeDetPlus_config import kitti_squeezeDetPlus_config
6 | 


--------------------------------------------------------------------------------
/data/random_split_train_val.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | image_set_dir = './KITTI/ImageSets'
 4 | trainval_file = image_set_dir+'/trainval.txt'
 5 | train_file = image_set_dir+'/train.txt'
 6 | val_file = image_set_dir+'/val.txt'
 7 | 
 8 | idx = []
 9 | with open(trainval_file) as f:
10 |   for line in f:
11 |     idx.append(line.strip())
12 | f.close()
13 | 
14 | idx = np.random.permutation(idx)
15 | 
16 | train_idx = sorted(idx[:len(idx)/2])
17 | val_idx = sorted(idx[len(idx)/2:])
18 | 
19 | with open(train_file, 'w') as f:
20 |   for i in train_idx:
21 |     f.write('{}\n'.format(i))
22 | f.close()
23 | 
24 | with open(val_file, 'w') as f:
25 |   for i in val_idx:
26 |     f.write('{}\n'.format(i))
27 | f.close()
28 | 
29 | print 'Trainining set is saved to ' + train_file
30 | print 'Validation set is saved to ' + val_file
31 | 


--------------------------------------------------------------------------------
/src/dataset/kitti-eval/cpp/mail.h:
--------------------------------------------------------------------------------
 1 | #ifndef MAIL_H
 2 | #define MAIL_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <stdarg.h>
 6 | #include <string.h>
 7 | 
 8 | class Mail {
 9 | 
10 | public:
11 | 
12 |   Mail (std::string email = "") {
13 |     if (email.compare("")) {
14 |       mail = popen("/usr/lib/sendmail -t -f noreply@cvlibs.net","w");
15 |       fprintf(mail,"To: %s\n", email.c_str());
16 |       fprintf(mail,"From: noreply@cvlibs.net\n");
17 |       fprintf(mail,"Subject: KITTI Evaluation Benchmark\n");
18 |       fprintf(mail,"\n\n");
19 |     } else {
20 |       mail = 0;
21 |     }
22 |   }
23 |   
24 |   ~Mail() {
25 |     if (mail) {
26 |       pclose(mail);
27 |     }
28 |   }
29 |   
30 |   void msg (const char *format, ...) {
31 |     va_list args;
32 |     va_start(args,format);
33 |     if (mail) {
34 |       vfprintf(mail,format,args);
35 |       fprintf(mail,"\n");
36 |     }
37 |     vprintf(format,args);
38 |     printf("\n");
39 |     va_end(args);
40 |   }
41 |     
42 | private:
43 | 
44 |   FILE *mail;
45 |   
46 | };
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/utils/caffemodel2pkl.py:
--------------------------------------------------------------------------------
 1 | # Merged from https://raw.githubusercontent.com/bgshih/vgg16.tf/master/src/dump_caffemodel_weights.py
 2 | 
 3 | import sys
 4 | import os
 5 | 
 6 | import argparse
 7 | import numpy as np
 8 | import joblib
 9 | 
10 | import caffe
11 | 
12 | parser = argparse.ArgumentParser(description='')
13 | parser.add_argument('--caffe_root', help='Caffe root directory.')
14 | parser.add_argument('--prototxt_path', help='Model prototxt path.')
15 | parser.add_argument('--caffemodel_path', help='Caffe model weights file (.caffemodel) path.')
16 | parser.add_argument('--caffe_weights_path', default='/tmp/VGG_ILSVRC_16_layers_weights.pkl',
17 |                     help='VGG16 weights dump path.')
18 | args = parser.parse_args()
19 | 
20 | def dump_caffemodel_weights():
21 |   net = caffe.Net(args.prototxt_path, args.caffemodel_path, caffe.TEST)
22 |   weights = {}
23 |   n_layers = len(net.layers)
24 |   for i in range(n_layers):
25 |     layer_name = net._layer_names[i]
26 |     layer = net.layers[i]
27 |     layer_blobs = [o.data for o in layer.blobs]
28 |     weights[layer_name] = layer_blobs
29 |   joblib.dump(weights, args.caffe_weights_path)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |   dump_caffemodel_weights()
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2016, Bichen Wu
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/scripts/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export GPUID=0
 4 | export NET="squeezeDet"
 5 | export EVAL_DIR="/tmp/bichen/logs/SqueezeDet/"
 6 | export IMAGE_SET="val"
 7 | 
 8 | if [ $# -eq 0 ]
 9 | then
10 |   echo "Usage: ./scripts/train.sh [options]"
11 |   echo " "
12 |   echo "options:"
13 |   echo "-h, --help                show brief help"
14 |   echo "-net                      (squeezeDet|squeezeDet+|vgg16|resnet50)"
15 |   echo "-gpu                      gpu id"
16 |   echo "-eval_dir                 directory to save logs"
17 |   echo "-image_set                (train|val)"
18 |   exit 0
19 | fi
20 | 
21 | while test $# -gt 0; do
22 |   case "$1" in
23 |     -h|--help)
24 |       echo "Usage: ./scripts/train.sh [options]"
25 |       echo " "
26 |       echo "options:"
27 |       echo "-h, --help                show brief help"
28 |       echo "-net                      (squeezeDet|squeezeDet+|vgg16|resnet50)"
29 |       echo "-gpu                      gpu id"
30 |       echo "-eval_dir                 directory to save logs"
31 |       echo "-image_set                (train|val)"
32 |       exit 0
33 |       ;;
34 |     -net)
35 |       export NET="$2"
36 |       shift
37 |       shift
38 |       ;;
39 |     -gpu)
40 |       export GPUID="$2"
41 |       shift
42 |       shift
43 |       ;;
44 |     -eval_dir)
45 |       export EVAL_DIR="$2"
46 |       shift
47 |       shift
48 |       ;;
49 |     -image_set)
50 |       export IMAGE_SET="$2"
51 |       shift
52 |       shift
53 |       ;;
54 |     *)
55 |       break
56 |       ;;
57 |   esac
58 | done
59 | 
60 | # =========================================================================== #
61 | # command for squeezeDet:
62 | # =========================================================================== #
63 | python ./src/eval.py \
64 |   --dataset=KITTI \
65 |   --data_path=./data/KITTI \
66 |   --image_set=$IMAGE_SET \
67 |   --eval_dir="$EVAL_DIR/$IMAGE_SET" \
68 |   --checkpoint_path="$EVAL_DIR/train" \
69 |   --net=$NET \
70 |   --gpu=$GPUID
71 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export GPUID=0
 4 | export NET="squeezeDet"
 5 | export TRAIN_DIR="/tmp/bichen/logs/SqueezeDet/"
 6 | 
 7 | if [ $# -eq 0 ]
 8 | then
 9 |   echo "Usage: ./scripts/train.sh [options]"
10 |   echo " "
11 |   echo "options:"
12 |   echo "-h, --help                show brief help"
13 |   echo "-net                      (squeezeDet|squeezeDet+|vgg16|resnet50)"
14 |   echo "-gpu                      gpu id"
15 |   echo "-train_dir                directory for training logs"
16 |   exit 0
17 | fi
18 | 
19 | while test $# -gt 0; do
20 |   case "$1" in
21 |     -h|--help)
22 |       echo "Usage: ./scripts/train.sh [options]"
23 |       echo " "
24 |       echo "options:"
25 |       echo "-h, --help                show brief help"
26 |       echo "-net                      (squeezeDet|squeezeDet+|vgg16|resnet50)"
27 |       echo "-gpu                      gpu id"
28 |       echo "-train_dir                directory for training logs"
29 |       exit 0
30 |       ;;
31 |     -net)
32 |       export NET="$2"
33 |       shift
34 |       shift
35 |       ;;
36 |     -gpu)
37 |       export GPUID="$2"
38 |       shift
39 |       shift
40 |       ;;
41 |     -train_dir)
42 |       export TRAIN_DIR="$2"
43 |       shift
44 |       shift
45 |       ;;
46 |     *)
47 |       break
48 |       ;;
49 |   esac
50 | done
51 | 
52 | case "$NET" in 
53 |   "squeezeDet")
54 |     export PRETRAINED_MODEL_PATH="./data/SqueezeNet/squeezenet_v1.1.pkl"
55 |     ;;
56 |   "squeezeDet+")
57 |     export PRETRAINED_MODEL_PATH="./data/SqueezeNet/squeezenet_v1.0_SR_0.750.pkl"
58 |     ;;
59 |   "resnet50")
60 |     export PRETRAINED_MODEL_PATH="./data/ResNet/ResNet-50-weights.pkl"
61 |     ;;
62 |   "vgg16")
63 |     export PRETRAINED_MODEL_PATH="./data/VGG16/VGG_ILSVRC_16_layers_weights.pkl"
64 |     ;;
65 |   *)
66 |     echo "net architecture not supported."
67 |     exit 0
68 |     ;;
69 | esac
70 | 
71 | 
72 | python ./src/train.py \
73 |   --dataset=KITTI \
74 |   --pretrained_model_path=$PRETRAINED_MODEL_PATH \
75 |   --data_path=./data/KITTI \
76 |   --image_set=train \
77 |   --train_dir="$TRAIN_DIR/train" \
78 |   --net=$NET \
79 |   --summary_step=100 \
80 |   --checkpoint_step=500 \
81 |   --gpu=$GPUID
82 | 


--------------------------------------------------------------------------------
/src/config/kitti_res50_config.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """Model configuration for pascal dataset"""
 4 | 
 5 | import numpy as np
 6 | 
 7 | from config import base_model_config
 8 | 
 9 | def kitti_res50_config():
10 |   """Specify the parameters to tune below."""
11 |   mc                       = base_model_config('KITTI')
12 | 
13 |   mc.IMAGE_WIDTH           = 1242
14 |   mc.IMAGE_HEIGHT          = 375
15 |   mc.BATCH_SIZE            = 20
16 | 
17 |   mc.WEIGHT_DECAY          = 0.0001
18 |   mc.LEARNING_RATE         = 0.01
19 |   mc.DECAY_STEPS           = 10000
20 |   mc.MAX_GRAD_NORM         = 1.0
21 |   mc.MOMENTUM              = 0.9
22 |   mc.LR_DECAY_FACTOR       = 0.5
23 | 
24 |   mc.LOSS_COEF_BBOX        = 5.0
25 |   mc.LOSS_COEF_CONF_POS    = 75.0
26 |   mc.LOSS_COEF_CONF_NEG    = 100.0
27 |   mc.LOSS_COEF_CLASS       = 1.0
28 | 
29 |   mc.PLOT_PROB_THRESH      = 0.4
30 |   mc.NMS_THRESH            = 0.4
31 |   mc.PROB_THRESH           = 0.005
32 |   mc.TOP_N_DETECTION       = 64
33 | 
34 |   mc.DATA_AUGMENTATION     = True
35 |   mc.DRIFT_X               = 150
36 |   mc.DRIFT_Y               = 100
37 |   mc.EXCLUDE_HARD_EXAMPLES = False
38 | 
39 |   mc.ANCHOR_BOX            = set_anchors(mc)
40 |   mc.ANCHORS               = len(mc.ANCHOR_BOX)
41 |   mc.ANCHOR_PER_GRID       = 9
42 | 
43 |   return mc
44 | 
45 | def set_anchors(mc):
46 |   H, W, B = 24, 78, 9
47 |   anchor_shapes = np.reshape(
48 |       [np.array(
49 |           [[  94.,  49.], [ 225., 161.], [ 170.,  91.],
50 |            [ 390., 181.], [  41.,  32.], [ 128.,  64.],
51 |            [ 298., 164.], [ 232.,  99.], [  65.,  42.]])] * H * W,
52 |       (H, W, B, 2)
53 |   )
54 |   center_x = np.reshape(
55 |       np.transpose(
56 |           np.reshape(
57 |               np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 
58 |               (B, H, W)
59 |           ),
60 |           (1, 2, 0)
61 |       ),
62 |       (H, W, B, 1)
63 |   )
64 |   center_y = np.reshape(
65 |       np.transpose(
66 |           np.reshape(
67 |               np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B),
68 |               (B, W, H)
69 |           ),
70 |           (2, 1, 0)
71 |       ),
72 |       (H, W, B, 1)
73 |   )
74 |   anchors = np.reshape(
75 |       np.concatenate((center_x, center_y, anchor_shapes), axis=3),
76 |       (-1, 4)
77 |   )
78 | 
79 |   return anchors
80 | 


--------------------------------------------------------------------------------
/src/config/kitti_vgg16_config.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """Model configuration for pascal dataset"""
 4 | 
 5 | import numpy as np
 6 | 
 7 | from config import base_model_config
 8 | 
 9 | def kitti_vgg16_config():
10 |   """Specify the parameters to tune below."""
11 |   mc                       = base_model_config('KITTI')
12 | 
13 |   mc.IMAGE_WIDTH           = 1242
14 |   mc.IMAGE_HEIGHT          = 375
15 |   mc.BATCH_SIZE            = 5
16 | 
17 |   mc.WEIGHT_DECAY          = 0.0001
18 |   mc.LEARNING_RATE         = 0.01
19 |   mc.DECAY_STEPS           = 10000
20 |   mc.MAX_GRAD_NORM         = 1.0
21 |   mc.MOMENTUM              = 0.9
22 |   mc.LR_DECAY_FACTOR       = 0.5
23 | 
24 |   mc.LOSS_COEF_BBOX        = 5.0
25 |   mc.LOSS_COEF_CONF_POS    = 75.0
26 |   mc.LOSS_COEF_CONF_NEG    = 100.0
27 |   mc.LOSS_COEF_CLASS       = 1.0
28 | 
29 |   mc.PLOT_PROB_THRESH      = 0.4
30 |   mc.NMS_THRESH            = 0.4
31 |   mc.PROB_THRESH           = 0.005
32 |   mc.TOP_N_DETECTION       = 64
33 | 
34 |   mc.DATA_AUGMENTATION     = True
35 |   mc.DRIFT_X               = 150
36 |   mc.DRIFT_Y               = 100
37 |   mc.EXCLUDE_HARD_EXAMPLES = False
38 | 
39 |   mc.ANCHOR_BOX            = set_anchors(mc)
40 |   mc.ANCHORS               = len(mc.ANCHOR_BOX)
41 |   mc.ANCHOR_PER_GRID       = 9
42 | 
43 |   return mc
44 | 
45 | def set_anchors(mc):
46 |   H, W, B = 24, 78, 9
47 |   anchor_shapes = np.reshape(
48 |       [np.array(
49 |           [[  36.,  37.], [ 366., 174.], [ 115.,  59.],
50 |            [ 162.,  87.], [  38.,  90.], [ 258., 173.],
51 |            [ 224., 108.], [  78., 170.], [  72.,  43.]])] * H * W,
52 |       (H, W, B, 2)
53 |   )
54 |   center_x = np.reshape(
55 |       np.transpose(
56 |           np.reshape(
57 |               np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 
58 |               (B, H, W)
59 |           ),
60 |           (1, 2, 0)
61 |       ),
62 |       (H, W, B, 1)
63 |   )
64 |   center_y = np.reshape(
65 |       np.transpose(
66 |           np.reshape(
67 |               np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B),
68 |               (B, W, H)
69 |           ),
70 |           (2, 1, 0)
71 |       ),
72 |       (H, W, B, 1)
73 |   )
74 |   anchors = np.reshape(
75 |       np.concatenate((center_x, center_y, anchor_shapes), axis=3),
76 |       (-1, 4)
77 |   )
78 | 
79 |   return anchors
80 | 


--------------------------------------------------------------------------------
/src/config/kitti_squeezeDet_config.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """Model configuration for pascal dataset"""
 4 | 
 5 | import numpy as np
 6 | 
 7 | from config import base_model_config
 8 | 
 9 | def kitti_squeezeDet_config():
10 |   """Specify the parameters to tune below."""
11 |   mc                       = base_model_config('KITTI')
12 | 
13 |   mc.IMAGE_WIDTH           = 1248
14 |   mc.IMAGE_HEIGHT          = 384
15 |   mc.BATCH_SIZE            = 20
16 | 
17 |   mc.WEIGHT_DECAY          = 0.0001
18 |   mc.LEARNING_RATE         = 0.01
19 |   mc.DECAY_STEPS           = 10000
20 |   mc.MAX_GRAD_NORM         = 1.0
21 |   mc.MOMENTUM              = 0.9
22 |   mc.LR_DECAY_FACTOR       = 0.5
23 | 
24 |   mc.LOSS_COEF_BBOX        = 5.0
25 |   mc.LOSS_COEF_CONF_POS    = 75.0
26 |   mc.LOSS_COEF_CONF_NEG    = 100.0
27 |   mc.LOSS_COEF_CLASS       = 1.0
28 | 
29 |   mc.PLOT_PROB_THRESH      = 0.4
30 |   mc.NMS_THRESH            = 0.4
31 |   mc.PROB_THRESH           = 0.005
32 |   mc.TOP_N_DETECTION       = 64
33 | 
34 |   mc.DATA_AUGMENTATION     = True
35 |   mc.DRIFT_X               = 150
36 |   mc.DRIFT_Y               = 100
37 |   mc.EXCLUDE_HARD_EXAMPLES = False
38 | 
39 |   mc.ANCHOR_BOX            = set_anchors(mc)
40 |   mc.ANCHORS               = len(mc.ANCHOR_BOX)
41 |   mc.ANCHOR_PER_GRID       = 9
42 | 
43 |   return mc
44 | 
45 | def set_anchors(mc):
46 |   H, W, B = 24, 78, 9
47 |   anchor_shapes = np.reshape(
48 |       [np.array(
49 |           [[  36.,  37.], [ 366., 174.], [ 115.,  59.],
50 |            [ 162.,  87.], [  38.,  90.], [ 258., 173.],
51 |            [ 224., 108.], [  78., 170.], [  72.,  43.]])] * H * W,
52 |       (H, W, B, 2)
53 |   )
54 |   center_x = np.reshape(
55 |       np.transpose(
56 |           np.reshape(
57 |               np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 
58 |               (B, H, W)
59 |           ),
60 |           (1, 2, 0)
61 |       ),
62 |       (H, W, B, 1)
63 |   )
64 |   center_y = np.reshape(
65 |       np.transpose(
66 |           np.reshape(
67 |               np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B),
68 |               (B, W, H)
69 |           ),
70 |           (2, 1, 0)
71 |       ),
72 |       (H, W, B, 1)
73 |   )
74 |   anchors = np.reshape(
75 |       np.concatenate((center_x, center_y, anchor_shapes), axis=3),
76 |       (-1, 4)
77 |   )
78 | 
79 |   return anchors
80 | 


--------------------------------------------------------------------------------
/src/config/kitti_squeezeDetPlus_config.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """Model configuration for pascal dataset"""
 4 | 
 5 | import numpy as np
 6 | 
 7 | from config import base_model_config
 8 | 
 9 | def kitti_squeezeDetPlus_config():
10 |   """Specify the parameters to tune below."""
11 |   mc                       = base_model_config('KITTI')
12 | 
13 |   mc.IMAGE_WIDTH           = 1242
14 |   mc.IMAGE_HEIGHT          = 375
15 |   mc.BATCH_SIZE            = 20
16 | 
17 |   mc.WEIGHT_DECAY          = 0.0001
18 |   mc.LEARNING_RATE         = 0.01
19 |   mc.DECAY_STEPS           = 10000
20 |   mc.MAX_GRAD_NORM         = 1.0
21 |   mc.MOMENTUM              = 0.9
22 |   mc.LR_DECAY_FACTOR       = 0.5
23 | 
24 |   mc.LOSS_COEF_BBOX        = 5.0
25 |   mc.LOSS_COEF_CONF_POS    = 75.0
26 |   mc.LOSS_COEF_CONF_NEG    = 100.0
27 |   mc.LOSS_COEF_CLASS       = 1.0
28 | 
29 |   mc.PLOT_PROB_THRESH      = 0.4
30 |   mc.NMS_THRESH            = 0.4
31 |   mc.PROB_THRESH           = 0.005
32 |   mc.TOP_N_DETECTION       = 64
33 | 
34 |   mc.DATA_AUGMENTATION     = True
35 |   mc.DRIFT_X               = 150
36 |   mc.DRIFT_Y               = 100
37 |   mc.EXCLUDE_HARD_EXAMPLES = False
38 | 
39 |   mc.ANCHOR_BOX            = set_anchors(mc)
40 |   mc.ANCHORS               = len(mc.ANCHOR_BOX)
41 |   mc.ANCHOR_PER_GRID       = 9
42 | 
43 |   return mc
44 | 
45 | def set_anchors(mc):
46 |   H, W, B = 22, 76, 9
47 |   anchor_shapes = np.reshape(
48 |       [np.array(
49 |           [[  36.,  37.], [ 366., 174.], [ 115.,  59.],
50 |            [ 162.,  87.], [  38.,  90.], [ 258., 173.],
51 |            [ 224., 108.], [  78., 170.], [  72.,  43.]])] * H * W,
52 |       (H, W, B, 2)
53 |   )
54 |   center_x = np.reshape(
55 |       np.transpose(
56 |           np.reshape(
57 |               np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 
58 |               (B, H, W)
59 |           ),
60 |           (1, 2, 0)
61 |       ),
62 |       (H, W, B, 1)
63 |   )
64 |   center_y = np.reshape(
65 |       np.transpose(
66 |           np.reshape(
67 |               np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B),
68 |               (B, W, H)
69 |           ),
70 |           (2, 1, 0)
71 |       ),
72 |       (H, W, B, 1)
73 |   )
74 |   anchors = np.reshape(
75 |       np.concatenate((center_x, center_y, anchor_shapes), axis=3),
76 |       (-1, 4)
77 |   )
78 | 
79 |   return anchors
80 | 


--------------------------------------------------------------------------------
/src/config/kitti_model_config.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """Model configuration for pascal dataset"""
 4 | 
 5 | import numpy as np
 6 | 
 7 | from config import base_model_config
 8 | 
 9 | def kitti_model_config():
10 |   """Specify the parameters to tune below."""
11 |   mc                       = base_model_config('KITTI')
12 |   # mc.IMAGE_WIDTH           = 1864 # half width 621
13 |   # mc.IMAGE_HEIGHT          = 562 # half height 187
14 |   mc.IMAGE_WIDTH           = 1248 # half width 621
15 |   mc.IMAGE_HEIGHT          = 384 # half height 187
16 |   # mc.IMAGE_WIDTH           = 621
17 |   # mc.IMAGE_HEIGHT          = 187
18 | 
19 |   mc.WEIGHT_DECAY          = 0.0001
20 |   mc.PROB_THRESH           = 0.005
21 |   mc.TOP_N_DETECTION       = 64
22 |   mc.PLOT_PROB_THRESH      = 0.4
23 |   mc.NMS_THRESH            = 0.4
24 |   mc.LEARNING_RATE         = 0.01
25 |   mc.MOMENTUM              = 0.9
26 |   mc.DECAY_STEPS           = 10000
27 |   mc.LR_DECAY_FACTOR       = 0.5
28 |   mc.BATCH_SIZE            = 20
29 |   mc.LOSS_COEF_BBOX        = 5.0
30 |   mc.LOSS_COEF_CONF_POS    = 75.0
31 |   mc.LOSS_COEF_CONF_NEG    = 100.0
32 |   mc.LOSS_COEF_CLASS       = 1.0
33 |   mc.MAX_GRAD_NORM         = 1.0
34 |   mc.DATA_AUGMENTATION     = True
35 |   mc.DRIFT_X               = 150
36 |   mc.DRIFT_Y               = 100
37 |   mc.ANCHOR_BOX            = set_anchors(mc)
38 |   mc.ANCHORS               = len(mc.ANCHOR_BOX)
39 |   mc.ANCHOR_PER_GRID       = 9
40 |   mc.USE_DECONV            = False
41 |   mc.EXCLUDE_HARD_EXAMPLES = False
42 | 
43 |   return mc
44 | 
45 | def set_anchors(mc):
46 |   H, W, B = 24, 78, 9
47 |   anchor_shapes = np.reshape(
48 |       [np.array(
49 |           [[  36.,  37.], [ 366., 174.], [ 115.,  59.],
50 |            [ 162.,  87.], [  38.,  90.], [ 258., 173.],
51 |            [ 224., 108.], [  78., 170.], [  72.,  43.]])] * H * W,
52 |       (H, W, B, 2)
53 |   )
54 |   center_x = np.reshape(
55 |       np.transpose(
56 |           np.reshape(
57 |               np.array([np.arange(1, W+1)*float(mc.IMAGE_WIDTH)/(W+1)]*H*B), 
58 |               (B, H, W)
59 |           ),
60 |           (1, 2, 0)
61 |       ),
62 |       (H, W, B, 1)
63 |   )
64 |   center_y = np.reshape(
65 |       np.transpose(
66 |           np.reshape(
67 |               np.array([np.arange(1, H+1)*float(mc.IMAGE_HEIGHT)/(H+1)]*W*B),
68 |               (B, W, H)
69 |           ),
70 |           (2, 1, 0)
71 |       ),
72 |       (H, W, B, 1)
73 |   )
74 |   anchors = np.reshape(
75 |       np.concatenate((center_x, center_y, anchor_shapes), axis=3),
76 |       (-1, 4)
77 |   )
78 | 
79 |   return anchors
80 | 


--------------------------------------------------------------------------------
/src/nets/vgg16_convDet.py:
--------------------------------------------------------------------------------
 1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
 2 | 
 3 | """VGG16+ConvDet model."""
 4 | 
 5 | from __future__ import absolute_import
 6 | from __future__ import division
 7 | from __future__ import print_function
 8 | 
 9 | import os
10 | import sys
11 | 
12 | import joblib
13 | from utils import util
14 | from easydict import EasyDict as edict
15 | import numpy as np
16 | import tensorflow as tf
17 | from nn_skeleton import ModelSkeleton
18 | 
19 | 
20 | class VGG16ConvDet(ModelSkeleton):
21 |   def __init__(self, mc, gpu_id=0):
22 |     with tf.device('/gpu:{}'.format(gpu_id)):
23 |       ModelSkeleton.__init__(self, mc)
24 | 
25 |       self._add_forward_graph()
26 |       self._add_interpretation_graph()
27 |       self._add_loss_graph()
28 |       self._add_train_graph()
29 |       self._add_viz_graph()
30 | 
31 |   def _add_forward_graph(self):
32 |     """Build the VGG-16 model."""
33 | 
34 |     mc = self.mc
35 |     if mc.LOAD_PRETRAINED_MODEL:
36 |       assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \
37 |           'Cannot find pretrained model at the given path:' \
38 |           '  {}'.format(mc.PRETRAINED_MODEL_PATH)
39 |       self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH)
40 | 
41 |     with tf.variable_scope('conv1') as scope:
42 |       conv1_1 = self._conv_layer(
43 |           'conv1_1', self.image_input, filters=64, size=3, stride=1, freeze=True)
44 |       conv1_2 = self._conv_layer(
45 |           'conv1_2', conv1_1, filters=64, size=3, stride=1, freeze=True)
46 |       pool1 = self._pooling_layer(
47 |           'pool1', conv1_2, size=2, stride=2)
48 | 
49 |     with tf.variable_scope('conv2') as scope:
50 |       conv2_1 = self._conv_layer(
51 |           'conv2_1', pool1, filters=128, size=3, stride=1, freeze=True)
52 |       conv2_2 = self._conv_layer(
53 |           'conv2_2', conv2_1, filters=128, size=3, stride=1, freeze=True)
54 |       pool2 = self._pooling_layer(
55 |           'pool2', conv2_2, size=2, stride=2)
56 | 
57 |     with tf.variable_scope('conv3') as scope:
58 |       conv3_1 = self._conv_layer(
59 |           'conv3_1', pool2, filters=256, size=3, stride=1)
60 |       conv3_2 = self._conv_layer(
61 |           'conv3_2', conv3_1, filters=256, size=3, stride=1)
62 |       conv3_3 = self._conv_layer(
63 |           'conv3_3', conv3_2, filters=256, size=3, stride=1)
64 |       pool3 = self._pooling_layer(
65 |           'pool3', conv3_3, size=2, stride=2)
66 | 
67 |     with tf.variable_scope('conv4') as scope:
68 |       conv4_1 = self._conv_layer(
69 |           'conv4_1', pool3, filters=512, size=3, stride=1)
70 |       conv4_2 = self._conv_layer(
71 |           'conv4_2', conv4_1, filters=512, size=3, stride=1)
72 |       conv4_3 = self._conv_layer(
73 |           'conv4_3', conv4_2, filters=512, size=3, stride=1)
74 |       pool4 = self._pooling_layer(
75 |           'pool4', conv4_3, size=2, stride=2)
76 | 
77 |     with tf.variable_scope('conv5') as scope:
78 |       conv5_1 = self._conv_layer(
79 |           'conv5_1', pool4, filters=512, size=3, stride=1)
80 |       conv5_2 = self._conv_layer(
81 |           'conv5_2', conv5_1, filters=512, size=3, stride=1)
82 |       conv5_3 = self._conv_layer(
83 |           'conv5_3', conv5_2, filters=512, size=3, stride=1)
84 | 
85 |     dropout5 = tf.nn.dropout(conv5_3, self.keep_prob, name='drop6')
86 | 
87 |     num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4)
88 |     self.preds = self._conv_layer(
89 |         'conv6', dropout5, filters=num_output, size=3, stride=1,
90 |         padding='SAME', xavier=False, relu=False, stddev=0.0001)
91 | 


--------------------------------------------------------------------------------
/src/nets/squeezeDet.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """SqueezeDet model."""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import joblib
 13 | from utils import util
 14 | from easydict import EasyDict as edict
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from nn_skeleton import ModelSkeleton
 18 | 
 19 | class SqueezeDet(ModelSkeleton):
 20 |   def __init__(self, mc, gpu_id=0):
 21 |     with tf.device('/gpu:{}'.format(gpu_id)):
 22 |       ModelSkeleton.__init__(self, mc)
 23 | 
 24 |       self._add_forward_graph()
 25 |       self._add_interpretation_graph()
 26 |       self._add_loss_graph()
 27 |       self._add_train_graph()
 28 |       self._add_viz_graph()
 29 | 
 30 |   def _add_forward_graph(self):
 31 |     """NN architecture."""
 32 | 
 33 |     mc = self.mc
 34 |     if mc.LOAD_PRETRAINED_MODEL:
 35 |       assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \
 36 |           'Cannot find pretrained model at the given path:' \
 37 |           '  {}'.format(mc.PRETRAINED_MODEL_PATH)
 38 |       self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH)
 39 | 
 40 |     conv1 = self._conv_layer(
 41 |         'conv1', self.image_input, filters=64, size=3, stride=2,
 42 |         padding='SAME', freeze=True)
 43 |     pool1 = self._pooling_layer(
 44 |         'pool1', conv1, size=3, stride=2, padding='SAME')
 45 | 
 46 |     fire2 = self._fire_layer(
 47 |         'fire2', pool1, s1x1=16, e1x1=64, e3x3=64, freeze=False)
 48 |     fire3 = self._fire_layer(
 49 |         'fire3', fire2, s1x1=16, e1x1=64, e3x3=64, freeze=False)
 50 |     pool3 = self._pooling_layer(
 51 |         'pool3', fire3, size=3, stride=2, padding='SAME')
 52 | 
 53 |     fire4 = self._fire_layer(
 54 |         'fire4', pool3, s1x1=32, e1x1=128, e3x3=128, freeze=False)
 55 |     fire5 = self._fire_layer(
 56 |         'fire5', fire4, s1x1=32, e1x1=128, e3x3=128, freeze=False)
 57 |     pool5 = self._pooling_layer(
 58 |         'pool5', fire5, size=3, stride=2, padding='SAME')
 59 | 
 60 |     fire6 = self._fire_layer(
 61 |         'fire6', pool5, s1x1=48, e1x1=192, e3x3=192, freeze=False)
 62 |     fire7 = self._fire_layer(
 63 |         'fire7', fire6, s1x1=48, e1x1=192, e3x3=192, freeze=False)
 64 |     fire8 = self._fire_layer(
 65 |         'fire8', fire7, s1x1=64, e1x1=256, e3x3=256, freeze=False)
 66 |     fire9 = self._fire_layer(
 67 |         'fire9', fire8, s1x1=64, e1x1=256, e3x3=256, freeze=False)
 68 | 
 69 |     # Two extra fire modules that are not trained before
 70 |     fire10 = self._fire_layer(
 71 |         'fire10', fire9, s1x1=96, e1x1=384, e3x3=384, freeze=False)
 72 |     fire11 = self._fire_layer(
 73 |         'fire11', fire10, s1x1=96, e1x1=384, e3x3=384, freeze=False)
 74 |     dropout11 = tf.nn.dropout(fire11, self.keep_prob, name='drop11')
 75 | 
 76 |     num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4)
 77 |     self.preds = self._conv_layer(
 78 |         'conv12', dropout11, filters=num_output, size=3, stride=1,
 79 |         padding='SAME', xavier=False, relu=False, stddev=0.0001)
 80 | 
 81 |   def _fire_layer(self, layer_name, inputs, s1x1, e1x1, e3x3, stddev=0.01,
 82 |       freeze=False):
 83 |     """Fire layer constructor.
 84 | 
 85 |     Args:
 86 |       layer_name: layer name
 87 |       inputs: input tensor
 88 |       s1x1: number of 1x1 filters in squeeze layer.
 89 |       e1x1: number of 1x1 filters in expand layer.
 90 |       e3x3: number of 3x3 filters in expand layer.
 91 |       freeze: if true, do not train parameters in this layer.
 92 |     Returns:
 93 |       fire layer operation.
 94 |     """
 95 | 
 96 |     sq1x1 = self._conv_layer(
 97 |         layer_name+'/squeeze1x1', inputs, filters=s1x1, size=1, stride=1,
 98 |         padding='SAME', stddev=stddev, freeze=freeze)
 99 |     ex1x1 = self._conv_layer(
100 |         layer_name+'/expand1x1', sq1x1, filters=e1x1, size=1, stride=1,
101 |         padding='SAME', stddev=stddev, freeze=freeze)
102 |     ex3x3 = self._conv_layer(
103 |         layer_name+'/expand3x3', sq1x1, filters=e3x3, size=3, stride=1,
104 |         padding='SAME', stddev=stddev, freeze=freeze)
105 | 
106 |     return tf.concat([ex1x1, ex3x3], 3, name=layer_name+'/concat')
107 | 


--------------------------------------------------------------------------------
/src/nets/squeezeDetPlus.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """SqueezeDet+ model."""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import joblib
 13 | from utils import util
 14 | from easydict import EasyDict as edict
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from nn_skeleton import ModelSkeleton
 18 | 
 19 | class SqueezeDetPlus(ModelSkeleton):
 20 |   def __init__(self, mc, gpu_id=0):
 21 |     with tf.device('/gpu:{}'.format(gpu_id)):
 22 |       ModelSkeleton.__init__(self, mc)
 23 | 
 24 |       self._add_forward_graph()
 25 |       self._add_interpretation_graph()
 26 |       self._add_loss_graph()
 27 |       self._add_train_graph()
 28 |       self._add_viz_graph()
 29 | 
 30 |   def _add_forward_graph(self):
 31 |     """NN architecture."""
 32 | 
 33 |     mc = self.mc
 34 |     if mc.LOAD_PRETRAINED_MODEL:
 35 |       assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \
 36 |           'Cannot find pretrained model at the given path:' \
 37 |           '  {}'.format(mc.PRETRAINED_MODEL_PATH)
 38 |       self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH)
 39 | 
 40 |     conv1 = self._conv_layer(
 41 |         'conv1', self.image_input, filters=96, size=7, stride=2,
 42 |         padding='VALID', freeze=True)
 43 |     pool1 = self._pooling_layer(
 44 |         'pool1', conv1, size=3, stride=2, padding='VALID')
 45 | 
 46 |     fire2 = self._fire_layer(
 47 |         'fire2', pool1, s1x1=96, e1x1=64, e3x3=64, freeze=False)
 48 |     fire3 = self._fire_layer(
 49 |         'fire3', fire2, s1x1=96, e1x1=64, e3x3=64, freeze=False)
 50 |     fire4 = self._fire_layer(
 51 |         'fire4', fire3, s1x1=192, e1x1=128, e3x3=128, freeze=False)
 52 |     pool4 = self._pooling_layer(
 53 |         'pool4', fire4, size=3, stride=2, padding='VALID')
 54 | 
 55 |     fire5 = self._fire_layer(
 56 |         'fire5', pool4, s1x1=192, e1x1=128, e3x3=128, freeze=False)
 57 |     fire6 = self._fire_layer(
 58 |         'fire6', fire5, s1x1=288, e1x1=192, e3x3=192, freeze=False)
 59 |     fire7 = self._fire_layer(
 60 |         'fire7', fire6, s1x1=288, e1x1=192, e3x3=192, freeze=False)
 61 |     fire8 = self._fire_layer(
 62 |         'fire8', fire7, s1x1=384, e1x1=256, e3x3=256, freeze=False)
 63 |     pool8 = self._pooling_layer(
 64 |         'pool8', fire8, size=3, stride=2, padding='VALID')
 65 | 
 66 |     fire9 = self._fire_layer(
 67 |         'fire9', pool8, s1x1=384, e1x1=256, e3x3=256, freeze=False)
 68 | 
 69 |     # Two extra fire modules that are not trained before
 70 |     fire10 = self._fire_layer(
 71 |         'fire10', fire9, s1x1=384, e1x1=256, e3x3=256, freeze=False)
 72 |     fire11 = self._fire_layer(
 73 |         'fire11', fire10, s1x1=384, e1x1=256, e3x3=256, freeze=False)
 74 |     dropout11 = tf.nn.dropout(fire11, self.keep_prob, name='drop11')
 75 | 
 76 |     num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4)
 77 |     self.preds = self._conv_layer(
 78 |         'conv12', dropout11, filters=num_output, size=3, stride=1,
 79 |         padding='SAME', xavier=False, relu=False, stddev=0.0001)
 80 | 
 81 |   def _fire_layer(self, layer_name, inputs, s1x1, e1x1, e3x3, stddev=0.01,
 82 |       freeze=False):
 83 |     """Fire layer constructor.
 84 | 
 85 |     Args:
 86 |       layer_name: layer name
 87 |       inputs: input tensor
 88 |       s1x1: number of 1x1 filters in squeeze layer.
 89 |       e1x1: number of 1x1 filters in expand layer.
 90 |       e3x3: number of 3x3 filters in expand layer.
 91 |       freeze: if true, do not train parameters in this layer.
 92 |     Returns:
 93 |       fire layer operation.
 94 |     """
 95 | 
 96 |     sq1x1 = self._conv_layer(
 97 |         layer_name+'/squeeze1x1', inputs, filters=s1x1, size=1, stride=1,
 98 |         padding='SAME', stddev=stddev, freeze=freeze)
 99 |     ex1x1 = self._conv_layer(
100 |         layer_name+'/expand1x1', sq1x1, filters=e1x1, size=1, stride=1,
101 |         padding='SAME', stddev=stddev, freeze=freeze)
102 |     ex3x3 = self._conv_layer(
103 |         layer_name+'/expand3x3', sq1x1, filters=e3x3, size=3, stride=1,
104 |         padding='SAME', stddev=stddev, freeze=freeze)
105 | 
106 |     return tf.concat([ex1x1, ex3x3], 3, name=layer_name+'/concat')
107 | 


--------------------------------------------------------------------------------
/src/config/config.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Base Model configurations"""
  4 | 
  5 | import os
  6 | import os.path as osp
  7 | import numpy as np
  8 | from easydict import EasyDict as edict
  9 | 
 10 | def base_model_config(dataset='PASCAL_VOC'):
 11 |   assert dataset.upper()=='PASCAL_VOC' or dataset.upper()=='KITTI', \
 12 |       'Currently only support PASCAL_VOC or KITTI dataset'
 13 | 
 14 |   cfg = edict()
 15 | 
 16 |   # Dataset used to train/val/test model. Now support PASCAL_VOC or KITTI
 17 |   cfg.DATASET = dataset.upper()
 18 | 
 19 |   if cfg.DATASET == 'PASCAL_VOC':
 20 |     # object categories to classify
 21 |     cfg.CLASS_NAMES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
 22 |                        'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
 23 |                        'horse', 'motorbike', 'person', 'pottedplant', 'sheep',
 24 |                        'sofa', 'train', 'tvmonitor')
 25 |   elif cfg.DATASET == 'KITTI':
 26 |     cfg.CLASS_NAMES = ('car', 'pedestrian', 'cyclist')
 27 | 
 28 |   # number of categories to classify
 29 |   cfg.CLASSES = len(cfg.CLASS_NAMES)    
 30 | 
 31 |   # ROI pooling output width
 32 |   cfg.GRID_POOL_WIDTH = 7
 33 | 
 34 |   # ROI pooling output height
 35 |   cfg.GRID_POOL_HEIGHT = 7
 36 | 
 37 |   # parameter used in leaky ReLU
 38 |   cfg.LEAKY_COEF = 0.1
 39 | 
 40 |   # Probability to keep a node in dropout
 41 |   cfg.KEEP_PROB = 0.5
 42 | 
 43 |   # image width
 44 |   cfg.IMAGE_WIDTH = 224
 45 | 
 46 |   # image height
 47 |   cfg.IMAGE_HEIGHT = 224
 48 | 
 49 |   # anchor box, array of [cx, cy, w, h]. To be defined later
 50 |   cfg.ANCHOR_BOX = []
 51 | 
 52 |   # number of anchor boxes
 53 |   cfg.ANCHORS = len(cfg.ANCHOR_BOX)
 54 | 
 55 |   # number of anchor boxes per grid
 56 |   cfg.ANCHOR_PER_GRID = -1
 57 | 
 58 |   # batch size
 59 |   cfg.BATCH_SIZE = 20
 60 | 
 61 |   # Only keep boxes with probability higher than this threshold
 62 |   cfg.PROB_THRESH = 0.005
 63 | 
 64 |   # Only plot boxes with probability higher than this threshold
 65 |   cfg.PLOT_PROB_THRESH = 0.5
 66 | 
 67 |   # Bounding boxes with IOU larger than this are going to be removed
 68 |   cfg.NMS_THRESH = 0.2
 69 | 
 70 |   # Pixel mean values (BGR order) as a (1, 1, 3) array. Below is the BGR mean
 71 |   # of VGG16
 72 |   cfg.BGR_MEANS = np.array([[[103.939, 116.779, 123.68]]])
 73 | 
 74 |   # loss coefficient for confidence regression
 75 |   cfg.LOSS_COEF_CONF = 1.0
 76 | 
 77 |   # loss coefficient for classification regression
 78 |   cfg.LOSS_COEF_CLASS = 1.0
 79 | 
 80 |   # loss coefficient for bounding box regression
 81 |   cfg.LOSS_COEF_BBOX = 10.0
 82 |                            
 83 |   # reduce step size after this many steps
 84 |   cfg.DECAY_STEPS = 10000
 85 | 
 86 |   # multiply the learning rate by this factor
 87 |   cfg.LR_DECAY_FACTOR = 0.1
 88 | 
 89 |   # learning rate
 90 |   cfg.LEARNING_RATE = 0.005
 91 | 
 92 |   # momentum
 93 |   cfg.MOMENTUM = 0.9
 94 | 
 95 |   # weight decay
 96 |   cfg.WEIGHT_DECAY = 0.0005
 97 | 
 98 |   # wether to load pre-trained model
 99 |   cfg.LOAD_PRETRAINED_MODEL = True
100 | 
101 |   # path to load the pre-trained model
102 |   cfg.PRETRAINED_MODEL_PATH = ''
103 | 
104 |   # print log to console in debug mode
105 |   cfg.DEBUG_MODE = False
106 | 
107 |   # a small value used to prevent numerical instability
108 |   cfg.EPSILON = 1e-16
109 | 
110 |   # threshold for safe exponential operation
111 |   cfg.EXP_THRESH=1.0
112 | 
113 |   # gradients with norm larger than this is going to be clipped.
114 |   cfg.MAX_GRAD_NORM = 10.0
115 | 
116 |   # Whether to do data augmentation
117 |   cfg.DATA_AUGMENTATION = False
118 | 
119 |   # The range to randomly shift the image widht
120 |   cfg.DRIFT_X = 0
121 | 
122 |   # The range to randomly shift the image height
123 |   cfg.DRIFT_Y = 0
124 | 
125 |   # Whether to exclude images harder than hard-category. Only useful for KITTI
126 |   # dataset.
127 |   cfg.EXCLUDE_HARD_EXAMPLES = True
128 | 
129 |   # small value used in batch normalization to prevent dividing by 0. The
130 |   # default value here is the same with caffe's default value.
131 |   cfg.BATCH_NORM_EPSILON = 1e-5
132 | 
133 |   # number of threads to fetch data
134 |   cfg.NUM_THREAD = 4
135 | 
136 |   # capacity for FIFOQueue
137 |   cfg.QUEUE_CAPACITY = 100
138 | 
139 |   # indicate if the model is in training mode
140 |   cfg.IS_TRAINING = False
141 | 
142 |   return cfg
143 | 


--------------------------------------------------------------------------------
/src/dataset/pascal_voc.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Image data base class for pascal voc"""
  4 | 
  5 | import cv2
  6 | import os 
  7 | import numpy as np
  8 | import xml.etree.ElementTree as ET
  9 | 
 10 | from utils.util import bbox_transform_inv
 11 | from dataset.imdb import imdb
 12 | from dataset.voc_eval import voc_eval
 13 | 
 14 | class pascal_voc(imdb):
 15 |   def __init__(self, image_set, year, data_path, mc):
 16 |     imdb.__init__(self, 'voc_'+year+'_'+image_set, mc)
 17 |     self._year = year
 18 |     self._image_set = image_set
 19 |     self._data_root_path = data_path
 20 |     self._data_path = os.path.join(self._data_root_path, 'VOC' + self._year)
 21 |     self._classes = self.mc.CLASS_NAMES
 22 |     self._class_to_idx = dict(zip(self.classes, xrange(self.num_classes)))
 23 | 
 24 |     # a list of string indices of images in the directory
 25 |     self._image_idx = self._load_image_set_idx() 
 26 |     # a dict of image_idx -> [[cx, cy, w, h, cls_idx]]. x,y,w,h are not divided by
 27 |     # the image width and height
 28 |     self._rois = self._load_pascal_annotation()
 29 | 
 30 |     ## batch reader ##
 31 |     self._perm_idx = None
 32 |     self._cur_idx = 0
 33 |     # TODO(bichen): add a random seed as parameter
 34 |     self._shuffle_image_idx()
 35 | 
 36 |   def _load_image_set_idx(self):
 37 |     image_set_file = os.path.join(self._data_path, 'ImageSets', 'Main',
 38 |                                   self._image_set+'.txt')
 39 |     assert os.path.exists(image_set_file), \
 40 |         'File does not exist: {}'.format(image_set_file)
 41 | 
 42 |     with open(image_set_file) as f:
 43 |       image_idx = [x.strip() for x in f.readlines()]
 44 |     return image_idx
 45 | 
 46 |   def _image_path_at(self, idx):
 47 |     image_path = os.path.join(self._data_path, 'JPEGImages', idx+'.jpg')
 48 |     assert os.path.exists(image_path), \
 49 |         'Image does not exist: {}'.format(image_path)
 50 |     return image_path
 51 | 
 52 |   def _load_pascal_annotation(self):
 53 |     idx2annotation = {}
 54 |     for index in self._image_idx:
 55 |       filename = os.path.join(self._data_path, 'Annotations', index+'.xml')
 56 |       tree = ET.parse(filename)
 57 |       objs = tree.findall('object')
 58 |       objs = [obj for obj in objs if int(obj.find('difficult').text) == 0]
 59 |       bboxes = []
 60 |       for obj in objs:
 61 |         bbox = obj.find('bndbox')
 62 |         # Make pixel indexes 0-based
 63 |         xmin = float(bbox.find('xmin').text) - 1
 64 |         xmax = float(bbox.find('xmax').text) - 1
 65 |         ymin = float(bbox.find('ymin').text) - 1
 66 |         ymax = float(bbox.find('ymax').text) - 1
 67 |         assert xmin >= 0.0 and xmin <= xmax, \
 68 |             'Invalid bounding box x-coord xmin {} or xmax {} at {}.xml' \
 69 |                 .format(xmin, xmax, index)
 70 |         assert ymin >= 0.0 and ymin <= ymax, \
 71 |             'Invalid bounding box y-coord ymin {} or ymax {} at {}.xml' \
 72 |                 .format(ymin, ymax, index)
 73 |         x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax])
 74 |         cls = self._class_to_idx[obj.find('name').text.lower().strip()]
 75 |         bboxes.append([x, y, w, h, cls])
 76 | 
 77 |       idx2annotation[index] = bboxes
 78 | 
 79 |     return idx2annotation
 80 | 
 81 |   def evaluate_detections(self, eval_dir, global_step, all_boxes):
 82 |     """Evaluate detection results.
 83 |     Args:
 84 |       eval_dir: directory to write evaluation logs
 85 |       global_step: step of the checkpoint
 86 |       all_boxes: all_boxes[cls][image] = N x 5 arrays of 
 87 |         [xmin, ymin, xmax, ymax, score]
 88 |     Returns:
 89 |       aps: array of average precisions.
 90 |       names: class names corresponding to each ap
 91 |     """
 92 |     det_file_dir = os.path.join(
 93 |         eval_dir, 'detection_files_{:s}'.format(global_step))
 94 |     if not os.path.isdir(det_file_dir):
 95 |       os.mkdir(det_file_dir)
 96 |     det_file_path_template = os.path.join(det_file_dir, '{:s}.txt')
 97 | 
 98 |     for cls_idx, cls in enumerate(self._classes):
 99 |       det_file_name = det_file_path_template.format(cls)
100 |       with open(det_file_name, 'wt') as f:
101 |         for im_idx, index in enumerate(self._image_idx):
102 |           dets = all_boxes[cls_idx][im_idx]
103 |           # VOC expects 1-based indices
104 |           for k in xrange(len(dets)):
105 |             f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.
106 |                 format(index, dets[k][-1], 
107 |                        dets[k][0]+1, dets[k][1]+1,
108 |                        dets[k][2]+1, dets[k][3]+1)
109 |             )
110 | 
111 |     # Evaluate detection results
112 |     annopath = os.path.join(
113 |         self._data_root_path,
114 |         'VOC'+self._year,
115 |         'Annotations',
116 |         '{:s}.xml'
117 |     )
118 |     imagesetfile = os.path.join(
119 |         self._data_root_path,
120 |         'VOC'+self._year,
121 |         'ImageSets',
122 |         'Main',
123 |         self._image_set+'.txt'
124 |     )
125 |     cachedir = os.path.join(self._data_root_path, 'annotations_cache')
126 |     aps = []
127 |     use_07_metric = True if int(self._year) < 2010 else False
128 |     for i, cls in enumerate(self._classes):
129 |       filename = det_file_path_template.format(cls)
130 |       _,  _, ap = voc_eval(
131 |           filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
132 |           use_07_metric=use_07_metric)
133 |       aps += [ap]
134 |       print ('{:s}: AP = {:.4f}'.format(cls, ap))
135 | 
136 |     print ('Mean AP = {:.4f}'.format(np.mean(aps)))
137 |     return aps, self._classes
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## _SqueezeDet:_ Unified, Small, Low Power Fully Convolutional Neural Networks for Real-Time Object Detection for Autonomous Driving
  2 | By Bichen Wu, Forrest Iandola, Peter H. Jin, Kurt Keutzer (UC Berkeley & DeepScale)
  3 | 
  4 | This repository contains a tensorflow implementation of SqueezeDet, a convolutional neural network based object detector described in our paper: https://arxiv.org/abs/1612.01051. If you find this work useful for your research, please consider citing:
  5 | 
  6 |     @inproceedings{squeezedet,
  7 |         Author = {Bichen Wu and Forrest Iandola and Peter H. Jin and Kurt Keutzer},
  8 |         Title = {SqueezeDet: Unified, Small, Low Power Fully Convolutional Neural Networks for Real-Time Object Detection for Autonomous Driving},
  9 |         Journal = {arXiv:1612.01051},
 10 |         Year = {2016}
 11 |     }
 12 |     
 13 | ## Installation:
 14 | 
 15 | The following instructions are written for Linux-based distros.
 16 | 
 17 | - Clone the SqueezeDet repository:
 18 | 
 19 |   ```Shell
 20 |   git clone https://github.com/BichenWuUCB/squeezeDet.git
 21 |   ```
 22 |   Let's call the top level directory of SqueezeDet `$SQDT_ROOT`. 
 23 | 
 24 | - (Optional) Setup your own virtual environment.
 25 | 
 26 |   1. The following assumes `python` is the Python2.7 executable. Navigate to your user home directory, and create the virtual environment there.
 27 |   
 28 |     ```Shell
 29 |     cd ~
 30 |     virtualenv env --python=python
 31 |     ```
 32 |     
 33 |   2. Launch the virtual environment.
 34 |   
 35 |     ```Shell
 36 |     source env/bin/activate
 37 |     ```
 38 |     
 39 | - Use pip to install required Python packages:
 40 |     
 41 |     ```Shell
 42 |     pip install -r requirements.txt
 43 |     ```
 44 | ## Demo:
 45 | - Download SqueezeDet model parameters from [here](https://www.dropbox.com/s/a6t3er8f03gdl4z/model_checkpoints.tgz?dl=0), untar it, and put it under `$SQDT_ROOT/data/` If you are using command line, type:
 46 | 
 47 |   ```Shell
 48 |   cd $SQDT_ROOT/data/
 49 |   wget https://www.dropbox.com/s/a6t3er8f03gdl4z/model_checkpoints.tgz
 50 |   tar -xzvf model_checkpoints.tgz
 51 |   rm model_checkpoints.tgz
 52 |   ```
 53 | 
 54 | 
 55 | - Now we can run the demo. To detect the sample image `$SQDT_ROOT/data/sample.png`,
 56 | 
 57 |   ```Shell
 58 |   cd $SQDT_ROOT/
 59 |   python ./src/demo.py
 60 |   ```
 61 |   If the installation is correct, the detector should generate this image: ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/out_sample.png)
 62 | 
 63 |   To detect other image(s), use the flag `--input_path=./data/*.png` to point to input image(s). Input image(s) will be scaled to the resolution of 1242x375 (KITTI image resolution), so it works best when original resolution is close to that.  
 64 | 
 65 | - SqueezeDet is a real-time object detector, which can be used to detect videos. The video demo will be released later.
 66 | 
 67 | ## Training/Validation:
 68 | - Download KITTI object detection dataset: [images](http://www.cvlibs.net/download.php?file=data_object_image_2.zip) and [labels](http://www.cvlibs.net/download.php?file=data_object_label_2.zip). Put them under `$SQDT_ROOT/data/KITTI/`. Unzip them, then you will get two directories:  `$SQDT_ROOT/data/KITTI/training/` and `$SQDT_ROOT/data/KITTI/testing/`. 
 69 | 
 70 | - Now we need to split the training data into a training set and a vlidation set. 
 71 | 
 72 |   ```Shell
 73 |   cd $SQDT_ROOT/data/KITTI/
 74 |   mkdir ImageSets
 75 |   cd ./ImageSets
 76 |   ls ../training/image_2/ | grep ".png" | sed s/.png// > trainval.txt
 77 |   ```
 78 |   `trainval.txt` contains indices to all the images in the training data. In our experiments, we randomly split half of indices in `trainval.txt` into `train.txt` to form a training set and rest of them into `val.txt` to form a validation set. For your convenience, we provide a script to split the train-val set automatically. Simply run
 79 |   
 80 |     ```Shell
 81 |   cd $SQDT_ROOT/data/
 82 |   python random_split_train_val.py
 83 |   ```
 84 |   
 85 |   then you should get the `train.txt` and `val.txt` under `$SQDT_ROOT/data/KITTI/ImageSets`. 
 86 | 
 87 |   When above two steps are finished, the structure of `$SQDT_ROOT/data/KITTI/` should at least contain:
 88 | 
 89 |   ```Shell
 90 |   $SQDT_ROOT/data/KITTI/
 91 |                     |->training/
 92 |                     |     |-> image_2/00****.png
 93 |                     |     L-> label_2/00****.txt
 94 |                     |->testing/
 95 |                     |     L-> image_2/00****.png
 96 |                     L->ImageSets/
 97 |                           |-> trainval.txt
 98 |                           |-> train.txt
 99 |                           L-> val.txt
100 |   ```
101 | 
102 | - Next, download the CNN model pretrained for ImageNet classification:
103 |   ```Shell
104 |   cd $SQDT_ROOT/data/
105 |   # SqueezeNet
106 |   wget https://www.dropbox.com/s/fzvtkc42hu3xw47/SqueezeNet.tgz
107 |   tar -xzvf SqueezeNet.tgz
108 |   # ResNet50 
109 |   wget https://www.dropbox.com/s/p65lktictdq011t/ResNet.tgz
110 |   tar -xzvf ResNet.tgz
111 |   # VGG16
112 |   wget https://www.dropbox.com/s/zxd72nj012lzrlf/VGG16.tgz
113 |   tar -xzvf VGG16.tgz
114 |   ```
115 | 
116 | - Now we can start training. Training script can be found in `$SQDT_ROOT/scripts/train.sh`, which contains commands to train 4 models: SqueezeDet, SqueezeDet+, VGG16+ConvDet, ResNet50+ConvDet. 
117 |   ```Shell
118 |   cd $SQDT_ROOT/
119 |   ./scripts/train.sh -net (squeezeDet|squeezeDet+|vgg16|resnet50) -train_dir /tmp/bichen/logs/squeezedet -gpu 0
120 |   ```
121 | 
122 |   Training logs are saved to the directory specified by `-train_dir`. GPU id is specified by `-gpu`. Network to train is specificed by `-net` 
123 | 
124 | - Before evaluation, you need to first compile the official evaluation script of KITTI dataset
125 |   ```Shell
126 |   cd $SQDT_ROOT/src/dataset/kitti-eval
127 |   make
128 |   ```
129 | 
130 | - Then, you can launch the evaluation script (in parallel with training) by 
131 | 
132 |   ```Shell
133 |   cd $SQDT_ROOT/
134 |   ./scripts/eval.sh -net (squeezeDet|squeezeDet+|vgg16|resnet50) -eval_dir /tmp/bichen/logs/squeezedet -image_set (train|val) -gpu 1
135 |   ```
136 | 
137 |   Note that `-train_dir` in the training script should be the same as `-eval_dir` in the evaluation script to make it easy for tensorboard to load logs. 
138 | 
139 |   You can run two evaluation scripts to simultaneously evaluate the model on training and validation set. The training script keeps dumping checkpoint (model parameters) to the training directory once every 1000 steps (step size can be changed). Once a new checkpoint is saved, evaluation threads load the new checkpoint file and evaluate them on training and validation set. 
140 | 
141 | - Finally, to monitor training and evaluation process, you can use tensorboard by
142 | 
143 |   ```Shell
144 |   tensorboard --logdir=$LOG_DIR
145 |   ```
146 |   Here, `$LOG_DIR` is the directory where your training and evaluation threads dump log events, which should be the same as `-train_dir` and `-eval_dir` specified in `train.sh` and `eval.sh`. From tensorboard, you should be able to see a lot of information including loss, average precision, error analysis, example detections, model visualization, etc.
147 | 
148 |   ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/detection_analysis.png)
149 |   ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/graph.png)
150 |   ![alt text](https://github.com/BichenWuUCB/squeezeDet/blob/master/README/det_img.png)
151 | 


--------------------------------------------------------------------------------
/src/nets/resnet50_convDet.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """ResNet50+ConvDet model."""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | import joblib
 13 | from utils import util
 14 | from easydict import EasyDict as edict
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from nn_skeleton import ModelSkeleton
 18 | 
 19 | 
 20 | class ResNet50ConvDet(ModelSkeleton):
 21 |   def __init__(self, mc, gpu_id=0):
 22 |     with tf.device('/gpu:{}'.format(gpu_id)):
 23 |       ModelSkeleton.__init__(self, mc)
 24 | 
 25 |       self._add_forward_graph()
 26 |       self._add_interpretation_graph()
 27 |       self._add_loss_graph()
 28 |       self._add_train_graph()
 29 |       self._add_viz_graph()
 30 | 
 31 |   def _add_forward_graph(self):
 32 |     """NN architecture."""
 33 | 
 34 |     mc = self.mc
 35 |     if mc.LOAD_PRETRAINED_MODEL:
 36 |       assert tf.gfile.Exists(mc.PRETRAINED_MODEL_PATH), \
 37 |           'Cannot find pretrained model at the given path:' \
 38 |           '  {}'.format(mc.PRETRAINED_MODEL_PATH)
 39 |       self.caffemodel_weight = joblib.load(mc.PRETRAINED_MODEL_PATH)
 40 | 
 41 |     conv1 = self._conv_bn_layer(
 42 |         self.image_input, 'conv1', 'bn_conv1', 'scale_conv1', filters=64,
 43 |         size=7, stride=2, freeze=True, conv_with_bias=True)
 44 |     pool1 = self._pooling_layer(
 45 |         'pool1', conv1, size=3, stride=2, padding='VALID')
 46 | 
 47 |     with tf.variable_scope('conv2_x') as scope:
 48 |       with tf.variable_scope('res2a'):
 49 |         branch1 = self._conv_bn_layer(
 50 |             pool1, 'res2a_branch1', 'bn2a_branch1', 'scale2a_branch1',
 51 |             filters=256, size=1, stride=1, freeze=True, relu=False)
 52 |         branch2 = self._res_branch(
 53 |             pool1, layer_name='2a', in_filters=64, out_filters=256,
 54 |             down_sample=False, freeze=True)
 55 |         res2a = tf.nn.relu(branch1+branch2, 'relu')
 56 |       with tf.variable_scope('res2b'):
 57 |         branch2 = self._res_branch(
 58 |             res2a, layer_name='2b', in_filters=64, out_filters=256,
 59 |             down_sample=False, freeze=True)
 60 |         res2b = tf.nn.relu(res2a+branch2, 'relu')
 61 |       with tf.variable_scope('res2c'):
 62 |         branch2 = self._res_branch(
 63 |             res2b, layer_name='2c', in_filters=64, out_filters=256,
 64 |             down_sample=False, freeze=True)
 65 |         res2c = tf.nn.relu(res2b+branch2, 'relu')
 66 | 
 67 |     with tf.variable_scope('conv3_x') as scope:
 68 |       with tf.variable_scope('res3a'):
 69 |         branch1 = self._conv_bn_layer(
 70 |             res2c, 'res3a_branch1', 'bn3a_branch1', 'scale3a_branch1',
 71 |             filters=512, size=1, stride=2, freeze=True, relu=False)
 72 |         branch2 = self._res_branch(
 73 |             res2c, layer_name='3a', in_filters=128, out_filters=512,
 74 |             down_sample=True, freeze=True)
 75 |         res3a = tf.nn.relu(branch1+branch2, 'relu')
 76 |       with tf.variable_scope('res3b'):
 77 |         branch2 = self._res_branch(
 78 |             res3a, layer_name='3b', in_filters=128, out_filters=512,
 79 |             down_sample=False, freeze=True)
 80 |         res3b = tf.nn.relu(res3a+branch2, 'relu')
 81 |       with tf.variable_scope('res3c'):
 82 |         branch2 = self._res_branch(
 83 |             res3b, layer_name='3c', in_filters=128, out_filters=512,
 84 |             down_sample=False, freeze=True)
 85 |         res3c = tf.nn.relu(res3b+branch2, 'relu')
 86 |       with tf.variable_scope('res3d'):
 87 |         branch2 = self._res_branch(
 88 |             res3c, layer_name='3d', in_filters=128, out_filters=512,
 89 |             down_sample=False, freeze=True)
 90 |         res3d = tf.nn.relu(res3c+branch2, 'relu')
 91 | 
 92 |     with tf.variable_scope('conv4_x') as scope:
 93 |       with tf.variable_scope('res4a'):
 94 |         branch1 = self._conv_bn_layer(
 95 |             res3d, 'res4a_branch1', 'bn4a_branch1', 'scale4a_branch1',
 96 |             filters=1024, size=1, stride=2, relu=False)
 97 |         branch2 = self._res_branch(
 98 |             res3d, layer_name='4a', in_filters=256, out_filters=1024,
 99 |             down_sample=True)
100 |         res4a = tf.nn.relu(branch1+branch2, 'relu')
101 |       with tf.variable_scope('res4b'):
102 |         branch2 = self._res_branch(
103 |             res4a, layer_name='4b', in_filters=256, out_filters=1024,
104 |             down_sample=False)
105 |         res4b = tf.nn.relu(res4a+branch2, 'relu')
106 |       with tf.variable_scope('res4c'):
107 |         branch2 = self._res_branch(
108 |             res4b, layer_name='4c', in_filters=256, out_filters=1024,
109 |             down_sample=False)
110 |         res4c = tf.nn.relu(res4b+branch2, 'relu')
111 |       with tf.variable_scope('res4d'):
112 |         branch2 = self._res_branch(
113 |             res4c, layer_name='4d', in_filters=256, out_filters=1024,
114 |             down_sample=False)
115 |         res4d = tf.nn.relu(res4c+branch2, 'relu')
116 |       with tf.variable_scope('res4e'):
117 |         branch2 = self._res_branch(
118 |             res4d, layer_name='4e', in_filters=256, out_filters=1024,
119 |             down_sample=False)
120 |         res4e = tf.nn.relu(res4d+branch2, 'relu')
121 |       with tf.variable_scope('res4f'):
122 |         branch2 = self._res_branch(
123 |             res4e, layer_name='4f', in_filters=256, out_filters=1024,
124 |             down_sample=False)
125 |         res4f = tf.nn.relu(res4e+branch2, 'relu')
126 | 
127 |     dropout4 = tf.nn.dropout(res4f, self.keep_prob, name='drop4')
128 | 
129 |     num_output = mc.ANCHOR_PER_GRID * (mc.CLASSES + 1 + 4)
130 |     self.preds = self._conv_layer(
131 |         'conv5', dropout4, filters=num_output, size=3, stride=1,
132 |         padding='SAME', xavier=False, relu=False, stddev=0.0001)
133 | 
134 |   def _res_branch(
135 |       self, inputs, layer_name, in_filters, out_filters, down_sample=False,
136 |       freeze=False):
137 |     """Residual branch constructor.
138 | 
139 |       Args:
140 |         inputs: input tensor
141 |         layer_name: layer name
142 |         in_filters: number of filters in XX_branch2a and XX_branch2b layers.
143 |         out_filters: number of filters in XX_branch2clayers.
144 |         donw_sample: if true, down sample the input feature map 
145 |         freeze: if true, do not change parameters in this layer
146 |       Returns:
147 |         A residual branch output operation.
148 |     """
149 |     with tf.variable_scope('res'+layer_name+'_branch2'):
150 |       stride = 2 if down_sample else 1
151 |       output = self._conv_bn_layer(
152 |           inputs,
153 |           conv_param_name='res'+layer_name+'_branch2a',
154 |           bn_param_name='bn'+layer_name+'_branch2a',
155 |           scale_param_name='scale'+layer_name+'_branch2a',
156 |           filters=in_filters, size=1, stride=stride, freeze=freeze)
157 |       output = self._conv_bn_layer(
158 |           output,
159 |           conv_param_name='res'+layer_name+'_branch2b',
160 |           bn_param_name='bn'+layer_name+'_branch2b',
161 |           scale_param_name='scale'+layer_name+'_branch2b',
162 |           filters=in_filters, size=3, stride=1, freeze=freeze)
163 |       output = self._conv_bn_layer(
164 |           output,
165 |           conv_param_name='res'+layer_name+'_branch2c',
166 |           bn_param_name='bn'+layer_name+'_branch2c',
167 |           scale_param_name='scale'+layer_name+'_branch2c',
168 |           filters=out_filters, size=1, stride=1, freeze=freeze, relu=False)
169 |       return output
170 | 


--------------------------------------------------------------------------------
/src/utils/util.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Utility functions."""
  4 | 
  5 | import numpy as np
  6 | import time
  7 | import tensorflow as tf
  8 | 
  9 | def iou(box1, box2):
 10 |   """Compute the Intersection-Over-Union of two given boxes.
 11 | 
 12 |   Args:
 13 |     box1: array of 4 elements [cx, cy, width, height].
 14 |     box2: same as above
 15 |   Returns:
 16 |     iou: a float number in range [0, 1]. iou of the two boxes.
 17 |   """
 18 | 
 19 |   lr = min(box1[0]+0.5*box1[2], box2[0]+0.5*box2[2]) - \
 20 |       max(box1[0]-0.5*box1[2], box2[0]-0.5*box2[2])
 21 |   if lr > 0:
 22 |     tb = min(box1[1]+0.5*box1[3], box2[1]+0.5*box2[3]) - \
 23 |         max(box1[1]-0.5*box1[3], box2[1]-0.5*box2[3])
 24 |     if tb > 0:
 25 |       intersection = tb*lr
 26 |       union = box1[2]*box1[3]+box2[2]*box2[3]-intersection
 27 | 
 28 |       return intersection/union
 29 | 
 30 |   return 0
 31 | 
 32 | def batch_iou(boxes, box):
 33 |   """Compute the Intersection-Over-Union of a batch of boxes with another
 34 |   box.
 35 | 
 36 |   Args:
 37 |     box1: 2D array of [cx, cy, width, height].
 38 |     box2: a single array of [cx, cy, width, height]
 39 |   Returns:
 40 |     ious: array of a float number in range [0, 1].
 41 |   """
 42 |   lr = np.maximum(
 43 |       np.minimum(boxes[:,0]+0.5*boxes[:,2], box[0]+0.5*box[2]) - \
 44 |       np.maximum(boxes[:,0]-0.5*boxes[:,2], box[0]-0.5*box[2]),
 45 |       0
 46 |   )
 47 |   tb = np.maximum(
 48 |       np.minimum(boxes[:,1]+0.5*boxes[:,3], box[1]+0.5*box[3]) - \
 49 |       np.maximum(boxes[:,1]-0.5*boxes[:,3], box[1]-0.5*box[3]),
 50 |       0
 51 |   )
 52 |   inter = lr*tb
 53 |   union = boxes[:,2]*boxes[:,3] + box[2]*box[3] - inter
 54 |   return inter/union
 55 | 
 56 | def nms(boxes, probs, threshold):
 57 |   """Non-Maximum supression.
 58 |   Args:
 59 |     boxes: array of [cx, cy, w, h] (center format)
 60 |     probs: array of probabilities
 61 |     threshold: two boxes are considered overlapping if their IOU is largher than
 62 |         this threshold
 63 |     form: 'center' or 'diagonal'
 64 |   Returns:
 65 |     keep: array of True or False.
 66 |   """
 67 | 
 68 |   order = probs.argsort()[::-1]
 69 |   keep = [True]*len(order)
 70 | 
 71 |   for i in range(len(order)-1):
 72 |     ovps = batch_iou(boxes[order[i+1:]], boxes[order[i]])
 73 |     for j, ov in enumerate(ovps):
 74 |       if ov > threshold:
 75 |         keep[order[j+i+1]] = False
 76 |   return keep
 77 | 
 78 | # TODO(bichen): this is not equivalent with full NMS. Need to improve it.
 79 | def recursive_nms(boxes, probs, threshold, form='center'):
 80 |   """Recursive Non-Maximum supression.
 81 |   Args:
 82 |     boxes: array of [cx, cy, w, h] (center format) or [xmin, ymin, xmax, ymax]
 83 |     probs: array of probabilities
 84 |     threshold: two boxes are considered overlapping if their IOU is largher than
 85 |         this threshold
 86 |     form: 'center' or 'diagonal'
 87 |   Returns:
 88 |     keep: array of True or False.
 89 |   """
 90 | 
 91 |   assert form == 'center' or form == 'diagonal', \
 92 |       'bounding box format not accepted: {}.'.format(form)
 93 | 
 94 |   if form == 'center':
 95 |     # convert to diagonal format
 96 |     boxes = np.array([bbox_transform(b) for b in boxes])
 97 | 
 98 |   areas = (boxes[:, 2]-boxes[:, 0])*(boxes[:, 3]-boxes[:, 1])
 99 |   hidx = boxes[:, 0].argsort()
100 |   keep = [True]*len(hidx)
101 | 
102 |   def _nms(hidx):
103 |     order = probs[hidx].argsort()[::-1]
104 | 
105 |     for idx in range(len(order)):
106 |       if not keep[hidx[order[idx]]]:
107 |         continue
108 |       xx2 = boxes[hidx[order[idx]], 2]
109 |       for jdx in range(idx+1, len(order)):
110 |         if not keep[hidx[order[jdx]]]:
111 |           continue
112 |         xx1 = boxes[hidx[order[jdx]], 0]
113 |         if xx2 < xx1:
114 |           break
115 |         w = xx2 - xx1
116 |         yy1 = max(boxes[hidx[order[idx]], 1], boxes[hidx[order[jdx]], 1])
117 |         yy2 = min(boxes[hidx[order[idx]], 3], boxes[hidx[order[jdx]], 3])
118 |         if yy2 <= yy1:
119 |           continue
120 |         h = yy2-yy1
121 |         inter = w*h
122 |         iou = inter/(areas[hidx[order[idx]]]+areas[hidx[order[jdx]]]-inter)
123 |         if iou > threshold:
124 |           keep[hidx[order[jdx]]] = False
125 | 
126 |   def _recur(hidx):
127 |     if len(hidx) <= 20:
128 |       _nms(hidx)
129 |     else:
130 |       mid = len(hidx)/2
131 |       _recur(hidx[:mid])
132 |       _recur(hidx[mid:])
133 |       _nms([idx for idx in hidx if keep[idx]])
134 | 
135 |   _recur(hidx)
136 | 
137 |   return keep
138 | 
139 | def sparse_to_dense(sp_indices, output_shape, values, default_value=0):
140 |   """Build a dense matrix from sparse representations.
141 | 
142 |   Args:
143 |     sp_indices: A [0-2]-D array that contains the index to place values.
144 |     shape: shape of the dense matrix.
145 |     values: A {0,1}-D array where values corresponds to the index in each row of
146 |     sp_indices.
147 |     default_value: values to set for indices not specified in sp_indices.
148 |   Return:
149 |     A dense numpy N-D array with shape output_shape.
150 |   """
151 | 
152 |   assert len(sp_indices) == len(values), \
153 |       'Length of sp_indices is not equal to length of values'
154 | 
155 |   array = np.ones(output_shape) * default_value
156 |   for idx, value in zip(sp_indices, values):
157 |     array[tuple(idx)] = value
158 |   return array
159 | 
160 | def bgr_to_rgb(ims):
161 |   """Convert a list of images from BGR format to RGB format."""
162 |   out = []
163 |   for im in ims:
164 |     out.append(im[:,:,::-1])
165 |   return out
166 | 
167 | def bbox_transform(bbox):
168 |   """convert a bbox of form [cx, cy, w, h] to [xmin, ymin, xmax, ymax]. Works
169 |   for numpy array or list of tensors.
170 |   """
171 |   with tf.variable_scope('bbox_transform') as scope:
172 |     cx, cy, w, h = bbox
173 |     out_box = [[]]*4
174 |     out_box[0] = cx-w/2
175 |     out_box[1] = cy-h/2
176 |     out_box[2] = cx+w/2
177 |     out_box[3] = cy+h/2
178 | 
179 |   return out_box
180 | 
181 | def bbox_transform_inv(bbox):
182 |   """convert a bbox of form [xmin, ymin, xmax, ymax] to [cx, cy, w, h]. Works
183 |   for numpy array or list of tensors.
184 |   """
185 |   with tf.variable_scope('bbox_transform_inv') as scope:
186 |     xmin, ymin, xmax, ymax = bbox
187 |     out_box = [[]]*4
188 | 
189 |     width       = xmax - xmin + 1.0
190 |     height      = ymax - ymin + 1.0
191 |     out_box[0]  = xmin + 0.5*width 
192 |     out_box[1]  = ymin + 0.5*height
193 |     out_box[2]  = width
194 |     out_box[3]  = height
195 | 
196 |   return out_box
197 | 
198 | class Timer(object):
199 |   def __init__(self):
200 |     self.total_time   = 0.0
201 |     self.calls        = 0
202 |     self.start_time   = 0.0
203 |     self.duration     = 0.0
204 |     self.average_time = 0.0
205 | 
206 |   def tic(self):
207 |     self.start_time = time.time()
208 | 
209 |   def toc(self, average=True):
210 |     self.duration = time.time() - self.start_time
211 |     self.total_time += self.duration
212 |     self.calls += 1
213 |     self.average_time = self.total_time/self.calls
214 |     if average:
215 |       return self.average_time
216 |     else:
217 |       return self.duration
218 | 
219 | def safe_exp(w, thresh):
220 |   """Safe exponential function for tensors."""
221 | 
222 |   slope = np.exp(thresh)
223 |   with tf.variable_scope('safe_exponential'):
224 |     lin_bool = w > thresh
225 |     lin_region = tf.to_float(lin_bool)
226 | 
227 |     lin_out = slope*(w - thresh + 1.)
228 |     exp_out = tf.exp(tf.where(lin_bool, tf.zeros_like(w), w))
229 | 
230 |     out = lin_region*lin_out + (1.-lin_region)*exp_out
231 |   return out
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/src/dataset/voc_eval.py:
--------------------------------------------------------------------------------
  1 | # This file was from
  2 | # https://raw.githubusercontent.com/rbgirshick/py-faster-rcnn/master/lib/datasets/voc_eval.py
  3 | # --------------------------------------------------------
  4 | # Fast/er R-CNN
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # Written by Bharath Hariharan
  7 | # --------------------------------------------------------
  8 | 
  9 | import xml.etree.ElementTree as ET
 10 | import os
 11 | import cPickle
 12 | import numpy as np
 13 | 
 14 | def parse_rec(filename):
 15 |     """ Parse a PASCAL VOC xml file """
 16 |     tree = ET.parse(filename)
 17 |     objects = []
 18 |     for obj in tree.findall('object'):
 19 |         obj_struct = {}
 20 |         obj_struct['name'] = obj.find('name').text
 21 |         obj_struct['pose'] = obj.find('pose').text
 22 |         obj_struct['truncated'] = int(obj.find('truncated').text)
 23 |         obj_struct['difficult'] = int(obj.find('difficult').text)
 24 |         bbox = obj.find('bndbox')
 25 |         obj_struct['bbox'] = [int(bbox.find('xmin').text),
 26 |                               int(bbox.find('ymin').text),
 27 |                               int(bbox.find('xmax').text),
 28 |                               int(bbox.find('ymax').text)]
 29 |         objects.append(obj_struct)
 30 | 
 31 |     return objects
 32 | 
 33 | def voc_ap(rec, prec, use_07_metric=False):
 34 |     """ ap = voc_ap(rec, prec, [use_07_metric])
 35 |     Compute VOC AP given precision and recall.
 36 |     If use_07_metric is true, uses the
 37 |     VOC 07 11 point method (default:False).
 38 |     """
 39 |     if use_07_metric:
 40 |         # 11 point metric
 41 |         ap = 0.
 42 |         for t in np.arange(0., 1.1, 0.1):
 43 |             if np.sum(rec >= t) == 0:
 44 |                 p = 0
 45 |             else:
 46 |                 p = np.max(prec[rec >= t])
 47 |             ap = ap + p / 11.
 48 |     else:
 49 |         # correct AP calculation
 50 |         # first append sentinel values at the end
 51 |         mrec = np.concatenate(([0.], rec, [1.]))
 52 |         mpre = np.concatenate(([0.], prec, [0.]))
 53 | 
 54 |         # compute the precision envelope
 55 |         for i in range(mpre.size - 1, 0, -1):
 56 |             mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 57 | 
 58 |         # to calculate area under PR curve, look for points
 59 |         # where X axis (recall) changes value
 60 |         i = np.where(mrec[1:] != mrec[:-1])[0]
 61 | 
 62 |         # and sum (\Delta recall) * prec
 63 |         ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 64 |     return ap
 65 | 
 66 | def voc_eval(detpath,
 67 |              annopath,
 68 |              imagesetfile,
 69 |              classname,
 70 |              cachedir,
 71 |              ovthresh=0.5,
 72 |              use_07_metric=False):
 73 |     """rec, prec, ap = voc_eval(detpath,
 74 |                                 annopath,
 75 |                                 imagesetfile,
 76 |                                 classname,
 77 |                                 [ovthresh],
 78 |                                 [use_07_metric])
 79 | 
 80 |     Top level function that does the PASCAL VOC evaluation.
 81 | 
 82 |     detpath: Path to detections
 83 |         detpath.format(classname) should produce the detection results file.
 84 |     annopath: Path to annotations
 85 |         annopath.format(imagename) should be the xml annotations file.
 86 |     imagesetfile: Text file containing the list of images, one image per line.
 87 |     classname: Category name (duh)
 88 |     cachedir: Directory for caching the annotations
 89 |     [ovthresh]: Overlap threshold (default = 0.5)
 90 |     [use_07_metric]: Whether to use VOC07's 11 point AP computation
 91 |         (default False)
 92 |     """
 93 |     # assumes detections are in detpath.format(classname)
 94 |     # assumes annotations are in annopath.format(imagename)
 95 |     # assumes imagesetfile is a text file with each line an image name
 96 |     # cachedir caches the annotations in a pickle file
 97 | 
 98 |     # first load gt
 99 |     if not os.path.isdir(cachedir):
100 |         os.mkdir(cachedir)
101 |     cachefile = os.path.join(cachedir, 'annots.pkl')
102 |     # read list of images
103 |     with open(imagesetfile, 'r') as f:
104 |         lines = f.readlines()
105 |     imagenames = [x.strip() for x in lines]
106 | 
107 |     if not os.path.isfile(cachefile):
108 |         # load annots
109 |         recs = {}
110 |         for i, imagename in enumerate(imagenames):
111 |             recs[imagename] = parse_rec(annopath.format(imagename))
112 |             if i % 100 == 0:
113 |                 print 'Reading annotation for {:d}/{:d}'.format(
114 |                     i + 1, len(imagenames))
115 |         # save
116 |         print 'Saving cached annotations to {:s}'.format(cachefile)
117 |         with open(cachefile, 'w') as f:
118 |             cPickle.dump(recs, f)
119 |     else:
120 |         # load
121 |         with open(cachefile, 'r') as f:
122 |             recs = cPickle.load(f)
123 | 
124 |     # extract gt objects for this class
125 |     class_recs = {}
126 |     npos = 0
127 |     for imagename in imagenames:
128 |         R = [obj for obj in recs[imagename] if obj['name'] == classname]
129 |         bbox = np.array([x['bbox'] for x in R])
130 |         difficult = np.array([x['difficult'] for x in R]).astype(np.bool)
131 |         det = [False] * len(R)
132 |         npos = npos + sum(~difficult)
133 |         class_recs[imagename] = {'bbox': bbox,
134 |                                  'difficult': difficult,
135 |                                  'det': det}
136 | 
137 |     # read dets
138 |     detfile = detpath.format(classname)
139 |     with open(detfile, 'r') as f:
140 |         lines = f.readlines()
141 | 
142 |     splitlines = [x.strip().split(' ') for x in lines]
143 |     image_ids = [x[0] for x in splitlines]
144 |     confidence = np.array([float(x[1]) for x in splitlines])
145 |     BB = np.array([[float(z) for z in x[2:]] for x in splitlines])
146 | 
147 |     if confidence.shape[0] == 0:
148 |       return 0, 0, 0
149 | 
150 |     # sort by confidence
151 |     sorted_ind = np.argsort(-confidence)
152 |     sorted_scores = np.sort(-confidence)
153 |     BB = BB[sorted_ind, :]
154 |     image_ids = [image_ids[x] for x in sorted_ind]
155 | 
156 | 
157 |     # go down dets and mark TPs and FPs
158 |     nd = len(image_ids)
159 |     tp = np.zeros(nd)
160 |     fp = np.zeros(nd)
161 |     for d in range(nd):
162 |         R = class_recs[image_ids[d]]
163 |         bb = BB[d, :].astype(float)
164 |         ovmax = -np.inf
165 |         BBGT = R['bbox'].astype(float)
166 | 
167 |         if BBGT.size > 0:
168 |             # compute overlaps
169 |             # intersection
170 |             ixmin = np.maximum(BBGT[:, 0], bb[0])
171 |             iymin = np.maximum(BBGT[:, 1], bb[1])
172 |             ixmax = np.minimum(BBGT[:, 2], bb[2])
173 |             iymax = np.minimum(BBGT[:, 3], bb[3])
174 |             iw = np.maximum(ixmax - ixmin + 1., 0.)
175 |             ih = np.maximum(iymax - iymin + 1., 0.)
176 |             inters = iw * ih
177 | 
178 |             # union
179 |             uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) +
180 |                    (BBGT[:, 2] - BBGT[:, 0] + 1.) *
181 |                    (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters)
182 | 
183 |             overlaps = inters / uni
184 |             ovmax = np.max(overlaps)
185 |             jmax = np.argmax(overlaps)
186 | 
187 |         if ovmax > ovthresh:
188 |             if not R['difficult'][jmax]:
189 |                 if not R['det'][jmax]:
190 |                     tp[d] = 1.
191 |                     R['det'][jmax] = 1
192 |                 else:
193 |                     fp[d] = 1.
194 |         else:
195 |             fp[d] = 1.
196 | 
197 |     # compute precision recall
198 |     fp = np.cumsum(fp)
199 |     tp = np.cumsum(tp)
200 |     rec = tp / float(npos)
201 |     # avoid divide by zero in case the first detection matches a difficult
202 |     # ground truth
203 |     prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
204 |     ap = voc_ap(rec, prec, use_07_metric)
205 | 
206 |     return rec, prec, ap
207 | 


--------------------------------------------------------------------------------
/src/demo.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """SqueezeDet Demo.
  4 | 
  5 | In image detection mode, for a given image, detect objects and draw bounding
  6 | boxes around them. In video detection mode, perform real-time detection on the
  7 | video stream.
  8 | """
  9 | 
 10 | from __future__ import absolute_import
 11 | from __future__ import division
 12 | from __future__ import print_function
 13 | 
 14 | import cv2
 15 | import time
 16 | import sys
 17 | import os
 18 | import glob
 19 | 
 20 | import numpy as np
 21 | import tensorflow as tf
 22 | 
 23 | from config import *
 24 | from train import _draw_box
 25 | from nets import *
 26 | 
 27 | FLAGS = tf.app.flags.FLAGS
 28 | 
 29 | tf.app.flags.DEFINE_string(
 30 |     'mode', 'image', """'image' or 'video'.""")
 31 | tf.app.flags.DEFINE_string(
 32 |     'checkpoint', './data/model_checkpoints/squeezeDet/model.ckpt-87000',
 33 |     """Path to the model parameter file.""")
 34 | tf.app.flags.DEFINE_string(
 35 |     'input_path', './data/sample.png',
 36 |     """Input image or video to be detected. Can process glob input such as """
 37 |     """./data/00000*.png.""")
 38 | tf.app.flags.DEFINE_string(
 39 |     'out_dir', './data/out/', """Directory to dump output image or video.""")
 40 | tf.app.flags.DEFINE_string(
 41 |     'demo_net', 'squeezeDet', """Neural net architecture.""")
 42 | 
 43 | 
 44 | def video_demo():
 45 |   """Detect videos."""
 46 | 
 47 |   cap = cv2.VideoCapture(FLAGS.input_path)
 48 | 
 49 |   # Define the codec and create VideoWriter object
 50 |   # fourcc = cv2.cv.CV_FOURCC(*'XVID')
 51 |   # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
 52 |   # in_file_name = os.path.split(FLAGS.input_path)[1]
 53 |   # out_file_name = os.path.join(FLAGS.out_dir, 'out_'+in_file_name)
 54 |   # out = cv2.VideoWriter(out_file_name, fourcc, 30.0, (375,1242), True)
 55 |   # out = VideoWriter(out_file_name, frameSize=(1242, 375))
 56 |   # out.open()
 57 | 
 58 |   assert FLAGS.demo_net == 'squeezeDet' or FLAGS.demo_net == 'squeezeDet+', \
 59 |       'Selected nueral net architecture not supported: {}'.format(FLAGS.demo_net)
 60 | 
 61 |   with tf.Graph().as_default():
 62 |     # Load model
 63 |     if FLAGS.demo_net == 'squeezeDet':
 64 |       mc = kitti_squeezeDet_config()
 65 |       mc.BATCH_SIZE = 1
 66 |       # model parameters will be restored from checkpoint
 67 |       mc.LOAD_PRETRAINED_MODEL = False
 68 |       model = SqueezeDet(mc, FLAGS.gpu)
 69 |     elif FLAGS.demo_net == 'squeezeDet+':
 70 |       mc = kitti_squeezeDetPlus_config()
 71 |       mc.BATCH_SIZE = 1
 72 |       mc.LOAD_PRETRAINED_MODEL = False
 73 |       model = SqueezeDetPlus(mc, FLAGS.gpu)
 74 | 
 75 |     saver = tf.train.Saver(model.model_params)
 76 | 
 77 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 78 |       saver.restore(sess, FLAGS.checkpoint)
 79 | 
 80 |       times = {}
 81 |       count = 0
 82 |       while cap.isOpened():
 83 |         t_start = time.time()
 84 |         count += 1
 85 |         out_im_name = os.path.join(FLAGS.out_dir, str(count).zfill(6)+'.jpg')
 86 | 
 87 |         # Load images from video and crop
 88 |         ret, frame = cap.read()
 89 |         if ret==True:
 90 |           # crop frames
 91 |           frame = frame[500:-205, 239:-439, :]
 92 |           im_input = frame.astype(np.float32) - mc.BGR_MEANS
 93 |         else:
 94 |           break
 95 | 
 96 |         t_reshape = time.time()
 97 |         times['reshape']= t_reshape - t_start
 98 | 
 99 |         # Detect
100 |         det_boxes, det_probs, det_class = sess.run(
101 |             [model.det_boxes, model.det_probs, model.det_class],
102 |             feed_dict={model.image_input:[im_input]})
103 | 
104 |         t_detect = time.time()
105 |         times['detect']= t_detect - t_reshape
106 |         
107 |         # Filter
108 |         final_boxes, final_probs, final_class = model.filter_prediction(
109 |             det_boxes[0], det_probs[0], det_class[0])
110 | 
111 |         keep_idx    = [idx for idx in range(len(final_probs)) \
112 |                           if final_probs[idx] > mc.PLOT_PROB_THRESH]
113 |         final_boxes = [final_boxes[idx] for idx in keep_idx]
114 |         final_probs = [final_probs[idx] for idx in keep_idx]
115 |         final_class = [final_class[idx] for idx in keep_idx]
116 | 
117 |         t_filter = time.time()
118 |         times['filter']= t_filter - t_detect
119 | 
120 |         # Draw boxes
121 | 
122 |         # TODO(bichen): move this color dict to configuration file
123 |         cls2clr = {
124 |             'car': (255, 191, 0),
125 |             'cyclist': (0, 191, 255),
126 |             'pedestrian':(255, 0, 191)
127 |         }
128 |         _draw_box(
129 |             frame, final_boxes,
130 |             [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \
131 |                 for idx, prob in zip(final_class, final_probs)],
132 |             cdict=cls2clr
133 |         )
134 | 
135 |         t_draw = time.time()
136 |         times['draw']= t_draw - t_filter
137 | 
138 |         cv2.imwrite(out_im_name, frame)
139 |         # out.write(frame)
140 | 
141 |         times['total']= time.time() - t_start
142 | 
143 |         # time_str = ''
144 |         # for t in times:
145 |         #   time_str += '{} time: {:.4f} '.format(t[0], t[1])
146 |         # time_str += '\n'
147 |         time_str = 'Total time: {:.4f}, detection time: {:.4f}, filter time: '\
148 |                    '{:.4f}'. \
149 |             format(times['total'], times['detect'], times['filter'])
150 | 
151 |         print (time_str)
152 | 
153 |         if cv2.waitKey(1) & 0xFF == ord('q'):
154 |             break
155 |   # Release everything if job is finished
156 |   cap.release()
157 |   # out.release()
158 |   cv2.destroyAllWindows()
159 | 
160 | 
161 | def image_demo():
162 |   """Detect image."""
163 | 
164 |   assert FLAGS.demo_net == 'squeezeDet' or FLAGS.demo_net == 'squeezeDet+', \
165 |       'Selected nueral net architecture not supported: {}'.format(FLAGS.demo_net)
166 | 
167 |   with tf.Graph().as_default():
168 |     # Load model
169 |     if FLAGS.demo_net == 'squeezeDet':
170 |       mc = kitti_squeezeDet_config()
171 |       mc.BATCH_SIZE = 1
172 |       # model parameters will be restored from checkpoint
173 |       mc.LOAD_PRETRAINED_MODEL = False
174 |       model = SqueezeDet(mc, FLAGS.gpu)
175 |     elif FLAGS.demo_net == 'squeezeDet+':
176 |       mc = kitti_squeezeDetPlus_config()
177 |       mc.BATCH_SIZE = 1
178 |       mc.LOAD_PRETRAINED_MODEL = False
179 |       model = SqueezeDetPlus(mc, FLAGS.gpu)
180 | 
181 |     saver = tf.train.Saver(model.model_params)
182 | 
183 |     with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
184 |       saver.restore(sess, FLAGS.checkpoint)
185 | 
186 |       for f in glob.iglob(FLAGS.input_path):
187 |         im = cv2.imread(f)
188 |         im = im.astype(np.float32, copy=False)
189 |         im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT))
190 |         input_image = im - mc.BGR_MEANS
191 | 
192 |         # Detect
193 |         det_boxes, det_probs, det_class = sess.run(
194 |             [model.det_boxes, model.det_probs, model.det_class],
195 |             feed_dict={model.image_input:[input_image]})
196 | 
197 |         # Filter
198 |         final_boxes, final_probs, final_class = model.filter_prediction(
199 |             det_boxes[0], det_probs[0], det_class[0])
200 | 
201 |         keep_idx    = [idx for idx in range(len(final_probs)) \
202 |                           if final_probs[idx] > mc.PLOT_PROB_THRESH]
203 |         final_boxes = [final_boxes[idx] for idx in keep_idx]
204 |         final_probs = [final_probs[idx] for idx in keep_idx]
205 |         final_class = [final_class[idx] for idx in keep_idx]
206 | 
207 |         # TODO(bichen): move this color dict to configuration file
208 |         cls2clr = {
209 |             'car': (255, 191, 0),
210 |             'cyclist': (0, 191, 255),
211 |             'pedestrian':(255, 0, 191)
212 |         }
213 | 
214 |         # Draw boxes
215 |         _draw_box(
216 |             im, final_boxes,
217 |             [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \
218 |                 for idx, prob in zip(final_class, final_probs)],
219 |             cdict=cls2clr,
220 |         )
221 | 
222 |         file_name = os.path.split(f)[1]
223 |         out_file_name = os.path.join(FLAGS.out_dir, 'out_'+file_name)
224 |         cv2.imwrite(out_file_name, im)
225 |         print ('Image detection output saved to {}'.format(out_file_name))
226 | 
227 | 
228 | def main(argv=None):
229 |   if not tf.gfile.Exists(FLAGS.out_dir):
230 |     tf.gfile.MakeDirs(FLAGS.out_dir)
231 |   if FLAGS.mode == 'image':
232 |     image_demo()
233 |   else:
234 |     video_demo()
235 | 
236 | if __name__ == '__main__':
237 |     tf.app.run()
238 | 


--------------------------------------------------------------------------------
/src/eval.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Evaluation"""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import cv2
 10 | from datetime import datetime
 11 | import os.path
 12 | import sys
 13 | import time
 14 | 
 15 | import numpy as np
 16 | from six.moves import xrange
 17 | import tensorflow as tf
 18 | 
 19 | from config import *
 20 | from dataset import pascal_voc, kitti
 21 | from utils.util import bbox_transform, Timer
 22 | from nets import *
 23 | 
 24 | FLAGS = tf.app.flags.FLAGS
 25 | 
 26 | tf.app.flags.DEFINE_string('dataset', 'KITTI',
 27 |                            """Currently support PASCAL_VOC or KITTI dataset.""")
 28 | tf.app.flags.DEFINE_string('data_path', '', """Root directory of data""")
 29 | tf.app.flags.DEFINE_string('image_set', 'test',
 30 |                            """Only used for VOC data."""
 31 |                            """Can be train, trainval, val, or test""")
 32 | tf.app.flags.DEFINE_string('year', '2007',
 33 |                             """VOC challenge year. 2007 or 2012"""
 34 |                             """Only used for VOC data""")
 35 | tf.app.flags.DEFINE_string('eval_dir', '/tmp/bichen/logs/squeezeDet/eval',
 36 |                             """Directory where to write event logs """)
 37 | tf.app.flags.DEFINE_string('checkpoint_path', '/tmp/bichen/logs/squeezeDet/train',
 38 |                             """Path to the training checkpoint.""")
 39 | tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 1,
 40 |                              """How often to check if new cpt is saved.""")
 41 | tf.app.flags.DEFINE_boolean('run_once', False,
 42 |                              """Whether to run eval only once.""")
 43 | tf.app.flags.DEFINE_string('net', 'squeezeDet',
 44 |                            """Neural net architecture.""")
 45 | tf.app.flags.DEFINE_string('gpu', '0', """gpu id.""")
 46 | 
 47 | 
 48 | def eval_once(
 49 |     saver, ckpt_path, summary_writer, eval_summary_ops, eval_summary_phs, imdb,
 50 |     model):
 51 | 
 52 |   with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
 53 | 
 54 |     # Restores from checkpoint
 55 |     saver.restore(sess, ckpt_path)
 56 |     # Assuming model_checkpoint_path looks something like:
 57 |     #   /ckpt_dir/model.ckpt-0,
 58 |     # extract global_step from it.
 59 |     global_step = ckpt_path.split('/')[-1].split('-')[-1]
 60 | 
 61 |     num_images = len(imdb.image_idx)
 62 | 
 63 |     all_boxes = [[[] for _ in xrange(num_images)]
 64 |                  for _ in xrange(imdb.num_classes)]
 65 | 
 66 |     _t = {'im_detect': Timer(), 'im_read': Timer(), 'misc': Timer()}
 67 | 
 68 |     num_detection = 0.0
 69 |     for i in xrange(num_images):
 70 |       _t['im_read'].tic()
 71 |       images, scales = imdb.read_image_batch(shuffle=False)
 72 |       _t['im_read'].toc()
 73 | 
 74 |       _t['im_detect'].tic()
 75 |       det_boxes, det_probs, det_class = sess.run(
 76 |           [model.det_boxes, model.det_probs, model.det_class],
 77 |           feed_dict={model.image_input:images})
 78 |       _t['im_detect'].toc()
 79 | 
 80 |       _t['misc'].tic()
 81 |       for j in range(len(det_boxes)): # batch
 82 |         # rescale
 83 |         det_boxes[j, :, 0::2] /= scales[j][0]
 84 |         det_boxes[j, :, 1::2] /= scales[j][1]
 85 | 
 86 |         det_bbox, score, det_class = model.filter_prediction(
 87 |             det_boxes[j], det_probs[j], det_class[j])
 88 | 
 89 |         num_detection += len(det_bbox)
 90 |         for c, b, s in zip(det_class, det_bbox, score):
 91 |           all_boxes[c][i].append(bbox_transform(b) + [s])
 92 |       _t['misc'].toc()
 93 | 
 94 |       print ('im_detect: {:d}/{:d} im_read: {:.3f}s '
 95 |              'detect: {:.3f}s misc: {:.3f}s'.format(
 96 |                 i+1, num_images, _t['im_read'].average_time,
 97 |                 _t['im_detect'].average_time, _t['misc'].average_time))
 98 | 
 99 |     print ('Evaluating detections...')
100 |     aps, ap_names = imdb.evaluate_detections(
101 |         FLAGS.eval_dir, global_step, all_boxes)
102 | 
103 |     print ('Evaluation summary:')
104 |     print ('  Average number of detections per image: {}:'.format(
105 |       num_detection/num_images))
106 |     print ('  Timing:')
107 |     print ('    im_read: {:.3f}s detect: {:.3f}s misc: {:.3f}s'.format(
108 |       _t['im_read'].average_time, _t['im_detect'].average_time,
109 |       _t['misc'].average_time))
110 |     print ('  Average precisions:')
111 | 
112 |     feed_dict = {}
113 |     for cls, ap in zip(ap_names, aps):
114 |       feed_dict[eval_summary_phs['APs/'+cls]] = ap
115 |       print ('    {}: {:.3f}'.format(cls, ap))
116 | 
117 |     print ('    Mean average precision: {:.3f}'.format(np.mean(aps)))
118 |     feed_dict[eval_summary_phs['APs/mAP']] = np.mean(aps)
119 |     feed_dict[eval_summary_phs['timing/im_detect']] = \
120 |         _t['im_detect'].average_time
121 |     feed_dict[eval_summary_phs['timing/im_read']] = \
122 |         _t['im_read'].average_time
123 |     feed_dict[eval_summary_phs['timing/post_proc']] = \
124 |         _t['misc'].average_time
125 |     feed_dict[eval_summary_phs['num_det_per_image']] = \
126 |         num_detection/num_images
127 | 
128 |     print ('Analyzing detections...')
129 |     stats, ims = imdb.do_detection_analysis_in_eval(
130 |         FLAGS.eval_dir, global_step)
131 | 
132 |     eval_summary_str = sess.run(eval_summary_ops, feed_dict=feed_dict)
133 |     for sum_str in eval_summary_str:
134 |       summary_writer.add_summary(sum_str, global_step)
135 | 
136 | def evaluate():
137 |   """Evaluate."""
138 |   assert FLAGS.dataset == 'KITTI', \
139 |       'Currently only supports KITTI dataset'
140 | 
141 |   os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
142 | 
143 |   with tf.Graph().as_default() as g:
144 | 
145 |     assert FLAGS.net == 'vgg16' or FLAGS.net == 'resnet50' \
146 |         or FLAGS.net == 'squeezeDet' or FLAGS.net == 'squeezeDet+', \
147 |         'Selected neural net architecture not supported: {}'.format(FLAGS.net)
148 |     if FLAGS.net == 'vgg16':
149 |       mc = kitti_vgg16_config()
150 |       mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1
151 |       mc.LOAD_PRETRAINED_MODEL = False
152 |       model = VGG16ConvDet(mc)
153 |     elif FLAGS.net == 'resnet50':
154 |       mc = kitti_res50_config()
155 |       mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1
156 |       mc.LOAD_PRETRAINED_MODEL = False
157 |       model = ResNet50ConvDet(mc)
158 |     elif FLAGS.net == 'squeezeDet':
159 |       mc = kitti_squeezeDet_config()
160 |       mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1
161 |       mc.LOAD_PRETRAINED_MODEL = False
162 |       model = SqueezeDet(mc)
163 |     elif FLAGS.net == 'squeezeDet+':
164 |       mc = kitti_squeezeDetPlus_config()
165 |       mc.BATCH_SIZE = 1 # TODO(bichen): allow batch size > 1
166 |       mc.LOAD_PRETRAINED_MODEL = False
167 |       model = SqueezeDetPlus(mc)
168 | 
169 |     imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc)
170 | 
171 |     # add summary ops and placeholders
172 |     ap_names = []
173 |     for cls in imdb.classes:
174 |       ap_names.append(cls+'_easy')
175 |       ap_names.append(cls+'_medium')
176 |       ap_names.append(cls+'_hard')
177 | 
178 |     eval_summary_ops = []
179 |     eval_summary_phs = {}
180 |     for ap_name in ap_names:
181 |       ph = tf.placeholder(tf.float32)
182 |       eval_summary_phs['APs/'+ap_name] = ph
183 |       eval_summary_ops.append(tf.summary.scalar('APs/'+ap_name, ph))
184 | 
185 |     ph = tf.placeholder(tf.float32)
186 |     eval_summary_phs['APs/mAP'] = ph
187 |     eval_summary_ops.append(tf.summary.scalar('APs/mAP', ph))
188 | 
189 |     ph = tf.placeholder(tf.float32)
190 |     eval_summary_phs['timing/im_detect'] = ph
191 |     eval_summary_ops.append(tf.summary.scalar('timing/im_detect', ph))
192 | 
193 |     ph = tf.placeholder(tf.float32)
194 |     eval_summary_phs['timing/im_read'] = ph
195 |     eval_summary_ops.append(tf.summary.scalar('timing/im_read', ph))
196 | 
197 |     ph = tf.placeholder(tf.float32)
198 |     eval_summary_phs['timing/post_proc'] = ph
199 |     eval_summary_ops.append(tf.summary.scalar('timing/post_proc', ph))
200 | 
201 |     ph = tf.placeholder(tf.float32)
202 |     eval_summary_phs['num_det_per_image'] = ph
203 |     eval_summary_ops.append(tf.summary.scalar('num_det_per_image', ph))
204 | 
205 |     saver = tf.train.Saver(model.model_params)
206 | 
207 |     summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)
208 |     
209 |     ckpts = set() 
210 |     while True:
211 |       if FLAGS.run_once:
212 |         # When run_once is true, checkpoint_path should point to the exact
213 |         # checkpoint file.
214 |         eval_once(
215 |             saver, FLAGS.checkpoint_path, summary_writer, eval_summary_ops,
216 |             eval_summary_phs, imdb, model)
217 |         return
218 |       else:
219 |         # When run_once is false, checkpoint_path should point to the directory
220 |         # that stores checkpoint files.
221 |         ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
222 |         if ckpt and ckpt.model_checkpoint_path:
223 |           if ckpt.model_checkpoint_path in ckpts:
224 |             # Do not evaluate on the same checkpoint
225 |             print ('Wait {:d}s for new checkpoints to be saved ... '
226 |                       .format(FLAGS.eval_interval_secs))
227 |             time.sleep(FLAGS.eval_interval_secs)
228 |           else:
229 |             ckpts.add(ckpt.model_checkpoint_path)
230 |             print ('Evaluating {}...'.format(ckpt.model_checkpoint_path))
231 |             eval_once(
232 |                 saver, ckpt.model_checkpoint_path, summary_writer,
233 |                 eval_summary_ops, eval_summary_phs, imdb, model)
234 |         else:
235 |           print('No checkpoint file found')
236 |           if not FLAGS.run_once:
237 |             print ('Wait {:d}s for new checkpoints to be saved ... '
238 |                       .format(FLAGS.eval_interval_secs))
239 |             time.sleep(FLAGS.eval_interval_secs)
240 | 
241 | 
242 | def main(argv=None):  # pylint: disable=unused-argument
243 |   if tf.gfile.Exists(FLAGS.eval_dir):
244 |     tf.gfile.DeleteRecursively(FLAGS.eval_dir)
245 |   tf.gfile.MakeDirs(FLAGS.eval_dir)
246 |   evaluate()
247 | 
248 | 
249 | if __name__ == '__main__':
250 |   tf.app.run()
251 | 


--------------------------------------------------------------------------------
/src/dataset/imdb.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """The data base wrapper class"""
  4 | 
  5 | import os
  6 | import random
  7 | import shutil
  8 | 
  9 | from PIL import Image, ImageFont, ImageDraw
 10 | import cv2
 11 | import numpy as np
 12 | from utils.util import iou, batch_iou
 13 | 
 14 | class imdb(object):
 15 |   """Image database."""
 16 | 
 17 |   def __init__(self, name, mc):
 18 |     self._name = name
 19 |     self._classes = []
 20 |     self._image_set = []
 21 |     self._image_idx = []
 22 |     self._data_root_path = []
 23 |     self._rois = {}
 24 |     self.mc = mc
 25 | 
 26 |     # batch reader
 27 |     self._perm_idx = None
 28 |     self._cur_idx = 0
 29 | 
 30 |   @property
 31 |   def name(self):
 32 |     return self._name
 33 | 
 34 |   @property
 35 |   def classes(self):
 36 |     return self._classes
 37 | 
 38 |   @property
 39 |   def num_classes(self):
 40 |     return len(self._classes)
 41 | 
 42 |   @property
 43 |   def image_idx(self):
 44 |     return self._image_idx
 45 | 
 46 |   @property
 47 |   def image_set(self):
 48 |     return self._image_set
 49 | 
 50 |   @property
 51 |   def data_root_path(self):
 52 |     return self._data_root_path
 53 | 
 54 |   @property
 55 |   def year(self):
 56 |     return self._year
 57 | 
 58 |   def _shuffle_image_idx(self):
 59 |     self._perm_idx = [self._image_idx[i] for i in
 60 |         np.random.permutation(np.arange(len(self._image_idx)))]
 61 |     self._cur_idx = 0
 62 | 
 63 |   def read_image_batch(self, shuffle=True):
 64 |     """Only Read a batch of images
 65 |     Args:
 66 |       shuffle: whether or not to shuffle the dataset
 67 |     Returns:
 68 |       images: length batch_size list of arrays [height, width, 3]
 69 |     """
 70 |     mc = self.mc
 71 |     if shuffle:
 72 |       if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx):
 73 |         self._shuffle_image_idx()
 74 |       batch_idx = self._perm_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE]
 75 |       self._cur_idx += mc.BATCH_SIZE
 76 |     else:
 77 |       if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx):
 78 |         batch_idx = self._image_idx[self._cur_idx:] \
 79 |             + self._image_idx[:self._cur_idx + mc.BATCH_SIZE-len(self._image_idx)]
 80 |         self._cur_idx += mc.BATCH_SIZE - len(self._image_idx)
 81 |       else:
 82 |         batch_idx = self._image_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE]
 83 |         self._cur_idx += mc.BATCH_SIZE
 84 | 
 85 |     images, scales = [], []
 86 |     for i in batch_idx:
 87 |       im = cv2.imread(self._image_path_at(i))
 88 |       im = im.astype(np.float32, copy=False)
 89 |       im -= mc.BGR_MEANS
 90 |       orig_h, orig_w, _ = [float(v) for v in im.shape]
 91 |       im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT))
 92 |       x_scale = mc.IMAGE_WIDTH/orig_w
 93 |       y_scale = mc.IMAGE_HEIGHT/orig_h
 94 |       images.append(im)
 95 |       scales.append((x_scale, y_scale))
 96 | 
 97 |     return images, scales
 98 | 
 99 |   def read_batch(self, shuffle=True):
100 |     """Read a batch of image and bounding box annotations.
101 |     Args:
102 |       shuffle: whether or not to shuffle the dataset
103 |     Returns:
104 |       image_per_batch: images. Shape: batch_size x width x height x [b, g, r]
105 |       label_per_batch: labels. Shape: batch_size x object_num
106 |       delta_per_batch: bounding box deltas. Shape: batch_size x object_num x 
107 |           [dx ,dy, dw, dh]
108 |       aidx_per_batch: index of anchors that are responsible for prediction.
109 |           Shape: batch_size x object_num
110 |       bbox_per_batch: scaled bounding boxes. Shape: batch_size x object_num x 
111 |           [cx, cy, w, h]
112 |     """
113 |     mc = self.mc
114 | 
115 |     if shuffle:
116 |       if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx):
117 |         self._shuffle_image_idx()
118 |       batch_idx = self._perm_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE]
119 |       self._cur_idx += mc.BATCH_SIZE
120 |     else:
121 |       if self._cur_idx + mc.BATCH_SIZE >= len(self._image_idx):
122 |         batch_idx = self._image_idx[self._cur_idx:] \
123 |             + self._image_idx[:self._cur_idx + mc.BATCH_SIZE-len(self._image_idx)]
124 |         self._cur_idx += mc.BATCH_SIZE - len(self._image_idx)
125 |       else:
126 |         batch_idx = self._image_idx[self._cur_idx:self._cur_idx+mc.BATCH_SIZE]
127 |         self._cur_idx += mc.BATCH_SIZE
128 | 
129 |     image_per_batch = []
130 |     label_per_batch = []
131 |     bbox_per_batch  = []
132 |     delta_per_batch = []
133 |     aidx_per_batch  = []
134 |     if mc.DEBUG_MODE:
135 |       avg_ious = 0.
136 |       num_objects = 0.
137 |       max_iou = 0.0
138 |       min_iou = 1.0
139 |       num_zero_iou_obj = 0
140 | 
141 |     for idx in batch_idx:
142 |       # load the image
143 |       im = cv2.imread(self._image_path_at(idx)).astype(np.float32, copy=False)
144 |       im -= mc.BGR_MEANS
145 |       orig_h, orig_w, _ = [float(v) for v in im.shape]
146 | 
147 |       # load annotations
148 |       label_per_batch.append([b[4] for b in self._rois[idx][:]])
149 |       gt_bbox = np.array([[b[0], b[1], b[2], b[3]] for b in self._rois[idx][:]])
150 | 
151 |       if mc.DATA_AUGMENTATION:
152 |         assert mc.DRIFT_X >= 0 and mc.DRIFT_Y > 0, \
153 |             'mc.DRIFT_X and mc.DRIFT_Y must be >= 0'
154 | 
155 |         if mc.DRIFT_X > 0 or mc.DRIFT_Y > 0:
156 |           # Ensures that gt boundibg box is not cutted out of the image
157 |           max_drift_x = min(gt_bbox[:, 0] - gt_bbox[:, 2]/2.0+1)
158 |           max_drift_y = min(gt_bbox[:, 1] - gt_bbox[:, 3]/2.0+1)
159 |           assert max_drift_x >= 0 and max_drift_y >= 0, 'bbox out of image'
160 | 
161 |           dy = np.random.randint(-mc.DRIFT_Y, min(mc.DRIFT_Y+1, max_drift_y))
162 |           dx = np.random.randint(-mc.DRIFT_X, min(mc.DRIFT_X+1, max_drift_x))
163 | 
164 |           # shift bbox
165 |           gt_bbox[:, 0] = gt_bbox[:, 0] - dx
166 |           gt_bbox[:, 1] = gt_bbox[:, 1] - dy
167 | 
168 |           # distort image
169 |           orig_h -= dy
170 |           orig_w -= dx
171 |           orig_x, dist_x = max(dx, 0), max(-dx, 0)
172 |           orig_y, dist_y = max(dy, 0), max(-dy, 0)
173 | 
174 |           distorted_im = np.zeros(
175 |               (int(orig_h), int(orig_w), 3)).astype(np.float32)
176 |           distorted_im[dist_y:, dist_x:, :] = im[orig_y:, orig_x:, :]
177 |           im = distorted_im
178 | 
179 |         # Flip image with 50% probability
180 |         if np.random.randint(2) > 0.5:
181 |           im = im[:, ::-1, :]
182 |           gt_bbox[:, 0] = orig_w - 1 - gt_bbox[:, 0]
183 | 
184 |       # scale image
185 |       im = cv2.resize(im, (mc.IMAGE_WIDTH, mc.IMAGE_HEIGHT))
186 |       image_per_batch.append(im)
187 | 
188 |       # scale annotation
189 |       x_scale = mc.IMAGE_WIDTH/orig_w
190 |       y_scale = mc.IMAGE_HEIGHT/orig_h
191 |       gt_bbox[:, 0::2] = gt_bbox[:, 0::2]*x_scale
192 |       gt_bbox[:, 1::2] = gt_bbox[:, 1::2]*y_scale
193 |       bbox_per_batch.append(gt_bbox)
194 | 
195 |       aidx_per_image, delta_per_image = [], []
196 |       aidx_set = set()
197 |       for i in range(len(gt_bbox)):
198 |         overlaps = batch_iou(mc.ANCHOR_BOX, gt_bbox[i])
199 | 
200 |         aidx = len(mc.ANCHOR_BOX)
201 |         for ov_idx in np.argsort(overlaps)[::-1]:
202 |           if overlaps[ov_idx] <= 0:
203 |             if mc.DEBUG_MODE:
204 |               min_iou = min(overlaps[ov_idx], min_iou)
205 |               num_objects += 1
206 |               num_zero_iou_obj += 1
207 |             break
208 |           if ov_idx not in aidx_set:
209 |             aidx_set.add(ov_idx)
210 |             aidx = ov_idx
211 |             if mc.DEBUG_MODE:
212 |               max_iou = max(overlaps[ov_idx], max_iou)
213 |               min_iou = min(overlaps[ov_idx], min_iou)
214 |               avg_ious += overlaps[ov_idx]
215 |               num_objects += 1
216 |             break
217 | 
218 |         if aidx == len(mc.ANCHOR_BOX): 
219 |           # even the largeset available overlap is 0, thus, choose one with the
220 |           # smallest square distance
221 |           dist = np.sum(np.square(gt_bbox[i] - mc.ANCHOR_BOX), axis=1)
222 |           for dist_idx in np.argsort(dist):
223 |             if dist_idx not in aidx_set:
224 |               aidx_set.add(dist_idx)
225 |               aidx = dist_idx
226 |               break
227 | 
228 |         box_cx, box_cy, box_w, box_h = gt_bbox[i]
229 |         delta = [0]*4
230 |         delta[0] = (box_cx - mc.ANCHOR_BOX[aidx][0])/mc.ANCHOR_BOX[aidx][2]
231 |         delta[1] = (box_cy - mc.ANCHOR_BOX[aidx][1])/mc.ANCHOR_BOX[aidx][3]
232 |         delta[2] = np.log(box_w/mc.ANCHOR_BOX[aidx][2])
233 |         delta[3] = np.log(box_h/mc.ANCHOR_BOX[aidx][3])
234 | 
235 |         aidx_per_image.append(aidx)
236 |         delta_per_image.append(delta)
237 | 
238 |       delta_per_batch.append(delta_per_image)
239 |       aidx_per_batch.append(aidx_per_image)
240 | 
241 |     if mc.DEBUG_MODE:
242 |       print ('max iou: {}'.format(max_iou))
243 |       print ('min iou: {}'.format(min_iou))
244 |       print ('avg iou: {}'.format(avg_ious/num_objects))
245 |       print ('number of objects: {}'.format(num_objects))
246 |       print ('number of objects with 0 iou: {}'.format(num_zero_iou_obj))
247 | 
248 |     return image_per_batch, label_per_batch, delta_per_batch, \
249 |         aidx_per_batch, bbox_per_batch
250 | 
251 |   def evaluate_detections(self):
252 |     raise NotImplementedError
253 | 
254 |   def visualize_detections(
255 |       self, image_dir, image_format, det_error_file, output_image_dir,
256 |       num_det_per_type=10):
257 | 
258 |     # load detections
259 |     with open(det_error_file) as f:
260 |       lines = f.readlines()
261 |       random.shuffle(lines)
262 |     f.close()
263 | 
264 |     dets_per_type = {}
265 |     for line in lines:
266 |       obj = line.strip().split(' ')
267 |       error_type = obj[1]
268 |       if error_type not in dets_per_type:
269 |         dets_per_type[error_type] = [{
270 |             'im_idx':obj[0], 
271 |             'bbox':[float(obj[2]), float(obj[3]), float(obj[4]), float(obj[5])],
272 |             'class':obj[6],
273 |             'score': float(obj[7])
274 |         }]
275 |       else:
276 |         dets_per_type[error_type].append({
277 |             'im_idx':obj[0], 
278 |             'bbox':[float(obj[2]), float(obj[3]), float(obj[4]), float(obj[5])],
279 |             'class':obj[6],
280 |             'score': float(obj[7])
281 |         })
282 | 
283 |     out_ims = []
284 |     # Randomly select some detections and plot them
285 |     COLOR = (200, 200, 0)
286 |     for error_type, dets in dets_per_type.iteritems():
287 |       det_im_dir = os.path.join(output_image_dir, error_type)
288 |       if os.path.exists(det_im_dir):
289 |         shutil.rmtree(det_im_dir)
290 |       os.makedirs(det_im_dir)
291 | 
292 |       for i in range(min(num_det_per_type, len(dets))):
293 |         det = dets[i]
294 |         im = Image.open(
295 |             os.path.join(image_dir, det['im_idx']+image_format))
296 |         draw = ImageDraw.Draw(im)
297 |         draw.rectangle(det['bbox'], outline=COLOR)
298 |         draw.text((det['bbox'][0], det['bbox'][1]), 
299 |                   '{:s} ({:.2f})'.format(det['class'], det['score']),
300 |                   fill=COLOR)
301 |         out_im_path = os.path.join(det_im_dir, str(i)+image_format)
302 |         im.save(out_im_path)
303 |         im = np.array(im)
304 |         out_ims.append(im[:,:,::-1]) # RGB to BGR
305 |     return out_ims
306 | 
307 | 


--------------------------------------------------------------------------------
/src/dataset/kitti.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Image data base class for kitti"""
  4 | 
  5 | import cv2
  6 | import os 
  7 | import numpy as np
  8 | import subprocess
  9 | 
 10 | from dataset.imdb import imdb
 11 | from utils.util import bbox_transform_inv, batch_iou
 12 | 
 13 | class kitti(imdb):
 14 |   def __init__(self, image_set, data_path, mc):
 15 |     imdb.__init__(self, 'kitti_'+image_set, mc)
 16 |     self._image_set = image_set
 17 |     self._data_root_path = data_path
 18 |     self._image_path = os.path.join(self._data_root_path, 'training', 'image_2')
 19 |     self._label_path = os.path.join(self._data_root_path, 'training', 'label_2')
 20 |     self._classes = self.mc.CLASS_NAMES
 21 |     self._class_to_idx = dict(zip(self.classes, xrange(self.num_classes)))
 22 | 
 23 |     # a list of string indices of images in the directory
 24 |     self._image_idx = self._load_image_set_idx() 
 25 |     # a dict of image_idx -> [[cx, cy, w, h, cls_idx]]. x,y,w,h are not divided by
 26 |     # the image width and height
 27 |     self._rois = self._load_kitti_annotation()
 28 | 
 29 |     ## batch reader ##
 30 |     self._perm_idx = None
 31 |     self._cur_idx = 0
 32 |     # TODO(bichen): add a random seed as parameter
 33 |     self._shuffle_image_idx()
 34 | 
 35 |     self._eval_tool = './src/dataset/kitti-eval/cpp/evaluate_object'
 36 | 
 37 |   def _load_image_set_idx(self):
 38 |     image_set_file = os.path.join(
 39 |         self._data_root_path, 'ImageSets', self._image_set+'.txt')
 40 |     assert os.path.exists(image_set_file), \
 41 |         'File does not exist: {}'.format(image_set_file)
 42 | 
 43 |     with open(image_set_file) as f:
 44 |       image_idx = [x.strip() for x in f.readlines()]
 45 |     return image_idx
 46 | 
 47 |   def _image_path_at(self, idx):
 48 |     image_path = os.path.join(self._image_path, idx+'.png')
 49 |     assert os.path.exists(image_path), \
 50 |         'Image does not exist: {}'.format(image_path)
 51 |     return image_path
 52 | 
 53 |   def _load_kitti_annotation(self):
 54 |     def _get_obj_level(obj):
 55 |       height = float(obj[7]) - float(obj[5]) + 1
 56 |       truncation = float(obj[1])
 57 |       occlusion = float(obj[2])
 58 |       if height >= 40 and truncation <= 0.15 and occlusion <= 0:
 59 |           return 1
 60 |       elif height >= 25 and truncation <= 0.3 and occlusion <= 1:
 61 |           return 2
 62 |       elif height >= 25 and truncation <= 0.5 and occlusion <= 2:
 63 |           return 3
 64 |       else:
 65 |           return 4
 66 | 
 67 |     idx2annotation = {}
 68 |     for index in self._image_idx:
 69 |       filename = os.path.join(self._label_path, index+'.txt')
 70 |       with open(filename, 'r') as f:
 71 |         lines = f.readlines()
 72 |       f.close()
 73 |       bboxes = []
 74 |       for line in lines:
 75 |         obj = line.strip().split(' ')
 76 |         try:
 77 |           cls = self._class_to_idx[obj[0].lower().strip()]
 78 |         except:
 79 |           continue
 80 | 
 81 |         if self.mc.EXCLUDE_HARD_EXAMPLES and _get_obj_level(obj) > 3:
 82 |           continue
 83 |         xmin = float(obj[4])
 84 |         ymin = float(obj[5])
 85 |         xmax = float(obj[6])
 86 |         ymax = float(obj[7])
 87 |         assert xmin >= 0.0 and xmin <= xmax, \
 88 |             'Invalid bounding box x-coord xmin {} or xmax {} at {}.txt' \
 89 |                 .format(xmin, xmax, index)
 90 |         assert ymin >= 0.0 and ymin <= ymax, \
 91 |             'Invalid bounding box y-coord ymin {} or ymax {} at {}.txt' \
 92 |                 .format(ymin, ymax, index)
 93 |         x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax])
 94 |         bboxes.append([x, y, w, h, cls])
 95 | 
 96 |       idx2annotation[index] = bboxes
 97 | 
 98 |     return idx2annotation
 99 | 
100 |   def evaluate_detections(self, eval_dir, global_step, all_boxes):
101 |     """Evaluate detection results.
102 |     Args:
103 |       eval_dir: directory to write evaluation logs
104 |       global_step: step of the checkpoint
105 |       all_boxes: all_boxes[cls][image] = N x 5 arrays of 
106 |         [xmin, ymin, xmax, ymax, score]
107 |     Returns:
108 |       aps: array of average precisions.
109 |       names: class names corresponding to each ap
110 |     """
111 |     det_file_dir = os.path.join(
112 |         eval_dir, 'detection_files_{:s}'.format(global_step), 'data')
113 |     if not os.path.isdir(det_file_dir):
114 |       os.makedirs(det_file_dir)
115 | 
116 |     for im_idx, index in enumerate(self._image_idx):
117 |       filename = os.path.join(det_file_dir, index+'.txt')
118 |       with open(filename, 'wt') as f:
119 |         for cls_idx, cls in enumerate(self._classes):
120 |           dets = all_boxes[cls_idx][im_idx]
121 |           for k in xrange(len(dets)):
122 |             f.write(
123 |                 '{:s} -1 -1 0.0 {:.2f} {:.2f} {:.2f} {:.2f} 0.0 0.0 0.0 0.0 0.0 '
124 |                 '0.0 0.0 {:.3f}\n'.format(
125 |                     cls.lower(), dets[k][0], dets[k][1], dets[k][2], dets[k][3],
126 |                     dets[k][4])
127 |             )
128 | 
129 |     cmd = self._eval_tool + ' ' \
130 |           + os.path.join(self._data_root_path, 'training') + ' ' \
131 |           + os.path.join(self._data_root_path, 'ImageSets',
132 |                          self._image_set+'.txt') + ' ' \
133 |           + os.path.dirname(det_file_dir) + ' ' + str(len(self._image_idx))
134 | 
135 |     print('Running: {}'.format(cmd))
136 |     status = subprocess.call(cmd, shell=True)
137 | 
138 |     aps = []
139 |     names = []
140 |     for cls in self._classes:
141 |       det_file_name = os.path.join(
142 |           os.path.dirname(det_file_dir), 'stats_{:s}_ap.txt'.format(cls))
143 |       if os.path.exists(det_file_name):
144 |         with open(det_file_name, 'r') as f:
145 |           lines = f.readlines()
146 |         assert len(lines) == 3, \
147 |             'Line number of {} should be 3'.format(det_file_name)
148 | 
149 |         aps.append(float(lines[0].split('=')[1].strip()))
150 |         aps.append(float(lines[1].split('=')[1].strip()))
151 |         aps.append(float(lines[2].split('=')[1].strip()))
152 |       else:
153 |         aps.extend([0.0, 0.0, 0.0])
154 | 
155 |       names.append(cls+'_easy')
156 |       names.append(cls+'_medium')
157 |       names.append(cls+'_hard')
158 | 
159 |     return aps, names
160 | 
161 |   def do_detection_analysis_in_eval(self, eval_dir, global_step):
162 |     det_file_dir = os.path.join(
163 |         eval_dir, 'detection_files_{:s}'.format(global_step), 'data')
164 |     det_error_dir = os.path.join(
165 |         eval_dir, 'detection_files_{:s}'.format(global_step),
166 |         'error_analysis')
167 |     if not os.path.exists(det_error_dir):
168 |       os.makedirs(det_error_dir)
169 |     det_error_file = os.path.join(det_error_dir, 'det_error_file.txt')
170 | 
171 |     stats = self.analyze_detections(det_file_dir, det_error_file)
172 |     ims = self.visualize_detections(
173 |         image_dir=self._image_path,
174 |         image_format='.png',
175 |         det_error_file=det_error_file,
176 |         output_image_dir=det_error_dir,
177 |         num_det_per_type=10
178 |     )
179 | 
180 |     return stats, ims
181 | 
182 |   def analyze_detections(self, detection_file_dir, det_error_file):
183 |     def _save_detection(f, idx, error_type, det, score):
184 |       f.write(
185 |           '{:s} {:s} {:.1f} {:.1f} {:.1f} {:.1f} {:s} {:.3f}\n'.format(
186 |               idx, error_type,
187 |               det[0]-det[2]/2., det[1]-det[3]/2.,
188 |               det[0]+det[2]/2., det[1]+det[3]/2.,
189 |               self._classes[int(det[4])], 
190 |               score
191 |           )
192 |       )
193 | 
194 |     # load detections
195 |     self._det_rois = {}
196 |     for idx in self._image_idx:
197 |       det_file_name = os.path.join(detection_file_dir, idx+'.txt')
198 |       with open(det_file_name) as f:
199 |         lines = f.readlines()
200 |       f.close()
201 |       bboxes = []
202 |       for line in lines:
203 |         obj = line.strip().split(' ')
204 |         cls = self._class_to_idx[obj[0].lower().strip()]
205 |         xmin = float(obj[4])
206 |         ymin = float(obj[5])
207 |         xmax = float(obj[6])
208 |         ymax = float(obj[7])
209 |         score = float(obj[-1])
210 | 
211 |         x, y, w, h = bbox_transform_inv([xmin, ymin, xmax, ymax])
212 |         bboxes.append([x, y, w, h, cls, score])
213 |       bboxes.sort(key=lambda x: x[-1], reverse=True)
214 |       self._det_rois[idx] = bboxes
215 | 
216 |     # do error analysis
217 |     num_objs = 0.
218 |     num_dets = 0.
219 |     num_correct = 0.
220 |     num_loc_error = 0.
221 |     num_cls_error = 0.
222 |     num_bg_error = 0.
223 |     num_repeated_error = 0.
224 |     num_detected_obj = 0.
225 | 
226 |     with open(det_error_file, 'w') as f:
227 |       for idx in self._image_idx:
228 |         gt_bboxes = np.array(self._rois[idx])
229 |         num_objs += len(gt_bboxes)
230 |         detected = [False]*len(gt_bboxes)
231 | 
232 |         det_bboxes = self._det_rois[idx]
233 |         if len(gt_bboxes) < 1:
234 |           continue
235 | 
236 |         for i, det in enumerate(det_bboxes):
237 |           if i < len(gt_bboxes):
238 |             num_dets += 1
239 |           ious = batch_iou(gt_bboxes[:, :4], det[:4])
240 |           max_iou = np.max(ious)
241 |           gt_idx = np.argmax(ious)
242 |           if max_iou > 0.1:
243 |             if gt_bboxes[gt_idx, 4] == det[4]:
244 |               if max_iou >= 0.5:
245 |                 if i < len(gt_bboxes):
246 |                   if not detected[gt_idx]:
247 |                     num_correct += 1
248 |                     detected[gt_idx] = True
249 |                   else:
250 |                     num_repeated_error += 1
251 |               else:
252 |                 if i < len(gt_bboxes):
253 |                   num_loc_error += 1
254 |                   _save_detection(f, idx, 'loc', det, det[5])
255 |             else:
256 |               if i < len(gt_bboxes):
257 |                 num_cls_error += 1
258 |                 _save_detection(f, idx, 'cls', det, det[5])
259 |           else:
260 |             if i < len(gt_bboxes):
261 |               num_bg_error += 1
262 |               _save_detection(f, idx, 'bg', det, det[5])
263 | 
264 |         for i, gt in enumerate(gt_bboxes):
265 |           if not detected[i]:
266 |             _save_detection(f, idx, 'missed', gt, -1.0)
267 |         num_detected_obj += sum(detected)
268 |     f.close()
269 | 
270 |     print ('Detection Analysis:')
271 |     print ('    Number of detections: {}'.format(num_dets))
272 |     print ('    Number of objects: {}'.format(num_objs))
273 |     print ('    Percentage of correct detections: {}'.format(
274 |       num_correct/num_dets))
275 |     print ('    Percentage of localization error: {}'.format(
276 |       num_loc_error/num_dets))
277 |     print ('    Percentage of classification error: {}'.format(
278 |       num_cls_error/num_dets))
279 |     print ('    Percentage of background error: {}'.format(
280 |       num_bg_error/num_dets))
281 |     print ('    Percentage of repeated detections: {}'.format(
282 |       num_repeated_error/num_dets))
283 |     print ('    Recall: {}'.format(
284 |       num_detected_obj/num_objs))
285 | 
286 |     out = {}
287 |     out['num of detections'] = num_dets
288 |     out['num of objects'] = num_objs
289 |     out['% correct detections'] = num_correct/num_dets
290 |     out['% localization error'] = num_loc_error/num_dets
291 |     out['% classification error'] = num_cls_error/num_dets
292 |     out['% background error'] = num_bg_error/num_dets
293 |     out['% repeated error'] = num_repeated_error/num_dets
294 |     out['% recall'] = num_detected_obj/num_objs
295 | 
296 |     return out
297 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Train"""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import cv2
 10 | from datetime import datetime
 11 | import os.path
 12 | import sys
 13 | import time
 14 | 
 15 | import numpy as np
 16 | from six.moves import xrange
 17 | import tensorflow as tf
 18 | import threading
 19 | 
 20 | from config import *
 21 | from dataset import pascal_voc, kitti
 22 | from utils.util import sparse_to_dense, bgr_to_rgb, bbox_transform
 23 | from nets import *
 24 | 
 25 | FLAGS = tf.app.flags.FLAGS
 26 | 
 27 | tf.app.flags.DEFINE_string('dataset', 'KITTI',
 28 |                            """Currently only support KITTI dataset.""")
 29 | tf.app.flags.DEFINE_string('data_path', '', """Root directory of data""")
 30 | tf.app.flags.DEFINE_string('image_set', 'train',
 31 |                            """ Can be train, trainval, val, or test""")
 32 | tf.app.flags.DEFINE_string('year', '2007',
 33 |                             """VOC challenge year. 2007 or 2012"""
 34 |                             """Only used for Pascal VOC dataset""")
 35 | tf.app.flags.DEFINE_string('train_dir', '/tmp/bichen/logs/squeezeDet/train',
 36 |                             """Directory where to write event logs """
 37 |                             """and checkpoint.""")
 38 | tf.app.flags.DEFINE_integer('max_steps', 1000000,
 39 |                             """Maximum number of batches to run.""")
 40 | tf.app.flags.DEFINE_string('net', 'squeezeDet',
 41 |                            """Neural net architecture. """)
 42 | tf.app.flags.DEFINE_string('pretrained_model_path', '',
 43 |                            """Path to the pretrained model.""")
 44 | tf.app.flags.DEFINE_integer('summary_step', 10,
 45 |                             """Number of steps to save summary.""")
 46 | tf.app.flags.DEFINE_integer('checkpoint_step', 1000,
 47 |                             """Number of steps to save summary.""")
 48 | tf.app.flags.DEFINE_string('gpu', '0', """gpu id.""")
 49 | 
 50 | 
 51 | def _draw_box(im, box_list, label_list, color=(0,255,0), cdict=None, form='center'):
 52 |   assert form == 'center' or form == 'diagonal', \
 53 |       'bounding box format not accepted: {}.'.format(form)
 54 | 
 55 |   for bbox, label in zip(box_list, label_list):
 56 | 
 57 |     if form == 'center':
 58 |       bbox = bbox_transform(bbox)
 59 | 
 60 |     xmin, ymin, xmax, ymax = [int(b) for b in bbox]
 61 | 
 62 |     l = label.split(':')[0] # text before "CLASS: (PROB)"
 63 |     if cdict and l in cdict:
 64 |       c = cdict[l]
 65 |     else:
 66 |       c = color
 67 | 
 68 |     # draw box
 69 |     cv2.rectangle(im, (xmin, ymin), (xmax, ymax), c, 1)
 70 |     # draw label
 71 |     font = cv2.FONT_HERSHEY_SIMPLEX
 72 |     cv2.putText(im, label, (xmin, ymax), font, 0.3, c, 1)
 73 | 
 74 | def _viz_prediction_result(model, images, bboxes, labels, batch_det_bbox,
 75 |                            batch_det_class, batch_det_prob):
 76 |   mc = model.mc
 77 | 
 78 |   for i in range(len(images)):
 79 |     # draw ground truth
 80 |     _draw_box(
 81 |         images[i], bboxes[i],
 82 |         [mc.CLASS_NAMES[idx] for idx in labels[i]],
 83 |         (0, 255, 0))
 84 | 
 85 |     # draw prediction
 86 |     det_bbox, det_prob, det_class = model.filter_prediction(
 87 |         batch_det_bbox[i], batch_det_prob[i], batch_det_class[i])
 88 | 
 89 |     keep_idx    = [idx for idx in range(len(det_prob)) \
 90 |                       if det_prob[idx] > mc.PLOT_PROB_THRESH]
 91 |     det_bbox    = [det_bbox[idx] for idx in keep_idx]
 92 |     det_prob    = [det_prob[idx] for idx in keep_idx]
 93 |     det_class   = [det_class[idx] for idx in keep_idx]
 94 | 
 95 |     _draw_box(
 96 |         images[i], det_bbox,
 97 |         [mc.CLASS_NAMES[idx]+': (%.2f)'% prob \
 98 |             for idx, prob in zip(det_class, det_prob)],
 99 |         (0, 0, 255))
100 | 
101 | 
102 | def train():
103 |   """Train SqueezeDet model"""
104 |   assert FLAGS.dataset == 'KITTI', \
105 |       'Currently only support KITTI dataset'
106 | 
107 |   os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
108 | 
109 |   with tf.Graph().as_default():
110 | 
111 |     assert FLAGS.net == 'vgg16' or FLAGS.net == 'resnet50' \
112 |         or FLAGS.net == 'squeezeDet' or FLAGS.net == 'squeezeDet+', \
113 |         'Selected neural net architecture not supported: {}'.format(FLAGS.net)
114 |     if FLAGS.net == 'vgg16':
115 |       mc = kitti_vgg16_config()
116 |       mc.IS_TRAINING = True
117 |       mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path
118 |       model = VGG16ConvDet(mc)
119 |     elif FLAGS.net == 'resnet50':
120 |       mc = kitti_res50_config()
121 |       mc.IS_TRAINING = True
122 |       mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path
123 |       model = ResNet50ConvDet(mc)
124 |     elif FLAGS.net == 'squeezeDet':
125 |       mc = kitti_squeezeDet_config()
126 |       mc.IS_TRAINING = True
127 |       mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path
128 |       model = SqueezeDet(mc)
129 |     elif FLAGS.net == 'squeezeDet+':
130 |       mc = kitti_squeezeDetPlus_config()
131 |       mc.IS_TRAINING = True
132 |       mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path
133 |       model = SqueezeDetPlus(mc)
134 | 
135 |     imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc)
136 | 
137 |     # save model size, flops, activations by layers
138 |     with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f:
139 |       f.write('Number of parameter by layer:\n')
140 |       count = 0
141 |       for c in model.model_size_counter:
142 |         f.write('\t{}: {}\n'.format(c[0], c[1]))
143 |         count += c[1]
144 |       f.write('\ttotal: {}\n'.format(count))
145 | 
146 |       count = 0
147 |       f.write('\nActivation size by layer:\n')
148 |       for c in model.activation_counter:
149 |         f.write('\t{}: {}\n'.format(c[0], c[1]))
150 |         count += c[1]
151 |       f.write('\ttotal: {}\n'.format(count))
152 | 
153 |       count = 0
154 |       f.write('\nNumber of flops by layer:\n')
155 |       for c in model.flop_counter:
156 |         f.write('\t{}: {}\n'.format(c[0], c[1]))
157 |         count += c[1]
158 |       f.write('\ttotal: {}\n'.format(count))
159 |     f.close()
160 |     print ('Model statistics saved to {}.'.format(
161 |       os.path.join(FLAGS.train_dir, 'model_metrics.txt')))
162 | 
163 |     def _load_data(load_to_placeholder=True):
164 |       # read batch input
165 |       image_per_batch, label_per_batch, box_delta_per_batch, aidx_per_batch, \
166 |           bbox_per_batch = imdb.read_batch()
167 | 
168 |       label_indices, bbox_indices, box_delta_values, mask_indices, box_values, \
169 |           = [], [], [], [], []
170 |       aidx_set = set()
171 |       num_discarded_labels = 0
172 |       num_labels = 0
173 |       for i in range(len(label_per_batch)): # batch_size
174 |         for j in range(len(label_per_batch[i])): # number of annotations
175 |           num_labels += 1
176 |           if (i, aidx_per_batch[i][j]) not in aidx_set:
177 |             aidx_set.add((i, aidx_per_batch[i][j]))
178 |             label_indices.append(
179 |                 [i, aidx_per_batch[i][j], label_per_batch[i][j]])
180 |             mask_indices.append([i, aidx_per_batch[i][j]])
181 |             bbox_indices.extend(
182 |                 [[i, aidx_per_batch[i][j], k] for k in range(4)])
183 |             box_delta_values.extend(box_delta_per_batch[i][j])
184 |             box_values.extend(bbox_per_batch[i][j])
185 |           else:
186 |             num_discarded_labels += 1
187 | 
188 |       if mc.DEBUG_MODE:
189 |         print ('Warning: Discarded {}/({}) labels that are assigned to the same '
190 |                'anchor'.format(num_discarded_labels, num_labels))
191 | 
192 |       if load_to_placeholder:
193 |         image_input = model.ph_image_input
194 |         input_mask = model.ph_input_mask
195 |         box_delta_input = model.ph_box_delta_input
196 |         box_input = model.ph_box_input
197 |         labels = model.ph_labels
198 |       else:
199 |         image_input = model.image_input
200 |         input_mask = model.input_mask
201 |         box_delta_input = model.box_delta_input
202 |         box_input = model.box_input
203 |         labels = model.labels
204 | 
205 |       feed_dict = {
206 |           image_input: image_per_batch,
207 |           input_mask: np.reshape(
208 |               sparse_to_dense(
209 |                   mask_indices, [mc.BATCH_SIZE, mc.ANCHORS],
210 |                   [1.0]*len(mask_indices)),
211 |               [mc.BATCH_SIZE, mc.ANCHORS, 1]),
212 |           box_delta_input: sparse_to_dense(
213 |               bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4],
214 |               box_delta_values),
215 |           box_input: sparse_to_dense(
216 |               bbox_indices, [mc.BATCH_SIZE, mc.ANCHORS, 4],
217 |               box_values),
218 |           labels: sparse_to_dense(
219 |               label_indices,
220 |               [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES],
221 |               [1.0]*len(label_indices)),
222 |       }
223 | 
224 |       return feed_dict, image_per_batch, label_per_batch, bbox_per_batch
225 | 
226 |     def _enqueue(sess, coord):
227 |       try:
228 |         while not coord.should_stop():
229 |           feed_dict, _, _, _ = _load_data()
230 |           sess.run(model.enqueue_op, feed_dict=feed_dict)
231 |           if mc.DEBUG_MODE:
232 |             print ("added to the queue")
233 |         if mc.DEBUG_MODE:
234 |           print ("Finished enqueue")
235 |       except Exception, e:
236 |         coord.request_stop(e)
237 | 
238 |     sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
239 | 
240 |     saver = tf.train.Saver(tf.global_variables())
241 |     summary_op = tf.summary.merge_all()
242 | 
243 |     ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
244 |     if ckpt and ckpt.model_checkpoint_path:
245 |         saver.restore(sess, ckpt.model_checkpoint_path)
246 | 
247 |     summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)
248 | 
249 |     init = tf.global_variables_initializer()
250 |     sess.run(init)
251 | 
252 |     coord = tf.train.Coordinator()
253 | 
254 |     if mc.NUM_THREAD > 0:
255 |       enq_threads = []
256 |       for _ in range(mc.NUM_THREAD):
257 |         enq_thread = threading.Thread(target=_enqueue, args=[sess, coord])
258 |         # enq_thread.isDaemon()
259 |         enq_thread.start()
260 |         enq_threads.append(enq_thread)
261 | 
262 |     threads = tf.train.start_queue_runners(coord=coord, sess=sess)
263 |     run_options = tf.RunOptions(timeout_in_ms=60000)
264 | 
265 |     # try: 
266 |     for step in xrange(FLAGS.max_steps):
267 |       if coord.should_stop():
268 |         sess.run(model.FIFOQueue.close(cancel_pending_enqueues=True))
269 |         coord.request_stop()
270 |         coord.join(threads)
271 |         break
272 | 
273 |       start_time = time.time()
274 | 
275 |       if step % FLAGS.summary_step == 0:
276 |         feed_dict, image_per_batch, label_per_batch, bbox_per_batch = \
277 |             _load_data(load_to_placeholder=False)
278 |         op_list = [
279 |             model.train_op, model.loss, summary_op, model.det_boxes,
280 |             model.det_probs, model.det_class, model.conf_loss,
281 |             model.bbox_loss, model.class_loss
282 |         ]
283 |         _, loss_value, summary_str, det_boxes, det_probs, det_class, \
284 |             conf_loss, bbox_loss, class_loss = sess.run(
285 |                 op_list, feed_dict=feed_dict)
286 | 
287 |         _viz_prediction_result(
288 |             model, image_per_batch, bbox_per_batch, label_per_batch, det_boxes,
289 |             det_class, det_probs)
290 |         image_per_batch = bgr_to_rgb(image_per_batch)
291 |         viz_summary = sess.run(
292 |             model.viz_op, feed_dict={model.image_to_show: image_per_batch})
293 | 
294 |         summary_writer.add_summary(summary_str, step)
295 |         summary_writer.add_summary(viz_summary, step)
296 |         summary_writer.flush()
297 | 
298 |         print ('conf_loss: {}, bbox_loss: {}, class_loss: {}'.
299 |             format(conf_loss, bbox_loss, class_loss))
300 |       else:
301 |         if mc.NUM_THREAD > 0:
302 |           _, loss_value, conf_loss, bbox_loss, class_loss = sess.run(
303 |               [model.train_op, model.loss, model.conf_loss, model.bbox_loss,
304 |                model.class_loss], options=run_options)
305 |         else:
306 |           feed_dict, _, _, _ = _load_data(load_to_placeholder=False)
307 |           _, loss_value, conf_loss, bbox_loss, class_loss = sess.run(
308 |               [model.train_op, model.loss, model.conf_loss, model.bbox_loss,
309 |                model.class_loss], feed_dict=feed_dict)
310 | 
311 |       duration = time.time() - start_time
312 | 
313 |       assert not np.isnan(loss_value), \
314 |           'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \
315 |           'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss)
316 | 
317 |       if step % 10 == 0:
318 |         num_images_per_step = mc.BATCH_SIZE
319 |         images_per_sec = num_images_per_step / duration
320 |         sec_per_batch = float(duration)
321 |         format_str = ('%s: step %d, loss = %.2f (%.1f images/sec; %.3f '
322 |                       'sec/batch)')
323 |         print (format_str % (datetime.now(), step, loss_value,
324 |                              images_per_sec, sec_per_batch))
325 |         sys.stdout.flush()
326 | 
327 |       # Save the model checkpoint periodically.
328 |       if step % FLAGS.checkpoint_step == 0 or (step + 1) == FLAGS.max_steps:
329 |         checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
330 |         saver.save(sess, checkpoint_path, global_step=step)
331 |     # except Exception, e:
332 |     #   coord.request_stop(e)
333 |     # finally:
334 |     #   coord.request_stop()
335 |     #   coord.join(threads)
336 | 
337 | def main(argv=None):  # pylint: disable=unused-argument
338 |   if tf.gfile.Exists(FLAGS.train_dir):
339 |     tf.gfile.DeleteRecursively(FLAGS.train_dir)
340 |   tf.gfile.MakeDirs(FLAGS.train_dir)
341 |   train()
342 | 
343 | 
344 | if __name__ == '__main__':
345 |   tf.app.run()
346 | 


--------------------------------------------------------------------------------
/src/nn_skeleton.py:
--------------------------------------------------------------------------------
  1 | # Author: Bichen Wu (bichen@berkeley.edu) 08/25/2016
  2 | 
  3 | """Neural network model base class."""
  4 | 
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | import os
 10 | import sys
 11 | 
 12 | from utils import util
 13 | from easydict import EasyDict as edict
 14 | import numpy as np
 15 | import tensorflow as tf
 16 | 
 17 | 
 18 | def _add_loss_summaries(total_loss):
 19 |   """Add summaries for losses
 20 |   Generates loss summaries for visualizing the performance of the network.
 21 |   Args:
 22 |     total_loss: Total loss from loss().
 23 |   """
 24 |   losses = tf.get_collection('losses')
 25 | 
 26 |   # Attach a scalar summary to all individual losses and the total loss; do the
 27 |   # same for the averaged version of the losses.
 28 |   for l in losses + [total_loss]:
 29 |     tf.summary.scalar(l.op.name, l)
 30 | 
 31 | def _variable_on_device(name, shape, initializer, trainable=True):
 32 |   """Helper to create a Variable.
 33 | 
 34 |   Args:
 35 |     name: name of the variable
 36 |     shape: list of ints
 37 |     initializer: initializer for Variable
 38 | 
 39 |   Returns:
 40 |     Variable Tensor
 41 |   """
 42 |   # TODO(bichen): fix the hard-coded data type below
 43 |   dtype = tf.float32
 44 |   if not callable(initializer):
 45 |     var = tf.get_variable(name, initializer=initializer, trainable=trainable)
 46 |   else:
 47 |     var = tf.get_variable(
 48 |         name, shape, initializer=initializer, dtype=dtype, trainable=trainable)
 49 |   return var
 50 | 
 51 | def _variable_with_weight_decay(name, shape, wd, initializer, trainable=True):
 52 |   """Helper to create an initialized Variable with weight decay.
 53 | 
 54 |   Note that the Variable is initialized with a truncated normal distribution.
 55 |   A weight decay is added only if one is specified.
 56 | 
 57 |   Args:
 58 |     name: name of the variable
 59 |     shape: list of ints
 60 |     wd: add L2Loss weight decay multiplied by this float. If None, weight
 61 |         decay is not added for this Variable.
 62 | 
 63 |   Returns:
 64 |     Variable Tensor
 65 |   """
 66 |   var = _variable_on_device(name, shape, initializer, trainable)
 67 |   if wd is not None and trainable:
 68 |     weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
 69 |     tf.add_to_collection('losses', weight_decay)
 70 |   return var
 71 | 
 72 | class ModelSkeleton:
 73 |   """Base class of NN detection models."""
 74 |   def __init__(self, mc):
 75 |     self.mc = mc
 76 |     # a scalar tensor in range (0, 1]. Usually set to 0.5 in training phase and
 77 |     # 1.0 in evaluation phase
 78 |     self.keep_prob = 0.5 if mc.IS_TRAINING else 1.0
 79 | 
 80 |     # image batch input
 81 |     self.ph_image_input = tf.placeholder(
 82 |         tf.float32, [mc.BATCH_SIZE, mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3],
 83 |         name='image_input'
 84 |     )
 85 |     # A tensor where an element is 1 if the corresponding box is "responsible"
 86 |     # for detection an object and 0 otherwise.
 87 |     self.ph_input_mask = tf.placeholder(
 88 |         tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 1], name='box_mask')
 89 |     # Tensor used to represent bounding box deltas.
 90 |     self.ph_box_delta_input = tf.placeholder(
 91 |         tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 4], name='box_delta_input')
 92 |     # Tensor used to represent bounding box coordinates.
 93 |     self.ph_box_input = tf.placeholder(
 94 |         tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, 4], name='box_input')
 95 |     # Tensor used to represent labels
 96 |     self.ph_labels = tf.placeholder(
 97 |         tf.float32, [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES], name='labels')
 98 | 
 99 |     # IOU between predicted anchors with ground-truth boxes
100 |     self.ious = tf.Variable(
101 |       initial_value=np.zeros((mc.BATCH_SIZE, mc.ANCHORS)), trainable=False,
102 |       name='iou', dtype=tf.float32
103 |     )
104 | 
105 |     self.FIFOQueue = tf.FIFOQueue(
106 |         capacity=mc.QUEUE_CAPACITY,
107 |         dtypes=[tf.float32, tf.float32, tf.float32, 
108 |                 tf.float32, tf.float32],
109 |         shapes=[[mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3],
110 |                 [mc.ANCHORS, 1],
111 |                 [mc.ANCHORS, 4],
112 |                 [mc.ANCHORS, 4],
113 |                 [mc.ANCHORS, mc.CLASSES]],
114 |     )
115 | 
116 |     self.enqueue_op = self.FIFOQueue.enqueue_many(
117 |         [self.ph_image_input, self.ph_input_mask,
118 |          self.ph_box_delta_input, self.ph_box_input, self.ph_labels]
119 |     )
120 | 
121 |     self.image_input, self.input_mask, self.box_delta_input, \
122 |         self.box_input, self.labels = tf.train.batch(
123 |             self.FIFOQueue.dequeue(), batch_size=mc.BATCH_SIZE,
124 |             capacity=mc.QUEUE_CAPACITY) 
125 | 
126 |     # model parameters
127 |     self.model_params = []
128 | 
129 |     # model size counter
130 |     self.model_size_counter = [] # array of tuple of layer name, parameter size
131 |     # flop counter
132 |     self.flop_counter = [] # array of tuple of layer name, flop number
133 |     # activation counter
134 |     self.activation_counter = [] # array of tuple of layer name, output activations
135 |     self.activation_counter.append(('input', mc.IMAGE_WIDTH*mc.IMAGE_HEIGHT*3))
136 | 
137 | 
138 |   def _add_forward_graph(self):
139 |     """NN architecture specification."""
140 |     raise NotImplementedError
141 | 
142 |   def _add_interpretation_graph(self):
143 |     """Interpret NN output."""
144 |     mc = self.mc
145 | 
146 |     with tf.variable_scope('interpret_output') as scope:
147 |       preds = self.preds
148 | 
149 |       # probability
150 |       num_class_probs = mc.ANCHOR_PER_GRID*mc.CLASSES
151 |       self.pred_class_probs = tf.reshape(
152 |           tf.nn.softmax(
153 |               tf.reshape(
154 |                   preds[:, :, :, :num_class_probs],
155 |                   [-1, mc.CLASSES]
156 |               )
157 |           ),
158 |           [mc.BATCH_SIZE, mc.ANCHORS, mc.CLASSES],
159 |           name='pred_class_probs'
160 |       )
161 |       
162 |       # confidence
163 |       num_confidence_scores = mc.ANCHOR_PER_GRID+num_class_probs
164 |       self.pred_conf = tf.sigmoid(
165 |           tf.reshape(
166 |               preds[:, :, :, num_class_probs:num_confidence_scores],
167 |               [mc.BATCH_SIZE, mc.ANCHORS]
168 |           ),
169 |           name='pred_confidence_score'
170 |       )
171 | 
172 |       # bbox_delta
173 |       self.pred_box_delta = tf.reshape(
174 |           preds[:, :, :, num_confidence_scores:],
175 |           [mc.BATCH_SIZE, mc.ANCHORS, 4],
176 |           name='bbox_delta'
177 |       )
178 | 
179 |       # number of object. Used to normalize bbox and classification loss
180 |       self.num_objects = tf.reduce_sum(self.input_mask, name='num_objects')
181 | 
182 |     with tf.variable_scope('bbox') as scope:
183 |       with tf.variable_scope('stretching'):
184 |         delta_x, delta_y, delta_w, delta_h = tf.unstack(
185 |             self.pred_box_delta, axis=2)
186 | 
187 |         anchor_x = mc.ANCHOR_BOX[:, 0]
188 |         anchor_y = mc.ANCHOR_BOX[:, 1]
189 |         anchor_w = mc.ANCHOR_BOX[:, 2]
190 |         anchor_h = mc.ANCHOR_BOX[:, 3]
191 | 
192 |         box_center_x = tf.identity(
193 |             anchor_x + delta_x * anchor_w, name='bbox_cx')
194 |         box_center_y = tf.identity(
195 |             anchor_y + delta_y * anchor_h, name='bbox_cy')
196 |         box_width = tf.identity(
197 |             anchor_w * util.safe_exp(delta_w, mc.EXP_THRESH),
198 |             name='bbox_width')
199 |         box_height = tf.identity(
200 |             anchor_h * util.safe_exp(delta_h, mc.EXP_THRESH),
201 |             name='bbox_height')
202 | 
203 |         self._activation_summary(delta_x, 'delta_x')
204 |         self._activation_summary(delta_y, 'delta_y')
205 |         self._activation_summary(delta_w, 'delta_w')
206 |         self._activation_summary(delta_h, 'delta_h')
207 | 
208 |         self._activation_summary(box_center_x, 'bbox_cx')
209 |         self._activation_summary(box_center_y, 'bbox_cy')
210 |         self._activation_summary(box_width, 'bbox_width')
211 |         self._activation_summary(box_height, 'bbox_height')
212 | 
213 |       with tf.variable_scope('trimming'):
214 |         xmins, ymins, xmaxs, ymaxs = util.bbox_transform(
215 |             [box_center_x, box_center_y, box_width, box_height])
216 | 
217 |         # The max x position is mc.IMAGE_WIDTH - 1 since we use zero-based
218 |         # pixels. Same for y.
219 |         xmins = tf.minimum(
220 |             tf.maximum(0.0, xmins), mc.IMAGE_WIDTH-1.0, name='bbox_xmin')
221 |         self._activation_summary(xmins, 'box_xmin')
222 | 
223 |         ymins = tf.minimum(
224 |             tf.maximum(0.0, ymins), mc.IMAGE_HEIGHT-1.0, name='bbox_ymin')
225 |         self._activation_summary(ymins, 'box_ymin')
226 | 
227 |         xmaxs = tf.maximum(
228 |             tf.minimum(mc.IMAGE_WIDTH-1.0, xmaxs), 0.0, name='bbox_xmax')
229 |         self._activation_summary(xmaxs, 'box_xmax')
230 | 
231 |         ymaxs = tf.maximum(
232 |             tf.minimum(mc.IMAGE_HEIGHT-1.0, ymaxs), 0.0, name='bbox_ymax')
233 |         self._activation_summary(ymaxs, 'box_ymax')
234 | 
235 |         self.det_boxes = tf.transpose(
236 |             tf.stack(util.bbox_transform_inv([xmins, ymins, xmaxs, ymaxs])),
237 |             (1, 2, 0), name='bbox'
238 |         )
239 | 
240 |     with tf.variable_scope('IOU'):
241 |       def _tensor_iou(box1, box2):
242 |         with tf.variable_scope('intersection'):
243 |           xmin = tf.maximum(box1[0], box2[0], name='xmin')
244 |           ymin = tf.maximum(box1[1], box2[1], name='ymin')
245 |           xmax = tf.minimum(box1[2], box2[2], name='xmax')
246 |           ymax = tf.minimum(box1[3], box2[3], name='ymax')
247 | 
248 |           w = tf.maximum(0.0, xmax-xmin, name='inter_w')
249 |           h = tf.maximum(0.0, ymax-ymin, name='inter_h')
250 |           intersection = tf.multiply(w, h, name='intersection')
251 | 
252 |         with tf.variable_scope('union'):
253 |           w1 = tf.subtract(box1[2], box1[0], name='w1')
254 |           h1 = tf.subtract(box1[3], box1[1], name='h1')
255 |           w2 = tf.subtract(box2[2], box2[0], name='w2')
256 |           h2 = tf.subtract(box2[3], box2[1], name='h2')
257 | 
258 |           union = w1*h1 + w2*h2 - intersection
259 | 
260 |         return intersection/(union+mc.EPSILON) \
261 |             * tf.reshape(self.input_mask, [mc.BATCH_SIZE, mc.ANCHORS])
262 | 
263 |       self.ious = self.ious.assign(
264 |           _tensor_iou(
265 |               util.bbox_transform(tf.unstack(self.det_boxes, axis=2)),
266 |               util.bbox_transform(tf.unstack(self.box_input, axis=2))
267 |           )
268 |       )
269 |       self._activation_summary(self.ious, 'conf_score')
270 | 
271 |     with tf.variable_scope('probability') as scope:
272 |       self._activation_summary(self.pred_class_probs, 'class_probs')
273 | 
274 |       probs = tf.multiply(
275 |           self.pred_class_probs,
276 |           tf.reshape(self.pred_conf, [mc.BATCH_SIZE, mc.ANCHORS, 1]),
277 |           name='final_class_prob'
278 |       )
279 | 
280 |       self._activation_summary(probs, 'final_class_prob')
281 | 
282 |       self.det_probs = tf.reduce_max(probs, 2, name='score')
283 |       self.det_class = tf.argmax(probs, 2, name='class_idx')
284 | 
285 |   def _add_loss_graph(self):
286 |     """Define the loss operation."""
287 |     mc = self.mc
288 | 
289 |     with tf.variable_scope('class_regression') as scope:
290 |       # cross-entropy: q * -log(p) + (1-q) * -log(1-p)
291 |       # add a small value into log to prevent blowing up
292 |       self.class_loss = tf.truediv(
293 |           tf.reduce_sum(
294 |               (self.labels*(-tf.log(self.pred_class_probs+mc.EPSILON))
295 |                + (1-self.labels)*(-tf.log(1-self.pred_class_probs+mc.EPSILON)))
296 |               * self.input_mask * mc.LOSS_COEF_CLASS),
297 |           self.num_objects,
298 |           name='class_loss'
299 |       )
300 |       tf.add_to_collection('losses', self.class_loss)
301 | 
302 |     with tf.variable_scope('confidence_score_regression') as scope:
303 |       input_mask = tf.reshape(self.input_mask, [mc.BATCH_SIZE, mc.ANCHORS])
304 |       self.conf_loss = tf.reduce_mean(
305 |           tf.reduce_sum(
306 |               tf.square((self.ious - self.pred_conf)) 
307 |               * (input_mask*mc.LOSS_COEF_CONF_POS/self.num_objects
308 |                  +(1-input_mask)*mc.LOSS_COEF_CONF_NEG/(mc.ANCHORS-self.num_objects)),
309 |               reduction_indices=[1]
310 |           ),
311 |           name='confidence_loss'
312 |       )
313 |       tf.add_to_collection('losses', self.conf_loss)
314 |       tf.summary.scalar('mean iou', tf.reduce_sum(self.ious)/self.num_objects)
315 | 
316 |     with tf.variable_scope('bounding_box_regression') as scope:
317 |       self.bbox_loss = tf.truediv(
318 |           tf.reduce_sum(
319 |               mc.LOSS_COEF_BBOX * tf.square(
320 |                   self.input_mask*(self.pred_box_delta-self.box_delta_input))),
321 |           self.num_objects,
322 |           name='bbox_loss'
323 |       )
324 |       tf.add_to_collection('losses', self.bbox_loss)
325 | 
326 |     # add above losses as well as weight decay losses to form the total loss
327 |     self.loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
328 | 
329 |   def _add_train_graph(self):
330 |     """Define the training operation."""
331 |     mc = self.mc
332 | 
333 |     self.global_step = tf.Variable(0, name='global_step', trainable=False)
334 |     lr = tf.train.exponential_decay(mc.LEARNING_RATE,
335 |                                     self.global_step,
336 |                                     mc.DECAY_STEPS,
337 |                                     mc.LR_DECAY_FACTOR,
338 |                                     staircase=True)
339 | 
340 |     tf.summary.scalar('learning_rate', lr)
341 | 
342 |     _add_loss_summaries(self.loss)
343 | 
344 |     opt = tf.train.MomentumOptimizer(learning_rate=lr, momentum=mc.MOMENTUM)
345 |     grads_vars = opt.compute_gradients(self.loss, tf.trainable_variables())
346 | 
347 |     with tf.variable_scope('clip_gradient') as scope:
348 |       for i, (grad, var) in enumerate(grads_vars):
349 |         grads_vars[i] = (tf.clip_by_norm(grad, mc.MAX_GRAD_NORM), var)
350 | 
351 |     apply_gradient_op = opt.apply_gradients(grads_vars, global_step=self.global_step)
352 | 
353 |     for var in tf.trainable_variables():
354 |         tf.summary.histogram(var.op.name, var)
355 | 
356 |     for grad, var in grads_vars:
357 |       if grad is not None:
358 |         tf.summary.histogram(var.op.name + '/gradients', grad)
359 | 
360 |     with tf.control_dependencies([apply_gradient_op]):
361 |       self.train_op = tf.no_op(name='train')
362 | 
363 |   def _add_viz_graph(self):
364 |     """Define the visualization operation."""
365 |     mc = self.mc
366 |     self.image_to_show = tf.placeholder(
367 |         tf.float32, [None, mc.IMAGE_HEIGHT, mc.IMAGE_WIDTH, 3],
368 |         name='image_to_show'
369 |     )
370 |     self.viz_op = tf.summary.image('sample_detection_results',
371 |         self.image_to_show, collections='image_summary',
372 |         max_outputs=mc.BATCH_SIZE)
373 | 
374 |   def _conv_bn_layer(
375 |       self, inputs, conv_param_name, bn_param_name, scale_param_name, filters,
376 |       size, stride, padding='SAME', freeze=False, relu=True,
377 |       conv_with_bias=False, stddev=0.001):
378 |     """ Convolution + BatchNorm + [relu] layer. Batch mean and var are treated
379 |     as constant. Weights have to be initialized from a pre-trained model or
380 |     restored from a checkpoint.
381 | 
382 |     Args:
383 |       inputs: input tensor
384 |       conv_param_name: name of the convolution parameters
385 |       bn_param_name: name of the batch normalization parameters
386 |       scale_param_name: name of the scale parameters
387 |       filters: number of output filters.
388 |       size: kernel size.
389 |       stride: stride
390 |       padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description.
391 |       freeze: if true, then do not train the parameters in this layer.
392 |       xavier: whether to use xavier weight initializer or not.
393 |       relu: whether to use relu or not.
394 |       conv_with_bias: whether or not add bias term to the convolution output.
395 |       stddev: standard deviation used for random weight initializer.
396 |     Returns:
397 |       A convolutional layer operation.
398 |     """
399 |     mc = self.mc
400 | 
401 |     with tf.variable_scope(conv_param_name) as scope:
402 |       channels = inputs.get_shape()[3]
403 | 
404 |       if mc.LOAD_PRETRAINED_MODEL:
405 |         cw = self.caffemodel_weight
406 |         kernel_val = np.transpose(cw[conv_param_name][0], [2,3,1,0])
407 |         if conv_with_bias:
408 |           bias_val = cw[conv_param_name][1]
409 |         mean_val   = cw[bn_param_name][0]
410 |         var_val    = cw[bn_param_name][1]
411 |         gamma_val  = cw[scale_param_name][0]
412 |         beta_val   = cw[scale_param_name][1]
413 |       else:
414 |         kernel_val = tf.truncated_normal_initializer(
415 |             stddev=stddev, dtype=tf.float32)
416 |         if conv_with_bias:
417 |           bias_val = tf.constant_initializer(0.0)
418 |         mean_val   = tf.constant_initializer(0.0)
419 |         var_val    = tf.constant_initializer(1.0)
420 |         gamma_val  = tf.constant_initializer(1.0)
421 |         beta_val   = tf.constant_initializer(0.0)
422 | 
423 |       # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with
424 |       # shape [h, w, in, out]
425 |       kernel = _variable_with_weight_decay(
426 |           'kernels', shape=[size, size, int(channels), filters],
427 |           wd=mc.WEIGHT_DECAY, initializer=kernel_val, trainable=(not freeze))
428 |       self.model_params += [kernel]
429 |       if conv_with_bias:
430 |         biases = _variable_on_device('biases', [filters], bias_val,
431 |                                      trainable=(not freeze))
432 |         self.model_params += [biases]
433 |       gamma = _variable_on_device('gamma', [filters], gamma_val,
434 |                                   trainable=(not freeze))
435 |       beta  = _variable_on_device('beta', [filters], beta_val,
436 |                                   trainable=(not freeze))
437 |       mean  = _variable_on_device('mean', [filters], mean_val, trainable=False)
438 |       var   = _variable_on_device('var', [filters], var_val, trainable=False)
439 |       self.model_params += [gamma, beta, mean, var]
440 | 
441 |       conv = tf.nn.conv2d(
442 |           inputs, kernel, [1, stride, stride, 1], padding=padding,
443 |           name='convolution')
444 |       if conv_with_bias:
445 |         conv = tf.nn.bias_add(conv, biases, name='bias_add')
446 | 
447 |       conv = tf.nn.batch_normalization(
448 |           conv, mean=mean, variance=var, offset=beta, scale=gamma,
449 |           variance_epsilon=mc.BATCH_NORM_EPSILON, name='batch_norm')
450 | 
451 |       self.model_size_counter.append(
452 |           (conv_param_name, (1+size*size*int(channels))*filters)
453 |       )
454 |       out_shape = conv.get_shape().as_list()
455 |       num_flops = \
456 |         (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2]
457 |       if relu:
458 |         num_flops += 2*filters*out_shape[1]*out_shape[2]
459 |       self.flop_counter.append((conv_param_name, num_flops))
460 | 
461 |       self.activation_counter.append(
462 |           (conv_param_name, out_shape[1]*out_shape[2]*out_shape[3])
463 |       )
464 | 
465 |       if relu:
466 |         return tf.nn.relu(conv)
467 |       else:
468 |         return conv
469 | 
470 | 
471 |   def _conv_layer(
472 |       self, layer_name, inputs, filters, size, stride, padding='SAME',
473 |       freeze=False, xavier=False, relu=True, stddev=0.001):
474 |     """Convolutional layer operation constructor.
475 | 
476 |     Args:
477 |       layer_name: layer name.
478 |       inputs: input tensor
479 |       filters: number of output filters.
480 |       size: kernel size.
481 |       stride: stride
482 |       padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description.
483 |       freeze: if true, then do not train the parameters in this layer.
484 |       xavier: whether to use xavier weight initializer or not.
485 |       relu: whether to use relu or not.
486 |       stddev: standard deviation used for random weight initializer.
487 |     Returns:
488 |       A convolutional layer operation.
489 |     """
490 | 
491 |     mc = self.mc
492 |     use_pretrained_param = False
493 |     if mc.LOAD_PRETRAINED_MODEL:
494 |       cw = self.caffemodel_weight
495 |       if layer_name in cw:
496 |         kernel_val = np.transpose(cw[layer_name][0], [2,3,1,0])
497 |         bias_val = cw[layer_name][1]
498 |         # check the shape
499 |         if (kernel_val.shape == 
500 |               (size, size, inputs.get_shape().as_list()[-1], filters)) \
501 |            and (bias_val.shape == (filters, )):
502 |           use_pretrained_param = True
503 |         else:
504 |           print ('Shape of the pretrained parameter of {} does not match, '
505 |               'use randomly initialized parameter'.format(layer_name))
506 |       else:
507 |         print ('Cannot find {} in the pretrained model. Use randomly initialized '
508 |                'parameters'.format(layer_name))
509 | 
510 |     if mc.DEBUG_MODE:
511 |       print('Input tensor shape to {}: {}'.format(layer_name, inputs.get_shape()))
512 | 
513 |     with tf.variable_scope(layer_name) as scope:
514 |       channels = inputs.get_shape()[3]
515 | 
516 |       # re-order the caffe kernel with shape [out, in, h, w] -> tf kernel with
517 |       # shape [h, w, in, out]
518 |       if use_pretrained_param:
519 |         if mc.DEBUG_MODE:
520 |           print ('Using pretrained model for {}'.format(layer_name))
521 |         kernel_init = tf.constant(kernel_val , dtype=tf.float32)
522 |         bias_init = tf.constant(bias_val, dtype=tf.float32)
523 |       elif xavier:
524 |         kernel_init = tf.contrib.layers.xavier_initializer_conv2d()
525 |         bias_init = tf.constant_initializer(0.0)
526 |       else:
527 |         kernel_init = tf.truncated_normal_initializer(
528 |             stddev=stddev, dtype=tf.float32)
529 |         bias_init = tf.constant_initializer(0.0)
530 | 
531 |       kernel = _variable_with_weight_decay(
532 |           'kernels', shape=[size, size, int(channels), filters],
533 |           wd=mc.WEIGHT_DECAY, initializer=kernel_init, trainable=(not freeze))
534 | 
535 |       biases = _variable_on_device('biases', [filters], bias_init, 
536 |                                 trainable=(not freeze))
537 |       self.model_params += [kernel, biases]
538 | 
539 |       conv = tf.nn.conv2d(
540 |           inputs, kernel, [1, stride, stride, 1], padding=padding,
541 |           name='convolution')
542 |       conv_bias = tf.nn.bias_add(conv, biases, name='bias_add')
543 |   
544 |       if relu:
545 |         out = tf.nn.relu(conv_bias, 'relu')
546 |       else:
547 |         out = conv_bias
548 | 
549 |       self.model_size_counter.append(
550 |           (layer_name, (1+size*size*int(channels))*filters)
551 |       )
552 |       out_shape = out.get_shape().as_list()
553 |       num_flops = \
554 |         (1+2*int(channels)*size*size)*filters*out_shape[1]*out_shape[2]
555 |       if relu:
556 |         num_flops += 2*filters*out_shape[1]*out_shape[2]
557 |       self.flop_counter.append((layer_name, num_flops))
558 | 
559 |       self.activation_counter.append(
560 |           (layer_name, out_shape[1]*out_shape[2]*out_shape[3])
561 |       )
562 | 
563 |       return out
564 |   
565 |   def _pooling_layer(
566 |       self, layer_name, inputs, size, stride, padding='SAME'):
567 |     """Pooling layer operation constructor.
568 | 
569 |     Args:
570 |       layer_name: layer name.
571 |       inputs: input tensor
572 |       size: kernel size.
573 |       stride: stride
574 |       padding: 'SAME' or 'VALID'. See tensorflow doc for detailed description.
575 |     Returns:
576 |       A pooling layer operation.
577 |     """
578 | 
579 |     with tf.variable_scope(layer_name) as scope:
580 |       out =  tf.nn.max_pool(inputs, 
581 |                             ksize=[1, size, size, 1], 
582 |                             strides=[1, stride, stride, 1],
583 |                             padding=padding)
584 |       activation_size = np.prod(out.get_shape().as_list()[1:])
585 |       self.activation_counter.append((layer_name, activation_size))
586 |       return out
587 | 
588 |   
589 |   def _fc_layer(
590 |       self, layer_name, inputs, hiddens, flatten=False, relu=True,
591 |       xavier=False, stddev=0.001):
592 |     """Fully connected layer operation constructor.
593 | 
594 |     Args:
595 |       layer_name: layer name.
596 |       inputs: input tensor
597 |       hiddens: number of (hidden) neurons in this layer.
598 |       flatten: if true, reshape the input 4D tensor of shape 
599 |           (batch, height, weight, channel) into a 2D tensor with shape 
600 |           (batch, -1). This is used when the input to the fully connected layer
601 |           is output of a convolutional layer.
602 |       relu: whether to use relu or not.
603 |       xavier: whether to use xavier weight initializer or not.
604 |       stddev: standard deviation used for random weight initializer.
605 |     Returns:
606 |       A fully connected layer operation.
607 |     """
608 |     mc = self.mc
609 | 
610 |     use_pretrained_param = False
611 |     if mc.LOAD_PRETRAINED_MODEL:
612 |       cw = self.caffemodel_weight
613 |       if layer_name in cw:
614 |         use_pretrained_param = True
615 |         kernel_val = cw[layer_name][0]
616 |         bias_val = cw[layer_name][1]
617 | 
618 |     if mc.DEBUG_MODE:
619 |       print('Input tensor shape to {}: {}'.format(layer_name, inputs.get_shape()))
620 | 
621 |     with tf.variable_scope(layer_name) as scope:
622 |       input_shape = inputs.get_shape().as_list()
623 |       if flatten:
624 |         dim = input_shape[1]*input_shape[2]*input_shape[3]
625 |         inputs = tf.reshape(inputs, [-1, dim])
626 |         if use_pretrained_param:
627 |           try:
628 |             # check the size before layout transform
629 |             assert kernel_val.shape == (hiddens, dim), \
630 |                 'kernel shape error at {}'.format(layer_name)
631 |             kernel_val = np.reshape(
632 |                 np.transpose(
633 |                     np.reshape(
634 |                         kernel_val, # O x (C*H*W)
635 |                         (hiddens, input_shape[3], input_shape[1], input_shape[2])
636 |                     ), # O x C x H x W
637 |                     (2, 3, 1, 0)
638 |                 ), # H x W x C x O
639 |                 (dim, -1)
640 |             ) # (H*W*C) x O
641 |             # check the size after layout transform
642 |             assert kernel_val.shape == (dim, hiddens), \
643 |                 'kernel shape error at {}'.format(layer_name)
644 |           except:
645 |             # Do not use pretrained parameter if shape doesn't match
646 |             use_pretrained_param = False
647 |             print ('Shape of the pretrained parameter of {} does not match, '
648 |                    'use randomly initialized parameter'.format(layer_name))
649 |       else:
650 |         dim = input_shape[1]
651 |         if use_pretrained_param:
652 |           try:
653 |             kernel_val = np.transpose(kernel_val, (1,0))
654 |             assert kernel_val.shape == (dim, hiddens), \
655 |                 'kernel shape error at {}'.format(layer_name)
656 |           except:
657 |             use_pretrained_param = False
658 |             print ('Shape of the pretrained parameter of {} does not match, '
659 |                    'use randomly initialized parameter'.format(layer_name))
660 | 
661 |       if use_pretrained_param:
662 |         if mc.DEBUG_MODE:
663 |           print ('Using pretrained model for {}'.format(layer_name))
664 |         kernel_init = tf.constant(kernel_val, dtype=tf.float32)
665 |         bias_init = tf.constant(bias_val, dtype=tf.float32)
666 |       elif xavier:
667 |         kernel_init = tf.contrib.layers.xavier_initializer()
668 |         bias_init = tf.constant_initializer(0.0)
669 |       else:
670 |         kernel_init = tf.truncated_normal_initializer(
671 |             stddev=stddev, dtype=tf.float32)
672 |         bias_init = tf.constant_initializer(0.0)
673 | 
674 |       weights = _variable_with_weight_decay(
675 |           'weights', shape=[dim, hiddens], wd=mc.WEIGHT_DECAY,
676 |           initializer=kernel_init)
677 |       biases = _variable_on_device('biases', [hiddens], bias_init)
678 |       self.model_params += [weights, biases]
679 |   
680 |       outputs = tf.nn.bias_add(tf.matmul(inputs, weights), biases)
681 |       if relu:
682 |         outputs = tf.nn.relu(outputs, 'relu')
683 | 
684 |       # count layer stats
685 |       self.model_size_counter.append((layer_name, (dim+1)*hiddens))
686 | 
687 |       num_flops = 2 * dim * hiddens + hiddens
688 |       if relu:
689 |         num_flops += 2*hiddens
690 |       self.flop_counter.append((layer_name, num_flops))
691 | 
692 |       self.activation_counter.append((layer_name, hiddens))
693 | 
694 |       return outputs
695 | 
696 |   def filter_prediction(self, boxes, probs, cls_idx):
697 |     """Filter bounding box predictions with probability threshold and
698 |     non-maximum supression.
699 | 
700 |     Args:
701 |       boxes: array of [cx, cy, w, h].
702 |       probs: array of probabilities
703 |       cls_idx: array of class indices
704 |     Returns:
705 |       final_boxes: array of filtered bounding boxes.
706 |       final_probs: array of filtered probabilities
707 |       final_cls_idx: array of filtered class indices
708 |     """
709 |     mc = self.mc
710 | 
711 |     if mc.TOP_N_DETECTION < len(probs) and mc.TOP_N_DETECTION > 0:
712 |       order = probs.argsort()[:-mc.TOP_N_DETECTION-1:-1]
713 |       probs = probs[order]
714 |       boxes = boxes[order]
715 |       cls_idx = cls_idx[order]
716 |     else:
717 |       filtered_idx = np.nonzero(probs>mc.PROB_THRESH)[0]
718 |       probs = probs[filtered_idx]
719 |       boxes = boxes[filtered_idx]
720 |       cls_idx = cls_idx[filtered_idx]
721 | 
722 |     final_boxes = []
723 |     final_probs = []
724 |     final_cls_idx = []
725 | 
726 |     for c in range(mc.CLASSES):
727 |       idx_per_class = [i for i in range(len(probs)) if cls_idx[i] == c]
728 |       keep = util.nms(boxes[idx_per_class], probs[idx_per_class], mc.NMS_THRESH)
729 |       for i in range(len(keep)):
730 |         if keep[i]:
731 |           final_boxes.append(boxes[idx_per_class[i]])
732 |           final_probs.append(probs[idx_per_class[i]])
733 |           final_cls_idx.append(c)
734 |     return final_boxes, final_probs, final_cls_idx
735 | 
736 |   def _activation_summary(self, x, layer_name):
737 |     """Helper to create summaries for activations.
738 | 
739 |     Args:
740 |       x: layer output tensor
741 |       layer_name: name of the layer
742 |     Returns:
743 |       nothing
744 |     """
745 |     with tf.variable_scope('activation_summary') as scope:
746 |       tf.summary.histogram(
747 |           'activation_summary/'+layer_name, x)
748 |       tf.summary.scalar(
749 |           'activation_summary/'+layer_name+'/sparsity', tf.nn.zero_fraction(x))
750 |       tf.summary.scalar(
751 |           'activation_summary/'+layer_name+'/average', tf.reduce_mean(x))
752 |       tf.summary.scalar(
753 |           'activation_summary/'+layer_name+'/max', tf.reduce_max(x))
754 |       tf.summary.scalar(
755 |           'activation_summary/'+layer_name+'/min', tf.reduce_min(x))
756 | 


--------------------------------------------------------------------------------
/src/dataset/kitti-eval/cpp/evaluate_object.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | #include <algorithm>
  3 | #include <stdio.h>
  4 | #include <math.h>
  5 | #include <vector>
  6 | #include <numeric>
  7 | #include <strings.h>
  8 | #include <assert.h>
  9 | 
 10 | #include "mail.h"
 11 | 
 12 | using namespace std;
 13 | 
 14 | #include<sstream>
 15 | template< typename T > inline std::string str(T const & i) { std::stringstream s; s << i; return s.str(); } // T-to-string
 16 | /*=======================================================================
 17 | STATIC EVALUATION PARAMETERS
 18 | =======================================================================*/
 19 | 
 20 | // path handling
 21 | string ospj( string const & a, string const & b ) { return a + "/" + b; }
 22 | string ospj( string const & a, string const & b, string const & c ) { return a + "/" + b + "/" + c; }
 23 | 
 24 | // easy, moderate and hard evaluation level
 25 | enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2};
 26 | 
 27 | // evaluation parameter
 28 | const int32_t MIN_HEIGHT[3]     = {40, 25, 25};     // minimum height for evaluated groundtruth/detections
 29 | const int32_t MAX_OCCLUSION[3]  = {0, 1, 2};        // maximum occlusion level of the groundtruth used for evaluation
 30 | const double  MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation
 31 | 
 32 | // evaluated object classes
 33 | enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2};
 34 | 
 35 | // parameters varying per class
 36 | vector<string> CLASS_NAMES;
 37 | const double   MIN_OVERLAP[3] = {0.7, 0.5, 0.5};                  // the minimum overlap required for evaluation
 38 | 
 39 | // no. of recall steps that should be evaluated (discretized)
 40 | const double N_SAMPLE_PTS = 41;
 41 | 
 42 | // initialize class names
 43 | void initGlobals () {
 44 |   CLASS_NAMES.push_back("car");
 45 |   CLASS_NAMES.push_back("pedestrian");
 46 |   CLASS_NAMES.push_back("cyclist");
 47 | }
 48 | 
 49 | /*=======================================================================
 50 | DATA TYPES FOR EVALUATION
 51 | =======================================================================*/
 52 | 
 53 | // holding data needed for precision-recall and precision-aos
 54 | struct tPrData {
 55 |   vector<double> v;           // detection score for computing score thresholds
 56 |   double         similarity;  // orientation similarity
 57 |   int32_t        tp;          // true positives
 58 |   int32_t        fp;          // false positives
 59 |   int32_t        fn;          // false negatives
 60 |   tPrData () :
 61 |     similarity(0), tp(0), fp(0), fn(0) {}
 62 | };
 63 | 
 64 | // holding bounding boxes for ground truth and detections
 65 | struct tBox {
 66 |   string  type;     // object type as car, pedestrian or cyclist,...
 67 |   double   x1;      // left corner
 68 |   double   y1;      // top corner
 69 |   double   x2;      // right corner
 70 |   double   y2;      // bottom corner
 71 |   double   alpha;   // image orientation
 72 |   tBox (string type, double x1,double y1,double x2,double y2,double alpha) :
 73 |     type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {}
 74 | };
 75 | 
 76 | // holding ground truth data
 77 | struct tGroundtruth {
 78 |   tBox    box;        // object type, box, orientation
 79 |   double  truncation; // truncation 0..1
 80 |   int32_t occlusion;  // occlusion 0,1,2 (non, partly, fully)
 81 |   tGroundtruth () :
 82 |     box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {}
 83 |   tGroundtruth (tBox box,double truncation,int32_t occlusion) :
 84 |     box(box),truncation(truncation),occlusion(occlusion) {}
 85 |   tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) :
 86 |     box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {}
 87 | };
 88 | 
 89 | // holding detection data
 90 | struct tDetection {
 91 |   tBox    box;    // object type, box, orientation
 92 |   double  thresh; // detection score
 93 |   tDetection ():
 94 |     box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {}
 95 |   tDetection (tBox box,double thresh) :
 96 |     box(box),thresh(thresh) {}
 97 |   tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) :
 98 |     box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {}
 99 | };
100 | 
101 | /*=======================================================================
102 | FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS
103 | =======================================================================*/
104 | 
105 | vector<tDetection> loadDetections(string file_name, bool &compute_aos, bool &eval_car, bool &eval_pedestrian, bool &eval_cyclist, bool &success) {
106 | 
107 |   // holds all detections (ignored detections are indicated by an index vector
108 |   vector<tDetection> detections;
109 |   FILE *fp = fopen(file_name.c_str(),"r");
110 |   if (!fp) {
111 |     success = false;
112 |     return detections;
113 |   }
114 |   while (!feof(fp)) {
115 |     tDetection d;
116 |     double trash;
117 |     char str[255];
118 |     if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
119 |                    str, &trash,    &trash,    &d.box.alpha,
120 |                    &d.box.x1,   &d.box.y1, &d.box.x2, &d.box.y2,
121 |                    &trash,      &trash,    &trash,    &trash,
122 |                    &trash,      &trash,    &trash,    &d.thresh )==16) {
123 |       d.box.type = str;
124 |       detections.push_back(d);
125 | 
126 |       // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid
127 |       if(d.box.alpha==-10)
128 |         compute_aos = false;
129 | 
130 |       // a class is only evaluated if it is detected at least once
131 |       if(!eval_car && !strcasecmp(d.box.type.c_str(), "car"))
132 |         eval_car = true;
133 |       if(!eval_pedestrian && !strcasecmp(d.box.type.c_str(), "pedestrian"))
134 |         eval_pedestrian = true;
135 |       if(!eval_cyclist && !strcasecmp(d.box.type.c_str(), "cyclist"))
136 |         eval_cyclist = true;
137 |     }
138 |   }
139 |   fclose(fp);
140 |   success = true;
141 |   return detections;
142 | }
143 | 
144 | vector<tGroundtruth> loadGroundtruth(string file_name,bool &success) {
145 | 
146 |   // holds all ground truth (ignored ground truth is indicated by an index vector
147 |   vector<tGroundtruth> groundtruth;
148 |   FILE *fp = fopen(file_name.c_str(),"r");
149 |   if (!fp) {
150 |     success = false;
151 |     return groundtruth;
152 |   }
153 |   while (!feof(fp)) {
154 |     tGroundtruth g;
155 |     double trash;
156 |     char str[255];
157 |     if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf",
158 |                    str, &g.truncation, &g.occlusion, &g.box.alpha,
159 |                    &g.box.x1,   &g.box.y1,     &g.box.x2,    &g.box.y2,
160 |                    &trash,      &trash,        &trash,       &trash,
161 |                    &trash,      &trash,        &trash )==15) {
162 |       g.box.type = str;
163 |       groundtruth.push_back(g);
164 |     }
165 |   }
166 |   fclose(fp);
167 |   success = true;
168 |   return groundtruth;
169 | }
170 | 
171 | void saveStats (const vector<double> &precision, const vector<double> &aos, FILE *fp_det, FILE *fp_ap, FILE *fp_ori) {
172 | 
173 |   // save precision to file
174 |   if(precision.empty())
175 |     return;
176 |   double AP=0;
177 |   uint AP_cnt = 0;
178 |   for (int32_t i=0; i<precision.size(); i += 4)
179 |   {
180 |     AP += precision[i];
181 |     ++AP_cnt;
182 |     fprintf(fp_det,"%f ",precision[i]);
183 |   }
184 |   assert( AP_cnt == 11 );
185 |   AP /= double( AP_cnt );
186 |   fprintf(fp_ap, "AP=%s\n", str(AP).c_str() );
187 |   fprintf(fp_det,"\n");
188 | 
189 |   // save orientation similarity, only if there were no invalid orientation entries in submission (alpha=-10)
190 |   if(aos.empty())
191 |     return;
192 |   for (int32_t i=0; i<aos.size(); i++)
193 |     fprintf(fp_ori,"%f ",aos[i]);
194 |   fprintf(fp_ori,"\n");
195 | }
196 | 
197 | /*=======================================================================
198 | EVALUATION HELPER FUNCTIONS
199 | =======================================================================*/
200 | 
201 | // criterion defines whether the overlap is computed with respect to both areas (ground truth and detection)
202 | // or with respect to box a or b (detection and "dontcare" areas)
203 | inline double boxoverlap(tBox a, tBox b, int32_t criterion=-1){
204 | 
205 |   // overlap is invalid in the beginning
206 |   double o = -1;
207 | 
208 |   // get overlapping area
209 |   double x1 = max(a.x1, b.x1);
210 |   double y1 = max(a.y1, b.y1);
211 |   double x2 = min(a.x2, b.x2);
212 |   double y2 = min(a.y2, b.y2);
213 | 
214 |   // compute width and height of overlapping area
215 |   double w = x2-x1;
216 |   double h = y2-y1;
217 | 
218 |   // set invalid entries to 0 overlap
219 |   if(w<=0 || h<=0)
220 |     return 0;
221 | 
222 |   // get overlapping areas
223 |   double inter = w*h;
224 |   double a_area = (a.x2-a.x1) * (a.y2-a.y1);
225 |   double b_area = (b.x2-b.x1) * (b.y2-b.y1);
226 | 
227 |   // intersection over union overlap depending on users choice
228 |   if(criterion==-1)     // union
229 |     o = inter / (a_area+b_area-inter);
230 |   else if(criterion==0) // bbox_a
231 |     o = inter / a_area;
232 |   else if(criterion==1) // bbox_b
233 |     o = inter / b_area;
234 | 
235 |   // overlap
236 |   return o;
237 | }
238 | 
239 | vector<double> getThresholds(vector<double> &v, double n_groundtruth){
240 | 
241 |   // holds scores needed to compute N_SAMPLE_PTS recall values
242 |   vector<double> t;
243 | 
244 |   // sort scores in descending order
245 |   // (highest score is assumed to give best/most confident detections)
246 |   sort(v.begin(), v.end(), greater<double>());
247 | 
248 |   // get scores for linearly spaced recall
249 |   double current_recall = 0;
250 |   for(int32_t i=0; i<v.size(); i++){
251 | 
252 |     // check if right-hand-side recall with respect to current recall is close than left-hand-side one
253 |     // in this case, skip the current detection score
254 |     double l_recall, r_recall, recall;
255 |     l_recall = (double)(i+1)/n_groundtruth;
256 |     if(i<(v.size()-1))
257 |       r_recall = (double)(i+2)/n_groundtruth;
258 |     else
259 |       r_recall = l_recall;
260 | 
261 |     if( (r_recall-current_recall) < (current_recall-l_recall) && i<(v.size()-1))
262 |       continue;
263 | 
264 |     // left recall is the best approximation, so use this and goto next recall step for approximation
265 |     recall = l_recall; // FIXME_MWM: what's up here? seems like a valid warning ...
266 | 
267 |     // the next recall step was reached
268 |     t.push_back(v[i]);
269 |     current_recall += 1.0/(N_SAMPLE_PTS-1.0);
270 |   }
271 |   return t;
272 | }
273 | 
274 | void cleanData(CLASSES current_class, const vector<tGroundtruth> &gt, const vector<tDetection> &det, vector<int32_t> &ignored_gt, vector<tGroundtruth> &dc, vector<int32_t> &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){
275 | 
276 |   // extract ground truth bounding boxes for current evaluation class
277 |   for(int32_t i=0;i<gt.size(); i++){
278 | 
279 |     // only bounding boxes with a minimum height are used for evaluation
280 |     double height = gt[i].box.y2 - gt[i].box.y1;
281 | 
282 |     // neighboring classes are ignored ("van" for "car" and "person_sitting" for "pedestrian")
283 |     // (lower/upper cases are ignored)
284 |     int32_t valid_class;
285 | 
286 |     // all classes without a neighboring class
287 |     if(!strcasecmp(gt[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
288 |       valid_class = 1;
289 | 
290 |     // classes with a neighboring class
291 |     else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Pedestrian") && !strcasecmp("Person_sitting", gt[i].box.type.c_str()))
292 |       valid_class = 0;
293 |     else if(!strcasecmp(CLASS_NAMES[current_class].c_str(), "Car") && !strcasecmp("Van", gt[i].box.type.c_str()))
294 |       valid_class = 0;
295 | 
296 |     // classes not used for evaluation
297 |     else
298 |       valid_class = -1;
299 | 
300 |     // ground truth is ignored, if occlusion, truncation exceeds the difficulty or ground truth is too small
301 |     // (doesn't count as FN nor TP, although detections may be assigned)
302 |     bool ignore = false;
303 |     if(gt[i].occlusion>MAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height<MIN_HEIGHT[difficulty])
304 |       ignore = true;
305 | 
306 |     // set ignored vector for ground truth
307 |     // current class and not ignored (total no. of ground truth is detected for recall denominator)
308 |     if(valid_class==1 && !ignore){
309 |       ignored_gt.push_back(0);
310 |       n_gt++;
311 |     }
312 | 
313 |     // neighboring class, or current class but ignored
314 |     else if(valid_class==0 || (ignore && valid_class==1))
315 |       ignored_gt.push_back(1);
316 | 
317 |     // all other classes which are FN in the evaluation
318 |     else
319 |       ignored_gt.push_back(-1);
320 |   }
321 | 
322 |   // extract dontcare areas
323 |   for(int32_t i=0;i<gt.size(); i++)
324 |     if(!strcasecmp("DontCare", gt[i].box.type.c_str()))
325 |       dc.push_back(gt[i]);
326 | 
327 |   // extract detections bounding boxes of the current class
328 |   for(int32_t i=0;i<det.size(); i++){
329 | 
330 |     // neighboring classes are not evaluated
331 |     int32_t valid_class;
332 |     if(!strcasecmp(det[i].box.type.c_str(), CLASS_NAMES[current_class].c_str()))
333 |       valid_class = 1;
334 |     else
335 |       valid_class = -1;
336 | 
337 |     // set ignored vector for detections
338 |     if(valid_class==1)
339 |       ignored_det.push_back(0);
340 |     else
341 |       ignored_det.push_back(-1);
342 |   }
343 | }
344 | 
345 | tPrData computeStatistics(CLASSES current_class, const vector<tGroundtruth> &gt, const vector<tDetection> &det, const vector<tGroundtruth> &dc, const vector<int32_t> &ignored_gt, const vector<int32_t>  &ignored_det, bool compute_fp, bool compute_aos=false, double thresh=0, bool debug=false){
346 | 
347 |   tPrData stat = tPrData();
348 |   const double NO_DETECTION = -10000000;
349 |   vector<double> delta;            // holds angular difference for TPs (needed for AOS evaluation)
350 |   vector<bool> assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth
351 |   assigned_detection.assign(det.size(), false);
352 |   vector<bool> ignored_threshold;
353 |   ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed
354 | 
355 |   // detections with a low score are ignored for computing precision (needs FP)
356 |   if(compute_fp)
357 |     for(int32_t i=0; i<det.size(); i++)
358 |       if(det[i].thresh<thresh)
359 |         ignored_threshold[i] = true;
360 | 
361 |   // evaluate all ground truth boxes
362 |   for(int32_t i=0; i<gt.size(); i++){
363 | 
364 |     // this ground truth is not of the current or a neighboring class and therefore ignored
365 |     if(ignored_gt[i]==-1)
366 |       continue;
367 | 
368 |     /*=======================================================================
369 |     find candidates (overlap with ground truth > 0.5) (logical len(det))
370 |     =======================================================================*/
371 |     int32_t det_idx          = -1;
372 |     double valid_detection = NO_DETECTION;
373 |     double max_overlap     = 0;
374 | 
375 |     // search for a possible detection
376 |     bool assigned_ignored_det = false;
377 |     for(int32_t j=0; j<det.size(); j++){
378 | 
379 |       // detections not of the current class, already assigned or with a low threshold are ignored
380 |       if(ignored_det[j]==-1)
381 |         continue;
382 |       if(assigned_detection[j])
383 |         continue;
384 |       if(ignored_threshold[j])
385 |         continue;
386 | 
387 |       // find the maximum score for the candidates and get idx of respective detection
388 |       double overlap = boxoverlap(det[j].box, gt[i].box);
389 | 
390 |       // for computing recall thresholds, the candidate with highest score is considered
391 |       if(!compute_fp && overlap>MIN_OVERLAP[current_class] && det[j].thresh>valid_detection){
392 |         det_idx         = j;
393 |         valid_detection = det[j].thresh;
394 |       }
395 | 
396 |       // for computing pr curve values, the candidate with the greatest overlap is considered
397 |       // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used
398 |       else if(compute_fp && overlap>MIN_OVERLAP[current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){
399 |         max_overlap     = overlap;
400 |         det_idx         = j;
401 |         valid_detection = 1;
402 |         assigned_ignored_det = false;
403 |       }
404 |       else if(compute_fp && overlap>MIN_OVERLAP[current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){
405 |         det_idx              = j;
406 |         valid_detection      = 1;
407 |         assigned_ignored_det = true;
408 |       }
409 |     }
410 | 
411 |     /*=======================================================================
412 |     compute TP, FP and FN
413 |     =======================================================================*/
414 | 
415 |     // nothing was assigned to this valid ground truth
416 |     if(valid_detection==NO_DETECTION && ignored_gt[i]==0)
417 |       stat.fn++;
418 | 
419 |     // only evaluate valid ground truth <=> detection assignments (considering difficulty level)
420 |     else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1))
421 |       assigned_detection[det_idx] = true;
422 | 
423 |     // found a valid true positive
424 |     else if(valid_detection!=NO_DETECTION){
425 | 
426 |       // write highest score to threshold vector
427 |       stat.tp++;
428 |       stat.v.push_back(det[det_idx].thresh);
429 | 
430 |       // compute angular difference of detection and ground truth if valid detection orientation was provided
431 |       if(compute_aos)
432 |         delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha);
433 | 
434 |       // clean up
435 |       assigned_detection[det_idx] = true;
436 |     }
437 |   }
438 | 
439 |   // if FP are requested, consider stuff area
440 |   if(compute_fp){
441 | 
442 |     // count fp
443 |     for(int32_t i=0; i<det.size(); i++){
444 | 
445 |       // count false positives if required (height smaller than required is ignored (ignored_det==1)
446 |       if(!(assigned_detection[i] || ignored_det[i]==-1 || ignored_det[i]==1 || ignored_threshold[i]))
447 |         stat.fp++;
448 |     }
449 | 
450 |     // do not consider detections overlapping with stuff area
451 |     int32_t nstuff = 0;
452 |     for(int32_t i=0; i<dc.size(); i++){
453 |       for(int32_t j=0; j<det.size(); j++){
454 | 
455 |         // detections not of the current class, already assigned, with a low threshold or a low minimum height are ignored
456 |         if(assigned_detection[j])
457 |           continue;
458 |         if(ignored_det[j]==-1 || ignored_det[j]==1)
459 |           continue;
460 |         if(ignored_threshold[j])
461 |           continue;
462 | 
463 |         // compute overlap and assign to stuff area, if overlap exceeds class specific value
464 |         double overlap = boxoverlap(det[j].box, dc[i].box, 0);
465 |         if(overlap>MIN_OVERLAP[current_class]){
466 |           assigned_detection[j] = true;
467 |           nstuff++;
468 |         }
469 |       }
470 |     }
471 | 
472 |     // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas
473 |     stat.fp -= nstuff;
474 | 
475 |     // if all orientation values are valid, the AOS is computed
476 |     if(compute_aos){
477 |       vector<double> tmp;
478 | 
479 |       // FP have a similarity of 0, for all TP compute AOS
480 |       tmp.assign(stat.fp, 0);
481 |       for(int32_t i=0; i<delta.size(); i++)
482 |         tmp.push_back((1.0+cos(delta[i]))/2.0);
483 | 
484 |       // be sure, that all orientation deltas are computed
485 |       assert(tmp.size()==stat.fp+stat.tp);
486 |       assert(delta.size()==stat.tp);
487 | 
488 |       // get the mean orientation similarity for this image
489 |       if(stat.tp>0 || stat.fp>0)
490 |         stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0);
491 | 
492 |       // there was neither a FP nor a TP, so the similarity is ignored in the evaluation
493 |       else
494 |         stat.similarity = -1;
495 |     }
496 |   }
497 |   return stat;
498 | }
499 | 
500 | /*=======================================================================
501 | EVALUATE CLASS-WISE
502 | =======================================================================*/
503 | 
504 | bool eval_class (FILE *fp_det, FILE *fp_ap, FILE *fp_ori, CLASSES current_class,const vector< vector<tGroundtruth> > &groundtruth,const vector< vector<tDetection> > &detections, bool compute_aos, vector<double> &precision, vector<double> &aos, DIFFICULTY difficulty, int32_t N_TESTIMAGES) {
505 | 
506 |   // init
507 |   int32_t n_gt=0;                                     // total no. of gt (denominator of recall)
508 |   vector<double> v, thresholds;                       // detection scores, evaluated for recall discretization
509 |   vector< vector<int32_t> > ignored_gt, ignored_det;  // index of ignored gt detection for current class/difficulty
510 |   vector< vector<tGroundtruth> > dontcare;            // index of dontcare areas, included in ground truth
511 | 
512 |   // for all test images do
513 |   for (int32_t i=0; i<N_TESTIMAGES; i++){
514 | 
515 |     // holds ignored ground truth, ignored detections and dontcare areas for current frame
516 |     vector<int32_t> i_gt, i_det;
517 |     vector<tGroundtruth> dc;
518 | 
519 |     // only evaluate objects of current class and ignore occluded, truncated objects
520 |     cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty);
521 |     ignored_gt.push_back(i_gt);
522 |     ignored_det.push_back(i_det);
523 |     dontcare.push_back(dc);
524 | 
525 |     // compute statistics to get recall values
526 |     tPrData pr_tmp = tPrData();
527 |     pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false);
528 | 
529 |     // add detection scores to vector over all images
530 |     for(int32_t j=0; j<pr_tmp.v.size(); j++)
531 |       v.push_back(pr_tmp.v[j]);
532 |   }
533 | 
534 |   // get scores that must be evaluated for recall discretization
535 |   thresholds = getThresholds(v, n_gt);
536 | 
537 |   // compute TP,FP,FN for relevant scores
538 |   vector<tPrData> pr;
539 |   pr.assign(thresholds.size(),tPrData());
540 |   for (int32_t i=0; i<N_TESTIMAGES; i++){
541 | 
542 |     // for all scores/recall thresholds do:
543 |     for(int32_t t=0; t<thresholds.size(); t++){
544 |       tPrData tmp = tPrData();
545 |       tmp = computeStatistics(current_class, groundtruth[i], detections[i], dontcare[i],
546 |                               ignored_gt[i], ignored_det[i], true, compute_aos, thresholds[t], t==38);
547 | 
548 |       // add no. of TP, FP, FN, AOS for current frame to total evaluation for current threshold
549 |       pr[t].tp += tmp.tp;
550 |       pr[t].fp += tmp.fp;
551 |       pr[t].fn += tmp.fn;
552 |       if(tmp.similarity!=-1)
553 |         pr[t].similarity += tmp.similarity;
554 |     }
555 |   }
556 | 
557 |   // compute recall, precision and AOS
558 |   vector<double> recall;
559 |   precision.assign(N_SAMPLE_PTS, 0);
560 |   if(compute_aos)
561 |     aos.assign(N_SAMPLE_PTS, 0);
562 |   double r=0;
563 |   for (int32_t i=0; i<thresholds.size(); i++){
564 |     r = pr[i].tp/(double)(pr[i].tp + pr[i].fn);
565 |     recall.push_back(r);
566 |     precision[i] = pr[i].tp/(double)(pr[i].tp + pr[i].fp);
567 |     if(compute_aos)
568 |       aos[i] = pr[i].similarity/(double)(pr[i].tp + pr[i].fp);
569 |   }
570 | 
571 |   // filter precision and AOS using max_{i..end}(precision)
572 |   for (int32_t i=0; i<thresholds.size(); i++){
573 |     precision[i] = *max_element(precision.begin()+i, precision.end());
574 |     if(compute_aos)
575 |       aos[i] = *max_element(aos.begin()+i, aos.end());
576 |   }
577 | 
578 |   // save statisics and finish with success
579 |   saveStats(precision, aos, fp_det, fp_ap, fp_ori);
580 | 	return true;
581 | }
582 | 
583 | void saveAndPlotPlots(string dir_name,string file_name,string obj_type,vector<double> vals[],bool is_aos){
584 | 
585 |   char command[1024];
586 | 
587 |   // save plot data to file
588 |   FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w");
589 |   for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++)
590 |     fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]);
591 |   fclose(fp);
592 | 
593 |   // create png + eps
594 |   for (int32_t j=0; j<2; j++) {
595 | 
596 |     // open file
597 |     FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w");
598 | 
599 |     // save gnuplot instructions
600 |     if (j==0) {
601 |       fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n");
602 |       fprintf(fp,"set output \"%s.png\"\n",file_name.c_str());
603 |     } else {
604 |       fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n");
605 |       fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str());
606 |     }
607 | 
608 |     // set labels and ranges
609 |     fprintf(fp,"set size ratio 0.7\n");
610 |     fprintf(fp,"set xrange [0:1]\n");
611 |     fprintf(fp,"set yrange [0:1]\n");
612 |     fprintf(fp,"set xlabel \"Recall\"\n");
613 |     if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n");
614 |     else         fprintf(fp,"set ylabel \"Orientation Similarity\"\n");
615 |     obj_type[0] = toupper(obj_type[0]);
616 |     fprintf(fp,"set title \"%s\"\n",obj_type.c_str());
617 | 
618 |     // line width
619 |     int32_t   lw = 5;
620 |     if (j==0) lw = 3;
621 | 
622 |     // plot error curve
623 |     fprintf(fp,"plot ");
624 |     fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw);
625 |     fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw);
626 |     fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw);
627 | 
628 |     // close file
629 |     fclose(fp);
630 | 
631 |     // run gnuplot => create png + eps
632 |     sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str());
633 |     system(command);
634 |   }
635 | 
636 |   // create pdf and crop
637 |   sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
638 |   system(command);
639 |   sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str());
640 |   system(command);
641 |   sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str());
642 |   system(command);
643 | }
644 | 
645 | bool eval(string const & result_dir, string const & image_set_filename, string const & gt_dir,  Mail* mail, int32_t N_TESTIMAGES){
646 | 
647 |   // set some global parameters
648 |   initGlobals();
649 | 
650 |   // ground truth and result directories
651 | //  string result_dir     = "results/" + result_sha;
652 |   string plot_dir       = result_dir + "/plot";
653 | 
654 |   // create output directories
655 |   system(("mkdir " + plot_dir).c_str());
656 | 
657 |   // hold detections and ground truth in memory
658 |   vector< vector<tGroundtruth> > groundtruth;
659 |   vector< vector<tDetection> >   detections;
660 | 
661 |   // holds wether orientation similarity shall be computed (might be set to false while loading detections)
662 |   // and which labels where provided by this submission
663 |   bool compute_aos=true, eval_car=false, eval_pedestrian=false, eval_cyclist=false;
664 | 
665 |   // get image names
666 |   FILE *fp = fopen( image_set_filename.c_str(),"r" );
667 |   if (!fp) {
668 |     mail->msg("ERROR: Couldn't read: image set file %s!", image_set_filename.c_str() );
669 |     return false;
670 |   }
671 |   vector< string > image_set;
672 |   while (!feof(fp)) {
673 |     char str[255];
674 |     if (fscanf(fp, "%s", str) == 1){
675 |       image_set.push_back(str);
676 |     }
677 |   }
678 |   fclose(fp);
679 |   if( image_set.size() != N_TESTIMAGES ) {
680 |     printf( "image_set.size()=%s N_TESTIMAGES=%s\n", str(image_set.size()).c_str(), str(N_TESTIMAGES).c_str() );
681 |   }
682 |   assert(image_set.size() == N_TESTIMAGES);
683 | 
684 |   // for all images read groundtruth and detections
685 |   mail->msg("Loading detections...");
686 |   for (int32_t i=0; i<N_TESTIMAGES; i++) {
687 | 
688 |     // file name
689 |     char file_name[256];
690 |     sprintf(file_name,"%s.txt", image_set[i].c_str());
691 | 
692 |     // read ground truth and result poses
693 |     bool gt_success,det_success;
694 |     vector<tGroundtruth> gt   = loadGroundtruth(ospj(gt_dir,file_name),gt_success);
695 |     vector<tDetection>   det  = loadDetections(ospj(result_dir,"data",file_name), compute_aos, eval_car, eval_pedestrian, eval_cyclist,det_success);
696 |     groundtruth.push_back(gt);
697 |     detections.push_back(det);
698 | 
699 |     // check for errors
700 |     if (!gt_success) {
701 |       mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name);
702 |       return false;
703 |     }
704 |     if (!det_success) {
705 |       mail->msg("ERROR: Couldn't read: %s", file_name);
706 |       return false;
707 |     }
708 |   }
709 |   mail->msg("  done.");
710 | 
711 |   // holds pointers for result files
712 |   FILE *fp_det=0, *fp_ap=0, *fp_ori=0;
713 | 
714 |   // eval cars
715 |   if(eval_car){
716 |     fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_detection.txt").c_str(),"w");
717 |     fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_ap.txt").c_str(),"w");
718 |     if(compute_aos)
719 |       fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[CAR] + "_orientation.txt").c_str(),"w");
720 |     vector<double> precision[3], aos[3];
721 |     if( !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[0],aos[0],EASY,N_TESTIMAGES)
722 |        || !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE, N_TESTIMAGES)
723 |        || !eval_class(fp_det,fp_ap,fp_ori,CAR,groundtruth,detections,compute_aos,precision[2],aos[2],HARD, N_TESTIMAGES)){
724 |       mail->msg("Car evaluation failed.");
725 |       return false;
726 |     }
727 |     fclose(fp_det);
728 |     fclose(fp_ap);
729 |     saveAndPlotPlots(plot_dir,CLASS_NAMES[CAR] + "_detection",CLASS_NAMES[CAR],precision,0);
730 |     if(compute_aos){
731 |       saveAndPlotPlots(plot_dir,CLASS_NAMES[CAR] + "_orientation",CLASS_NAMES[CAR],aos,1);
732 |       fclose(fp_ori);
733 |     }
734 |   }
735 | 
736 |   // eval pedestrians
737 |   if(eval_pedestrian){
738 |     fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_detection.txt").c_str(),"w");
739 |     fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_ap.txt").c_str(),"w");
740 |     if(compute_aos)
741 |       fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[PEDESTRIAN] + "_orientation.txt").c_str(),"w");
742 |     vector<double> precision[3], aos[3];
743 |     if( !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[0],aos[0],EASY, N_TESTIMAGES)
744 |        || !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE,N_TESTIMAGES)
745 |        || !eval_class(fp_det,fp_ap,fp_ori,PEDESTRIAN,groundtruth,detections,compute_aos,precision[2],aos[2],HARD,N_TESTIMAGES)){
746 |       mail->msg("Pedestrian evaluation failed.");
747 |       return false;
748 |     }
749 |     fclose(fp_det);
750 |     fclose(fp_ap);
751 |     saveAndPlotPlots(plot_dir,CLASS_NAMES[PEDESTRIAN] + "_detection",CLASS_NAMES[PEDESTRIAN],precision,0);
752 |     if(compute_aos){
753 |       fclose(fp_ori);
754 |       saveAndPlotPlots(plot_dir,CLASS_NAMES[PEDESTRIAN] + "_orientation",CLASS_NAMES[PEDESTRIAN],aos,1);
755 |     }
756 |   }
757 | 
758 |   // eval cyclists
759 |   if(eval_cyclist){
760 |     fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST]  + "_detection.txt").c_str(),"w");
761 |     fp_ap = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST] + "_ap.txt").c_str(),"w");
762 |     if(compute_aos)
763 |       fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[CYCLIST] + "_orientation.txt").c_str(),"w");
764 |     vector<double> precision[3], aos[3];
765 |     if( !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[0],aos[0],EASY, N_TESTIMAGES)
766 |        || !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[1],aos[1],MODERATE, N_TESTIMAGES)
767 |        || !eval_class(fp_det,fp_ap,fp_ori,CYCLIST,groundtruth,detections,compute_aos,precision[2],aos[2],HARD, N_TESTIMAGES)){
768 |       mail->msg("Cyclist evaluation failed.");
769 |       return false;
770 |     }
771 |     fclose(fp_det);
772 |     fclose(fp_ap);
773 |     saveAndPlotPlots(plot_dir,CLASS_NAMES[CYCLIST] + "_detection",CLASS_NAMES[CYCLIST],precision,0);
774 |     if(compute_aos){
775 |       fclose(fp_ori);
776 |       saveAndPlotPlots(plot_dir,CLASS_NAMES[CYCLIST] + "_orientation",CLASS_NAMES[CYCLIST],aos,1);
777 |     }
778 |   }
779 | 
780 |   // success
781 |   return true;
782 | }
783 | 
784 | int32_t main (int32_t argc,char *argv[]) {
785 | 
786 |   // we need 4 arguments!
787 |   if (argc!=5) {
788 |     cout << "Usage: ./eval_detection kitti_dir image_set_filename result_dir" << endl;
789 |     return 1;
790 |   }
791 | 
792 |   // read arguments
793 |   string const kitti_dir          = argv[1];
794 |   string const gt_dir             = ospj( kitti_dir, "label_2" ); // FIXME_MWM: should be part of input? configurable?
795 |   string const image_set_filename = argv[2];
796 |   string const result_dir         = argv[3];
797 |   int32_t const N_TESTIMAGES      = atoi(argv[4]);
798 | 
799 |   // init notification mail
800 |   Mail *mail = new Mail();
801 |   mail->msg("Thank you for participating in our evaluation!");
802 | 
803 |   // run evaluation
804 |   if (eval( result_dir, image_set_filename, gt_dir, mail, N_TESTIMAGES )) {
805 |     mail->msg( ("Your evaluation results are available in " + result_dir).c_str() );
806 |   } else {
807 |     mail->msg("An error occured while processing your results.");
808 |     mail->msg("Please make sure that the data in your zip archive has the right format!");
809 |   }
810 | 
811 |   // send mail and exit
812 |   delete mail;
813 | 
814 |   return 0;
815 | }
816 | 
817 | 


--------------------------------------------------------------------------------