├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── configs
    ├── faster_rcnn_inception_resnet_v2_atrous_egohands.config
    ├── faster_rcnn_inception_v2_egohands.config
    ├── faster_rcnn_resnet101_egohands.config
    ├── faster_rcnn_resnet50_egohands.config
    ├── gcp-run.sh
    ├── rfcn_resnet101_egohands.config
    ├── ssd_inception_v2_egohands.config
    ├── ssd_mobilenet_v1_egohands.config
    ├── ssd_mobilenet_v2_egohands.config
    ├── ssd_resnet50_v1_fpn_egohands.config
    ├── ssdlite_mobilenet_v2_egohands.config
    └── training_results.txt
├── create_kitti_tf_record.py
├── create_tfrecords.sh
├── data
    ├── egohands_label_map.pbtxt
    └── jk-son-hands.jpg
├── detect_image.py
├── detect_image.sh
├── doc
    ├── eval.png
    ├── loss_curve_1.png
    ├── ssdlite_mobilenet_v2_result.jpg
    └── training.log
├── download_pretrained_models.sh
├── eval.sh
├── export.sh
├── install.sh
├── prepare_egohands.py
├── requirements.txt
└── train.sh


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # TensorFlow object detection installation stuffs
10 | *.tar.gz
11 | cocoapi/
12 | protoc-3.5.1/
13 | ssd_*/
14 | ssdlite_*/
15 | rfcn_*/
16 | faster_rcnn_*/
17 | 
18 | # Project specifics
19 | egohands_data.zip
20 | egohands/
21 | egohands_kitti_formatted/
22 | *.tfrecord
23 | model_exported/
24 | detection_output.jpg
25 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "models"]
2 | 	path = models
3 | 	url = https://github.com/tensorflow/models
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 JK Jung
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Hand Detection Tutorial
  2 | =======================
  3 | 
  4 | This is a tutorial on how to train a 'hand detector' with TensorFlow object detection API.  This README outlines how to set up everything and train the object detection model locally.  You could refer to the following blog post for more detailed description about the steps within.
  5 | 
  6 | * [Training a Hand Detector with TensorFlow Object Detection API](https://jkjung-avt.github.io/hand-detection-tutorial/)
  7 | * [Adapting the Hand Detector Tutorial to Your Own Data](https://jkjung-avt.github.io/object-detection-tutorial/)
  8 | 
  9 | 
 10 | Table of contents
 11 | -----------------
 12 | 
 13 | * [Setup](#setup)
 14 | * [Training](#training)
 15 | * [Evaluating the trained model](#evluating)
 16 | * [Testing the trained model with an image](#testing)
 17 | * [Deploying the trained model onto Jetson TX2](#deploying)
 18 | 
 19 | 
 20 | <a name="setup"></a>
 21 | Setup
 22 | -----
 23 | 
 24 | Just for reference, the code in this repository has been tested on a desktop PC with:
 25 | 
 26 | * NVIDIA GeForce GTX-1080Ti
 27 | * Ubuntu 16.04.5 LTS (x86_64)
 28 | * CUDA 9.2
 29 | * cuDNN 7.1.4
 30 | * TensorFlow 1.10.0
 31 | 
 32 | This tutorial uses python3 for training and testing the TensorFlow object detection models.  Follow the steps below to set up the environment for training the models.  Make sure `tensorflow-gpu` or `tensorflow` (python3 packages) has been installed on the system already.
 33 | 
 34 | 1. Clone this repository.
 35 | 
 36 |    ```shell
 37 |    $ cd ~/project
 38 |    $ git clone https://github.com/jkjung-avt/hand-detection-tutorial.git
 39 |    $ cd hand-detection-tutorial
 40 |    ```
 41 | 
 42 | 2. Install required python3 packages.
 43 | 
 44 |    ```shell
 45 |    $ sudo pip3 install -r requirements.txt
 46 |    ```
 47 | 
 48 |    In case you are having trouble with `sudo`, you can do `pip3 install --user -r requirements.txt` instead.
 49 |  
 50 | 3. Run the installation script.  Make sure the last step in the script, `Running model_builder_test.py`, finishes without error, before continuing on.
 51 | 
 52 |    ```shell
 53 |    $ ./install.sh
 54 |    ```
 55 |  
 56 | 4. Download pretrained models from TensorFlow Object Detection Model Zoo.
 57 | 
 58 |    ```shell
 59 |    $ ./download_pretrained_models.sh
 60 |    ```
 61 | 
 62 | 
 63 | <a name="training"></a>
 64 | Training
 65 | --------
 66 | 
 67 | 1. Prepare the 'egohands' dataset.
 68 | 
 69 |    ```shell
 70 |    $ python3 prepare_egohands.py
 71 |    ```
 72 | 
 73 |    The `prepare_egohands.py` script downloads the 'egohands' dataset and convert its annotations to KITTI format.  When finished, the following files should be present in the folder.  Note there are totally 4,800 jpg images in the 'egohands' dataset.
 74 | 
 75 |    ```
 76 |    ./egohands_data.zip
 77 |    ./egohands
 78 |      ├── (egohands dataset unzipped)
 79 |      └── ......
 80 |    ./egohands_kitti_formatted
 81 |      ├── images
 82 |      │   ├── CARDS_COURTYARD_B_T_frame_0011.jpg
 83 |      │   ├── ......
 84 |      │   └── PUZZLE_OFFICE_T_S_frame_2697.jpg
 85 |      └── labels
 86 |          ├── CARDS_COURTYARD_B_T_frame_0011.txt
 87 |          ├── ......
 88 |          └── PUZZLE_OFFICE_T_S_frame_2697.txt
 89 |    ```
 90 | 
 91 | 2. Create the TFRecord files (train/val) needed to train the object detection model.  The `create_tfrecords.py` script would split the jpg images into 'train' (4,300) and 'val' (500) sets, and then generate `data/egohands_train.tfrecord` and `data/egohands_val.tfrecord`.  This process might take a few minutes.  The resulting TFRecord files are roughly 1.1GB and 132MB in size.
 92 | 
 93 |    ```shell
 94 |    $ ./create_tfrecords.sh
 95 |    ```
 96 | 
 97 | 3. (Optional) Review and modify the model config file if necessary.  For example, open the file `configs/ssd_mobilenet_v1_egohands.config` with an editor and do some editing.
 98 | 
 99 | 4. Start training the model by invoking `./train.sh <model_name>`.  For example, to train the detector based on ssd_mobilenet_v1.  Do this:
100 | 
101 |    ```shell
102 |    $ ./train.sh ssd_mobilenet_v1_egohands
103 |    ```
104 | 
105 |    The training is set to run for 20,000 iterations.  It takes roughly 2 hours to finish on the desktop PC listed above.
106 | 
107 |    If you have multiple GPUs, you could specify which GPU to use for the training with the [`CUDA_VISIBLE_DEVICES`](https://www.tensorflow.org/guide/using_gpu) environment variable.  For example, the following command starts a training session for the `faster_rcnn_inception_v2_egohands` model on the 2nd GPU (GPU #1).
108 | 
109 |    ```shell
110 |    $ CUDA_VISIBLE_DEVICES=1 ./train.sh faster_rcnn_inception_v2_egohands
111 |    ```
112 | 
113 | 5. Monitor the progress of training with TensorBoard, by executing `tensorboard` in another terminal.
114 | 
115 |    ```shell
116 |    $ cd ~/project/hand-detection-tutorial
117 |    $ tensorboard --logdir=ssd_mobilenet_v1_egohands
118 |    ```
119 | 
120 |    Then open `http://localhost:6006` with a browser locally.  (You could also replace `localhost` with IP address of the training PC, and do the monitoring remotely.)
121 | 
122 |    <p>
123 |    <img src="doc/loss_curve_1.png" alt="TensorBoard showing learning rate and loss curve of ssd_mobilenet_v1_egohands" height="300px"/>
124 |    </p> 
125 | 
126 | 
127 | <a name="evaluating"></a>
128 | Evaluating the trained model
129 | ----------------------------
130 | 
131 | * The trained model could be evaluated by simply executing the `./eval.sh` script.  For example,
132 | 
133 |   ```shell
134 |   # similar to train.sh, use 'CUDA_VISIBLE_DEVICES' to specify GPU
135 |   $ ./eval.sh ssd_mobilenet_v1_egohands
136 |   ```
137 | 
138 |   Here's an example output of the evaluation output.  Among all the numbers, the author would pay most attention to the 'AP @ IoU=0.50' value (0.967).
139 | 
140 |   ```
141 |     Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.681
142 |     Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.967
143 |     Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.809
144 |     Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.079
145 |     Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.313
146 |     Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.717
147 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.258
148 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.736
149 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.742
150 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.118
151 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.466
152 |     Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.774
153 |   ```
154 | 
155 |   In addition, you could run `tensorboard` to inspect details of the evaluation.  Note `logdir` points to the 'eval' subdirectory below.
156 | 
157 |   ```shell
158 |   $ cd ~/project/hand-detection-tutorial
159 |   $ tensorboard --logdir=ssd_mobilenet_v1_egohands_eval
160 |   ```
161 | 
162 |   Again, open `http://localhost:6006` or `http://<IP.addr>:6006` with a browser.  Click on the 'IMAGES' tab.  You can then browse through all images in the validation set and check how well your trained model performs on those images.
163 | 
164 |   <p>
165 |   <img src="doc/eval.png" alt="TensorBoard showing evaluation result of ssd_mobilenet_v1_egohands" height="300px"/>
166 |   </p> 
167 | 
168 | 
169 | <a name="testing"></a>
170 | Testing the trained model with an image
171 | ---------------------------------------
172 | 
173 | * This repo also includes scripts to test the trained model with your own image file.  For example, the following commands would convert a trained `ssdlite_mobilenet_v2_egohands` model into a frozen graph (saved under `model_exported/`), and then use the graph to detect hands in `data/jk-son-hands.jpg`.  The output image, with bounding boxes overlaid, would be saved as `detection_output.jpg`.
174 | 
175 |   ```shell
176 |   $ CUDA_VISIBLE_DEVICES=0 ./export.sh ssdlite_mobilenet_v2_egohands
177 |   $ CUDA_VISIBLE_DEVICES=0 ./detect_image.sh data/jk-son-hands.jpg 
178 |   ```
179 | 
180 |   You can then check out the output image by, say,
181 | 
182 |   ```shell
183 |   $ display detection_output.jpg
184 |   ```
185 | 
186 |   <p>
187 |   <img src="doc/ssdlite_mobilenet_v2_result.jpg" alt="Detection result with ssdlite_mobilenet_v2_egohands" height="300px"/>
188 |   </p> 
189 | 
190 | 
191 | <a name="deploying"></a>
192 | Deploying the trained model onto Jetson TX2/Nano
193 | ------------------------------------------------
194 | 
195 | Please refer to the following GitHub repos and blog posts.
196 | 
197 | * Demo #3 ('ssd') of [jkjung-avt/tensorrt_demos](https://github.com/jkjung-avt/tensorrt_demos) -> The trained 'ssd_mobilenet_v1_egohands' model could run as fast as ~31 frames per seconds (FPS) on Jetson Nano using 'trt_ssd_async.py'!
198 | * [jkjung-avt/tf_trt_models](https://github.com/jkjung-avt/tf_trt_models)
199 | * [Deploying the Hand Detector onto Jetson TX2](https://jkjung-avt.github.io/hand-detection-on-tx2/)
200 | * [TensorFlow/TensorRT (TF-TRT) Revisited](https://jkjung-avt.github.io/tf-trt-revisited/)
201 | * [Testing TF-TRT Object Detectors on Jetson Nano](https://jkjung-avt.github.io/tf-trt-on-nano/)
202 | 


--------------------------------------------------------------------------------
/configs/faster_rcnn_inception_resnet_v2_atrous_egohands.config:
--------------------------------------------------------------------------------
  1 | # Faster R-CNN with Inception Resnet v2, Atrous version;
  2 | # Configured for egohands dataset.
  3 | # Users should configure the fine_tune_checkpoint field in the train config as
  4 | # well as the label_map_path and input_path fields in the train_input_reader and
  5 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  6 | # should be configured.
  7 | 
  8 | model {
  9 |   faster_rcnn {
 10 |     num_classes: 1
 11 |     image_resizer {
 12 |       keep_aspect_ratio_resizer {
 13 |         min_dimension: 600
 14 |         max_dimension: 1024
 15 |       }
 16 |     }
 17 |     feature_extractor {
 18 |       type: 'faster_rcnn_inception_resnet_v2'
 19 |       first_stage_features_stride: 8
 20 |     }
 21 |     first_stage_anchor_generator {
 22 |       grid_anchor_generator {
 23 |         scales: [0.25, 0.5, 1.0, 2.0]
 24 |         aspect_ratios: [0.5, 1.0, 2.0]
 25 |         height_stride: 8
 26 |         width_stride: 8
 27 |       }
 28 |     }
 29 |     first_stage_atrous_rate: 2
 30 |     first_stage_box_predictor_conv_hyperparams {
 31 |       op: CONV
 32 |       regularizer {
 33 |         l2_regularizer {
 34 |           weight: 0.0
 35 |         }
 36 |       }
 37 |       initializer {
 38 |         truncated_normal_initializer {
 39 |           stddev: 0.01
 40 |         }
 41 |       }
 42 |     }
 43 |     first_stage_nms_score_threshold: 0.0
 44 |     first_stage_nms_iou_threshold: 0.7
 45 |     first_stage_max_proposals: 300
 46 |     first_stage_localization_loss_weight: 2.0
 47 |     first_stage_objectness_loss_weight: 1.0
 48 |     initial_crop_size: 17
 49 |     maxpool_kernel_size: 1
 50 |     maxpool_stride: 1
 51 |     second_stage_box_predictor {
 52 |       mask_rcnn_box_predictor {
 53 |         use_dropout: false
 54 |         dropout_keep_probability: 1.0
 55 |         fc_hyperparams {
 56 |           op: FC
 57 |           regularizer {
 58 |             l2_regularizer {
 59 |               weight: 0.0
 60 |             }
 61 |           }
 62 |           initializer {
 63 |             variance_scaling_initializer {
 64 |               factor: 1.0
 65 |               uniform: true
 66 |               mode: FAN_AVG
 67 |             }
 68 |           }
 69 |         }
 70 |       }
 71 |     }
 72 |     second_stage_post_processing {
 73 |       batch_non_max_suppression {
 74 |         score_threshold: 0.0
 75 |         iou_threshold: 0.6
 76 |         max_detections_per_class: 100
 77 |         max_total_detections: 100
 78 |       }
 79 |       score_converter: SOFTMAX
 80 |     }
 81 |     second_stage_localization_loss_weight: 2.0
 82 |     second_stage_classification_loss_weight: 1.0
 83 |   }
 84 | }
 85 | 
 86 | train_config: {
 87 |   batch_size: 1
 88 |   optimizer {
 89 |     momentum_optimizer: {
 90 |       learning_rate: {
 91 |         manual_step_learning_rate {
 92 |           initial_learning_rate: 0.0003
 93 |           schedule {
 94 |             step: 30000
 95 |             learning_rate: .00003
 96 |           }
 97 |           schedule {
 98 |             step: 48000
 99 |             learning_rate: .000003
100 |           }
101 |         }
102 |       }
103 |       momentum_optimizer_value: 0.9
104 |     }
105 |     use_moving_average: false
106 |   }
107 |   gradient_clipping_by_norm: 10.0
108 |   fine_tune_checkpoint: "faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28/model.ckpt"
109 |   from_detection_checkpoint: true
110 |   # Note: The below line limits the training process to 200K steps, which we
111 |   # empirically found to be sufficient enough to train the pets dataset. This
112 |   # effectively bypasses the learning rate schedule (the learning rate will
113 |   # never decay). Remove the below line to train indefinitely.
114 |   num_steps: 50000
115 |   data_augmentation_options {
116 |     random_horizontal_flip {
117 |     }
118 |   }
119 | }
120 | 
121 | train_input_reader: {
122 |   tf_record_input_reader {
123 |     input_path: "data/egohands_train.tfrecord"
124 |   }
125 |   label_map_path: "data/egohands_label_map.pbtxt"
126 | }
127 | 
128 | eval_config: {
129 |   num_examples: 500
130 |   # Note: The below line limits the evaluation process to 10 evaluations.
131 |   # Remove the below line to evaluate indefinitely.
132 |   max_evals: 10
133 | }
134 | 
135 | eval_input_reader: {
136 |   tf_record_input_reader {
137 |     input_path: "data/egohands_val.tfrecord"
138 |   }
139 |   label_map_path: "data/egohands_label_map.pbtxt"
140 |   shuffle: false
141 |   num_readers: 1
142 | }
143 | 


--------------------------------------------------------------------------------
/configs/faster_rcnn_inception_v2_egohands.config:
--------------------------------------------------------------------------------
  1 | # Faster R-CNN with Inception v2, configured for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   faster_rcnn {
  9 |     num_classes: 1
 10 |     image_resizer {
 11 |       keep_aspect_ratio_resizer {
 12 |         min_dimension: 600
 13 |         max_dimension: 1024
 14 |       }
 15 |     }
 16 |     feature_extractor {
 17 |       type: 'faster_rcnn_inception_v2'
 18 |       first_stage_features_stride: 16
 19 |     }
 20 |     first_stage_anchor_generator {
 21 |       grid_anchor_generator {
 22 |         scales: [0.25, 0.5, 1.0, 2.0]
 23 |         aspect_ratios: [0.5, 1.0, 2.0]
 24 |         height_stride: 16
 25 |         width_stride: 16
 26 |       }
 27 |     }
 28 |     first_stage_box_predictor_conv_hyperparams {
 29 |       op: CONV
 30 |       regularizer {
 31 |         l2_regularizer {
 32 |           weight: 0.0
 33 |         }
 34 |       }
 35 |       initializer {
 36 |         truncated_normal_initializer {
 37 |           stddev: 0.01
 38 |         }
 39 |       }
 40 |     }
 41 |     first_stage_nms_score_threshold: 0.0
 42 |     first_stage_nms_iou_threshold: 0.7
 43 |     first_stage_max_proposals: 300
 44 |     first_stage_localization_loss_weight: 2.0
 45 |     first_stage_objectness_loss_weight: 1.0
 46 |     initial_crop_size: 14
 47 |     maxpool_kernel_size: 2
 48 |     maxpool_stride: 2
 49 |     second_stage_box_predictor {
 50 |       mask_rcnn_box_predictor {
 51 |         use_dropout: false
 52 |         dropout_keep_probability: 1.0
 53 |         fc_hyperparams {
 54 |           op: FC
 55 |           regularizer {
 56 |             l2_regularizer {
 57 |               weight: 0.0
 58 |             }
 59 |           }
 60 |           initializer {
 61 |             variance_scaling_initializer {
 62 |               factor: 1.0
 63 |               uniform: true
 64 |               mode: FAN_AVG
 65 |             }
 66 |           }
 67 |         }
 68 |       }
 69 |     }
 70 |     second_stage_post_processing {
 71 |       batch_non_max_suppression {
 72 |         score_threshold: 0.0
 73 |         iou_threshold: 0.6
 74 |         max_detections_per_class: 100
 75 |         max_total_detections: 300
 76 |       }
 77 |       score_converter: SOFTMAX
 78 |     }
 79 |     second_stage_localization_loss_weight: 2.0
 80 |     second_stage_classification_loss_weight: 1.0
 81 |   }
 82 | }
 83 | 
 84 | train_config: {
 85 |   batch_size: 1
 86 |   optimizer {
 87 |     momentum_optimizer: {
 88 |       learning_rate: {
 89 |         manual_step_learning_rate {
 90 |           initial_learning_rate: 0.0002
 91 |           schedule {
 92 |             step: 30000
 93 |             learning_rate: .00002
 94 |           }
 95 |           schedule {
 96 |             step: 48000
 97 |             learning_rate: .000002
 98 |           }
 99 |         }
100 |       }
101 |       momentum_optimizer_value: 0.9
102 |     }
103 |     use_moving_average: false
104 |   }
105 |   gradient_clipping_by_norm: 10.0
106 |   fine_tune_checkpoint: "faster_rcnn_inception_v2_coco_2018_01_28/model.ckpt"
107 |   from_detection_checkpoint: true
108 |   # Note: The below line limits the training process to 200K steps, which we
109 |   # empirically found to be sufficient enough to train the pets dataset. This
110 |   # effectively bypasses the learning rate schedule (the learning rate will
111 |   # never decay). Remove the below line to train indefinitely.
112 |   num_steps: 50000
113 |   data_augmentation_options {
114 |     random_horizontal_flip {
115 |     }
116 |   }
117 | }
118 | 
119 | 
120 | train_input_reader: {
121 |   tf_record_input_reader {
122 |     input_path: "data/egohands_train.tfrecord"
123 |   }
124 |   label_map_path: "data/egohands_label_map.pbtxt"
125 | }
126 | 
127 | eval_config: {
128 |   num_examples: 500
129 |   # Note: The below line limits the evaluation process to 10 evaluations.
130 |   # Remove the below line to evaluate indefinitely.
131 |   max_evals: 10
132 | }
133 | 
134 | eval_input_reader: {
135 |   tf_record_input_reader {
136 |     input_path: "data/egohands_val.tfrecord"
137 |   }
138 |   label_map_path: "data/egohands_label_map.pbtxt"
139 |   shuffle: false
140 |   num_readers: 1
141 | }
142 | 


--------------------------------------------------------------------------------
/configs/faster_rcnn_resnet101_egohands.config:
--------------------------------------------------------------------------------
  1 | # Faster R-CNN with Resnet-101 (v1), configuration for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   faster_rcnn {
  9 |     num_classes: 1
 10 |     image_resizer {
 11 |       keep_aspect_ratio_resizer {
 12 |         min_dimension: 600
 13 |         max_dimension: 1024
 14 |       }
 15 |     }
 16 |     feature_extractor {
 17 |       type: 'faster_rcnn_resnet101'
 18 |       first_stage_features_stride: 16
 19 |     }
 20 |     first_stage_anchor_generator {
 21 |       grid_anchor_generator {
 22 |         scales: [0.25, 0.5, 1.0, 2.0]
 23 |         aspect_ratios: [0.5, 1.0, 2.0]
 24 |         height_stride: 16
 25 |         width_stride: 16
 26 |       }
 27 |     }
 28 |     first_stage_box_predictor_conv_hyperparams {
 29 |       op: CONV
 30 |       regularizer {
 31 |         l2_regularizer {
 32 |           weight: 0.0
 33 |         }
 34 |       }
 35 |       initializer {
 36 |         truncated_normal_initializer {
 37 |           stddev: 0.01
 38 |         }
 39 |       }
 40 |     }
 41 |     first_stage_nms_score_threshold: 0.0
 42 |     first_stage_nms_iou_threshold: 0.7
 43 |     first_stage_max_proposals: 300
 44 |     first_stage_localization_loss_weight: 2.0
 45 |     first_stage_objectness_loss_weight: 1.0
 46 |     initial_crop_size: 14
 47 |     maxpool_kernel_size: 2
 48 |     maxpool_stride: 2
 49 |     second_stage_box_predictor {
 50 |       mask_rcnn_box_predictor {
 51 |         use_dropout: false
 52 |         dropout_keep_probability: 1.0
 53 |         fc_hyperparams {
 54 |           op: FC
 55 |           regularizer {
 56 |             l2_regularizer {
 57 |               weight: 0.0
 58 |             }
 59 |           }
 60 |           initializer {
 61 |             variance_scaling_initializer {
 62 |               factor: 1.0
 63 |               uniform: true
 64 |               mode: FAN_AVG
 65 |             }
 66 |           }
 67 |         }
 68 |       }
 69 |     }
 70 |     second_stage_post_processing {
 71 |       batch_non_max_suppression {
 72 |         score_threshold: 0.0
 73 |         iou_threshold: 0.6
 74 |         max_detections_per_class: 100
 75 |         max_total_detections: 300
 76 |       }
 77 |       score_converter: SOFTMAX
 78 |     }
 79 |     second_stage_localization_loss_weight: 2.0
 80 |     second_stage_classification_loss_weight: 1.0
 81 |   }
 82 | }
 83 | 
 84 | train_config: {
 85 |   batch_size: 1
 86 |   optimizer {
 87 |     momentum_optimizer: {
 88 |       learning_rate: {
 89 |         manual_step_learning_rate {
 90 |           initial_learning_rate: 0.0003
 91 |           schedule {
 92 |             step: 30000
 93 |             learning_rate: .00003
 94 |           }
 95 |           schedule {
 96 |             step: 48000
 97 |             learning_rate: .000003
 98 |           }
 99 |         }
100 |       }
101 |       momentum_optimizer_value: 0.9
102 |     }
103 |     use_moving_average: false
104 |   }
105 |   gradient_clipping_by_norm: 10.0
106 |   fine_tune_checkpoint: "faster_rcnn_resnet101_coco_2018_01_28/model.ckpt"
107 |   from_detection_checkpoint: true
108 |   num_steps: 50000
109 |   data_augmentation_options {
110 |     random_horizontal_flip {
111 |     }
112 |   }
113 | }
114 | 
115 | train_input_reader: {
116 |   tf_record_input_reader {
117 |     input_path: "data/egohands_train.tfrecord"
118 |   }
119 |   label_map_path: "data/egohands_label_map.pbtxt"
120 | }
121 | 
122 | eval_config: {
123 |   num_examples: 500
124 |   # Note: The below line limits the evaluation process to 10 evaluations.
125 |   # Remove the below line to evaluate indefinitely.
126 |   max_evals: 10
127 | }
128 | 
129 | eval_input_reader: {
130 |   tf_record_input_reader {
131 |     input_path: "data/egohands_val.tfrecord"
132 |   }
133 |   label_map_path: "data/egohands_label_map.pbtxt"
134 |   shuffle: false
135 |   num_readers: 1
136 | }
137 | 


--------------------------------------------------------------------------------
/configs/faster_rcnn_resnet50_egohands.config:
--------------------------------------------------------------------------------
  1 | # Faster R-CNN with Resnet-50 (v1), configuration for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   faster_rcnn {
  9 |     num_classes: 1
 10 |     image_resizer {
 11 |       keep_aspect_ratio_resizer {
 12 |         min_dimension: 600
 13 |         max_dimension: 1024
 14 |       }
 15 |     }
 16 |     feature_extractor {
 17 |       type: 'faster_rcnn_resnet50'
 18 |       first_stage_features_stride: 16
 19 |     }
 20 |     first_stage_anchor_generator {
 21 |       grid_anchor_generator {
 22 |         scales: [0.25, 0.5, 1.0, 2.0]
 23 |         aspect_ratios: [0.5, 1.0, 2.0]
 24 |         height_stride: 16
 25 |         width_stride: 16
 26 |       }
 27 |     }
 28 |     first_stage_box_predictor_conv_hyperparams {
 29 |       op: CONV
 30 |       regularizer {
 31 |         l2_regularizer {
 32 |           weight: 0.0
 33 |         }
 34 |       }
 35 |       initializer {
 36 |         truncated_normal_initializer {
 37 |           stddev: 0.01
 38 |         }
 39 |       }
 40 |     }
 41 |     first_stage_nms_score_threshold: 0.0
 42 |     first_stage_nms_iou_threshold: 0.7
 43 |     first_stage_max_proposals: 300
 44 |     first_stage_localization_loss_weight: 2.0
 45 |     first_stage_objectness_loss_weight: 1.0
 46 |     initial_crop_size: 14
 47 |     maxpool_kernel_size: 2
 48 |     maxpool_stride: 2
 49 |     second_stage_box_predictor {
 50 |       mask_rcnn_box_predictor {
 51 |         use_dropout: false
 52 |         dropout_keep_probability: 1.0
 53 |         fc_hyperparams {
 54 |           op: FC
 55 |           regularizer {
 56 |             l2_regularizer {
 57 |               weight: 0.0
 58 |             }
 59 |           }
 60 |           initializer {
 61 |             variance_scaling_initializer {
 62 |               factor: 1.0
 63 |               uniform: true
 64 |               mode: FAN_AVG
 65 |             }
 66 |           }
 67 |         }
 68 |       }
 69 |     }
 70 |     second_stage_post_processing {
 71 |       batch_non_max_suppression {
 72 |         score_threshold: 0.0
 73 |         iou_threshold: 0.6
 74 |         max_detections_per_class: 100
 75 |         max_total_detections: 300
 76 |       }
 77 |       score_converter: SOFTMAX
 78 |     }
 79 |     second_stage_localization_loss_weight: 2.0
 80 |     second_stage_classification_loss_weight: 1.0
 81 |   }
 82 | }
 83 | 
 84 | train_config: {
 85 |   batch_size: 1
 86 |   optimizer {
 87 |     momentum_optimizer: {
 88 |       learning_rate: {
 89 |         manual_step_learning_rate {
 90 |           initial_learning_rate: 0.0003
 91 |           schedule {
 92 |             step: 30000
 93 |             learning_rate: .00003
 94 |           }
 95 |           schedule {
 96 |             step: 48000
 97 |             learning_rate: .000003
 98 |           }
 99 |         }
100 |       }
101 |       momentum_optimizer_value: 0.9
102 |     }
103 |     use_moving_average: false
104 |   }
105 |   gradient_clipping_by_norm: 10.0
106 |   fine_tune_checkpoint: "faster_rcnn_resnet50_coco_2018_01_28/model.ckpt"
107 |   from_detection_checkpoint: true
108 |   # Note: The below line limits the training process to 200K steps, which we
109 |   # empirically found to be sufficient enough to train the pets dataset. This
110 |   # effectively bypasses the learning rate schedule (the learning rate will
111 |   # never decay). Remove the below line to train indefinitely.
112 |   num_steps: 50000
113 |   data_augmentation_options {
114 |     random_horizontal_flip {
115 |     }
116 |   }
117 | }
118 | 
119 | train_input_reader: {
120 |   tf_record_input_reader {
121 |     input_path: "data/egohands_train.tfrecord"
122 |   }
123 |   label_map_path: "data/egohands_label_map.pbtxt"
124 | }
125 | 
126 | eval_config: {
127 |   num_examples: 500
128 |   # Note: The below line limits the evaluation process to 10 evaluations.
129 |   # Remove the below line to evaluate indefinitely.
130 |   max_evals: 10
131 | }
132 | 
133 | eval_input_reader: {
134 |   tf_record_input_reader {
135 |     input_path: "data/egohands_val.tfrecord"
136 |   }
137 |   label_map_path: "data/egohands_label_map.pbtxt"
138 |   shuffle: false
139 |   num_readers: 1
140 | }
141 | 


--------------------------------------------------------------------------------
/configs/gcp-run.sh:
--------------------------------------------------------------------------------
 1 | # export YOUR_GCS_BUCKET=oxford-iiit-pets-dataset
 2 | 
 3 | gcloud ml-engine jobs submit training `whoami`_object_detection_pets_`date +%m_%d_%Y_%H_%M_%S` \
 4 |     --runtime-version 1.9 \
 5 |     --job-dir=gs://${YOUR_GCS_BUCKET}/model_dir \
 6 |     --packages /home/jkjung/src/tensorflow/models/research/dist/object_detection-0.1.tar.gz,/home/jkjung/src/tensorflow/models/research/slim/dist/slim-0.1.tar.gz,/tmp/pycocotools/pycocotools-2.0.tar.gz \
 7 |     --module-name object_detection.model_main \
 8 |     --region asia-east1 \
 9 |     --config cloud-ssd_mobilenet_v1_pets.yml \
10 |     -- \
11 |     --model_dir=gs://${YOUR_GCS_BUCKET}/model_dir \
12 |     --pipeline_config_path=gs://${YOUR_GCS_BUCKET}/data/ssd_mobilenet_v1_pets.config
13 | 


--------------------------------------------------------------------------------
/configs/rfcn_resnet101_egohands.config:
--------------------------------------------------------------------------------
  1 | # R-FCN with Resnet-101 (v1),  configuration for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   faster_rcnn {
  9 |     num_classes: 1
 10 |     image_resizer {
 11 |       keep_aspect_ratio_resizer {
 12 |         min_dimension: 600
 13 |         max_dimension: 1024
 14 |       }
 15 |     }
 16 |     feature_extractor {
 17 |       type: 'faster_rcnn_resnet101'
 18 |       first_stage_features_stride: 16
 19 |     }
 20 |     first_stage_anchor_generator {
 21 |       grid_anchor_generator {
 22 |         scales: [0.25, 0.5, 1.0, 2.0]
 23 |         aspect_ratios: [0.5, 1.0, 2.0]
 24 |         height_stride: 16
 25 |         width_stride: 16
 26 |       }
 27 |     }
 28 |     first_stage_box_predictor_conv_hyperparams {
 29 |       op: CONV
 30 |       regularizer {
 31 |         l2_regularizer {
 32 |           weight: 0.0
 33 |         }
 34 |       }
 35 |       initializer {
 36 |         truncated_normal_initializer {
 37 |           stddev: 0.01
 38 |         }
 39 |       }
 40 |     }
 41 |     first_stage_nms_score_threshold: 0.0
 42 |     first_stage_nms_iou_threshold: 0.7
 43 |     first_stage_max_proposals: 300
 44 |     first_stage_localization_loss_weight: 2.0
 45 |     first_stage_objectness_loss_weight: 1.0
 46 |     second_stage_box_predictor {
 47 |       rfcn_box_predictor {
 48 |         conv_hyperparams {
 49 |           op: CONV
 50 |           regularizer {
 51 |             l2_regularizer {
 52 |               weight: 0.0
 53 |             }
 54 |           }
 55 |           initializer {
 56 |             truncated_normal_initializer {
 57 |               stddev: 0.01
 58 |             }
 59 |           }
 60 |         }
 61 |         crop_height: 18
 62 |         crop_width: 18
 63 |         num_spatial_bins_height: 3
 64 |         num_spatial_bins_width: 3
 65 |       }
 66 |     }
 67 |     second_stage_post_processing {
 68 |       batch_non_max_suppression {
 69 |         score_threshold: 0.0
 70 |         iou_threshold: 0.6
 71 |         max_detections_per_class: 100
 72 |         max_total_detections: 300
 73 |       }
 74 |       score_converter: SOFTMAX
 75 |     }
 76 |     second_stage_localization_loss_weight: 2.0
 77 |     second_stage_classification_loss_weight: 1.0
 78 |   }
 79 | }
 80 | 
 81 | train_config: {
 82 |   batch_size: 1
 83 |   optimizer {
 84 |     momentum_optimizer: {
 85 |       learning_rate: {
 86 |         manual_step_learning_rate {
 87 |           initial_learning_rate: 0.0003
 88 |           schedule {
 89 |             step: 30000
 90 |             learning_rate: .00003
 91 |           }
 92 |           schedule {
 93 |             step: 48000
 94 |             learning_rate: .000003
 95 |           }
 96 |         }
 97 |       }
 98 |       momentum_optimizer_value: 0.9
 99 |     }
100 |     use_moving_average: false
101 |   }
102 |   gradient_clipping_by_norm: 10.0
103 |   fine_tune_checkpoint: "rfcn_resnet101_coco_2018_01_28/model.ckpt"
104 |   from_detection_checkpoint: true
105 |   # Note: The below line limits the training process to 200K steps, which we
106 |   # empirically found to be sufficient enough to train the pets dataset. This
107 |   # effectively bypasses the learning rate schedule (the learning rate will
108 |   # never decay). Remove the below line to train indefinitely.
109 |   num_steps: 50000
110 |   data_augmentation_options {
111 |     random_horizontal_flip {
112 |     }
113 |   }
114 | }
115 | 
116 | train_input_reader: {
117 |   tf_record_input_reader {
118 |     input_path: "data/egohands_train.tfrecord"
119 |   }
120 |   label_map_path: "data/egohands_label_map.pbtxt"
121 | }
122 | 
123 | eval_config: {
124 |   num_examples: 500
125 |   # Note: The below line limits the evaluation process to 10 evaluations.
126 |   # Remove the below line to evaluate indefinitely.
127 |   max_evals: 10
128 | }
129 | 
130 | eval_input_reader: {
131 |   tf_record_input_reader {
132 |     input_path: "data/egohands_val.tfrecord"
133 |   }
134 |   label_map_path: "data/egohands_label_map.pbtxt"
135 |   shuffle: false
136 |   num_readers: 1
137 | }
138 | 


--------------------------------------------------------------------------------
/configs/ssd_inception_v2_egohands.config:
--------------------------------------------------------------------------------
  1 | # SSD with Inception v2, configured for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   ssd {
  9 |     num_classes: 1
 10 |     box_coder {
 11 |       faster_rcnn_box_coder {
 12 |         y_scale: 10.0
 13 |         x_scale: 10.0
 14 |         height_scale: 5.0
 15 |         width_scale: 5.0
 16 |       }
 17 |     }
 18 |     matcher {
 19 |       argmax_matcher {
 20 |         matched_threshold: 0.5
 21 |         unmatched_threshold: 0.5
 22 |         ignore_thresholds: false
 23 |         negatives_lower_than_unmatched: true
 24 |         force_match_for_each_row: true
 25 |       }
 26 |     }
 27 |     similarity_calculator {
 28 |       iou_similarity {
 29 |       }
 30 |     }
 31 |     anchor_generator {
 32 |       ssd_anchor_generator {
 33 |         num_layers: 6
 34 |         min_scale: 0.1  # Use a smaller min_scale so that our trained model might be able to detect smaller objects (hands) more accurately
 35 |         max_scale: 0.9
 36 |         aspect_ratios: 1.0
 37 |         aspect_ratios: 2.0
 38 |         aspect_ratios: 0.5
 39 |         aspect_ratios: 3.0
 40 |         aspect_ratios: 0.3333
 41 |         reduce_boxes_in_lowest_layer: true
 42 |       }
 43 |     }
 44 |     image_resizer {
 45 |       fixed_shape_resizer {
 46 |         height: 300
 47 |         width: 300
 48 |       }
 49 |     }
 50 |     box_predictor {
 51 |       convolutional_box_predictor {
 52 |         min_depth: 0
 53 |         max_depth: 0
 54 |         num_layers_before_predictor: 0
 55 |         use_dropout: false
 56 |         dropout_keep_probability: 0.8
 57 |         kernel_size: 3
 58 |         box_code_size: 4
 59 |         apply_sigmoid_to_scores: false
 60 |         conv_hyperparams {
 61 |           activation: RELU_6,
 62 |           regularizer {
 63 |             l2_regularizer {
 64 |               weight: 0.00004
 65 |             }
 66 |           }
 67 |           initializer {
 68 |             truncated_normal_initializer {
 69 |               stddev: 0.03
 70 |               mean: 0.0
 71 |             }
 72 |           }
 73 |         }
 74 |       }
 75 |     }
 76 |     feature_extractor {
 77 |       type: 'ssd_inception_v2'
 78 |       min_depth: 16
 79 |       depth_multiplier: 1.0
 80 |       conv_hyperparams {
 81 |         activation: RELU_6,
 82 |         regularizer {
 83 |           l2_regularizer {
 84 |             weight: 0.00004
 85 |           }
 86 |         }
 87 |         initializer {
 88 |           truncated_normal_initializer {
 89 |             stddev: 0.03
 90 |             mean: 0.0
 91 |           }
 92 |         }
 93 |         batch_norm {
 94 |           train: true,
 95 |           scale: true,
 96 |           center: true,
 97 |           decay: 0.9997,
 98 |           epsilon: 0.001,
 99 |         }
100 |       }
101 |       override_base_feature_extractor_hyperparams: true
102 |     }
103 |     loss {
104 |       classification_loss {
105 |         weighted_sigmoid {
106 |         }
107 |       }
108 |       localization_loss {
109 |         weighted_smooth_l1 {
110 |         }
111 |       }
112 |       hard_example_miner {
113 |         num_hard_examples: 3000
114 |         iou_threshold: 0.99
115 |         loss_type: CLASSIFICATION
116 |         max_negatives_per_positive: 3
117 |         min_negatives_per_image: 0
118 |       }
119 |       classification_weight: 1.0
120 |       localization_weight: 1.0
121 |     }
122 |     normalize_loss_by_num_matches: true
123 |     post_processing {
124 |       batch_non_max_suppression {
125 |         score_threshold: 1e-8
126 |         iou_threshold: 0.6
127 |         max_detections_per_class: 100
128 |         max_total_detections: 100
129 |       }
130 |       score_converter: SIGMOID
131 |     }
132 |   }
133 | }
134 | 
135 | train_config: {
136 |   batch_size: 24
137 |   optimizer {
138 |     rms_prop_optimizer: {
139 |       learning_rate: {
140 |         exponential_decay_learning_rate {
141 |           initial_learning_rate: 0.004
142 |           decay_steps: 1000
143 |           decay_factor: 0.8
144 |         }
145 |       }
146 |       momentum_optimizer_value: 0.9
147 |       decay: 0.9
148 |       epsilon: 1.0
149 |     }
150 |   }
151 |   fine_tune_checkpoint: "ssd_inception_v2_coco_2018_01_28/model.ckpt"
152 |   from_detection_checkpoint: true
153 |   # Note: The below line limits the training process to 200K steps, which we
154 |   # empirically found to be sufficient enough to train the pets dataset. This
155 |   # effectively bypasses the learning rate schedule (the learning rate will
156 |   # never decay). Remove the below line to train indefinitely.
157 |   num_steps: 20000
158 |   data_augmentation_options {
159 |     random_horizontal_flip {
160 |     }
161 |   }
162 |   data_augmentation_options {
163 |     ssd_random_crop {
164 |     }
165 |   }
166 | }
167 | 
168 | train_input_reader: {
169 |   tf_record_input_reader {
170 |     input_path: "data/egohands_train.tfrecord"
171 |   }
172 |   label_map_path: "data/egohands_label_map.pbtxt"
173 | }
174 | 
175 | eval_config: {
176 |   num_examples: 500
177 |   # Note: The below line limits the evaluation process to 10 evaluations.
178 |   # Remove the below line to evaluate indefinitely.
179 |   max_evals: 10
180 | }
181 | 
182 | eval_input_reader: {
183 |   tf_record_input_reader {
184 |     input_path: "data/egohands_val.tfrecord"
185 |   }
186 |   label_map_path: "data/egohands_label_map.pbtxt"
187 |   shuffle: false
188 |   num_readers: 1
189 | }
190 | 


--------------------------------------------------------------------------------
/configs/ssd_mobilenet_v1_egohands.config:
--------------------------------------------------------------------------------
  1 | # SSD with Mobilenet v1, configured for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   ssd {
  9 |     num_classes: 1
 10 |     box_coder {
 11 |       faster_rcnn_box_coder {
 12 |         y_scale: 10.0
 13 |         x_scale: 10.0
 14 |         height_scale: 5.0
 15 |         width_scale: 5.0
 16 |       }
 17 |     }
 18 |     matcher {
 19 |       argmax_matcher {
 20 |         matched_threshold: 0.5
 21 |         unmatched_threshold: 0.5
 22 |         ignore_thresholds: false
 23 |         negatives_lower_than_unmatched: true
 24 |         force_match_for_each_row: true
 25 |       }
 26 |     }
 27 |     similarity_calculator {
 28 |       iou_similarity {
 29 |       }
 30 |     }
 31 |     anchor_generator {
 32 |       ssd_anchor_generator {
 33 |         num_layers: 6
 34 |         min_scale: 0.1  # Use a smaller min_scale so that our trained model might be able to detect smaller objects (hands) more accurately
 35 |         max_scale: 0.9
 36 |         aspect_ratios: 1.0
 37 |         aspect_ratios: 2.0
 38 |         aspect_ratios: 0.5
 39 |         aspect_ratios: 3.0
 40 |         aspect_ratios: 0.3333
 41 |       }
 42 |     }
 43 |     image_resizer {
 44 |       fixed_shape_resizer {
 45 |         height: 300
 46 |         width: 300
 47 |       }
 48 |     }
 49 |     box_predictor {
 50 |       convolutional_box_predictor {
 51 |         min_depth: 0
 52 |         max_depth: 0
 53 |         num_layers_before_predictor: 0
 54 |         use_dropout: false
 55 |         dropout_keep_probability: 0.8
 56 |         kernel_size: 1
 57 |         box_code_size: 4
 58 |         apply_sigmoid_to_scores: false
 59 |         conv_hyperparams {
 60 |           activation: RELU_6,
 61 |           regularizer {
 62 |             l2_regularizer {
 63 |               weight: 0.00004
 64 |             }
 65 |           }
 66 |           initializer {
 67 |             truncated_normal_initializer {
 68 |               stddev: 0.03
 69 |               mean: 0.0
 70 |             }
 71 |           }
 72 |           batch_norm {
 73 |             train: true,
 74 |             scale: true,
 75 |             center: true,
 76 |             decay: 0.9997,
 77 |             epsilon: 0.001,
 78 |           }
 79 |         }
 80 |       }
 81 |     }
 82 |     feature_extractor {
 83 |       type: 'ssd_mobilenet_v1'
 84 |       min_depth: 16
 85 |       depth_multiplier: 1.0
 86 |       conv_hyperparams {
 87 |         activation: RELU_6,
 88 |         regularizer {
 89 |           l2_regularizer {
 90 |             weight: 0.00004
 91 |           }
 92 |         }
 93 |         initializer {
 94 |           truncated_normal_initializer {
 95 |             stddev: 0.03
 96 |             mean: 0.0
 97 |           }
 98 |         }
 99 |         batch_norm {
100 |           train: true,
101 |           scale: true,
102 |           center: true,
103 |           decay: 0.9997,
104 |           epsilon: 0.001,
105 |         }
106 |       }
107 |     }
108 |     loss {
109 |       classification_loss {
110 |         weighted_sigmoid {
111 |         }
112 |       }
113 |       localization_loss {
114 |         weighted_smooth_l1 {
115 |         }
116 |       }
117 |       hard_example_miner {
118 |         num_hard_examples: 3000
119 |         iou_threshold: 0.99
120 |         loss_type: CLASSIFICATION
121 |         max_negatives_per_positive: 3
122 |         min_negatives_per_image: 0
123 |       }
124 |       classification_weight: 1.0
125 |       localization_weight: 1.0
126 |     }
127 |     normalize_loss_by_num_matches: true
128 |     post_processing {
129 |       batch_non_max_suppression {
130 |         score_threshold: 1e-8
131 |         iou_threshold: 0.6
132 |         max_detections_per_class: 100
133 |         max_total_detections: 100
134 |       }
135 |       score_converter: SIGMOID
136 |     }
137 |   }
138 | }
139 | 
140 | train_config: {
141 |   batch_size: 24
142 |   optimizer {
143 |     rms_prop_optimizer: {
144 |       learning_rate: {
145 |         exponential_decay_learning_rate {
146 |           initial_learning_rate: 0.004
147 |           decay_steps: 1000
148 |           decay_factor: 0.8
149 |         }
150 |       }
151 |       momentum_optimizer_value: 0.9
152 |       decay: 0.9
153 |       epsilon: 1.0
154 |     }
155 |   }
156 |   fine_tune_checkpoint: "ssd_mobilenet_v1_coco_2018_01_28/model.ckpt"
157 |   from_detection_checkpoint: true
158 |   load_all_detection_checkpoint_vars: true
159 |   # Note: The below line limits the training process to 200K steps, which we
160 |   # empirically found to be sufficient enough to train the pets dataset. This
161 |   # effectively bypasses the learning rate schedule (the learning rate will
162 |   # never decay). Remove the below line to train indefinitely.
163 |   num_steps: 20000
164 |   data_augmentation_options {
165 |     random_horizontal_flip {
166 |     }
167 |   }
168 |   data_augmentation_options {
169 |     ssd_random_crop {
170 |     }
171 |   }
172 | }
173 | 
174 | train_input_reader: {
175 |   tf_record_input_reader {
176 |     input_path: "data/egohands_train.tfrecord"
177 |   }
178 |   label_map_path: "data/egohands_label_map.pbtxt"
179 | }
180 | 
181 | eval_config: {
182 |   metrics_set: "coco_detection_metrics"
183 |   num_examples: 500
184 | }
185 | 
186 | eval_input_reader: {
187 |   tf_record_input_reader {
188 |     input_path: "data/egohands_val.tfrecord"
189 |   }
190 |   label_map_path: "data/egohands_label_map.pbtxt"
191 |   shuffle: false
192 |   num_readers: 1
193 | }
194 | 


--------------------------------------------------------------------------------
/configs/ssd_mobilenet_v2_egohands.config:
--------------------------------------------------------------------------------
  1 | # SSD with Mobilenet v2, configured for egohands dataset.
  2 | # This file was extracted modified from 'pipeline.config' in
  3 | # http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v2_coco_2018_03_29.tar.gz
  4 | 
  5 | model {
  6 |   ssd {
  7 |     num_classes: 1
  8 |     box_coder {
  9 |       faster_rcnn_box_coder {
 10 |         y_scale: 10.0
 11 |         x_scale: 10.0
 12 |         height_scale: 5.0
 13 |         width_scale: 5.0
 14 |       }
 15 |     }
 16 |     matcher {
 17 |       argmax_matcher {
 18 |         matched_threshold: 0.5
 19 |         unmatched_threshold: 0.5
 20 |         ignore_thresholds: false
 21 |         negatives_lower_than_unmatched: true
 22 |         force_match_for_each_row: true
 23 |       }
 24 |     }
 25 |     similarity_calculator {
 26 |       iou_similarity {
 27 |       }
 28 |     }
 29 |     anchor_generator {
 30 |       ssd_anchor_generator {
 31 |         num_layers: 6
 32 |         min_scale: 0.05
 33 |         max_scale: 0.95
 34 |         aspect_ratios: 1.0
 35 |         aspect_ratios: 2.0
 36 |         aspect_ratios: 0.5
 37 |         aspect_ratios: 3.0
 38 |         aspect_ratios: 0.3333
 39 |       }
 40 |     }
 41 |     image_resizer {
 42 |       fixed_shape_resizer {
 43 |         height: 300
 44 |         width: 300
 45 |       }
 46 |     }
 47 |     box_predictor {
 48 |       convolutional_box_predictor {
 49 |         min_depth: 0
 50 |         max_depth: 0
 51 |         num_layers_before_predictor: 0
 52 |         use_dropout: false
 53 |         dropout_keep_probability: 0.8
 54 |         kernel_size: 3
 55 |         box_code_size: 4
 56 |         apply_sigmoid_to_scores: false
 57 |         conv_hyperparams {
 58 |           activation: RELU_6
 59 |           regularizer {
 60 |             l2_regularizer {
 61 |               weight: 0.00004
 62 |             }
 63 |           }
 64 |           initializer {
 65 |             truncated_normal_initializer {
 66 |               stddev: 0.03
 67 |               mean: 0.0
 68 |             }
 69 |           }
 70 |           batch_norm {
 71 |             train: true
 72 |             scale: true
 73 |             center: true
 74 |             decay: 0.9997
 75 |             epsilon: 0.001
 76 |           }
 77 |         }
 78 |       }
 79 |     }
 80 |     feature_extractor {
 81 |       type: "ssd_mobilenet_v2"
 82 |       min_depth: 16
 83 |       depth_multiplier: 1.0
 84 |       conv_hyperparams {
 85 |         activation: RELU_6
 86 |         regularizer {
 87 |           l2_regularizer {
 88 |             weight: 4e-05
 89 |           }
 90 |         }
 91 |         initializer {
 92 |           truncated_normal_initializer {
 93 |             stddev: 0.03
 94 |             mean: 0.0
 95 |           }
 96 |         }
 97 |         batch_norm {
 98 |           train: true
 99 |           scale: true
100 |           center: true
101 |           decay: 0.9997
102 |           epsilon: 0.001
103 |         }
104 |       }
105 |       #batch_norm_trainable: true
106 |       use_depthwise: true
107 |     }
108 |     loss {
109 |       classification_loss {
110 |         weighted_sigmoid {
111 |         }
112 |       }
113 |       localization_loss {
114 |         weighted_smooth_l1 {
115 |         }
116 |       }
117 |       hard_example_miner {
118 |         num_hard_examples: 3000
119 |         iou_threshold: 0.99
120 |         loss_type: CLASSIFICATION
121 |         max_negatives_per_positive: 3
122 |         min_negatives_per_image: 3
123 |       }
124 |       classification_weight: 1.0
125 |       localization_weight: 1.0
126 |     }
127 |     normalize_loss_by_num_matches: true
128 |     post_processing {
129 |       batch_non_max_suppression {
130 |         score_threshold: 1e-8
131 |         iou_threshold: 0.6
132 |         max_detections_per_class: 100
133 |         max_total_detections: 100
134 |       }
135 |       score_converter: SIGMOID
136 |     }
137 |   }
138 | }
139 | 
140 | train_config {
141 |   batch_size: 24
142 |   optimizer {
143 |     rms_prop_optimizer {
144 |       learning_rate {
145 |         exponential_decay_learning_rate {
146 |           initial_learning_rate: 0.004
147 |           decay_steps: 1000
148 |           decay_factor: 0.8
149 |         }
150 |       }
151 |       momentum_optimizer_value: 0.9
152 |       decay: 0.9
153 |       epsilon: 1.0
154 |     }
155 |   }
156 |   fine_tune_checkpoint: "ssd_mobilenet_v2_coco_2018_03_29/model.ckpt"
157 |   fine_tune_checkpoint_type: "detection"
158 |   num_steps: 20000
159 |   data_augmentation_options {
160 |     random_horizontal_flip {
161 |     }
162 |   }
163 |   data_augmentation_options {
164 |     ssd_random_crop {
165 |     }
166 |   }
167 | }
168 | 
169 | train_input_reader {
170 |   tf_record_input_reader {
171 |     input_path: "data/egohands_train.tfrecord"
172 |   }
173 |   label_map_path: "data/egohands_label_map.pbtxt"
174 | }
175 | 
176 | eval_config {
177 |   num_examples: 500
178 |   max_evals: 10
179 |   use_moving_averages: false
180 | }
181 | 
182 | eval_input_reader {
183 |   tf_record_input_reader {
184 |     input_path: "data/egohands_val.tfrecord"
185 |   }
186 |   label_map_path: "data/egohands_label_map.pbtxt"
187 |   shuffle: false
188 |   num_readers: 1
189 | }
190 | 


--------------------------------------------------------------------------------
/configs/ssd_resnet50_v1_fpn_egohands.config:
--------------------------------------------------------------------------------
  1 | # SSD with Resnet-50 (v1) and FPN, configured for egohands dataset.
  2 | # This file was extracted modified from 'pipeline.config' in
  3 | # http://download.tensorflow.org/models/object_detection/ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03.tar.gz
  4 | 
  5 | model {
  6 |   ssd {
  7 |     num_classes: 1
  8 |     image_resizer {
  9 |       fixed_shape_resizer {
 10 |         height: 640
 11 |         width: 640
 12 |       }
 13 |     }
 14 |     feature_extractor {
 15 |       type: "ssd_resnet50_v1_fpn"
 16 |       depth_multiplier: 1.0
 17 |       min_depth: 16
 18 |       conv_hyperparams {
 19 |         regularizer {
 20 |           l2_regularizer {
 21 |             weight: 0.0004
 22 |           }
 23 |         }
 24 |         initializer {
 25 |           truncated_normal_initializer {
 26 |             mean: 0.0
 27 |             stddev: 0.03
 28 |           }
 29 |         }
 30 |         activation: RELU_6
 31 |         batch_norm {
 32 |           decay: 0.997
 33 |           scale: true
 34 |           epsilon: 0.001
 35 |         }
 36 |       }
 37 |       override_base_feature_extractor_hyperparams: true
 38 |     }
 39 |     box_coder {
 40 |       faster_rcnn_box_coder {
 41 |         y_scale: 10.0
 42 |         x_scale: 10.0
 43 |         height_scale: 5.0
 44 |         width_scale: 5.0
 45 |       }
 46 |     }
 47 |     matcher {
 48 |       argmax_matcher {
 49 |         matched_threshold: 0.5
 50 |         unmatched_threshold: 0.5
 51 |         ignore_thresholds: false
 52 |         negatives_lower_than_unmatched: true
 53 |         force_match_for_each_row: true
 54 |         use_matmul_gather: true
 55 |       }
 56 |     }
 57 |     similarity_calculator {
 58 |       iou_similarity {
 59 |       }
 60 |     }
 61 |     box_predictor {
 62 |       weight_shared_convolutional_box_predictor {
 63 |         conv_hyperparams {
 64 |           regularizer {
 65 |             l2_regularizer {
 66 |               weight: 0.0004
 67 |             }
 68 |           }
 69 |           initializer {
 70 |             random_normal_initializer {
 71 |               mean: 0.0
 72 |               stddev: 0.01
 73 |             }
 74 |           }
 75 |           activation: RELU_6
 76 |           batch_norm {
 77 |             decay: 0.997
 78 |             scale: true
 79 |             epsilon: 0.001
 80 |           }
 81 |         }
 82 |         depth: 256
 83 |         num_layers_before_predictor: 4
 84 |         kernel_size: 3
 85 |         class_prediction_bias_init: -4.6
 86 |       }
 87 |     }
 88 |     anchor_generator {
 89 |       multiscale_anchor_generator {
 90 |         min_level: 3
 91 |         max_level: 7
 92 |         anchor_scale: 4.0
 93 |         aspect_ratios: 1.0
 94 |         aspect_ratios: 2.0
 95 |         aspect_ratios: 0.5
 96 |         scales_per_octave: 2
 97 |       }
 98 |     }
 99 |     post_processing {
100 |       batch_non_max_suppression {
101 |         score_threshold: 0.3
102 |         iou_threshold: 0.6
103 |         max_detections_per_class: 100
104 |         max_total_detections: 100
105 |       }
106 |       score_converter: SIGMOID
107 |     }
108 |     normalize_loss_by_num_matches: true
109 |     loss {
110 |       localization_loss {
111 |         weighted_smooth_l1 {
112 |         }
113 |       }
114 |       classification_loss {
115 |         weighted_sigmoid_focal {
116 |           gamma: 2.0
117 |           alpha: 0.25
118 |         }
119 |       }
120 |       classification_weight: 1.0
121 |       localization_weight: 1.0
122 |     }
123 |     encode_background_as_zeros: true
124 |     normalize_loc_loss_by_codesize: true
125 |     inplace_batchnorm_update: true
126 |     freeze_batchnorm: false
127 |   }
128 | }
129 | train_config {
130 |   batch_size: 8
131 |   data_augmentation_options {
132 |     random_horizontal_flip {
133 |     }
134 |   }
135 |   data_augmentation_options {
136 |     random_crop_image {
137 |       min_object_covered: 0.0
138 |       min_aspect_ratio: 0.75
139 |       max_aspect_ratio: 3.0
140 |       min_area: 0.75
141 |       max_area: 1.0
142 |       overlap_thresh: 0.0
143 |     }
144 |   }
145 |   sync_replicas: true
146 |   optimizer {
147 |     momentum_optimizer {
148 |       learning_rate {
149 |         cosine_decay_learning_rate {
150 |           learning_rate_base: 0.04
151 |           total_steps: 25000
152 |           warmup_learning_rate: 0.013333
153 |           warmup_steps: 2000
154 |         }
155 |       }
156 |       momentum_optimizer_value: 0.9
157 |     }
158 |     use_moving_average: false
159 |   }
160 |   fine_tune_checkpoint: "ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03/model.ckpt"
161 |   num_steps: 25000
162 |   startup_delay_steps: 0.0
163 |   replicas_to_aggregate: 8
164 |   max_number_of_boxes: 100
165 |   unpad_groundtruth_tensors: false
166 | }
167 | 
168 | train_input_reader {
169 |   tf_record_input_reader {
170 |     input_path: "data/egohands_train.tfrecord"
171 |   }
172 |   label_map_path: "data/egohands_label_map.pbtxt"
173 | }
174 | 
175 | eval_config {
176 |   num_examples: 500
177 |   metrics_set: "coco_detection_metrics"
178 |   use_moving_averages: false
179 | }
180 | 
181 | eval_input_reader {
182 |   tf_record_input_reader {
183 |     input_path: "data/egohands_val.tfrecord"
184 |   }
185 |   label_map_path: "data/egohands_label_map.pbtxt"
186 |   shuffle: false
187 |   num_readers: 1
188 | }
189 | 


--------------------------------------------------------------------------------
/configs/ssdlite_mobilenet_v2_egohands.config:
--------------------------------------------------------------------------------
  1 | # SSDLite with Mobilenet v2 configuration for egohands dataset.
  2 | # Users should configure the fine_tune_checkpoint field in the train config as
  3 | # well as the label_map_path and input_path fields in the train_input_reader and
  4 | # eval_input_reader. Search for "PATH_TO_BE_CONFIGURED" to find the fields that
  5 | # should be configured.
  6 | 
  7 | model {
  8 |   ssd {
  9 |     num_classes: 1
 10 |     box_coder {
 11 |       faster_rcnn_box_coder {
 12 |         y_scale: 10.0
 13 |         x_scale: 10.0
 14 |         height_scale: 5.0
 15 |         width_scale: 5.0
 16 |       }
 17 |     }
 18 |     matcher {
 19 |       argmax_matcher {
 20 |         matched_threshold: 0.5
 21 |         unmatched_threshold: 0.5
 22 |         ignore_thresholds: false
 23 |         negatives_lower_than_unmatched: true
 24 |         force_match_for_each_row: true
 25 |       }
 26 |     }
 27 |     similarity_calculator {
 28 |       iou_similarity {
 29 |       }
 30 |     }
 31 |     anchor_generator {
 32 |       ssd_anchor_generator {
 33 |         num_layers: 6
 34 |         min_scale: 0.05
 35 |         max_scale: 0.95
 36 |         aspect_ratios: 1.0
 37 |         aspect_ratios: 2.0
 38 |         aspect_ratios: 0.5
 39 |         aspect_ratios: 3.0
 40 |         aspect_ratios: 0.3333
 41 |       }
 42 |     }
 43 |     image_resizer {
 44 |       fixed_shape_resizer {
 45 |         height: 300
 46 |         width: 300
 47 |       }
 48 |     }
 49 |     box_predictor {
 50 |       convolutional_box_predictor {
 51 |         min_depth: 0
 52 |         max_depth: 0
 53 |         num_layers_before_predictor: 0
 54 |         use_dropout: false
 55 |         dropout_keep_probability: 0.8
 56 |         kernel_size: 3
 57 |         use_depthwise: true
 58 |         box_code_size: 4
 59 |         apply_sigmoid_to_scores: false
 60 |         conv_hyperparams {
 61 |           activation: RELU_6,
 62 |           regularizer {
 63 |             l2_regularizer {
 64 |               weight: 0.00004
 65 |             }
 66 |           }
 67 |           initializer {
 68 |             truncated_normal_initializer {
 69 |               stddev: 0.03
 70 |               mean: 0.0
 71 |             }
 72 |           }
 73 |           batch_norm {
 74 |             train: true,
 75 |             scale: true,
 76 |             center: true,
 77 |             decay: 0.9997,
 78 |             epsilon: 0.001,
 79 |           }
 80 |         }
 81 |       }
 82 |     }
 83 |     feature_extractor {
 84 |       type: 'ssd_mobilenet_v2'
 85 |       min_depth: 16
 86 |       depth_multiplier: 1.0
 87 |       use_depthwise: true
 88 |       conv_hyperparams {
 89 |         activation: RELU_6,
 90 |         regularizer {
 91 |           l2_regularizer {
 92 |             weight: 0.00004
 93 |           }
 94 |         }
 95 |         initializer {
 96 |           truncated_normal_initializer {
 97 |             stddev: 0.03
 98 |             mean: 0.0
 99 |           }
100 |         }
101 |         batch_norm {
102 |           train: true,
103 |           scale: true,
104 |           center: true,
105 |           decay: 0.9997,
106 |           epsilon: 0.001,
107 |         }
108 |       }
109 |     }
110 |     loss {
111 |       classification_loss {
112 |         weighted_sigmoid {
113 |         }
114 |       }
115 |       localization_loss {
116 |         weighted_smooth_l1 {
117 |         }
118 |       }
119 |       hard_example_miner {
120 |         num_hard_examples: 3000
121 |         iou_threshold: 0.99
122 |         loss_type: CLASSIFICATION
123 |         max_negatives_per_positive: 3
124 |         min_negatives_per_image: 3
125 |       }
126 |       classification_weight: 1.0
127 |       localization_weight: 1.0
128 |     }
129 |     normalize_loss_by_num_matches: true
130 |     post_processing {
131 |       batch_non_max_suppression {
132 |         score_threshold: 1e-8
133 |         iou_threshold: 0.6
134 |         max_detections_per_class: 100
135 |         max_total_detections: 100
136 |       }
137 |       score_converter: SIGMOID
138 |     }
139 |   }
140 | }
141 | 
142 | train_config: {
143 |   batch_size: 24
144 |   optimizer {
145 |     rms_prop_optimizer: {
146 |       learning_rate: {
147 |         exponential_decay_learning_rate {
148 |           initial_learning_rate: 0.004
149 |           decay_steps: 1000
150 |           decay_factor: 0.8
151 |         }
152 |       }
153 |       momentum_optimizer_value: 0.9
154 |       decay: 0.9
155 |       epsilon: 1.0
156 |     }
157 |   }
158 |   fine_tune_checkpoint: "ssdlite_mobilenet_v2_coco_2018_05_09/model.ckpt"
159 |   fine_tune_checkpoint_type:  "detection"
160 |   # Note: The below line limits the training process to 200K steps, which we
161 |   # empirically found to be sufficient enough to train the pets dataset. This
162 |   # effectively bypasses the learning rate schedule (the learning rate will
163 |   # never decay). Remove the below line to train indefinitely.
164 |   num_steps: 20000
165 |   data_augmentation_options {
166 |     random_horizontal_flip {
167 |     }
168 |   }
169 |   data_augmentation_options {
170 |     ssd_random_crop {
171 |     }
172 |   }
173 | }
174 | 
175 | train_input_reader: {
176 |   tf_record_input_reader {
177 |     input_path: "data/egohands_train.tfrecord"
178 |   }
179 |   label_map_path: "data/egohands_label_map.pbtxt"
180 | }
181 | 
182 | eval_config: {
183 |   num_examples: 500
184 |   # Note: The below line limits the evaluation process to 10 evaluations.
185 |   # Remove the below line to evaluate indefinitely.
186 |   max_evals: 10
187 | }
188 | 
189 | eval_input_reader: {
190 |   tf_record_input_reader {
191 |     input_path: "data/egohands_val.tfrecord"
192 |   }
193 |   label_map_path: "data/egohands_label_map.pbtxt"
194 |   shuffle: false
195 |   num_readers: 1
196 | }
197 | 


--------------------------------------------------------------------------------
/configs/training_results.txt:
--------------------------------------------------------------------------------
  1 | ssd_mobilenet_v1_egohands: (training time: 2h 3m)
  2 | 
  3 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.680
  4 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.968
  5 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.813
  6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.118
  7 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.329
  8 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.713
  9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.253
 10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.739
 11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.744
 12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 13 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.471
 14 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.773
 15 | 
 16 | 
 17 | ssd_mobilenet_v2_egohands: (training time: 2h 4m 16s)
 18 | 
 19 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.675
 20 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.970
 21 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.813
 22 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.303
 23 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.337
 24 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.706
 25 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.253
 26 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.735
 27 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.738
 28 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.300
 29 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.452
 30 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.769
 31 | 
 32 | 
 33 | ssdlite_mobilenet_v2_egohands (training time: 2h 13m 51s)
 34 |  
 35 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.573
 36 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.959
 37 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.661
 38 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.252
 39 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.302
 40 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.597
 41 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.220
 42 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.632
 43 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.637
 44 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 45 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.427
 46 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.660
 47 | 
 48 | 
 49 | ssd_inception_v2_egohands (training time: 2h 15m 6s)
 50 | 
 51 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.669
 52 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.965
 53 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.811
 54 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.101
 55 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.305
 56 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.704
 57 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.252
 58 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.726
 59 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.731
 60 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.200
 61 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.443
 62 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.762
 63 | 
 64 | 
 65 | rfcn_resnet101_egohands (training time: 3h 4m 54s)
 66 | 
 67 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.751
 68 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.971
 69 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.905
 70 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.025
 71 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.452
 72 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.780
 73 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.271
 74 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.791
 75 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.794
 76 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.100
 77 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.550
 78 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.821
 79 | 
 80 | 
 81 | faster_rcnn_resnet50_egohands (training time: 2h 6m 5s)
 82 | 
 83 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.751
 84 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.977
 85 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.906
 86 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.252
 87 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.476
 88 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.776
 89 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.270
 90 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.789
 91 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.793
 92 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 93 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.576
 94 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.817
 95 | 
 96 | 
 97 | faster_rcnn_resnet101_egohands (training time: 3h 33m 5s)
 98 | 
 99 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.762
100 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.980
101 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.909
102 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
103 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.479
104 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.787
105 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
106 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.797
107 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.800
108 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.150
109 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.577
110 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.824
111 | 
112 | 
113 | faster_rcnn_inception_v2_egohands (training time: 1h 21m 5s)
114 | 
115 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.739
116 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.978
117 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.889
118 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050
119 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.443
120 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.766
121 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.268
122 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.779
123 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.784
124 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.150
125 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.552
126 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.809
127 | 
128 | 
129 | faster_rcnn_inception_resnet_v2_atrous_egohands (training time: 18h 29m 57s)
130 | 
131 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.772
132 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.979
133 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.911
134 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
135 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.504
136 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.796
137 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.277
138 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.805
139 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.809
140 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
141 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.590
142 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.832
143 | 
144 | 
145 | * All training time was measured on GTX-1080Ti
146 | 
147 | 


--------------------------------------------------------------------------------
/create_kitti_tf_record.py:
--------------------------------------------------------------------------------
  1 | # https://github.com/tensorflow/models/blob/master/research/object_detection/dataset_tools/create_kitti_tf_record.py
  2 | 
  3 | # ========================================================================
  4 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | # ========================================================================
 18 | 
 19 | 
 20 | """Convert KITTI detection dataset to TFRecord for object detection.
 21 | 
 22 | Example usage:
 23 | 
 24 |     python3 create_kitti_tf_record.py \
 25 |             --data_dir=egohands_kitti_formatted \
 26 |             --output_path=data/egohands \
 27 |             --classes_to_use=hand \
 28 |             --label_map_path=data/egohands_label_map.pbtxt \
 29 |             --validation_set_size=500
 30 | """
 31 | 
 32 | 
 33 | from __future__ import absolute_import
 34 | from __future__ import division
 35 | from __future__ import print_function
 36 | 
 37 | 
 38 | import hashlib
 39 | import io
 40 | import os
 41 | import random
 42 | 
 43 | import numpy as np
 44 | import PIL.Image as pil
 45 | import tensorflow as tf
 46 | 
 47 | from object_detection.utils import dataset_util
 48 | from object_detection.utils import label_map_util
 49 | from object_detection.utils.np_box_ops import iou
 50 | 
 51 | 
 52 | tf.app.flags.DEFINE_string('data_dir', '', 'Location for the data.  '
 53 |                            'All jpg and txt files are assumed to be '
 54 |                            'present at this location.')
 55 | tf.app.flags.DEFINE_string('output_path', '', 'Path to which TFRecord '
 56 |                            'files will be written. The TFRecord with '
 57 |                            'the training set will be located at: '
 58 |                            '<output_path>_train.tfrecord.  And the '
 59 |                            'TFRecord with the validation set will be '
 60 |                            'located at: <output_path>_val.tfrecord.')
 61 | tf.app.flags.DEFINE_string('classes_to_use', 'hand,dontcare',
 62 |                            'Comma separated list of class names that '
 63 |                            'will be used.  Adding the dontcare class '
 64 |                            'will remove all bboxs in the dontcare '
 65 |                            'regions.')
 66 | tf.app.flags.DEFINE_string('label_map_path',
 67 |                            'data/egohands_label_map.pbtxt',
 68 |                            'Path to label map proto.')
 69 | tf.app.flags.DEFINE_integer('validation_set_size', '500',
 70 |                             'Number of images to be used as a '
 71 |                             'validation set.')
 72 | FLAGS = tf.app.flags.FLAGS
 73 | 
 74 | 
 75 | def convert_kitti_to_tfrecords(data_dir, output_path, classes_to_use,
 76 |                                label_map_path, validation_set_size):
 77 |   """Convert the KITTI detection dataset to TFRecords.
 78 | 
 79 |   Args:
 80 |     data_dir: The full path containing KITTI formatted data, with all
 81 |       jpg files located in <data_dir>/images/ and all txt files in
 82 |       <data_dir>/labels/.
 83 |     output_path: The path to which TFRecord files will be written.
 84 |       The TFRecord with the training set will be located at:
 85 |       <output_path>_train.tfrecord
 86 |       And the TFRecord with the validation set will be located at:
 87 |       <output_path>_val.tfrecord
 88 |     classes_to_use: List of strings naming the classes for which
 89 |       data should be converted.  Use the same names as presented in
 90 |       the KIITI README file.  Adding dontcare class will remove all
 91 |       other bounding boxes that overlap with areas marked as dontcare
 92 |        regions.
 93 |     label_map_path: Path to label map proto
 94 |     validation_set_size: How many images should be left as the
 95 |        validation set.  (First `validation_set_size` examples are
 96 |        selected to be in the validation set).
 97 |   """
 98 |   label_map_dict = label_map_util.get_label_map_dict(label_map_path)
 99 |   train_count = 0
100 |   val_count = 0
101 | 
102 |   annotation_dir = os.path.join(data_dir, 'labels')
103 |   image_dir = os.path.join(data_dir, 'images')
104 | 
105 |   train_writer = tf.python_io.TFRecordWriter('%s_train.tfrecord'%
106 |                                              output_path)
107 |   val_writer = tf.python_io.TFRecordWriter('%s_val.tfrecord'%
108 |                                            output_path)
109 | 
110 |   images = sorted(tf.gfile.ListDirectory(image_dir))
111 |   images = [f for f in images if f.endswith('jpg')]  # only keep jpg files
112 |   assert len(images) > 0
113 |   random.shuffle(images)
114 |   for idx, img_name in enumerate(images):
115 |     img_num = img_name.split('.')[0]
116 |     is_validation_img = idx < validation_set_size
117 |     img_anno = read_annotation_file(os.path.join(annotation_dir,
118 |                                                  img_num+'.txt'))
119 | 
120 |     image_path = os.path.join(image_dir, img_name)
121 | 
122 |     # Filter all bounding boxes of this frame that are of a legal class,
123 |     # and don't overlap with a dontcare region.
124 |     # TODO(talremez) filter out targets that are truncated or heavily
125 |     # occluded.
126 |     annotation_for_image = filter_annotations(img_anno, classes_to_use)
127 | 
128 |     example = prepare_example(image_path, annotation_for_image, label_map_dict)
129 |     if is_validation_img:
130 |       val_writer.write(example.SerializeToString())
131 |       val_count += 1
132 |     else:
133 |       train_writer.write(example.SerializeToString())
134 |       train_count += 1
135 | 
136 |   train_writer.close()
137 |   val_writer.close()
138 | 
139 | 
140 | def prepare_example(image_path, annotations, label_map_dict):
141 |   """Converts a dictionary with annotations for an image to tf.Example proto.
142 | 
143 |   Args:
144 |     image_path: The complete path to image.
145 |     annotations: A dictionary representing the annotation of a single object
146 |       that appears in the image.
147 |     label_map_dict: A map from string label names to integer ids.
148 | 
149 |   Returns:
150 |     example: The converted tf.Example.
151 |   """
152 |   with tf.gfile.GFile(image_path, 'rb') as fid:
153 |     encoded_png = fid.read()
154 |   encoded_png_io = io.BytesIO(encoded_png)
155 |   image = pil.open(encoded_png_io)
156 |   image = np.asarray(image)
157 | 
158 |   key = hashlib.sha256(encoded_png).hexdigest()
159 | 
160 |   width = int(image.shape[1])
161 |   height = int(image.shape[0])
162 | 
163 |   xmin_norm = annotations['2d_bbox_left'] / float(width)
164 |   ymin_norm = annotations['2d_bbox_top'] / float(height)
165 |   xmax_norm = annotations['2d_bbox_right'] / float(width)
166 |   ymax_norm = annotations['2d_bbox_bottom'] / float(height)
167 | 
168 |   difficult_obj = [0]*len(xmin_norm)
169 | 
170 |   example = tf.train.Example(features=tf.train.Features(feature={
171 |       'image/height': dataset_util.int64_feature(height),
172 |       'image/width': dataset_util.int64_feature(width),
173 |       'image/filename': dataset_util.bytes_feature(image_path.encode('utf8')),
174 |       'image/source_id': dataset_util.bytes_feature(image_path.encode('utf8')),
175 |       'image/key/sha256': dataset_util.bytes_feature(key.encode('utf8')),
176 |       'image/encoded': dataset_util.bytes_feature(encoded_png),
177 |       'image/format': dataset_util.bytes_feature('png'.encode('utf8')),
178 |       'image/object/bbox/xmin': dataset_util.float_list_feature(xmin_norm),
179 |       'image/object/bbox/xmax': dataset_util.float_list_feature(xmax_norm),
180 |       'image/object/bbox/ymin': dataset_util.float_list_feature(ymin_norm),
181 |       'image/object/bbox/ymax': dataset_util.float_list_feature(ymax_norm),
182 |       'image/object/class/text': dataset_util.bytes_list_feature(
183 |           [x.encode('utf8') for x in annotations['type']]),
184 |       'image/object/class/label': dataset_util.int64_list_feature(
185 |           [label_map_dict[x] for x in annotations['type']]),
186 |       'image/object/difficult': dataset_util.int64_list_feature(difficult_obj),
187 |       'image/object/truncated': dataset_util.float_list_feature(
188 |           annotations['truncated']),
189 |       'image/object/alpha': dataset_util.float_list_feature(
190 |           annotations['alpha']),
191 |       'image/object/3d_bbox/height': dataset_util.float_list_feature(
192 |           annotations['3d_bbox_height']),
193 |       'image/object/3d_bbox/width': dataset_util.float_list_feature(
194 |           annotations['3d_bbox_width']),
195 |       'image/object/3d_bbox/length': dataset_util.float_list_feature(
196 |           annotations['3d_bbox_length']),
197 |       'image/object/3d_bbox/x': dataset_util.float_list_feature(
198 |           annotations['3d_bbox_x']),
199 |       'image/object/3d_bbox/y': dataset_util.float_list_feature(
200 |           annotations['3d_bbox_y']),
201 |       'image/object/3d_bbox/z': dataset_util.float_list_feature(
202 |           annotations['3d_bbox_z']),
203 |       'image/object/3d_bbox/rot_y': dataset_util.float_list_feature(
204 |           annotations['3d_bbox_rot_y']),
205 |   }))
206 | 
207 |   return example
208 | 
209 | 
210 | def filter_annotations(img_all_annotations, used_classes):
211 |   """Filters out annotations from the unused classes and dontcare regions.
212 | 
213 |   Filters out the annotations that belong to classes we do now wish to use and
214 |   (optionally) also removes all boxes that overlap with dontcare regions.
215 | 
216 |   Args:
217 |     img_all_annotations: A list of annotation dictionaries. See documentation of
218 |       read_annotation_file for more details about the format of the annotations.
219 |     used_classes: A list of strings listing the classes we want to keep, if the
220 |     list contains "dontcare", all bounding boxes with overlapping with dont
221 |     care regions will also be filtered out.
222 | 
223 |   Returns:
224 |     img_filtered_annotations: A list of annotation dictionaries that have passed
225 |       the filtering.
226 |   """
227 | 
228 |   img_filtered_annotations = {}
229 | 
230 |   # Filter the type of the objects.
231 |   relevant_annotation_indices = [
232 |       i for i, x in enumerate(img_all_annotations['type']) if x in used_classes
233 |   ]
234 | 
235 |   for key in img_all_annotations.keys():
236 |     img_filtered_annotations[key] = (
237 |         img_all_annotations[key][relevant_annotation_indices])
238 | 
239 |   if 'dontcare' in used_classes:
240 |     dont_care_indices = [i for i,
241 |                          x in enumerate(img_filtered_annotations['type'])
242 |                          if x == 'dontcare']
243 | 
244 |     # bounding box format [y_min, x_min, y_max, x_max]
245 |     all_boxes = np.stack([img_filtered_annotations['2d_bbox_top'],
246 |                           img_filtered_annotations['2d_bbox_left'],
247 |                           img_filtered_annotations['2d_bbox_bottom'],
248 |                           img_filtered_annotations['2d_bbox_right']],
249 |                          axis=1)
250 | 
251 |     ious = iou(boxes1=all_boxes,
252 |                boxes2=all_boxes[dont_care_indices])
253 | 
254 |     # Remove all bounding boxes that overlap with a dontcare region.
255 |     if ious.size > 0:
256 |       boxes_to_remove = np.amax(ious, axis=1) > 0.0
257 |       for key in img_all_annotations.keys():
258 |         img_filtered_annotations[key] = (
259 |             img_filtered_annotations[key][np.logical_not(boxes_to_remove)])
260 | 
261 |   return img_filtered_annotations
262 | 
263 | 
264 | def read_annotation_file(filename):
265 |   """Reads a KITTI annotation file.
266 | 
267 |   Converts a KITTI annotation file into a dictionary containing all the
268 |   relevant information.
269 | 
270 |   Args:
271 |     filename: the path to the annotataion text file.
272 | 
273 |   Returns:
274 |     anno: A dictionary with the converted annotation information. See annotation
275 |     README file for details on the different fields.
276 |   """
277 |   with open(filename) as f:
278 |     content = f.readlines()
279 |   content = [x.strip().split(' ') for x in content]
280 | 
281 |   anno = {}
282 |   anno['type'] = np.array([x[0].lower() for x in content])
283 |   anno['truncated'] = np.array([float(x[1]) for x in content])
284 |   anno['occluded'] = np.array([int(x[2]) for x in content])
285 |   anno['alpha'] = np.array([float(x[3]) for x in content])
286 | 
287 |   anno['2d_bbox_left'] = np.array([float(x[4]) for x in content])
288 |   anno['2d_bbox_top'] = np.array([float(x[5]) for x in content])
289 |   anno['2d_bbox_right'] = np.array([float(x[6]) for x in content])
290 |   anno['2d_bbox_bottom'] = np.array([float(x[7]) for x in content])
291 | 
292 |   anno['3d_bbox_height'] = np.array([float(x[8]) for x in content])
293 |   anno['3d_bbox_width'] = np.array([float(x[9]) for x in content])
294 |   anno['3d_bbox_length'] = np.array([float(x[10]) for x in content])
295 |   anno['3d_bbox_x'] = np.array([float(x[11]) for x in content])
296 |   anno['3d_bbox_y'] = np.array([float(x[12]) for x in content])
297 |   anno['3d_bbox_z'] = np.array([float(x[13]) for x in content])
298 |   anno['3d_bbox_rot_y'] = np.array([float(x[14]) for x in content])
299 | 
300 |   return anno
301 | 
302 | 
303 | def main(_):
304 |   convert_kitti_to_tfrecords(
305 |       data_dir=FLAGS.data_dir,
306 |       output_path=FLAGS.output_path,
307 |       classes_to_use=FLAGS.classes_to_use.split(','),
308 |       label_map_path=FLAGS.label_map_path,
309 |       validation_set_size=FLAGS.validation_set_size)
310 | 
311 | if __name__ == '__main__':
312 |   tf.app.run()
313 | 


--------------------------------------------------------------------------------
/create_tfrecords.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PYTHONPATH=`pwd`/models/research:`pwd`/models/research/slim \
 4 |     python3 create_kitti_tf_record.py \
 5 |             --data_dir=egohands_kitti_formatted \
 6 |             --output_path=data/egohands \
 7 |             --classes_to_use=hand \
 8 |             --label_map_path=data/egohands_label_map.pbtxt \
 9 |             --validation_set_size=500
10 | 


--------------------------------------------------------------------------------
/data/egohands_label_map.pbtxt:
--------------------------------------------------------------------------------
1 | item {
2 |   id: 1
3 |   name: 'hand'
4 | }
5 | 


--------------------------------------------------------------------------------
/data/jk-son-hands.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkjung-avt/hand-detection-tutorial/e5ef4208abfa7eb89aec4ec96dda21582cde463e/data/jk-son-hands.jpg


--------------------------------------------------------------------------------
/detect_image.py:
--------------------------------------------------------------------------------
 1 | """detect_image.py
 2 | 
 3 | This script is used to test my trained egohands (hand detector) models.  It is modified from the following example from TensorFlow Object Detection API:
 4 | 
 5 | https://github.com/tensorflow/models/blob/master/research/object_detection/object_detection_tutorial.ipynb
 6 | """
 7 | 
 8 | 
 9 | import sys
10 | 
11 | import numpy as np
12 | import cv2
13 | import tensorflow as tf
14 | 
15 | from object_detection.utils import label_map_util
16 | from object_detection.utils import visualization_utils as vis_util
17 | 
18 | 
19 | PATH_TO_FROZEN_GRAPH = 'model_exported/frozen_inference_graph.pb'
20 | PATH_TO_LABELS = 'data/egohands_label_map.pbtxt'
21 | OUTPUT_PATH = 'detection_output.jpg'
22 | 
23 | 
24 | def detect_image(image_path):
25 |     # load label map
26 |     category_index = label_map_util.create_category_index_from_labelmap(
27 |         PATH_TO_LABELS)
28 | 
29 |     # load detection graph
30 |     detection_graph = tf.Graph()
31 |     with detection_graph.as_default():
32 |         od_graph_def = tf.GraphDef()
33 |         with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
34 |             serialized_graph = fid.read()
35 |             od_graph_def.ParseFromString(serialized_graph)
36 |             tf.import_graph_def(od_graph_def, name='')
37 | 
38 |     # define input/output tensors
39 |     image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
40 |     detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
41 |     detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
42 |     detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
43 |     num_detections = detection_graph.get_tensor_by_name('num_detections:0')
44 | 
45 |     # load input image
46 |     img = cv2.imread(image_path)
47 |     if img is None:
48 |         sys.exit('failed to load image: %s' % image_path)
49 |     img = img[..., ::-1]  # BGR to RGB
50 | 
51 |     # run inference
52 |     with detection_graph.as_default():
53 |         with tf.Session() as sess:
54 |             boxes, scores, classes, _ = sess.run(
55 |                 [detection_boxes, detection_scores, detection_classes, num_detections],
56 |                 feed_dict={image_tensor: np.expand_dims(img, 0)})
57 | 
58 |     # draw the results of the detection
59 |     vis_util.visualize_boxes_and_labels_on_image_array(
60 |         img,
61 |         np.squeeze(boxes),
62 |         np.squeeze(classes).astype(np.int32),
63 |         np.squeeze(scores),
64 |         category_index,
65 |         use_normalized_coordinates=True,
66 |         line_thickness=6,
67 |         min_score_thresh=0.3)
68 | 
69 |     # save the output image
70 |     img = img[..., ::-1]  # RGB to BGR
71 |     cv2.imwrite(OUTPUT_PATH, img)
72 | 
73 |     print('Output has been written to %s\n' % OUTPUT_PATH)
74 | 
75 | 
76 | def main():
77 |     if len(sys.argv) != 2:
78 |         sys.exit('Usage: %s <image_path>' % sys.argv[0])
79 |     detect_image(image_path=sys.argv[1])
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     main()
84 | 


--------------------------------------------------------------------------------
/detect_image.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | PYTHONPATH=`pwd`/models/research:`pwd`/models/research/slim \
4 |     python3 ./detect_image.py $@
5 | 


--------------------------------------------------------------------------------
/doc/eval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkjung-avt/hand-detection-tutorial/e5ef4208abfa7eb89aec4ec96dda21582cde463e/doc/eval.png


--------------------------------------------------------------------------------
/doc/loss_curve_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkjung-avt/hand-detection-tutorial/e5ef4208abfa7eb89aec4ec96dda21582cde463e/doc/loss_curve_1.png


--------------------------------------------------------------------------------
/doc/ssdlite_mobilenet_v2_result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkjung-avt/hand-detection-tutorial/e5ef4208abfa7eb89aec4ec96dda21582cde463e/doc/ssdlite_mobilenet_v2_result.jpg


--------------------------------------------------------------------------------
/doc/training.log:
--------------------------------------------------------------------------------
  1 | ssd_mobilenet_v1_egohands: (training time: 2h 3m)
  2 | 
  3 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.680
  4 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.968
  5 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.813
  6 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.118
  7 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.329
  8 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.713
  9 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.253
 10 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.739
 11 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.744
 12 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 13 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.471
 14 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.773
 15 | 
 16 | 
 17 | ssd_mobilenet_v2_egohands: (training time: 2h 4m 16s)
 18 | 
 19 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.675
 20 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.970
 21 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.813
 22 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.303
 23 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.337
 24 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.706
 25 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.253
 26 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.735
 27 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.738
 28 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.300
 29 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.452
 30 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.769
 31 | 
 32 | 
 33 | ssdlite_mobilenet_v2_egohands: (training time: 2h 13m 51s)
 34 | 
 35 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.573
 36 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.959
 37 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.661
 38 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.252
 39 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.302
 40 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.597
 41 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.220
 42 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.632
 43 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.637
 44 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 45 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.427
 46 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.660
 47 | 
 48 | 
 49 | ssd_inception_v2_egohands: (training time: 2h 15m 6s)
 50 | 
 51 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.669
 52 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.965
 53 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.811
 54 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.101
 55 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.305
 56 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.704
 57 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.252
 58 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.726
 59 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.731
 60 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.200
 61 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.443
 62 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.762
 63 | 
 64 | 
 65 | rfcn_resnet101_egohands: (training time: 3h 4m 54s)
 66 | 
 67 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.751
 68 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.971
 69 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.905
 70 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.025
 71 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.452
 72 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.780
 73 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.271
 74 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.791
 75 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.794
 76 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.100
 77 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.550
 78 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.821
 79 | 
 80 | 
 81 | faster_rcnn_resnet50_egohands: (training time: 2h 6m 5s)
 82 | 
 83 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.751
 84 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.977
 85 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.906
 86 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.252
 87 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.476
 88 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.776
 89 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.270
 90 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.789
 91 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.793
 92 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
 93 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.576
 94 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.817
 95 | 
 96 | 
 97 | faster_rcnn_resnet101_egohands: (training time: 3h 33m 5s)
 98 | 
 99 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.762
100 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.980
101 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.909
102 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
103 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.479
104 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.787
105 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
106 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.797
107 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.800
108 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.150
109 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.577
110 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.824
111 | 
112 | 
113 | faster_rcnn_inception_v2_egohands: (training time: 1h 21m 5s)
114 | 
115 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.739
116 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.978
117 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.889
118 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050
119 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.443
120 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.766
121 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.268
122 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.779
123 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.784
124 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.150
125 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.552
126 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.809
127 | 
128 | 
129 | faster_rcnn_inception_resnet_v2_atrous_egohands: (training time: 18h 29m 57s)
130 | 
131 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.772
132 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.979
133 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.911
134 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.126
135 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.504
136 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.796
137 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.277
138 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.805
139 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.809
140 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.250
141 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.590
142 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.832
143 | 


--------------------------------------------------------------------------------
/download_pretrained_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BASE_URL="http://download.tensorflow.org/models/object_detection/"
 4 | 
 5 | for model in ssd_mobilenet_v1_coco_2018_01_28 \
 6 |              ssd_mobilenet_v2_coco_2018_03_29 \
 7 |              ssdlite_mobilenet_v2_coco_2018_05_09 \
 8 |              ssd_inception_v2_coco_2018_01_28 \
 9 |              ssd_resnet50_v1_fpn_shared_box_predictor_640x640_coco14_sync_2018_07_03 \
10 |              rfcn_resnet101_coco_2018_01_28 \
11 |              faster_rcnn_resnet50_coco_2018_01_28 \
12 |              faster_rcnn_resnet101_coco_2018_01_28 \
13 |              faster_rcnn_inception_v2_coco_2018_01_28 \
14 |              faster_rcnn_inception_resnet_v2_atrous_coco_2018_01_28; do
15 |     wget --no-check-certificate \
16 |          ${BASE_URL}${model}.tar.gz \
17 |          -O /tmp/${model}.tar.gz
18 |     tar xzvf /tmp/${model}.tar.gz
19 | done
20 | 


--------------------------------------------------------------------------------
/eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage()
 4 | {
 5 |     echo
 6 |     echo "Usage: ./eval.sh <model_name>"
 7 |     echo
 8 |     echo "where <model_name> could be one of the following:"
 9 |     echo "    ssd_mobilenet_v1_egohands"
10 |     echo "    ssd_mobilenet_v2_egohands"
11 |     echo "    ssdlite_mobilenet_v2_egohands"
12 |     echo "    ssd_inception_v2_egohands"
13 |     echo "    ssd_resnet50_v1_fpn_egohands"
14 |     echo "    rfcn_resnet101_egohands"
15 |     echo "    faster_rcnn_resnet50_egohands"
16 |     echo "    faster_rcnn_resnet101_egohands"
17 |     echo "    faster_rcnn_inception_v2_egohands"
18 |     echo "    faster_rcnn_inception_resnet_v2_atrous_egohands"
19 |     echo
20 |     exit
21 | }
22 | 
23 | if [ $# -ne 1 ]; then
24 |     usage
25 | fi
26 | 
27 | case $1 in
28 |     ssd_mobilenet_v1_egohands | \
29 |     ssd_mobilenet_v2_egohands | \
30 |     ssdlite_mobilenet_v2_egohands | \
31 |     ssd_inception_v2_egohands | \
32 |     ssd_resnet50_v1_fpn_egohands | \
33 |     rfcn_resnet101_egohands | \
34 |     faster_rcnn_resnet50_egohands | \
35 |     faster_rcnn_resnet101_egohands | \
36 |     faster_rcnn_inception_v2_egohands | \
37 |     faster_rcnn_inception_resnet_v2_atrous_egohands )
38 |         ;;
39 |     * )
40 |         usage
41 | esac
42 | 
43 | MODEL_DIR=$1
44 | PIPELINE_CONFIG_PATH=configs/${MODEL_DIR}.config
45 | EVAL_DIR=${MODEL_DIR}_eval
46 | 
47 | # clear old eval results
48 | rm -rf ${EVAL_DIR}
49 | 
50 | PYTHONPATH=`pwd`/models/research:`pwd`/models/research/slim \
51 |     python3 ./models/research/object_detection/model_main.py \
52 |             --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
53 |             --checkpoint_dir=${MODEL_DIR} \
54 |             --model_dir=${EVAL_DIR} \
55 |             --run_once \
56 |             --alsologtostderr
57 | 


--------------------------------------------------------------------------------
/export.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage()
 4 | {
 5 |     echo
 6 |     echo "Usage: ./export.sh <model_name>"
 7 |     echo
 8 |     echo "where <model_name> could be one of the following:"
 9 |     echo "    ssd_mobilenet_v1_egohands"
10 |     echo "    ssd_mobilenet_v2_egohands"
11 |     echo "    ssdlite_mobilenet_v2_egohands"
12 |     echo "    ssd_inception_v2_egohands"
13 |     echo "    rfcn_resnet101_egohands"
14 |     echo "    faster_rcnn_resnet50_egohands"
15 |     echo "    faster_rcnn_resnet101_egohands"
16 |     echo "    faster_rcnn_inception_v2_egohands"
17 |     echo "    faster_rcnn_inception_resnet_v2_atrous_egohands"
18 |     echo
19 |     exit
20 | }
21 | 
22 | if [ $# -ne 1 ]; then
23 |     usage
24 | fi
25 | 
26 | case $1 in
27 |     ssd_mobilenet_v1_egohands | \
28 |     ssd_mobilenet_v2_egohands | \
29 |     ssdlite_mobilenet_v2_egohands | \
30 |     ssd_inception_v2_egohands )
31 |         MODEL_DIR=$1
32 |         NUM_TRAIN_STEPS=20000
33 |         ;;
34 |     rfcn_resnet101_egohands | \
35 |     faster_rcnn_resnet50_egohands | \
36 |     faster_rcnn_resnet101_egohands | \
37 |     faster_rcnn_inception_v2_egohands | \
38 |     faster_rcnn_inception_resnet_v2_atrous_egohands )
39 |         MODEL_DIR=$1
40 |         NUM_TRAIN_STEPS=50000
41 |         ;;
42 |     * )
43 |         usage
44 | esac
45 | 
46 | PIPELINE_CONFIG_PATH=configs/${MODEL_DIR}.config
47 | CHECKPOINT_PREFIX=${MODEL_DIR}/model.ckpt-${NUM_TRAIN_STEPS}
48 | OUTPUT_DIR=model_exported
49 | 
50 | # clear old exported model
51 | rm -rf ${OUTPUT_DIR}
52 | 
53 | PYTHONPATH=`pwd`/models/research:`pwd`/models/research/slim \
54 |     python3 ./models/research/object_detection/export_inference_graph.py \
55 |             --input_type=image_tensor \
56 |             --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
57 |             --trained_checkpoint_prefix=${CHECKPOINT_PREFIX} \
58 |             --output_directory=${OUTPUT_DIR}
59 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ROOT_DIR=`pwd`
 4 | MODELS_DIR=$ROOT_DIR/models
 5 | PYTHON=python3
 6 | PIP=pip3
 7 | 
 8 | # make sure tensorflow has been installed
 9 | $PIP list | grep tensorflow
10 | if [ $? -ne 0 ]; then
11 |     echo "TensorFlow doesn't seem to be installed!"
12 |     exit
13 | fi
14 | 
15 | # download protoc-3.5.1
16 | BASE_URL="https://github.com/google/protobuf/releases/download/v3.5.1/"
17 | filename="protoc-3.5.1-linux-x86_64.zip"
18 | wget --no-check-certificate ${BASE_URL}${filename} -O /tmp/${filename}
19 | unzip /tmp/${filename} -d protoc-3.5.1
20 | 
21 | # install tensorflow models
22 | # (and also fix some code so that it could be run with pyhton3)
23 | git submodule update --init
24 | cd $MODELS_DIR
25 | cd research
26 | sed -i "157s/print '--annotation_type expected value is 1 or 2.'/print('--annotation_type expected value is 1 or 2.')/" \
27 |        object_detection/dataset_tools/oid_hierarchical_labels_expansion.py
28 | sed -i "516s/print num_classes, num_anchors/print(num_classes, num_anchors)/" \
29 |        object_detection/meta_architectures/ssd_meta_arch_test.py
30 | sed -i "282s/losses_dict.itervalues()/losses_dict.values()/" \
31 |        object_detection/model_lib.py
32 | sed -i "381s/category_index.values(),/list(category_index.values()),/" \
33 |        object_detection/model_lib.py
34 | sed -i "391s/eval_metric_ops.iteritems()/eval_metric_ops.items()/" \
35 |        object_detection/model_lib.py
36 | sed -i "225s/reversed(zip(output_feature_map_keys, output_feature_maps_list)))/reversed(list(zip(output_feature_map_keys, output_feature_maps_list))))/" \
37 |        object_detection/models/feature_map_generators.py
38 | sed -i "842s/print 'Scores and tpfp per class label: {}'.format(class_index)/print('Scores and tpfp per class label: {}'.format(class_index))/" \
39 |        object_detection/utils/object_detection_evaluation.py
40 | sed -i "843s/print tp_fp_labels/print(tp_fp_labels)/" \
41 |        object_detection/utils/object_detection_evaluation.py
42 | sed -i "844s/print scores/print(scores)/" \
43 |        object_detection/utils/object_detection_evaluation.py
44 | sed -n '31p' object_detection/eval_util.py | grep -q vis_utils &&
45 |     ex -s -c 31m23 -c w -c q object_detection/eval_util.py
46 | 
47 | $ROOT_DIR/protoc-3.5.1/bin/protoc object_detection/protos/*.proto --python_out=.
48 | cd $ROOT_DIR
49 | 
50 | # add pycocotools
51 | git clone https://github.com/cocodataset/cocoapi.git
52 | cd cocoapi/PythonAPI/
53 | sed -i '3s/python /python3 /' Makefile
54 | sed -i '8s/python /python3 /' Makefile
55 | make
56 | cp -r pycocotools $MODELS_DIR/research/
57 | cd $ROOT_DIR
58 | 
59 | # run a basic test to make sure tensorflow object detection is working
60 | echo
61 | echo
62 | echo Running model_builder_test.py
63 | CUDA_VISIBLE_DEVICES=0 \
64 | PYTHONPATH=$MODELS_DIR/research:$MODELS_DIR/research/slim \
65 |     $PYTHON $MODELS_DIR/research/object_detection/builders/model_builder_test.py
66 | 


--------------------------------------------------------------------------------
/prepare_egohands.py:
--------------------------------------------------------------------------------
  1 | """prepare_egohands.py
  2 | 
  3 | This script downloads the 'egohands' dataset and convert its annotations
  4 | into bounding boxes in KITTI format.
  5 | 
  6 | Output of this script:
  7 | 
  8 |   ./egohands_data.zip
  9 |   ./egohands
 10 |     ├── (egohands dataset unzipped)
 11 |     └── ......
 12 |   ./egohands_kitti_formatted
 13 |     ├── images
 14 |     │   ├── CARDS_COURTYARD_B_T_frame_0011.jpg
 15 |     │   ├── ......
 16 |     │   └── PUZZLE_OFFICE_T_S_frame_2697.jpg
 17 |     └── labels
 18 |         ├── CARDS_COURTYARD_B_T_frame_0011.txt
 19 |         ├── ......
 20 |         └── PUZZLE_OFFICE_T_S_frame_2697.txt
 21 | """
 22 | 
 23 | 
 24 | import os
 25 | import sys
 26 | import math
 27 | import logging
 28 | import argparse
 29 | from zipfile import ZipFile
 30 | from shutil import rmtree, copyfile
 31 | 
 32 | import numpy as np
 33 | from scipy.io import loadmat
 34 | import cv2
 35 | 
 36 | 
 37 | EGOHANDS_DATASET_URL = \
 38 |     'http://vision.soic.indiana.edu/egohands_files/egohands_data.zip'
 39 | EGOHANDS_DIR = './egohands'
 40 | EGOHANDS_DATA_DIR = './egohands/_LABELLED_SAMPLES'
 41 | CONVERTED_DIR = './egohands_kitti_formatted'
 42 | CONVERTED_IMG_DIR = './egohands_kitti_formatted/images'
 43 | CONVERTED_LBL_DIR = './egohands_kitti_formatted/labels'
 44 | 
 45 | VISUALIZE = False  # visualize each image (for debugging)
 46 | 
 47 | 
 48 | def parse_args():
 49 |     """Parse input arguments."""
 50 |     desc = ('This script downloads the egohands dataset and convert'
 51 |             'the annotations into bounding boxes in KITTI format.')
 52 |     parser = argparse.ArgumentParser(description=desc)
 53 |     parser.add_argument('--verify', dest='do_verify',
 54 |                         help='show and verify each images',
 55 |                         action='store_true')
 56 |     args = parser.parse_args()
 57 |     return args
 58 | 
 59 | 
 60 | def download_file(url, dest=None):
 61 |     """Download file from an URL."""
 62 |     from tqdm import tqdm
 63 |     import requests
 64 | 
 65 |     if not dest:
 66 |         dest = url.split('/')[-1]
 67 | 
 68 |     # Streaming, so we can iterate over the response.
 69 |     r = requests.get(url, stream=True)
 70 | 
 71 |     # Total size in bytes.
 72 |     total_size = int(r.headers.get('content-length', 0))
 73 |     assert total_size != 0
 74 |     block_size = 1024
 75 |     wrote = 0
 76 |     with open(dest, 'wb') as f:
 77 |         for data in tqdm(r.iter_content(block_size),
 78 |                          total=math.ceil(total_size//block_size),
 79 |                          unit='KB', unit_scale=True):
 80 |             wrote = wrote + len(data)
 81 |             f.write(data)
 82 |     assert wrote == total_size
 83 | 
 84 | 
 85 | def polygon_to_box(polygon):
 86 |     """Convert 1 polygon into a bounding box.
 87 | 
 88 |     # Arguments
 89 |       polygon: a numpy array of shape (N, 2) representing N vertices
 90 |                of the hand segmentation label (polygon); each vertex
 91 |                is a point: (x, y)
 92 |     """
 93 |     if len(polygon) < 3:  # a polygon has at least 3 vertices
 94 |         return None
 95 | 
 96 |     x_min = np.min(polygon[:, 0])
 97 |     y_min = np.min(polygon[:, 1])
 98 |     x_max = np.max(polygon[:, 0])
 99 |     y_max = np.max(polygon[:, 1])
100 | 
101 |     x_min = int(math.floor(x_min))
102 |     y_min = int(math.floor(y_min))
103 |     x_max = int(math.ceil(x_max))
104 |     y_max = int(math.ceil(y_max))
105 | 
106 |     return [x_min, y_min, x_max, y_max]
107 | 
108 | 
109 | def box_to_line(box):
110 |     """Convert 1 bounding box into 1 line in the KITTI txt file.
111 | 
112 |     # Arguments
113 |       box: [x_min, y_min, x_max, y_max].
114 | 
115 |     KITTI format:
116 |     Values  Name        Description
117 |     --------------------------------------------------------------------
118 |        1    type        Describes the type of object: 'Car', 'Van',
119 |                         'Truck', 'Pedestrian', 'Person_sitting',
120 |                         'Cyclist', 'Tram', 'Misc' or 'DontCare'
121 |        1    truncated   Float from 0 (non-truncated) to 1 (truncated),
122 |                         where truncated refers to the object leaving
123 |                         image boundaries
124 |        1    occluded    Integer (0,1,2,3) indicating occlusion state:
125 |                         0 = fully visible, 1 = partly occluded
126 |                         2 = largely occluded, 3 = unknown
127 |        1    alpha       Observation angle of object, ranging [-pi..pi]
128 |        4    bbox        2D bounding box of object in the image
129 |                         (0-based index): contains left, top, right,
130 |                         bottom pixel coordinates
131 |        3    dimensions  3D object dimensions: height, width, length
132 |        3    location    3D object location x,y,z in camera coordinates
133 |        1    rotation_y  Rotation ry around Y-axis in camera coordinates
134 |                         [-pi..pi]
135 |        1    score       Only for results: Float, indicating confidence
136 |                         in detection, needed for p/r curves, higher is
137 |                         better.
138 |     """
139 |     return ' '.join(['hand',
140 |                      '0',
141 |                      '0',
142 |                      '0',
143 |                      '{} {} {} {}'.format(*box),
144 |                      '0 0 0',
145 |                      '0 0 0',
146 |                      '0',
147 |                      '0'])
148 | 
149 | 
150 | def convert_one_folder(folder):
151 |     """Convert egohands to KITTI for 1 data folder (100 images).
152 | 
153 |     Refer to README.txt in the egohands folder for the format of the
154 |     MATLAB annotation files and how jpg image files are organized.
155 |     The code in this function loads the 'video' struct from the
156 |     MATLAB file, converts polygons into bounding boxes and write
157 |     annotation into KITTI format.
158 |     """
159 |     folder_path = os.path.join(EGOHANDS_DATA_DIR, folder)
160 |     logging.debug('Converting %s' % folder_path)
161 |     frames = [os.path.splitext(f)[0]
162 |               for f in os.listdir(folder_path) if f.endswith('jpg')]
163 |     frames.sort()
164 |     assert len(frames) == 100
165 |     video = loadmat(os.path.join(folder_path, 'polygons.mat'))
166 |     polygons = video['polygons'][0]  # there are 100*4 entries in polygons
167 |     for i, frame in enumerate(frames):
168 |         # copy and rename jpg file to the 'converted' folder
169 |         src_jpg = frame + '.jpg'
170 |         dst_jpg = folder + '_' + src_jpg
171 |         copyfile(os.path.join(folder_path, src_jpg),
172 |                  os.path.join(CONVERTED_IMG_DIR, dst_jpg))
173 |         # generate txt (the KITTI annotation corresponding to the jpg)
174 |         dst_txt = folder + '_' + frame + '.txt'
175 |         boxes = []
176 |         with open(os.path.join(CONVERTED_LBL_DIR, dst_txt), 'w') as f:
177 |             for polygon in polygons[i]:
178 |                 box = polygon_to_box(polygon)
179 |                 if box:
180 |                     boxes.append(box)
181 |                     f.write(box_to_line(box) + '\n')
182 | 
183 |         if VISUALIZE:
184 |             img = cv2.imread(os.path.join(CONVERTED_IMG_DIR, dst_jpg))
185 |             for box in boxes:
186 |                 cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]),
187 |                               (0, 224, 0), 2)
188 |             cv2.imshow('Visualization', img)
189 |             if cv2.waitKey(0) == 27:
190 |                 sys.exit()
191 | 
192 | 
193 | def egohands_to_kitti():
194 |     """Convert egohands data and annotations to KITTI format.
195 | 
196 |     Steps:
197 |       1. walk through each sub-directory in egohands' data folder.
198 |       2. copy each jpg file to the 'converted' image folder and give
199 |          each file a unique name.
200 |       3. convert the original annotations ('polygon.mat') into
201 |          bounding boxes and write a KITTI txt file for each image.
202 |     """
203 |     rmtree(CONVERTED_DIR, ignore_errors=True)
204 |     os.makedirs(CONVERTED_IMG_DIR)
205 |     os.makedirs(CONVERTED_LBL_DIR)
206 |     for folder in os.listdir(EGOHANDS_DATA_DIR):
207 |         convert_one_folder(folder)
208 | 
209 | 
210 | def main():
211 |     """main"""
212 |     logging.basicConfig(level=logging.DEBUG)
213 | 
214 |     egohands_zip_path = EGOHANDS_DATASET_URL.split('/')[-1]
215 |     if not os.path.isfile(egohands_zip_path):
216 |         logging.info('Downloading %s...' % egohands_zip_path)
217 |         download_file(EGOHANDS_DATASET_URL, egohands_zip_path)
218 | 
219 |     if not os.path.exists(EGOHANDS_DIR):
220 |         with ZipFile(egohands_zip_path, 'r') as zf:
221 |             logging.info('Extracting egohands dataset files...')
222 |             zf.extractall(EGOHANDS_DIR)
223 | 
224 |     logging.info('Copying jpg files and converting annotations...')
225 |     egohands_to_kitti()
226 | 
227 |     logging.info('All done.')
228 | 
229 | 
230 | if __name__ == '__main__':
231 |     main()
232 |     sys.exit()
233 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | matplotlib
 2 | Pillow
 3 | scipy
 4 | Cython
 5 | contextlib2
 6 | lxml
 7 | jupyter
 8 | tqdm
 9 | requests
10 | 


--------------------------------------------------------------------------------
/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | usage()
 4 | {
 5 |     echo
 6 |     echo "Usage: ./trash.sh <model_name>"
 7 |     echo
 8 |     echo "where <model_name> could be one of the following:"
 9 |     echo "    ssd_mobilenet_v1_egohands"
10 |     echo "    ssd_mobilenet_v2_egohands"
11 |     echo "    ssdlite_mobilenet_v2_egohands"
12 |     echo "    ssd_inception_v2_egohands"
13 |     echo "    ssd_resnet50_v1_fpn_egohands"
14 |     echo "    rfcn_resnet101_egohands"
15 |     echo "    faster_rcnn_resnet50_egohands"
16 |     echo "    faster_rcnn_resnet101_egohands"
17 |     echo "    faster_rcnn_inception_v2_egohands"
18 |     echo "    faster_rcnn_inception_resnet_v2_atrous_egohands"
19 |     echo
20 |     exit
21 | }
22 | 
23 | if [ $# -ne 1 ]; then
24 |     usage
25 | fi
26 | 
27 | case $1 in
28 |     ssd_mobilenet_v1_egohands | \
29 |     ssd_mobilenet_v2_egohands | \
30 |     ssdlite_mobilenet_v2_egohands | \
31 |     ssd_inception_v2_egohands )
32 |         NUM_TRAIN_STEPS=20000
33 |         ;;
34 |     ssd_resnet50_v1_fpn_egohands )
35 |         NUM_TRAIN_STEPS=25000
36 |         ;;
37 |     rfcn_resnet101_egohands | \
38 |     faster_rcnn_resnet50_egohands | \
39 |     faster_rcnn_resnet101_egohands | \
40 |     faster_rcnn_inception_v2_egohands | \
41 |     faster_rcnn_inception_resnet_v2_atrous_egohands )
42 |         NUM_TRAIN_STEPS=50000
43 |         ;;
44 |     * )
45 |         usage
46 | esac
47 | 
48 | MODEL_DIR=$1
49 | PIPELINE_CONFIG_PATH=configs/${MODEL_DIR}.config
50 | 
51 | # clear old training logs
52 | rm -rf ${MODEL_DIR}
53 | 
54 | PYTHONPATH=`pwd`/models/research:`pwd`/models/research/slim \
55 |     python3 ./models/research/object_detection/model_main.py \
56 |             --pipeline_config_path=${PIPELINE_CONFIG_PATH} \
57 |             --model_dir=${MODEL_DIR} \
58 |             --num_train_steps=${NUM_TRAIN_STEPS} \
59 |             --sample_1_of_n_eval_samples=1 \
60 |             --alsologtostderr
61 | 


--------------------------------------------------------------------------------