├── signnames.csv ├── inference_out ├── stop_1323822840.avi_image5.png ├── stop_1323804419.avi_image31.png ├── stop_1323896809.avi_image12.png ├── pedestrian_1323896918.avi_image9.png ├── pedestrianCrossing_1330547304.avi_image1.png └── pedestrianCrossing_1333395817.avi_image21.png ├── sample_images ├── stop_1323822840.avi_image5.png ├── stop_1323803184.avi_image16.png ├── stop_1323804419.avi_image31.png ├── stop_1323804592.avi_image12.png ├── stop_1323896809.avi_image12.png ├── pedestrian_1323896918.avi_image9.png ├── pedestrianCrossing_1330547304.avi_image1.png ├── pedestrianCrossing_1333395693.avi_image8.png └── pedestrianCrossing_1333395817.avi_image21.png ├── viz_model.py ├── LICENSE ├── settings.py ├── data_gathering └── create_pickle.py ├── README.md ├── data_prep.py ├── inference.py ├── train.py └── model.py /signnames.csv: -------------------------------------------------------------------------------- 1 | 1,stop 2 | 2,pedestrianCrossing 3 | -------------------------------------------------------------------------------- /inference_out/stop_1323822840.avi_image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323822840.avi_image5.png -------------------------------------------------------------------------------- /sample_images/stop_1323822840.avi_image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323822840.avi_image5.png -------------------------------------------------------------------------------- /inference_out/stop_1323804419.avi_image31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323804419.avi_image31.png -------------------------------------------------------------------------------- /inference_out/stop_1323896809.avi_image12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323896809.avi_image12.png -------------------------------------------------------------------------------- /sample_images/stop_1323803184.avi_image16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323803184.avi_image16.png -------------------------------------------------------------------------------- /sample_images/stop_1323804419.avi_image31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323804419.avi_image31.png -------------------------------------------------------------------------------- /sample_images/stop_1323804592.avi_image12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323804592.avi_image12.png -------------------------------------------------------------------------------- /sample_images/stop_1323896809.avi_image12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323896809.avi_image12.png -------------------------------------------------------------------------------- /inference_out/pedestrian_1323896918.avi_image9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrian_1323896918.avi_image9.png -------------------------------------------------------------------------------- /sample_images/pedestrian_1323896918.avi_image9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrian_1323896918.avi_image9.png -------------------------------------------------------------------------------- /inference_out/pedestrianCrossing_1330547304.avi_image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrianCrossing_1330547304.avi_image1.png -------------------------------------------------------------------------------- /inference_out/pedestrianCrossing_1333395817.avi_image21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrianCrossing_1333395817.avi_image21.png -------------------------------------------------------------------------------- /sample_images/pedestrianCrossing_1330547304.avi_image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1330547304.avi_image1.png -------------------------------------------------------------------------------- /sample_images/pedestrianCrossing_1333395693.avi_image8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1333395693.avi_image8.png -------------------------------------------------------------------------------- /sample_images/pedestrianCrossing_1333395817.avi_image21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1333395817.avi_image21.png -------------------------------------------------------------------------------- /viz_model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Visualize the model using TensorBoard 3 | ''' 4 | import tensorflow as tf 5 | from settings import * 6 | from model import SSDModel 7 | 8 | FM_ONLY = False # Only want to see feature map sizes? 9 | 10 | with tf.Graph().as_default(), tf.Session() as sess: 11 | if FM_ONLY: 12 | # Only want to see feature map sizes (e.g. loss function and vector concatenation not yet set up) 13 | if MODEL == 'AlexNet': 14 | from model import AlexNet as MyModel 15 | else: 16 | raise NotImplementedError('Model %s not supported' % MODEL) 17 | _ = MyModel() 18 | else: 19 | # This includes the entire graph, e.g. loss function, optimizer, etc. 20 | _ = SSDModel() 21 | 22 | tf.summary.merge_all() 23 | writer = tf.summary.FileWriter('./tensorboard_out', sess.graph) 24 | tf.global_variables_initializer().run() 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jou-ching Sung 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Global settings 3 | ''' 4 | import tensorflow as tf 5 | 6 | 7 | # Default boxes 8 | # DEFAULT_BOXES = ((x1_offset, y1_offset, x2_offset, y2_offset), (...), ...) 9 | # Offset is relative to upper-left-corner and lower-right-corner of the feature map cell 10 | DEFAULT_BOXES = ((-0.5, -0.5, 0.5, 0.5), (0.2, 0.2, -0.2, -0.2), (-0.8, -0.2, 0.8, 0.2), (-0.2, -0.8, 0.2, 0.8)) 11 | NUM_DEFAULT_BOXES = len(DEFAULT_BOXES) 12 | 13 | # Constants (TODO: Keep this updated as we go along) 14 | NUM_CLASSES = 3 # 2 signs + 1 background class 15 | NUM_CHANNELS = 1 # grayscale->1, RGB->3 16 | NUM_PRED_CONF = NUM_DEFAULT_BOXES * NUM_CLASSES # number of class predictions per feature map cell 17 | NUM_PRED_LOC = NUM_DEFAULT_BOXES * 4 # number of localization regression predictions per feature map cell 18 | 19 | # Bounding box parameters 20 | IOU_THRESH = 0.5 # match ground-truth box to default boxes exceeding this IOU threshold, during data prep 21 | NMS_IOU_THRESH = 0.2 # IOU threshold for non-max suppression 22 | 23 | # Negatives-to-positives ratio used to filter training data 24 | NEG_POS_RATIO = 5 # negative:positive = NEG_POS_RATIO:1 25 | 26 | # Class confidence threshold to count as detection 27 | CONF_THRESH = 0.9 28 | 29 | # Model selection and dependent parameters 30 | MODEL = 'AlexNet' # AlexNet/VGG16/ResNet50 31 | if MODEL == 'AlexNet': 32 | #IMG_H, IMG_W = 300, 300 33 | #FM_SIZES = [[36, 36], [17, 17], [9, 9], [5, 5]] # feature map sizes for SSD hooks via TensorBoard visualization (HxW) 34 | 35 | IMG_H, IMG_W = 260, 400 36 | FM_SIZES = [[31, 48], [15, 23], [8, 12], [4, 6]] 37 | else: 38 | raise NotImplementedError('Model not implemented') 39 | 40 | # Model hyper-parameters 41 | OPT = tf.train.AdadeltaOptimizer() 42 | REG_SCALE = 1e-2 # L2 regularization strength 43 | LOC_LOSS_WEIGHT = 1. # weight of localization loss: loss = conf_loss + LOC_LOSS_WEIGHT * loc_loss 44 | 45 | # Training process 46 | RESUME = False # resume training from previously saved model? 47 | NUM_EPOCH = 200 48 | BATCH_SIZE = 32 # batch size for training (relatively small) 49 | VALIDATION_SIZE = 0.05 # fraction of total training set to use as validation set 50 | SAVE_MODEL = True # save trained model to disk? 51 | MODEL_SAVE_PATH = './model.ckpt' # where to save trained model 52 | -------------------------------------------------------------------------------- /data_gathering/create_pickle.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Create raw data pickle file 3 | data_raw is a dict mapping image_filename -> [{'class': class_int, 'box_coords': (x1, y1, x2, y2)}, {...}, ...] 4 | ''' 5 | import numpy as np 6 | import pickle 7 | import re 8 | import os 9 | from PIL import Image 10 | 11 | # Script config 12 | RESIZE_IMAGE = True # resize the images and write to 'resized_images/' 13 | GRAYSCALE = True # convert image to grayscale? this option is only valid if RESIZE_IMAGE==True (FIXME) 14 | TARGET_W, TARGET_H = 400, 260 # 1.74 is weighted avg ratio, but 1.65 aspect ratio is close enough (1.65 was for stop signs) 15 | 16 | ########################### 17 | # Execute main script 18 | ########################### 19 | 20 | # First get mapping from sign name string to integer label 21 | sign_map = {'stop': 1, 'pedestrianCrossing': 2} # only 2 sign classes (background class is 0) 22 | ''' 23 | sign_map = {} # sign_name -> integer_label 24 | with open('signnames.csv', 'r') as f: 25 | for line in f: 26 | line = line[:-1] # strip newline at the end 27 | integer_label, sign_name = line.split(',') 28 | sign_map[sign_name] = int(integer_label) 29 | ''' 30 | 31 | # Create raw data pickle file 32 | data_raw = {} 33 | 34 | # For speed, put entire contents of mergedAnnotations.csv in memory 35 | merged_annotations = [] 36 | with open('mergedAnnotations.csv', 'r') as f: 37 | for line in f: 38 | line = line[:-1] # strip trailing newline 39 | merged_annotations.append(line) 40 | 41 | # Create pickle file to represent dataset 42 | image_files = os.listdir('annotations') 43 | for image_file in image_files: 44 | # Find box coordinates for all signs in this image 45 | class_list = [] 46 | box_coords_list = [] 47 | for line in merged_annotations: 48 | if re.search(image_file, line): 49 | fields = line.split(';') 50 | 51 | # Get sign name and assign class label 52 | sign_name = fields[1] 53 | if sign_name != 'stop' and sign_name != 'pedestrianCrossing': 54 | continue # ignore signs that are neither stop nor pedestrianCrossing signs 55 | sign_class = sign_map[sign_name] 56 | class_list.append(sign_class) 57 | 58 | # Resize image, get rescaled box coordinates 59 | box_coords = np.array([int(x) for x in fields[2:6]]) 60 | 61 | if RESIZE_IMAGE: 62 | # Resize the images and write to 'resized_images/' 63 | image = Image.open('annotations/' + image_file) 64 | orig_w, orig_h = image.size 65 | 66 | if GRAYSCALE: 67 | image = image.convert('L') # 8-bit grayscale 68 | image = image.resize((TARGET_W, TARGET_H), Image.LANCZOS) # high-quality downsampling filter 69 | 70 | resized_dir = 'resized_images_%dx%d/' % (TARGET_W, TARGET_H) 71 | if not os.path.exists(resized_dir): 72 | os.makedirs(resized_dir) 73 | 74 | image.save(os.path.join(resized_dir, image_file)) 75 | 76 | # Rescale box coordinates 77 | x_scale = TARGET_W / orig_w 78 | y_scale = TARGET_H / orig_h 79 | 80 | ulc_x, ulc_y, lrc_x, lrc_y = box_coords 81 | new_box_coords = (ulc_x * x_scale, ulc_y * y_scale, lrc_x * x_scale, lrc_y * y_scale) 82 | new_box_coords = [round(x) for x in new_box_coords] 83 | box_coords = np.array(new_box_coords) 84 | 85 | box_coords_list.append(box_coords) 86 | 87 | if len(class_list) == 0: 88 | continue # ignore images with no signs-of-interest 89 | class_list = np.array(class_list) 90 | box_coords_list = np.array(box_coords_list) 91 | 92 | # Create the list of dicts 93 | the_list = [] 94 | for i in range(len(box_coords_list)): 95 | d = {'class': class_list[i], 'box_coords': box_coords_list[i]} 96 | the_list.append(d) 97 | 98 | data_raw[image_file] = the_list 99 | 100 | with open('data_raw_%dx%d.p' % (TARGET_W, TARGET_H), 'wb') as f: 101 | pickle.dump(data_raw, f) 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SSD in TensorFlow: Traffic Sign Detection and Classification 2 | ## Overview 3 | Implementation of [Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) in TensorFlow, to detect and classify traffic signs. This implementation was able to achieve 40-45 fps on a GTX 1080 with an Intel Core i7-6700K. 4 | 5 | *Note this project is still work-in-progress*. The main issue now is model overfitting. I am currently working on pre-training on VOC2012 first, then performing transfer learning over to traffic sign detection. 6 | 7 | Currently only stop signs and pedestrian crossing signs are detected. Example detection images are below. 8 | 9 | ![example1](inference_out/stop_1323896809.avi_image12.png) 10 | ![example2](inference_out/pedestrian_1323896918.avi_image9.png) 11 | ![example3](inference_out/stop_1323804419.avi_image31.png) 12 | ![example4](inference_out/stop_1323822840.avi_image5.png) 13 | ![example5](inference_out/pedestrianCrossing_1330547304.avi_image1.png) 14 | ![example6](inference_out/pedestrianCrossing_1333395817.avi_image21.png) 15 | 16 | The model was trained on the [LISA Traffic Sign Dataset](http://cvrr.ucsd.edu/LISA/lisa-traffic-sign-dataset.html), a dataset of US traffic signs. 17 | 18 | ## Dependencies 19 | * Python 3.5+ 20 | * TensorFlow v0.12.0 21 | * Pickle 22 | * OpenCV-Python 23 | * Matplotlib (optional) 24 | 25 | ## How to run 26 | Clone this repository somewhere, let's refer to it as `$ROOT` 27 | 28 | Training the model from scratch: 29 | * Download the [LISA Traffic Sign Dataset](http://cvrr.ucsd.edu/LISA/lisa-traffic-sign-dataset.html), and store it in a directory `$LISA_DATA` 30 | * `cd $LISA_DATA` 31 | * Follow instructions in the LISA Traffic Sign Dataset to create 'mergedAnnotations.csv' such that only stop signs and pedestrian crossing signs are shown 32 | * `cp $ROOT/data_gathering/create_pickle.py $LISA_DATA` 33 | * `python create_pickle.py` 34 | * `cd $ROOT` 35 | * `ln -s $LISA_DATA/resized_images_* .` 36 | * `ln -s $LISA_DATA/data_raw_*.p .` 37 | * `python data_prep.py` 38 | * This performs box matching between ground-truth boxes and default boxes, and packages the data into a format used later in the pipeline 39 | * `python train.py` 40 | * This trains the SSD model 41 | * `python inference.py -m demo` 42 | * This will take the images from sample_images, annotate them, and display them on screen 43 | * To run predictions on your own images and/or videos, use the `-i` flag in inference.py (see the code for more details) 44 | * Note the model severly overfits at this time 45 | 46 | ## Differences between original SSD implementation 47 | Obivously, we are only detecting certain traffic signs in this implementation, whereas the original SSD implemetation detected a greater number of object classes in the PASCAL VOC and MS COCO datasets. Other notable differences are: 48 | * Uses AlexNet as the base network 49 | * Input image resolution is 400x260 50 | * Uses a dynamic scaling factor based on the dimensions of the feature map relative to original image dimensions 51 | 52 | ## Performance 53 | As mentioned above, this SSD implementation was able to achieve 40-45 fps on a GTX 1080 with an Intel Core i7 6700K. 54 | 55 | The inference time is the sum of the neural network inference time, and Non-Maximum Suppression (NMS) time. Overall, the neural network inference time is significantly less than the NMS time, with the neural network inference time generally between 7-8 ms, whereas the NMS time is between 15-16 ms. The NMS algorithm implemented here has not been optimized, and runs on CPU only, so further effort to improve performance can be done there. 56 | 57 | ## Dataset characteristics 58 | The entire LISA Traffic Sign Dataset consists of 47 distinct traffic sign classes. Since we are only concered with a subset of those classes, we only use a subset of the LISA dataset. Also, we ignore all training samples where we do not find a matching default box, further reducing our dataset's size. Due to this process, we end up with very little data to work with. 59 | 60 | In order to improve on this issue, we can perform image data augmentation, and/or pre-train the model on a larger dataset (e.g. VOC2012, ILSVRC) 61 | 62 | ## Training process 63 | Given the small size of our pruned dataset, I chose a train/validation split of 95/5. The model was trained with Adadelta optimizers, with the default parameters provided by TensorFlow. The model was trained over 200 epochs, with a batch size of 32. 64 | 65 | ## Areas of improvement 66 | There are multiple potential areas of improvement in this project: 67 | 68 | * Pre-train the model on VOC2012 and/or ILSVRC 69 | * Image data augmentation 70 | * Hyper-parameter tuning 71 | * Optimize NMS alogorithm, or leverage existing optimized NMS algorithm 72 | * Implement and report mAP metric 73 | * Try different base networks 74 | * Expand to more traffic sign classes 75 | -------------------------------------------------------------------------------- /data_prep.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data preparation 3 | ''' 4 | from settings import * 5 | import numpy as np 6 | import pickle 7 | 8 | 9 | def calc_iou(box_a, box_b): 10 | """ 11 | Calculate the Intersection Over Union of two boxes 12 | Each box specified by upper left corner and lower right corner: 13 | (x1, y1, x2, y2), where 1 denotes upper left corner, 2 denotes lower right corner 14 | 15 | Returns IOU value 16 | """ 17 | # Calculate intersection, i.e. area of overlap between the 2 boxes (could be 0) 18 | # http://math.stackexchange.com/a/99576 19 | x_overlap = max(0, min(box_a[2], box_b[2]) - max(box_a[0], box_b[0])) 20 | y_overlap = max(0, min(box_a[3], box_b[3]) - max(box_a[1], box_b[1])) 21 | intersection = x_overlap * y_overlap 22 | 23 | # Calculate union 24 | area_box_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1]) 25 | area_box_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1]) 26 | union = area_box_a + area_box_b - intersection 27 | 28 | iou = intersection / union 29 | return iou 30 | 31 | 32 | def find_gt_boxes(data_raw, image_file): 33 | """ 34 | Given (global) feature map sizes, and single training example, 35 | find all default boxes that exceed Jaccard overlap threshold 36 | 37 | Returns y_true array that flags the matching default boxes with class ID (-1 means nothing there) 38 | """ 39 | # Pre-process ground-truth data 40 | # Convert absolute coordinates to relative coordinates ranging from 0 to 1 41 | # Read the sign class label (note background class label is 0, sign labels are ints >=1) 42 | signs_data = data_raw[image_file] 43 | 44 | signs_class = [] 45 | signs_box_coords = [] # relative coordinates 46 | for sign_data in signs_data: 47 | # Find class label 48 | sign_class = sign_data['class'] 49 | signs_class.append(sign_class) 50 | 51 | # Calculate relative coordinates 52 | # (x1, y1, x2, y2), where 1 denotes upper left corner, 2 denotes lower right corner 53 | abs_box_coords = sign_data['box_coords'] 54 | scale = np.array([IMG_W, IMG_H, IMG_W, IMG_H]) 55 | box_coords = np.array(abs_box_coords) / scale 56 | signs_box_coords.append(box_coords) 57 | 58 | # Initialize y_true to all 0s (0 -> background) 59 | y_true_len = 0 60 | for fm_size in FM_SIZES: 61 | y_true_len += fm_size[0] * fm_size[1] * NUM_DEFAULT_BOXES 62 | y_true_conf = np.zeros(y_true_len) 63 | y_true_loc = np.zeros(y_true_len * 4) 64 | 65 | # For each GT box, for each feature map, for each feature map cell, for each default box: 66 | # 1) Calculate the Jaccard overlap (IOU) and annotate the class label 67 | # 2) Count how many box matches we got 68 | # 3) If we got a match, calculate normalized box coordinates and updte y_true_loc 69 | match_counter = 0 70 | for i, gt_box_coords in enumerate(signs_box_coords): 71 | y_true_idx = 0 72 | #for fm_idx, fm_size in enumerate(FM_SIZES): 73 | for fm_size in FM_SIZES: 74 | fm_h, fm_w = fm_size # feature map height and width 75 | for row in range(fm_h): 76 | for col in range(fm_w): 77 | for db in DEFAULT_BOXES: 78 | # Calculate relative box coordinates for this default box 79 | x1_offset, y1_offset, x2_offset, y2_offset = db 80 | abs_db_box_coords = np.array([ 81 | max(0, col + x1_offset), 82 | max(0, row + y1_offset), 83 | min(fm_w, col+1 + x2_offset), 84 | min(fm_h, row+1 + y2_offset) 85 | ]) 86 | scale = np.array([fm_w, fm_h, fm_w, fm_h]) 87 | db_box_coords = abs_db_box_coords / scale 88 | 89 | # Calculate Jaccard overlap (i.e. Intersection Over Union, IOU) of GT box and default box 90 | iou = calc_iou(gt_box_coords, db_box_coords) 91 | 92 | # If box matches, i.e. IOU threshold met 93 | if iou >= IOU_THRESH: 94 | # Update y_true_conf to reflect we found a match, and increment match_counter 95 | y_true_conf[y_true_idx] = signs_class[i] 96 | match_counter += 1 97 | 98 | # Calculate normalized box coordinates and update y_true_loc 99 | abs_box_center = np.array([col + 0.5, row + 0.5]) # absolute coordinates of center of feature map cell 100 | abs_gt_box_coords = gt_box_coords * scale # absolute ground truth box coordinates (in feature map grid) 101 | norm_box_coords = abs_gt_box_coords - np.concatenate((abs_box_center, abs_box_center)) 102 | y_true_loc[y_true_idx*4 : y_true_idx*4 + 4] = norm_box_coords 103 | 104 | y_true_idx += 1 105 | 106 | return y_true_conf, y_true_loc, match_counter 107 | 108 | 109 | def do_data_prep(data_raw): 110 | """ 111 | Create the y_true array 112 | data_raw is the dict mapping image_file -> [{'class': class_int, 'box_coords': (x1, y1, x2, y2)}, {...}, ...] 113 | 114 | Return a dict {image_file1: {'y_true_conf': y_true_conf, 'y_true_loc': y_true_loc}, image_file2: ...} 115 | """ 116 | # Prepare the data by populating y_true appropriately 117 | data_prep = {} 118 | for image_file in data_raw.keys(): 119 | # Find groud-truth boxes based on Jaccard overlap, 120 | # populate y_true_conf (class labels) and y_true_loc (normalized box coordinates) 121 | y_true_conf, y_true_loc, match_counter = find_gt_boxes(data_raw, image_file) 122 | 123 | # Only want data points where we have matching default boxes 124 | if match_counter > 0: 125 | data_prep[image_file] = {'y_true_conf': y_true_conf, 'y_true_loc': y_true_loc} 126 | 127 | return data_prep 128 | 129 | 130 | if __name__ == '__main__': 131 | with open('data_raw_%sx%s.p' % (IMG_W, IMG_H), 'rb') as f: 132 | data_raw = pickle.load(f) 133 | 134 | print('Preparing data (i.e. matching boxes)') 135 | data_prep = do_data_prep(data_raw) 136 | 137 | with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), 'wb') as f: 138 | pickle.dump(data_prep, f) 139 | 140 | print('Done. Saved prepared data to data_prep_%sx%s.p' % (IMG_W, IMG_H)) 141 | print('Total images with >=1 matching box: %d' % len(data_prep.keys())) 142 | -------------------------------------------------------------------------------- /inference.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Run inference using trained model 3 | ''' 4 | import tensorflow as tf 5 | from settings import * 6 | from model import SSDModel 7 | from model import ModelHelper 8 | from model import nms 9 | import numpy as np 10 | from sklearn.model_selection import train_test_split 11 | import cv2 12 | import math 13 | import os 14 | import time 15 | import pickle 16 | from PIL import Image 17 | import matplotlib.pyplot as plt 18 | from moviepy.editor import VideoFileClip 19 | from optparse import OptionParser 20 | import glob 21 | 22 | 23 | def run_inference(image, model, sess, mode, sign_map): 24 | """ 25 | Run inference on a given image 26 | 27 | Arguments: 28 | * image: Numpy array representing a single RGB image 29 | * model: Dict of tensor references returned by SSDModel() 30 | * sess: TensorFlow session reference 31 | * mode: String of either "image", "video", or "demo" 32 | 33 | Returns: 34 | * Numpy array representing annotated image 35 | """ 36 | # Save original image in memory 37 | image = np.array(image) 38 | image_orig = np.copy(image) 39 | 40 | # Get relevant tensors 41 | x = model['x'] 42 | is_training = model['is_training'] 43 | preds_conf = model['preds_conf'] 44 | preds_loc = model['preds_loc'] 45 | probs = model['probs'] 46 | 47 | # Convert image to PIL Image, resize it, convert to grayscale (if necessary), convert back to numpy array 48 | image = Image.fromarray(image) 49 | orig_w, orig_h = image.size 50 | if NUM_CHANNELS == 1: 51 | image = image.convert('L') # 8-bit grayscale 52 | image = image.resize((IMG_W, IMG_H), Image.LANCZOS) # high-quality downsampling filter 53 | image = np.asarray(image) 54 | 55 | images = np.array([image]) # create a "batch" of 1 image 56 | if NUM_CHANNELS == 1: 57 | images = np.expand_dims(images, axis=-1) # need extra dimension of size 1 for grayscale 58 | 59 | # Perform object detection 60 | t0 = time.time() # keep track of duration of object detection + NMS 61 | preds_conf_val, preds_loc_val, probs_val = sess.run([preds_conf, preds_loc, probs], feed_dict={x: images, is_training: False}) 62 | if mode != 'video': 63 | print('Inference took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0))) 64 | 65 | # Gather class predictions and confidence values 66 | y_pred_conf = preds_conf_val[0] # batch size of 1, so just take [0] 67 | y_pred_conf = y_pred_conf.astype('float32') 68 | prob = probs_val[0] 69 | 70 | # Gather localization predictions 71 | y_pred_loc = preds_loc_val[0] 72 | 73 | # Perform NMS 74 | boxes = nms(y_pred_conf, y_pred_loc, prob) 75 | if mode != 'video': 76 | print('Inference + NMS took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0))) 77 | 78 | # Rescale boxes' coordinates back to original image's dimensions 79 | # Recall boxes = [[x1, y1, x2, y2, cls, cls_prob], [...], ...] 80 | scale = np.array([orig_w/IMG_W, orig_h/IMG_H, orig_w/IMG_W, orig_h/IMG_H]) 81 | if len(boxes) > 0: 82 | boxes[:, :4] = boxes[:, :4] * scale 83 | 84 | # Draw and annotate boxes over original image, and return annotated image 85 | image = image_orig 86 | for box in boxes: 87 | # Get box parameters 88 | box_coords = [int(round(x)) for x in box[:4]] 89 | cls = int(box[4]) 90 | cls_prob = box[5] 91 | 92 | # Annotate image 93 | image = cv2.rectangle(image, tuple(box_coords[:2]), tuple(box_coords[2:]), (0,255,0)) 94 | label_str = '%s %.2f' % (sign_map[cls], cls_prob) 95 | image = cv2.putText(image, label_str, (box_coords[0], box_coords[1]), 0, 0.5, (0,255,0), 1, cv2.LINE_AA) 96 | 97 | return image 98 | 99 | 100 | def generate_output(input_files, mode): 101 | """ 102 | Generate annotated images, videos, or sample images, based on mode 103 | """ 104 | # First, load mapping from integer class ID to sign name string 105 | sign_map = {} 106 | with open('signnames.csv', 'r') as f: 107 | for line in f: 108 | line = line[:-1] # strip newline at the end 109 | sign_id, sign_name = line.split(',') 110 | sign_map[int(sign_id)] = sign_name 111 | sign_map[0] = 'background' # class ID 0 reserved for background class 112 | 113 | # Create output directory 'inference_out/' if needed 114 | if mode == 'image' or mode == 'video': 115 | if not os.path.isdir('./inference_out'): 116 | try: 117 | os.mkdir('./inference_out') 118 | except FileExistsError: 119 | print('Error: Cannot mkdir ./inference_out') 120 | return 121 | 122 | # Launch the graph 123 | with tf.Graph().as_default(), tf.Session() as sess: 124 | # "Instantiate" neural network, get relevant tensors 125 | model = SSDModel() 126 | 127 | # Load trained model 128 | saver = tf.train.Saver() 129 | print('Restoring previously trained model at %s' % MODEL_SAVE_PATH) 130 | saver.restore(sess, MODEL_SAVE_PATH) 131 | 132 | if mode == 'image': 133 | for image_file in input_files: 134 | print('Running inference on %s' % image_file) 135 | image_orig = np.asarray(Image.open(image_file)) 136 | image = run_inference(image_orig, model, sess, mode, sign_map) 137 | 138 | head, tail = os.path.split(image_file) 139 | plt.imsave('./inference_out/%s' % tail, image) 140 | print('Output saved in inference_out/') 141 | 142 | elif mode == 'video': 143 | for video_file in input_files: 144 | print('Running inference on %s' % video_file) 145 | video = VideoFileClip(video_file) 146 | video = video.fl_image(lambda x: run_inference(x, model, sess, mode, sign_map)) 147 | 148 | head, tail = os.path.split(video_file) 149 | video.write_videofile('./inference_out/%s' % tail, audio=False) 150 | print('Output saved in inference_out/') 151 | 152 | elif mode == 'demo': 153 | print('Demo mode: Running inference on images in sample_images/') 154 | image_files = os.listdir('sample_images/') 155 | 156 | for image_file in image_files: 157 | print('Running inference on sample_images/%s' % image_file) 158 | image_orig = np.asarray(Image.open('sample_images/' + image_file)) 159 | image = run_inference(image_orig, model, sess, mode, sign_map) 160 | plt.imshow(image) 161 | plt.show() 162 | 163 | else: 164 | raise ValueError('Invalid mode: %s' % mode) 165 | 166 | 167 | if __name__ == '__main__': 168 | # Configure command line options 169 | parser = OptionParser() 170 | parser.add_option('-i', '--input_dir', dest='input_dir', 171 | help='Directory of input videos/images (ignored for "demo" mode). Will run inference on all videos/images in that dir') 172 | parser.add_option('-m', '--mode', dest='mode', default='image', 173 | help='Operating mode, could be "image", "video", or "demo"; "demo" mode displays annotated images from sample_images/') 174 | 175 | # Get and parse command line options 176 | options, args = parser.parse_args() 177 | 178 | input_dir = options.input_dir 179 | mode = options.mode 180 | 181 | if mode != 'video' and mode != 'image' and mode != 'demo': 182 | assert ValueError('Invalid mode: %s' % mode) 183 | 184 | if mode != 'demo': 185 | input_files = glob.glob(input_dir + '/*.*') 186 | else: 187 | input_files = [] 188 | 189 | generate_output(input_files, mode) 190 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Train the model on dataset 3 | ''' 4 | import tensorflow as tf 5 | from settings import * 6 | from model import SSDModel 7 | from model import ModelHelper 8 | import numpy as np 9 | from sklearn.model_selection import train_test_split 10 | import cv2 11 | import math 12 | import os 13 | import time 14 | import pickle 15 | from PIL import Image 16 | 17 | 18 | def next_batch(X, y_conf, y_loc, batch_size): 19 | """ 20 | Next batch generator 21 | Arguments: 22 | * X: List of image file names 23 | * y_conf: List of ground-truth vectors for class labels 24 | * y_loc: List of ground-truth vectors for localization 25 | * batch_size: Batch size 26 | 27 | Yields: 28 | * images: Batch numpy array representation of batch of images 29 | * y_true_conf: Batch numpy array of ground-truth class labels 30 | * y_true_loc: Batch numpy array of ground-truth localization 31 | * conf_loss_mask: Loss mask for confidence loss, to set NEG_POS_RATIO 32 | """ 33 | start_idx = 0 34 | while True: 35 | image_files = X[start_idx : start_idx + batch_size] 36 | y_true_conf = np.array(y_conf[start_idx : start_idx + batch_size]) 37 | y_true_loc = np.array(y_loc[start_idx : start_idx + batch_size]) 38 | 39 | # Read images from image_files 40 | images = [] 41 | for image_file in image_files: 42 | image = Image.open('resized_images_%sx%s/%s' % (IMG_W, IMG_H, image_file)) 43 | image = np.asarray(image) 44 | images.append(image) 45 | 46 | images = np.array(images, dtype='float32') 47 | 48 | # Grayscale images have array shape (H, W), but we want shape (H, W, 1) 49 | if NUM_CHANNELS == 1: 50 | images = np.expand_dims(images, axis=-1) 51 | 52 | # Normalize pixel values (scale them between -1 and 1) 53 | images = images/127.5 - 1. 54 | 55 | # For y_true_conf, calculate how many negative examples we need to satisfy NEG_POS_RATIO 56 | num_pos = np.where(y_true_conf > 0)[0].shape[0] 57 | num_neg = NEG_POS_RATIO * num_pos 58 | y_true_conf_size = np.sum(y_true_conf.shape) 59 | 60 | # Create confidence loss mask to satisfy NEG_POS_RATIO 61 | if num_pos + num_neg < y_true_conf_size: 62 | conf_loss_mask = np.copy(y_true_conf) 63 | conf_loss_mask[np.where(conf_loss_mask > 0)] = 1. 64 | 65 | # Find all (i,j) tuples where y_true_conf[i][j]==0 66 | zero_indices = np.where(conf_loss_mask == 0.) # ([i1, i2, ...], [j1, j2, ...]) 67 | zero_indices = np.transpose(zero_indices) # [[i1, j1], [i2, j2], ...] 68 | 69 | # Randomly choose num_neg rows from zero_indices, w/o replacement 70 | chosen_zero_indices = zero_indices[np.random.choice(zero_indices.shape[0], int(num_neg), False)] 71 | 72 | # "Enable" chosen negative examples, specified by chosen_zero_indices 73 | for zero_idx in chosen_zero_indices: 74 | i, j = zero_idx 75 | conf_loss_mask[i][j] = 1. 76 | 77 | else: 78 | # If we have so many positive examples such that num_pos+num_neg >= y_true_conf_size, 79 | # no need to prune negative data 80 | conf_loss_mask = np.ones_like(y_true_conf) 81 | 82 | yield (images, y_true_conf, y_true_loc, conf_loss_mask) 83 | 84 | # Update start index for the next batch 85 | start_idx += batch_size 86 | if start_idx >= X.shape[0]: 87 | start_idx = 0 88 | 89 | 90 | def run_training(): 91 | """ 92 | Load training and test data 93 | Run training process 94 | Plot train/validation losses 95 | Report test loss 96 | Save model 97 | """ 98 | # Load training and test data 99 | with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), mode='rb') as f: 100 | train = pickle.load(f) 101 | #with open('test.p', mode='rb') as f: 102 | # test = pickle.load(f) 103 | 104 | # Format the data 105 | X_train = [] 106 | y_train_conf = [] 107 | y_train_loc = [] 108 | for image_file in train.keys(): 109 | X_train.append(image_file) 110 | y_train_conf.append(train[image_file]['y_true_conf']) 111 | y_train_loc.append(train[image_file]['y_true_loc']) 112 | X_train = np.array(X_train) 113 | y_train_conf = np.array(y_train_conf) 114 | y_train_loc = np.array(y_train_loc) 115 | 116 | # Train/validation split 117 | X_train, X_valid, y_train_conf, y_valid_conf, y_train_loc, y_valid_loc = train_test_split(\ 118 | X_train, y_train_conf, y_train_loc, test_size=VALIDATION_SIZE, random_state=1) 119 | 120 | # Launch the graph 121 | with tf.Graph().as_default(), tf.Session() as sess: 122 | # "Instantiate" neural network, get relevant tensors 123 | model = SSDModel() 124 | x = model['x'] 125 | y_true_conf = model['y_true_conf'] 126 | y_true_loc = model['y_true_loc'] 127 | conf_loss_mask = model['conf_loss_mask'] 128 | is_training = model['is_training'] 129 | optimizer = model['optimizer'] 130 | reported_loss = model['loss'] 131 | 132 | # Training process 133 | # TF saver to save/restore trained model 134 | saver = tf.train.Saver() 135 | 136 | if RESUME: 137 | print('Restoring previously trained model at %s' % MODEL_SAVE_PATH) 138 | saver.restore(sess, MODEL_SAVE_PATH) 139 | 140 | # Restore previous loss history 141 | with open('loss_history.p', 'rb') as f: 142 | loss_history = pickle.load(f) 143 | else: 144 | print('Training model from scratch') 145 | # Variable initialization 146 | sess.run(tf.global_variables_initializer()) 147 | 148 | # For book-keeping, keep track of training and validation loss over epochs, like such: 149 | # [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...] 150 | loss_history = [] 151 | 152 | # Record time elapsed for performance check 153 | last_time = time.time() 154 | train_start_time = time.time() 155 | 156 | # Run NUM_EPOCH epochs of training 157 | for epoch in range(NUM_EPOCH): 158 | train_gen = next_batch(X_train, y_train_conf, y_train_loc, BATCH_SIZE) 159 | num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE) 160 | losses = [] # list of loss values for book-keeping 161 | 162 | # Run training on each batch 163 | for _ in range(num_batches_train): 164 | # Obtain the training data and labels from generator 165 | images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(train_gen) 166 | 167 | # Perform gradient update (i.e. training step) on current batch 168 | _, loss = sess.run([optimizer, reported_loss], feed_dict={ 169 | #_, loss, loc_loss_dbg, loc_loss_mask, loc_loss = sess.run([optimizer, reported_loss, model['loc_loss_dbg'], model['loc_loss_mask'], model['loc_loss']],feed_dict={ # DEBUG 170 | x: images, 171 | y_true_conf: y_true_conf_gen, 172 | y_true_loc: y_true_loc_gen, 173 | conf_loss_mask: conf_loss_mask_gen, 174 | is_training: True 175 | }) 176 | 177 | losses.append(loss) # TODO: Need mAP metric instead of raw loss 178 | 179 | # A rough estimate of loss for this epoch (overweights the last batch) 180 | train_loss = np.mean(losses) 181 | 182 | # Calculate validation loss at the end of the epoch 183 | valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc, BATCH_SIZE) 184 | num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE) 185 | losses = [] 186 | for _ in range(num_batches_valid): 187 | images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(valid_gen) 188 | 189 | # Perform forward pass and calculate loss 190 | loss = sess.run(reported_loss, feed_dict={ 191 | x: images, 192 | y_true_conf: y_true_conf_gen, 193 | y_true_loc: y_true_loc_gen, 194 | conf_loss_mask: conf_loss_mask_gen, 195 | is_training: False 196 | }) 197 | losses.append(loss) 198 | valid_loss = np.mean(losses) 199 | 200 | # Record and report train/validation/test losses for this epoch 201 | loss_history.append((train_loss, valid_loss)) 202 | 203 | # Print accuracy every epoch 204 | print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' %\ 205 | (epoch+1, train_loss, valid_loss, time.time() - last_time)) 206 | last_time = time.time() 207 | 208 | total_time = time.time() - train_start_time 209 | print('Total elapsed time: %d min %d sec' % (total_time/60, total_time%60)) 210 | 211 | test_loss = 0. # TODO: Add test set 212 | ''' 213 | # After training is complete, evaluate accuracy on test set 214 | print('Calculating test accuracy...') 215 | test_gen = next_batch(X_test, y_test, BATCH_SIZE) 216 | test_size = X_test.shape[0] 217 | test_acc = calculate_accuracy(test_gen, test_size, BATCH_SIZE, accuracy, x, y, keep_prob, sess) 218 | print('Test acc.: %.4f' % (test_acc,)) 219 | ''' 220 | 221 | if SAVE_MODEL: 222 | # Save model to disk 223 | save_path = saver.save(sess, MODEL_SAVE_PATH) 224 | print('Trained model saved at: %s' % save_path) 225 | 226 | # Also save accuracy history 227 | print('Loss history saved at loss_history.p') 228 | with open('loss_history.p', 'wb') as f: 229 | pickle.dump(loss_history, f) 230 | 231 | # Return final test accuracy and accuracy_history 232 | return test_loss, loss_history 233 | 234 | 235 | if __name__ == '__main__': 236 | run_training() 237 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Model definition 3 | ''' 4 | import numpy as np 5 | import tensorflow as tf 6 | import tensorflow.contrib.slim as slim 7 | from settings import * 8 | from data_prep import calc_iou 9 | 10 | 11 | def SSDHook(feature_map, hook_id): 12 | """ 13 | Takes input feature map, output the predictions tensor 14 | hook_id is for variable_scope unqie string ID 15 | """ 16 | with tf.variable_scope('ssd_hook_' + hook_id): 17 | # Note we have linear activation (i.e. no activation function) 18 | net_conf = slim.conv2d(feature_map, NUM_PRED_CONF, [3, 3], activation_fn=None, scope='conv_conf') 19 | net_conf = tf.contrib.layers.flatten(net_conf) 20 | 21 | net_loc = slim.conv2d(feature_map, NUM_PRED_LOC, [3, 3], activation_fn=None, scope='conv_loc') 22 | net_loc = tf.contrib.layers.flatten(net_loc) 23 | 24 | return net_conf, net_loc 25 | 26 | 27 | def ModelHelper(y_pred_conf, y_pred_loc): 28 | """ 29 | Define loss function, optimizer, predictions, and accuracy metric 30 | Loss includes confidence loss and localization loss 31 | 32 | conf_loss_mask is created at batch generation time, to mask the confidence losses 33 | It has 1 at locations w/ positives, and 1 at select negative locations 34 | such that negative-to-positive ratio of NEG_POS_RATIO is satisfied 35 | 36 | Arguments: 37 | * y_pred_conf: Class predictions from model, 38 | a tensor of shape [batch_size, num_feature_map_cells * num_defaul_boxes * num_classes] 39 | * y_pred_loc: Localization predictions from model, 40 | a tensor of shape [batch_size, num_feature_map_cells * num_defaul_boxes * 4] 41 | 42 | Returns relevant tensor references 43 | """ 44 | num_total_preds = 0 45 | for fm_size in FM_SIZES: 46 | num_total_preds += fm_size[0] * fm_size[1] * NUM_DEFAULT_BOXES 47 | num_total_preds_conf = num_total_preds * NUM_CLASSES 48 | num_total_preds_loc = num_total_preds * 4 49 | 50 | # Input tensors 51 | y_true_conf = tf.placeholder(tf.int32, [None, num_total_preds], name='y_true_conf') # classification ground-truth labels 52 | y_true_loc = tf.placeholder(tf.float32, [None, num_total_preds_loc], name='y_true_loc') # localization ground-truth labels 53 | conf_loss_mask = tf.placeholder(tf.float32, [None, num_total_preds], name='conf_loss_mask') # 1 mask "bit" per def. box 54 | 55 | # Confidence loss 56 | logits = tf.reshape(y_pred_conf, [-1, num_total_preds, NUM_CLASSES]) 57 | conf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_true_conf) 58 | conf_loss = conf_loss_mask * conf_loss # "zero-out" the loss for don't-care negatives 59 | conf_loss = tf.reduce_sum(conf_loss) 60 | 61 | # Localization loss (smooth L1 loss) 62 | # loc_loss_mask is analagous to conf_loss_mask, except 4 times the size 63 | diff = y_true_loc - y_pred_loc 64 | 65 | loc_loss_l2 = 0.5 * (diff**2.0) 66 | loc_loss_l1 = tf.abs(diff) - 0.5 67 | smooth_l1_condition = tf.less(tf.abs(diff), 1.0) 68 | loc_loss = tf.select(smooth_l1_condition, loc_loss_l2, loc_loss_l1) 69 | 70 | loc_loss_mask = tf.minimum(y_true_conf, 1) # have non-zero localization loss only where we have matching ground-truth box 71 | loc_loss_mask = tf.to_float(loc_loss_mask) 72 | loc_loss_mask = tf.stack([loc_loss_mask] * 4, axis=2) # [0, 1, 1] -> [[[0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], ...] 73 | loc_loss_mask = tf.reshape(loc_loss_mask, [-1, num_total_preds_loc]) # removing the inner-most dimension of above 74 | loc_loss = loc_loss_mask * loc_loss 75 | loc_loss = tf.reduce_sum(loc_loss) 76 | 77 | # Weighted average of confidence loss and localization loss 78 | # Also add regularization loss 79 | loss = conf_loss + LOC_LOSS_WEIGHT * loc_loss + tf.reduce_sum(slim.losses.get_regularization_losses()) 80 | optimizer = OPT.minimize(loss) 81 | 82 | #reported_loss = loss #tf.reduce_sum(loss, 1) # DEBUG 83 | 84 | # Class probabilities and predictions 85 | probs_all = tf.nn.softmax(logits) 86 | probs, preds_conf = tf.nn.top_k(probs_all) # take top-1 probability, and the index is the predicted class 87 | probs = tf.reshape(probs, [-1, num_total_preds]) 88 | preds_conf = tf.reshape(preds_conf, [-1, num_total_preds]) 89 | 90 | # Return a dictionary of {tensor_name: tensor_reference} 91 | ret_dict = { 92 | 'y_true_conf': y_true_conf, 93 | 'y_true_loc': y_true_loc, 94 | 'conf_loss_mask': conf_loss_mask, 95 | 'optimizer': optimizer, 96 | 'conf_loss': conf_loss, 97 | 'loc_loss': loc_loss, 98 | 'loss': loss, 99 | 'probs': probs, 100 | 'preds_conf': preds_conf, 101 | 'preds_loc': y_pred_loc, 102 | } 103 | return ret_dict 104 | 105 | 106 | def AlexNet(): 107 | """ 108 | AlexNet 109 | """ 110 | # Image batch tensor and dropout keep prob placeholders 111 | x = tf.placeholder(tf.float32, [None, IMG_H, IMG_W, NUM_CHANNELS], name='x') 112 | is_training = tf.placeholder(tf.bool, name='is_training') 113 | 114 | # Classification and localization predictions 115 | preds_conf = [] # conf -> classification b/c confidence loss -> classification loss 116 | preds_loc = [] 117 | 118 | # Use batch normalization for all convolution layers 119 | # FIXME: Not sure why setting is_training is not working well 120 | #with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': is_training}): 121 | with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': True},\ 122 | weights_regularizer=slim.l2_regularizer(scale=REG_SCALE)): 123 | net = slim.conv2d(x, 64, [11, 11], 4, padding='VALID', scope='conv1') 124 | net = slim.max_pool2d(net, [3, 3], 2, scope='pool1') 125 | net = slim.conv2d(net, 192, [5, 5], scope='conv2') 126 | 127 | net_conf, net_loc = SSDHook(net, 'conv2') 128 | preds_conf.append(net_conf) 129 | preds_loc.append(net_loc) 130 | 131 | net = slim.max_pool2d(net, [3, 3], 2, scope='pool2') 132 | net = slim.conv2d(net, 384, [3, 3], scope='conv3') 133 | net = slim.conv2d(net, 384, [3, 3], scope='conv4') 134 | net = slim.conv2d(net, 256, [3, 3], scope='conv5') 135 | 136 | # The following layers added for SSD 137 | net = slim.conv2d(net, 1024, [3, 3], scope='conv6') 138 | net = slim.conv2d(net, 1024, [1, 1], scope='conv7') 139 | 140 | net_conf, net_loc = SSDHook(net, 'conv7') 141 | preds_conf.append(net_conf) 142 | preds_loc.append(net_loc) 143 | 144 | net = slim.conv2d(net, 256, [1, 1], scope='conv8') 145 | net = slim.conv2d(net, 512, [3, 3], 2, scope='conv8_2') 146 | 147 | net_conf, net_loc = SSDHook(net, 'conv8_2') 148 | preds_conf.append(net_conf) 149 | preds_loc.append(net_loc) 150 | 151 | net = slim.conv2d(net, 128, [1, 1], scope='conv9') 152 | net = slim.conv2d(net, 256, [3, 3], 2, scope='conv9_2') 153 | 154 | net_conf, net_loc = SSDHook(net, 'conv9_2') 155 | preds_conf.append(net_conf) 156 | preds_loc.append(net_loc) 157 | 158 | # Concatenate all preds together into 1 vector, for both classification and localization predictions 159 | final_pred_conf = tf.concat(1, preds_conf) 160 | final_pred_loc = tf.concat(1, preds_loc) 161 | 162 | # Return a dictionary of {tensor_name: tensor_reference} 163 | ret_dict = { 164 | 'x': x, 165 | 'y_pred_conf': final_pred_conf, 166 | 'y_pred_loc': final_pred_loc, 167 | 'is_training': is_training, 168 | } 169 | return ret_dict 170 | 171 | 172 | def SSDModel(): 173 | """ 174 | Wrapper around the model and model helper 175 | Returns dict of relevant tensor references 176 | """ 177 | if MODEL == 'AlexNet': 178 | model = AlexNet() 179 | else: 180 | raise NotImplementedError('Model %s not supported' % MODEL) 181 | 182 | model_helper = ModelHelper(model['y_pred_conf'], model['y_pred_loc']) 183 | 184 | ssd_model = {} 185 | for k in model.keys(): 186 | ssd_model[k] = model[k] 187 | for k in model_helper.keys(): 188 | ssd_model[k] = model_helper[k] 189 | 190 | return ssd_model 191 | 192 | 193 | def nms(y_pred_conf, y_pred_loc, prob): 194 | """ 195 | Non-Maximum Suppression (NMS) 196 | Performs NMS on all boxes of each class where predicted probability > CONF_THRES 197 | For all boxes exceeding IOU threshold, select the box with highest confidence 198 | Returns a lsit of box coordinates post-NMS 199 | 200 | Arguments: 201 | * y_pred_conf: Class predictions, numpy array of shape (num_feature_map_cells * num_defaul_boxes,) 202 | * y_pred_loc: Bounding box coordinates, numpy array of shape (num_feature_map_cells * num_defaul_boxes * 4,) 203 | These coordinates are normalized coordinates relative to center of feature map cell 204 | * prob: Class probabilities, numpy array of shape (num_feature_map_cells * num_defaul_boxes,) 205 | 206 | Returns: 207 | * boxes: Numpy array of boxes, with shape (num_boxes, 6). shape[0] is interpreted as: 208 | [x1, y1, x2, y2, class, probability], where x1/y1/x2/y2 are the coordinates of the 209 | upper-left and lower-right corners. Box coordinates assume the image size is IMG_W x IMG_H. 210 | Remember to rescale box coordinates if your target image has different dimensions. 211 | """ 212 | # Keep track of boxes for each class 213 | class_boxes = {} # class -> [(x1, y1, x2, y2, prob), (...), ...] 214 | with open('signnames.csv', 'r') as f: 215 | for line in f: 216 | cls, _ = line.split(',') 217 | class_boxes[float(cls)] = [] 218 | 219 | # Go through all possible boxes and perform class-based greedy NMS (greedy based on class prediction confidence) 220 | y_idx = 0 221 | for fm_size in FM_SIZES: 222 | fm_h, fm_w = fm_size # feature map height and width 223 | for row in range(fm_h): 224 | for col in range(fm_w): 225 | for db in DEFAULT_BOXES: 226 | # Only perform calculations if class confidence > CONF_THRESH and not background class 227 | if prob[y_idx] > CONF_THRESH and y_pred_conf[y_idx] > 0.: 228 | # Calculate absolute coordinates of predicted bounding box 229 | xc, yc = col + 0.5, row + 0.5 # center of current feature map cell 230 | center_coords = np.array([xc, yc, xc, yc]) 231 | abs_box_coords = center_coords + y_pred_loc[y_idx*4 : y_idx*4 + 4] # predictions are offsets to center of fm cell 232 | 233 | # Calculate predicted box coordinates in actual image 234 | scale = np.array([IMG_W/fm_w, IMG_H/fm_h, IMG_W/fm_w, IMG_H/fm_h]) 235 | box_coords = abs_box_coords * scale 236 | box_coords = [int(round(x)) for x in box_coords] 237 | 238 | # Compare this box to all previous boxes of this class 239 | cls = y_pred_conf[y_idx] 240 | cls_prob = prob[y_idx] 241 | box = (*box_coords, cls, cls_prob) 242 | if len(class_boxes[cls]) == 0: 243 | class_boxes[cls].append(box) 244 | else: 245 | suppressed = False # did this box suppress other box(es)? 246 | overlapped = False # did this box overlap with other box(es)? 247 | for other_box in class_boxes[cls]: 248 | iou = calc_iou(box[:4], other_box[:4]) 249 | if iou > NMS_IOU_THRESH: 250 | overlapped = True 251 | # If current box has higher confidence than other box 252 | if box[5] > other_box[5]: 253 | class_boxes[cls].remove(other_box) 254 | suppressed = True 255 | if suppressed or not overlapped: 256 | class_boxes[cls].append(box) 257 | 258 | y_idx += 1 259 | 260 | # Gather all the pruned boxes and return them 261 | boxes = [] 262 | for cls in class_boxes.keys(): 263 | for class_box in class_boxes[cls]: 264 | boxes.append(class_box) 265 | boxes = np.array(boxes) 266 | 267 | return boxes 268 | --------------------------------------------------------------------------------