├── signnames.csv
├── inference_out
    ├── stop_1323822840.avi_image5.png
    ├── stop_1323804419.avi_image31.png
    ├── stop_1323896809.avi_image12.png
    ├── pedestrian_1323896918.avi_image9.png
    ├── pedestrianCrossing_1330547304.avi_image1.png
    └── pedestrianCrossing_1333395817.avi_image21.png
├── sample_images
    ├── stop_1323822840.avi_image5.png
    ├── stop_1323803184.avi_image16.png
    ├── stop_1323804419.avi_image31.png
    ├── stop_1323804592.avi_image12.png
    ├── stop_1323896809.avi_image12.png
    ├── pedestrian_1323896918.avi_image9.png
    ├── pedestrianCrossing_1330547304.avi_image1.png
    ├── pedestrianCrossing_1333395693.avi_image8.png
    └── pedestrianCrossing_1333395817.avi_image21.png
├── viz_model.py
├── LICENSE
├── settings.py
├── data_gathering
    └── create_pickle.py
├── README.md
├── data_prep.py
├── inference.py
├── train.py
└── model.py


/signnames.csv:
--------------------------------------------------------------------------------
1 | 1,stop
2 | 2,pedestrianCrossing
3 | 


--------------------------------------------------------------------------------
/inference_out/stop_1323822840.avi_image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323822840.avi_image5.png


--------------------------------------------------------------------------------
/sample_images/stop_1323822840.avi_image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323822840.avi_image5.png


--------------------------------------------------------------------------------
/inference_out/stop_1323804419.avi_image31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323804419.avi_image31.png


--------------------------------------------------------------------------------
/inference_out/stop_1323896809.avi_image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/stop_1323896809.avi_image12.png


--------------------------------------------------------------------------------
/sample_images/stop_1323803184.avi_image16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323803184.avi_image16.png


--------------------------------------------------------------------------------
/sample_images/stop_1323804419.avi_image31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323804419.avi_image31.png


--------------------------------------------------------------------------------
/sample_images/stop_1323804592.avi_image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323804592.avi_image12.png


--------------------------------------------------------------------------------
/sample_images/stop_1323896809.avi_image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/stop_1323896809.avi_image12.png


--------------------------------------------------------------------------------
/inference_out/pedestrian_1323896918.avi_image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrian_1323896918.avi_image9.png


--------------------------------------------------------------------------------
/sample_images/pedestrian_1323896918.avi_image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrian_1323896918.avi_image9.png


--------------------------------------------------------------------------------
/inference_out/pedestrianCrossing_1330547304.avi_image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrianCrossing_1330547304.avi_image1.png


--------------------------------------------------------------------------------
/inference_out/pedestrianCrossing_1333395817.avi_image21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/inference_out/pedestrianCrossing_1333395817.avi_image21.png


--------------------------------------------------------------------------------
/sample_images/pedestrianCrossing_1330547304.avi_image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1330547304.avi_image1.png


--------------------------------------------------------------------------------
/sample_images/pedestrianCrossing_1333395693.avi_image8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1333395693.avi_image8.png


--------------------------------------------------------------------------------
/sample_images/pedestrianCrossing_1333395817.avi_image21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/georgesung/ssd_tensorflow_traffic_sign_detection/HEAD/sample_images/pedestrianCrossing_1333395817.avi_image21.png


--------------------------------------------------------------------------------
/viz_model.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Visualize the model using TensorBoard
 3 | '''
 4 | import tensorflow as tf
 5 | from settings import *
 6 | from model import SSDModel
 7 | 
 8 | FM_ONLY = False  # Only want to see feature map sizes?
 9 | 
10 | with tf.Graph().as_default(), tf.Session() as sess:
11 | 	if FM_ONLY:
12 | 		# Only want to see feature map sizes (e.g. loss function and vector concatenation not yet set up)
13 | 		if MODEL == 'AlexNet':
14 | 			from model import AlexNet as MyModel
15 | 		else:
16 | 			raise NotImplementedError('Model %s not supported' % MODEL)
17 | 		_ = MyModel()
18 | 	else:
19 | 		# This includes the entire graph, e.g. loss function, optimizer, etc.
20 | 		_ = SSDModel()
21 | 
22 | 	tf.summary.merge_all()
23 | 	writer = tf.summary.FileWriter('./tensorboard_out', sess.graph)
24 | 	tf.global_variables_initializer().run()
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Jou-ching Sung
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Global settings
 3 | '''
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | # Default boxes
 8 | # DEFAULT_BOXES = ((x1_offset, y1_offset, x2_offset, y2_offset), (...), ...)
 9 | # Offset is relative to upper-left-corner and lower-right-corner of the feature map cell
10 | DEFAULT_BOXES = ((-0.5, -0.5, 0.5, 0.5), (0.2, 0.2, -0.2, -0.2), (-0.8, -0.2, 0.8, 0.2), (-0.2, -0.8, 0.2, 0.8))
11 | NUM_DEFAULT_BOXES = len(DEFAULT_BOXES)
12 | 
13 | # Constants (TODO: Keep this updated as we go along)
14 | NUM_CLASSES = 3  # 2 signs + 1 background class
15 | NUM_CHANNELS = 1  # grayscale->1, RGB->3
16 | NUM_PRED_CONF = NUM_DEFAULT_BOXES * NUM_CLASSES  # number of class predictions per feature map cell
17 | NUM_PRED_LOC  = NUM_DEFAULT_BOXES * 4  # number of localization regression predictions per feature map cell
18 | 
19 | # Bounding box parameters
20 | IOU_THRESH = 0.5  # match ground-truth box to default boxes exceeding this IOU threshold, during data prep
21 | NMS_IOU_THRESH = 0.2  # IOU threshold for non-max suppression
22 | 
23 | # Negatives-to-positives ratio used to filter training data
24 | NEG_POS_RATIO = 5  # negative:positive = NEG_POS_RATIO:1
25 | 
26 | # Class confidence threshold to count as detection
27 | CONF_THRESH = 0.9
28 | 
29 | # Model selection and dependent parameters
30 | MODEL = 'AlexNet'  # AlexNet/VGG16/ResNet50
31 | if MODEL == 'AlexNet':
32 | 	#IMG_H, IMG_W = 300, 300
33 | 	#FM_SIZES = [[36, 36], [17, 17], [9, 9], [5, 5]]  # feature map sizes for SSD hooks via TensorBoard visualization (HxW)
34 | 
35 | 	IMG_H, IMG_W = 260, 400
36 | 	FM_SIZES = [[31, 48], [15, 23], [8, 12], [4, 6]]
37 | else:
38 | 	raise NotImplementedError('Model not implemented')
39 | 
40 | # Model hyper-parameters
41 | OPT = tf.train.AdadeltaOptimizer()
42 | REG_SCALE = 1e-2  # L2 regularization strength
43 | LOC_LOSS_WEIGHT = 1.  # weight of localization loss: loss = conf_loss + LOC_LOSS_WEIGHT * loc_loss
44 | 
45 | # Training process
46 | RESUME = False  # resume training from previously saved model?
47 | NUM_EPOCH = 200
48 | BATCH_SIZE = 32  # batch size for training (relatively small)
49 | VALIDATION_SIZE = 0.05  # fraction of total training set to use as validation set
50 | SAVE_MODEL = True  # save trained model to disk?
51 | MODEL_SAVE_PATH = './model.ckpt'  # where to save trained model
52 | 


--------------------------------------------------------------------------------
/data_gathering/create_pickle.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Create raw data pickle file
  3 | data_raw is a dict mapping image_filename -> [{'class': class_int, 'box_coords': (x1, y1, x2, y2)}, {...}, ...]
  4 | '''
  5 | import numpy as np
  6 | import pickle
  7 | import re
  8 | import os
  9 | from PIL import Image
 10 | 
 11 | # Script config
 12 | RESIZE_IMAGE = True  # resize the images and write to 'resized_images/'
 13 | GRAYSCALE = True  # convert image to grayscale? this option is only valid if RESIZE_IMAGE==True (FIXME)
 14 | TARGET_W, TARGET_H = 400, 260  # 1.74 is weighted avg ratio, but 1.65 aspect ratio is close enough (1.65 was for stop signs)
 15 | 
 16 | ###########################
 17 | # Execute main script
 18 | ###########################
 19 | 
 20 | # First get mapping from sign name string to integer label
 21 | sign_map = {'stop': 1, 'pedestrianCrossing': 2}  # only 2 sign classes (background class is 0)
 22 | '''
 23 | sign_map = {}  # sign_name -> integer_label
 24 | with open('signnames.csv', 'r') as f:
 25 | 	for line in f:
 26 | 		line = line[:-1]  # strip newline at the end
 27 | 		integer_label, sign_name = line.split(',')
 28 | 		sign_map[sign_name] = int(integer_label)
 29 | '''
 30 | 
 31 | # Create raw data pickle file
 32 | data_raw = {}
 33 | 
 34 | # For speed, put entire contents of mergedAnnotations.csv in memory
 35 | merged_annotations = []
 36 | with open('mergedAnnotations.csv', 'r') as f:
 37 | 	for line in f:
 38 | 		line = line[:-1]  # strip trailing newline
 39 | 		merged_annotations.append(line)
 40 | 
 41 | # Create pickle file to represent dataset
 42 | image_files = os.listdir('annotations')
 43 | for image_file in image_files:
 44 | 	# Find box coordinates for all signs in this image
 45 | 	class_list = []
 46 | 	box_coords_list = []
 47 | 	for line in merged_annotations:
 48 | 		if re.search(image_file, line):
 49 | 			fields = line.split(';')
 50 | 
 51 | 			# Get sign name and assign class label
 52 | 			sign_name = fields[1]
 53 | 			if sign_name != 'stop' and sign_name != 'pedestrianCrossing':
 54 | 				continue  # ignore signs that are neither stop nor pedestrianCrossing signs
 55 | 			sign_class = sign_map[sign_name]
 56 | 			class_list.append(sign_class)
 57 | 
 58 | 			# Resize image, get rescaled box coordinates
 59 | 			box_coords = np.array([int(x) for x in fields[2:6]])
 60 | 
 61 | 			if RESIZE_IMAGE:
 62 | 				# Resize the images and write to 'resized_images/'
 63 | 				image = Image.open('annotations/' + image_file)
 64 | 				orig_w, orig_h = image.size
 65 | 
 66 | 				if GRAYSCALE:
 67 | 					image = image.convert('L')  # 8-bit grayscale
 68 | 				image = image.resize((TARGET_W, TARGET_H), Image.LANCZOS)  # high-quality downsampling filter
 69 | 
 70 | 				resized_dir = 'resized_images_%dx%d/' % (TARGET_W, TARGET_H)
 71 | 				if not os.path.exists(resized_dir):
 72 | 					os.makedirs(resized_dir)
 73 | 
 74 | 				image.save(os.path.join(resized_dir, image_file))
 75 | 
 76 | 				# Rescale box coordinates
 77 | 				x_scale = TARGET_W / orig_w
 78 | 				y_scale = TARGET_H / orig_h
 79 | 
 80 | 				ulc_x, ulc_y, lrc_x, lrc_y = box_coords
 81 | 				new_box_coords = (ulc_x * x_scale, ulc_y * y_scale, lrc_x * x_scale, lrc_y * y_scale)
 82 | 				new_box_coords = [round(x) for x in new_box_coords]
 83 | 				box_coords = np.array(new_box_coords)
 84 | 
 85 | 			box_coords_list.append(box_coords)
 86 | 
 87 | 	if len(class_list) == 0:
 88 | 		continue  # ignore images with no signs-of-interest
 89 | 	class_list = np.array(class_list)
 90 | 	box_coords_list = np.array(box_coords_list)
 91 | 
 92 | 	# Create the list of dicts
 93 | 	the_list = []
 94 | 	for i in range(len(box_coords_list)):
 95 | 		d = {'class': class_list[i], 'box_coords': box_coords_list[i]}
 96 | 		the_list.append(d)
 97 | 
 98 | 	data_raw[image_file] = the_list
 99 | 
100 | with open('data_raw_%dx%d.p' % (TARGET_W, TARGET_H), 'wb') as f:
101 | 	pickle.dump(data_raw, f)
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SSD in TensorFlow: Traffic Sign Detection and Classification
 2 | ## Overview
 3 | Implementation of [Single Shot MultiBox Detector (SSD)](https://arxiv.org/abs/1512.02325) in TensorFlow, to detect and classify traffic signs. This implementation was able to achieve 40-45 fps on a GTX 1080 with an Intel Core i7-6700K.
 4 | 
 5 | *Note this project is still work-in-progress*. The main issue now is model overfitting. I am currently working on pre-training on VOC2012 first, then performing transfer learning over to traffic sign detection.
 6 | 
 7 | Currently only stop signs and pedestrian crossing signs are detected. Example detection images are below.
 8 | 
 9 | ![example1](inference_out/stop_1323896809.avi_image12.png)
10 | ![example2](inference_out/pedestrian_1323896918.avi_image9.png)
11 | ![example3](inference_out/stop_1323804419.avi_image31.png)
12 | ![example4](inference_out/stop_1323822840.avi_image5.png)
13 | ![example5](inference_out/pedestrianCrossing_1330547304.avi_image1.png)
14 | ![example6](inference_out/pedestrianCrossing_1333395817.avi_image21.png)
15 | 
16 | The model was trained on the [LISA Traffic Sign Dataset](http://cvrr.ucsd.edu/LISA/lisa-traffic-sign-dataset.html), a dataset of US traffic signs.
17 | 
18 | ## Dependencies
19 | * Python 3.5+
20 | * TensorFlow v0.12.0
21 | * Pickle
22 | * OpenCV-Python
23 | * Matplotlib (optional)
24 | 
25 | ## How to run
26 | Clone this repository somewhere, let's refer to it as `$ROOT`
27 | 
28 | Training the model from scratch:
29 | * Download the [LISA Traffic Sign Dataset](http://cvrr.ucsd.edu/LISA/lisa-traffic-sign-dataset.html), and store it in a directory `$LISA_DATA`
30 | * `cd $LISA_DATA`
31 | * Follow instructions in the LISA Traffic Sign Dataset to create 'mergedAnnotations.csv' such that only stop signs and pedestrian crossing signs are shown
32 | * `cp $ROOT/data_gathering/create_pickle.py $LISA_DATA`
33 | * `python create_pickle.py`
34 | * `cd $ROOT`
35 | * `ln -s $LISA_DATA/resized_images_* .`
36 | * `ln -s $LISA_DATA/data_raw_*.p .`
37 | * `python data_prep.py`
38 |   * This performs box matching between ground-truth boxes and default boxes, and packages the data into a format used later in the pipeline
39 | * `python train.py`
40 |   * This trains the SSD model
41 | * `python inference.py -m demo`
42 |   * This will take the images from sample_images, annotate them, and display them on screen
43 | * To run predictions on your own images and/or videos, use the `-i` flag in inference.py (see the code for more details)
44 |   * Note the model severly overfits at this time
45 | 
46 | ## Differences between original SSD implementation
47 | Obivously, we are only detecting certain traffic signs in this implementation, whereas the original SSD implemetation detected a greater number of object classes in the PASCAL VOC and MS COCO datasets. Other notable differences are:
48 | * Uses AlexNet as the base network
49 | * Input image resolution is 400x260
50 | * Uses a dynamic scaling factor based on the dimensions of the feature map relative to original image dimensions
51 | 
52 | ## Performance
53 | As mentioned above, this SSD implementation was able to achieve 40-45 fps on a GTX 1080 with an Intel Core i7 6700K.
54 | 
55 | The inference time is the sum of the neural network inference time, and Non-Maximum Suppression (NMS) time. Overall, the neural network inference time is significantly less than the NMS time, with the neural network inference time generally between 7-8 ms, whereas the NMS time is between 15-16 ms. The NMS algorithm implemented here has not been optimized, and runs on CPU only, so further effort to improve performance can be done there.
56 | 
57 | ## Dataset characteristics
58 | The entire LISA Traffic Sign Dataset consists of 47 distinct traffic sign classes. Since we are only concered with a subset of those classes, we only use a subset of the LISA dataset. Also, we ignore all training samples where we do not find a matching default box, further reducing our dataset's size. Due to this process, we end up with very little data to work with.
59 | 
60 | In order to improve on this issue, we can perform image data augmentation, and/or pre-train the model on a larger dataset (e.g. VOC2012, ILSVRC)
61 | 
62 | ## Training process
63 | Given the small size of our pruned dataset, I chose a train/validation split of 95/5. The model was trained with Adadelta optimizers, with the default parameters provided by TensorFlow. The model was trained over 200 epochs, with a batch size of 32.
64 | 
65 | ## Areas of improvement
66 | There are multiple potential areas of improvement in this project:
67 | 
68 | * Pre-train the model on VOC2012 and/or ILSVRC
69 | * Image data augmentation
70 | * Hyper-parameter tuning
71 | * Optimize NMS alogorithm, or leverage existing optimized NMS algorithm
72 | * Implement and report mAP metric
73 | * Try different base networks
74 | * Expand to more traffic sign classes
75 | 


--------------------------------------------------------------------------------
/data_prep.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Data preparation
  3 | '''
  4 | from settings import *
  5 | import numpy as np
  6 | import pickle
  7 | 
  8 | 
  9 | def calc_iou(box_a, box_b):
 10 | 	"""
 11 | 	Calculate the Intersection Over Union of two boxes
 12 | 	Each box specified by upper left corner and lower right corner:
 13 | 	(x1, y1, x2, y2), where 1 denotes upper left corner, 2 denotes lower right corner
 14 | 
 15 | 	Returns IOU value
 16 | 	"""
 17 | 	# Calculate intersection, i.e. area of overlap between the 2 boxes (could be 0)
 18 | 	# http://math.stackexchange.com/a/99576
 19 | 	x_overlap = max(0, min(box_a[2], box_b[2]) - max(box_a[0], box_b[0]))
 20 | 	y_overlap = max(0, min(box_a[3], box_b[3]) - max(box_a[1], box_b[1]))
 21 | 	intersection = x_overlap * y_overlap
 22 | 
 23 | 	# Calculate union
 24 | 	area_box_a = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
 25 | 	area_box_b = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
 26 | 	union = area_box_a + area_box_b - intersection
 27 | 
 28 | 	iou = intersection / union
 29 | 	return iou
 30 | 
 31 | 
 32 | def find_gt_boxes(data_raw, image_file):
 33 | 	"""
 34 | 	Given (global) feature map sizes, and single training example,
 35 | 	find all default boxes that exceed Jaccard overlap threshold
 36 | 
 37 | 	Returns y_true array that flags the matching default boxes with class ID (-1 means nothing there)
 38 | 	"""
 39 | 	# Pre-process ground-truth data
 40 | 	# Convert absolute coordinates to relative coordinates ranging from 0 to 1
 41 | 	# Read the sign class label (note background class label is 0, sign labels are ints >=1)
 42 | 	signs_data = data_raw[image_file]
 43 | 
 44 | 	signs_class = []
 45 | 	signs_box_coords = []  # relative coordinates
 46 | 	for sign_data in signs_data:
 47 | 		# Find class label
 48 | 		sign_class = sign_data['class']
 49 | 		signs_class.append(sign_class)
 50 | 
 51 | 		# Calculate relative coordinates
 52 | 		# (x1, y1, x2, y2), where 1 denotes upper left corner, 2 denotes lower right corner
 53 | 		abs_box_coords = sign_data['box_coords']
 54 | 		scale = np.array([IMG_W, IMG_H, IMG_W, IMG_H])
 55 | 		box_coords = np.array(abs_box_coords) / scale
 56 | 		signs_box_coords.append(box_coords)
 57 | 
 58 | 	# Initialize y_true to all 0s (0 -> background)
 59 | 	y_true_len = 0
 60 | 	for fm_size in FM_SIZES:
 61 | 		y_true_len += fm_size[0] * fm_size[1] * NUM_DEFAULT_BOXES
 62 | 	y_true_conf = np.zeros(y_true_len)
 63 | 	y_true_loc = np.zeros(y_true_len * 4)
 64 | 
 65 | 	# For each GT box, for each feature map, for each feature map cell, for each default box:
 66 | 	# 1) Calculate the Jaccard overlap (IOU) and annotate the class label
 67 | 	# 2) Count how many box matches we got
 68 | 	# 3) If we got a match, calculate normalized box coordinates and updte y_true_loc
 69 | 	match_counter = 0
 70 | 	for i, gt_box_coords in enumerate(signs_box_coords):
 71 | 		y_true_idx = 0
 72 | 		#for fm_idx, fm_size in enumerate(FM_SIZES):
 73 | 		for fm_size in FM_SIZES:
 74 | 			fm_h, fm_w = fm_size  # feature map height and width
 75 | 			for row in range(fm_h):
 76 | 				for col in range(fm_w):
 77 | 					for db in DEFAULT_BOXES:
 78 | 						# Calculate relative box coordinates for this default box
 79 | 						x1_offset, y1_offset, x2_offset, y2_offset = db
 80 | 						abs_db_box_coords = np.array([
 81 | 							max(0, col + x1_offset),
 82 | 							max(0, row + y1_offset),
 83 | 							min(fm_w, col+1 + x2_offset),
 84 | 							min(fm_h, row+1 + y2_offset)
 85 | 						])
 86 | 						scale = np.array([fm_w, fm_h, fm_w, fm_h])
 87 | 						db_box_coords = abs_db_box_coords / scale
 88 | 
 89 | 						# Calculate Jaccard overlap (i.e. Intersection Over Union, IOU) of GT box and default box
 90 | 						iou = calc_iou(gt_box_coords, db_box_coords)
 91 | 
 92 | 						# If box matches, i.e. IOU threshold met
 93 | 						if iou >= IOU_THRESH:
 94 | 							# Update y_true_conf to reflect we found a match, and increment match_counter
 95 | 							y_true_conf[y_true_idx] = signs_class[i]
 96 | 							match_counter += 1
 97 | 
 98 | 							# Calculate normalized box coordinates and update y_true_loc
 99 | 							abs_box_center = np.array([col + 0.5, row + 0.5])  # absolute coordinates of center of feature map cell
100 | 							abs_gt_box_coords = gt_box_coords * scale  # absolute ground truth box coordinates (in feature map grid)
101 | 							norm_box_coords = abs_gt_box_coords - np.concatenate((abs_box_center, abs_box_center))
102 | 							y_true_loc[y_true_idx*4 : y_true_idx*4 + 4] = norm_box_coords
103 | 
104 | 						y_true_idx += 1
105 | 
106 | 	return y_true_conf, y_true_loc, match_counter
107 | 
108 | 
109 | def do_data_prep(data_raw):
110 | 	"""
111 | 	Create the y_true array
112 | 	data_raw is the dict mapping image_file -> [{'class': class_int, 'box_coords': (x1, y1, x2, y2)}, {...}, ...]
113 | 
114 | 	Return a dict {image_file1: {'y_true_conf': y_true_conf, 'y_true_loc': y_true_loc}, image_file2: ...}
115 | 	"""
116 | 	# Prepare the data by populating y_true appropriately
117 | 	data_prep = {}
118 | 	for image_file in data_raw.keys():
119 | 		# Find groud-truth boxes based on Jaccard overlap,
120 | 		# populate y_true_conf (class labels) and y_true_loc (normalized box coordinates)
121 | 		y_true_conf, y_true_loc, match_counter = find_gt_boxes(data_raw, image_file)
122 | 
123 | 		# Only want data points where we have matching default boxes
124 | 		if match_counter > 0:
125 | 			data_prep[image_file] = {'y_true_conf': y_true_conf, 'y_true_loc': y_true_loc}
126 | 
127 | 	return data_prep
128 | 
129 | 
130 | if __name__ == '__main__':
131 | 	with open('data_raw_%sx%s.p' % (IMG_W, IMG_H), 'rb') as f:
132 | 		data_raw = pickle.load(f)
133 | 
134 | 	print('Preparing data (i.e. matching boxes)')
135 | 	data_prep = do_data_prep(data_raw)
136 | 
137 | 	with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), 'wb') as f:
138 | 		pickle.dump(data_prep, f)
139 | 
140 | 	print('Done. Saved prepared data to data_prep_%sx%s.p' % (IMG_W, IMG_H))
141 | 	print('Total images with >=1 matching box: %d' % len(data_prep.keys()))
142 | 


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Run inference using trained model
  3 | '''
  4 | import tensorflow as tf
  5 | from settings import *
  6 | from model import SSDModel
  7 | from model import ModelHelper
  8 | from model import nms
  9 | import numpy as np
 10 | from sklearn.model_selection import train_test_split
 11 | import cv2
 12 | import math
 13 | import os
 14 | import time
 15 | import pickle
 16 | from PIL import Image
 17 | import matplotlib.pyplot as plt
 18 | from moviepy.editor import VideoFileClip
 19 | from optparse import OptionParser
 20 | import glob
 21 | 
 22 | 
 23 | def run_inference(image, model, sess, mode, sign_map):
 24 | 	"""
 25 | 	Run inference on a given image
 26 | 
 27 | 	Arguments:
 28 | 		* image: Numpy array representing a single RGB image
 29 | 		* model: Dict of tensor references returned by SSDModel()
 30 | 		* sess: TensorFlow session reference
 31 | 		* mode: String of either "image", "video", or "demo"
 32 | 
 33 | 	Returns:
 34 | 		* Numpy array representing annotated image
 35 | 	"""
 36 | 	# Save original image in memory
 37 | 	image = np.array(image)
 38 | 	image_orig = np.copy(image)
 39 | 
 40 | 	# Get relevant tensors
 41 | 	x = model['x']
 42 | 	is_training = model['is_training']
 43 | 	preds_conf = model['preds_conf']
 44 | 	preds_loc = model['preds_loc']
 45 | 	probs = model['probs']
 46 | 
 47 | 	# Convert image to PIL Image, resize it, convert to grayscale (if necessary), convert back to numpy array
 48 | 	image = Image.fromarray(image)
 49 | 	orig_w, orig_h = image.size
 50 | 	if NUM_CHANNELS == 1:
 51 | 		image = image.convert('L')  # 8-bit grayscale
 52 | 	image = image.resize((IMG_W, IMG_H), Image.LANCZOS)  # high-quality downsampling filter
 53 | 	image = np.asarray(image)
 54 | 
 55 | 	images = np.array([image])  # create a "batch" of 1 image
 56 | 	if NUM_CHANNELS == 1:
 57 | 		images = np.expand_dims(images, axis=-1)  # need extra dimension of size 1 for grayscale
 58 | 
 59 | 	# Perform object detection
 60 | 	t0 = time.time()  # keep track of duration of object detection + NMS
 61 | 	preds_conf_val, preds_loc_val, probs_val = sess.run([preds_conf, preds_loc, probs], feed_dict={x: images, is_training: False})
 62 | 	if mode != 'video':
 63 | 		print('Inference took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0)))
 64 | 
 65 | 	# Gather class predictions and confidence values
 66 | 	y_pred_conf = preds_conf_val[0]  # batch size of 1, so just take [0]
 67 | 	y_pred_conf = y_pred_conf.astype('float32')
 68 | 	prob = probs_val[0]
 69 | 
 70 | 	# Gather localization predictions
 71 | 	y_pred_loc = preds_loc_val[0]
 72 | 
 73 | 	# Perform NMS
 74 | 	boxes = nms(y_pred_conf, y_pred_loc, prob)
 75 | 	if mode != 'video':
 76 | 		print('Inference + NMS took %.1f ms (%.2f fps)' % ((time.time() - t0)*1000, 1/(time.time() - t0)))
 77 | 
 78 | 	# Rescale boxes' coordinates back to original image's dimensions
 79 | 	# Recall boxes = [[x1, y1, x2, y2, cls, cls_prob], [...], ...]
 80 | 	scale = np.array([orig_w/IMG_W, orig_h/IMG_H, orig_w/IMG_W, orig_h/IMG_H])
 81 | 	if len(boxes) > 0:
 82 | 		boxes[:, :4] = boxes[:, :4] * scale
 83 | 
 84 | 	# Draw and annotate boxes over original image, and return annotated image
 85 | 	image = image_orig
 86 | 	for box in boxes:
 87 | 		# Get box parameters
 88 | 		box_coords = [int(round(x)) for x in box[:4]]
 89 | 		cls = int(box[4])
 90 | 		cls_prob = box[5]
 91 | 
 92 | 		# Annotate image
 93 | 		image = cv2.rectangle(image, tuple(box_coords[:2]), tuple(box_coords[2:]), (0,255,0))
 94 | 		label_str = '%s %.2f' % (sign_map[cls], cls_prob)
 95 | 		image = cv2.putText(image, label_str, (box_coords[0], box_coords[1]), 0, 0.5, (0,255,0), 1, cv2.LINE_AA)
 96 | 
 97 | 	return image
 98 | 
 99 | 
100 | def generate_output(input_files, mode):
101 | 	"""
102 | 	Generate annotated images, videos, or sample images, based on mode
103 | 	"""
104 | 	# First, load mapping from integer class ID to sign name string
105 | 	sign_map = {}
106 | 	with open('signnames.csv', 'r') as f:
107 | 		for line in f:
108 | 			line = line[:-1]  # strip newline at the end
109 | 			sign_id, sign_name = line.split(',')
110 | 			sign_map[int(sign_id)] = sign_name
111 | 	sign_map[0] = 'background'  # class ID 0 reserved for background class
112 | 
113 | 	# Create output directory 'inference_out/' if needed
114 | 	if mode == 'image' or mode == 'video':
115 | 		if not os.path.isdir('./inference_out'):
116 | 			try:
117 | 				os.mkdir('./inference_out')
118 | 			except FileExistsError:
119 | 				print('Error: Cannot mkdir ./inference_out')
120 | 				return
121 | 
122 | 	# Launch the graph
123 | 	with tf.Graph().as_default(), tf.Session() as sess:
124 | 		# "Instantiate" neural network, get relevant tensors
125 | 		model = SSDModel()
126 | 
127 | 		# Load trained model
128 | 		saver = tf.train.Saver()
129 | 		print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
130 | 		saver.restore(sess, MODEL_SAVE_PATH)
131 | 
132 | 		if mode == 'image':
133 | 			for image_file in input_files:
134 | 				print('Running inference on %s' % image_file)
135 | 				image_orig = np.asarray(Image.open(image_file))
136 | 				image = run_inference(image_orig, model, sess, mode, sign_map)
137 | 
138 | 				head, tail = os.path.split(image_file)
139 | 				plt.imsave('./inference_out/%s' % tail, image)
140 | 			print('Output saved in inference_out/')
141 | 
142 | 		elif mode == 'video':
143 | 			for video_file in input_files:
144 | 				print('Running inference on %s' % video_file)
145 | 				video = VideoFileClip(video_file)
146 | 				video = video.fl_image(lambda x: run_inference(x, model, sess, mode, sign_map))
147 | 
148 | 				head, tail = os.path.split(video_file)
149 | 				video.write_videofile('./inference_out/%s' % tail, audio=False)
150 | 			print('Output saved in inference_out/')
151 | 
152 | 		elif mode == 'demo':
153 | 			print('Demo mode: Running inference on images in sample_images/')
154 | 			image_files = os.listdir('sample_images/')
155 | 
156 | 			for image_file in image_files:
157 | 				print('Running inference on sample_images/%s' % image_file)
158 | 				image_orig = np.asarray(Image.open('sample_images/' + image_file))
159 | 				image = run_inference(image_orig, model, sess, mode, sign_map)
160 | 				plt.imshow(image)
161 | 				plt.show()
162 | 
163 | 		else:
164 | 			raise ValueError('Invalid mode: %s' % mode)
165 | 
166 | 
167 | if __name__ == '__main__':
168 | 	# Configure command line options
169 | 	parser = OptionParser()
170 | 	parser.add_option('-i', '--input_dir', dest='input_dir',
171 | 		help='Directory of input videos/images (ignored for "demo" mode). Will run inference on all videos/images in that dir')
172 | 	parser.add_option('-m', '--mode', dest='mode', default='image',
173 | 		help='Operating mode, could be "image", "video", or "demo"; "demo" mode displays annotated images from sample_images/')
174 | 
175 | 	# Get and parse command line options
176 | 	options, args = parser.parse_args()
177 | 
178 | 	input_dir = options.input_dir
179 | 	mode = options.mode
180 | 
181 | 	if mode != 'video' and mode != 'image' and mode != 'demo':
182 | 		assert ValueError('Invalid mode: %s' % mode)
183 | 
184 | 	if mode != 'demo':
185 | 		input_files = glob.glob(input_dir + '/*.*')
186 | 	else:
187 | 		input_files = []
188 | 
189 | 	generate_output(input_files, mode)
190 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Train the model on dataset
  3 | '''
  4 | import tensorflow as tf
  5 | from settings import *
  6 | from model import SSDModel
  7 | from model import ModelHelper
  8 | import numpy as np
  9 | from sklearn.model_selection import train_test_split
 10 | import cv2
 11 | import math
 12 | import os
 13 | import time
 14 | import pickle
 15 | from PIL import Image
 16 | 
 17 | 
 18 | def next_batch(X, y_conf, y_loc, batch_size):
 19 | 	"""
 20 | 	Next batch generator
 21 | 	Arguments:
 22 | 		* X: List of image file names
 23 | 		* y_conf: List of ground-truth vectors for class labels
 24 | 		* y_loc: List of ground-truth vectors for localization
 25 | 		* batch_size: Batch size
 26 | 
 27 | 	Yields:
 28 | 		* images: Batch numpy array representation of batch of images
 29 | 		* y_true_conf: Batch numpy array of ground-truth class labels
 30 | 		* y_true_loc: Batch numpy array of ground-truth localization
 31 | 		* conf_loss_mask: Loss mask for confidence loss, to set NEG_POS_RATIO
 32 | 	"""
 33 | 	start_idx = 0
 34 | 	while True:
 35 | 		image_files = X[start_idx : start_idx + batch_size]
 36 | 		y_true_conf = np.array(y_conf[start_idx : start_idx + batch_size])
 37 | 		y_true_loc  = np.array(y_loc[start_idx : start_idx + batch_size])
 38 | 
 39 | 		# Read images from image_files
 40 | 		images = []
 41 | 		for image_file in image_files:
 42 | 			image = Image.open('resized_images_%sx%s/%s' % (IMG_W, IMG_H, image_file))
 43 | 			image = np.asarray(image)
 44 | 			images.append(image)
 45 | 
 46 | 		images = np.array(images, dtype='float32')
 47 | 
 48 | 		# Grayscale images have array shape (H, W), but we want shape (H, W, 1)
 49 | 		if NUM_CHANNELS == 1:
 50 | 			images = np.expand_dims(images, axis=-1)
 51 | 
 52 | 		# Normalize pixel values (scale them between -1 and 1)
 53 | 		images = images/127.5 - 1.
 54 | 
 55 | 		# For y_true_conf, calculate how many negative examples we need to satisfy NEG_POS_RATIO
 56 | 		num_pos = np.where(y_true_conf > 0)[0].shape[0]
 57 | 		num_neg = NEG_POS_RATIO * num_pos
 58 | 		y_true_conf_size = np.sum(y_true_conf.shape)
 59 | 
 60 | 		# Create confidence loss mask to satisfy NEG_POS_RATIO
 61 | 		if num_pos + num_neg < y_true_conf_size:
 62 | 			conf_loss_mask = np.copy(y_true_conf)
 63 | 			conf_loss_mask[np.where(conf_loss_mask > 0)] = 1.
 64 | 
 65 | 			# Find all (i,j) tuples where y_true_conf[i][j]==0
 66 | 			zero_indices = np.where(conf_loss_mask == 0.)  # ([i1, i2, ...], [j1, j2, ...])
 67 | 			zero_indices = np.transpose(zero_indices)  # [[i1, j1], [i2, j2], ...]
 68 | 
 69 | 			# Randomly choose num_neg rows from zero_indices, w/o replacement
 70 | 			chosen_zero_indices = zero_indices[np.random.choice(zero_indices.shape[0], int(num_neg), False)]
 71 | 
 72 | 			# "Enable" chosen negative examples, specified by chosen_zero_indices
 73 | 			for zero_idx in chosen_zero_indices:
 74 | 				i, j = zero_idx
 75 | 				conf_loss_mask[i][j] = 1.
 76 | 
 77 | 		else:
 78 | 			# If we have so many positive examples such that num_pos+num_neg >= y_true_conf_size,
 79 | 			# no need to prune negative data
 80 | 			conf_loss_mask = np.ones_like(y_true_conf)
 81 | 
 82 | 		yield (images, y_true_conf, y_true_loc, conf_loss_mask)
 83 | 
 84 | 		# Update start index for the next batch
 85 | 		start_idx += batch_size
 86 | 		if start_idx >= X.shape[0]:
 87 | 			start_idx = 0
 88 | 
 89 | 
 90 | def run_training():
 91 | 	"""
 92 | 	Load training and test data
 93 | 	Run training process
 94 | 	Plot train/validation losses
 95 | 	Report test loss
 96 | 	Save model
 97 | 	"""
 98 | 	# Load training and test data
 99 | 	with open('data_prep_%sx%s.p' % (IMG_W, IMG_H), mode='rb') as f:
100 | 		train = pickle.load(f)
101 | 	#with open('test.p', mode='rb') as f:
102 | 	#	test = pickle.load(f)
103 | 
104 | 	# Format the data
105 | 	X_train = []
106 | 	y_train_conf = []
107 | 	y_train_loc = []
108 | 	for image_file in train.keys():
109 | 		X_train.append(image_file)
110 | 		y_train_conf.append(train[image_file]['y_true_conf'])
111 | 		y_train_loc.append(train[image_file]['y_true_loc'])
112 | 	X_train = np.array(X_train)
113 | 	y_train_conf = np.array(y_train_conf)
114 | 	y_train_loc = np.array(y_train_loc)
115 | 
116 | 	# Train/validation split
117 | 	X_train, X_valid, y_train_conf, y_valid_conf, y_train_loc, y_valid_loc = train_test_split(\
118 | 		X_train, y_train_conf, y_train_loc, test_size=VALIDATION_SIZE, random_state=1)
119 | 
120 | 	# Launch the graph
121 | 	with tf.Graph().as_default(), tf.Session() as sess:
122 | 		# "Instantiate" neural network, get relevant tensors
123 | 		model = SSDModel()
124 | 		x = model['x']
125 | 		y_true_conf = model['y_true_conf']
126 | 		y_true_loc = model['y_true_loc']
127 | 		conf_loss_mask = model['conf_loss_mask']
128 | 		is_training = model['is_training']
129 | 		optimizer = model['optimizer']
130 | 		reported_loss = model['loss']
131 | 
132 | 		# Training process
133 | 		# TF saver to save/restore trained model
134 | 		saver = tf.train.Saver()
135 | 
136 | 		if RESUME:
137 | 			print('Restoring previously trained model at %s' % MODEL_SAVE_PATH)
138 | 			saver.restore(sess, MODEL_SAVE_PATH)
139 | 
140 | 			# Restore previous loss history
141 | 			with open('loss_history.p', 'rb') as f:
142 | 				loss_history = pickle.load(f)
143 | 		else:
144 | 			print('Training model from scratch')
145 | 			# Variable initialization
146 | 			sess.run(tf.global_variables_initializer())
147 | 
148 | 			# For book-keeping, keep track of training and validation loss over epochs, like such:
149 | 			# [(train_acc_epoch1, valid_acc_epoch1), (train_acc_epoch2, valid_acc_epoch2), ...]
150 | 			loss_history = []
151 | 
152 | 		# Record time elapsed for performance check
153 | 		last_time = time.time()
154 | 		train_start_time = time.time()
155 | 
156 | 		# Run NUM_EPOCH epochs of training
157 | 		for epoch in range(NUM_EPOCH):
158 | 			train_gen = next_batch(X_train, y_train_conf, y_train_loc, BATCH_SIZE)
159 | 			num_batches_train = math.ceil(X_train.shape[0] / BATCH_SIZE)
160 | 			losses = []  # list of loss values for book-keeping
161 | 
162 | 			# Run training on each batch
163 | 			for _ in range(num_batches_train):
164 | 				# Obtain the training data and labels from generator
165 | 				images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(train_gen)
166 | 
167 | 				# Perform gradient update (i.e. training step) on current batch
168 | 				_, loss = sess.run([optimizer, reported_loss], feed_dict={
169 | 				#_, loss, loc_loss_dbg, loc_loss_mask, loc_loss = sess.run([optimizer, reported_loss, model['loc_loss_dbg'], model['loc_loss_mask'], model['loc_loss']],feed_dict={  # DEBUG
170 | 					x: images,
171 | 					y_true_conf: y_true_conf_gen,
172 | 					y_true_loc: y_true_loc_gen,
173 | 					conf_loss_mask: conf_loss_mask_gen,
174 | 					is_training: True
175 | 				})
176 | 				
177 | 				losses.append(loss)  # TODO: Need mAP metric instead of raw loss
178 | 
179 | 			# A rough estimate of loss for this epoch (overweights the last batch)
180 | 			train_loss = np.mean(losses)
181 | 
182 | 			# Calculate validation loss at the end of the epoch
183 | 			valid_gen = next_batch(X_valid, y_valid_conf, y_valid_loc, BATCH_SIZE)
184 | 			num_batches_valid = math.ceil(X_valid.shape[0] / BATCH_SIZE)
185 | 			losses = []
186 | 			for _ in range(num_batches_valid):
187 | 				images, y_true_conf_gen, y_true_loc_gen, conf_loss_mask_gen = next(valid_gen)
188 | 
189 | 				# Perform forward pass and calculate loss
190 | 				loss = sess.run(reported_loss, feed_dict={
191 | 					x: images,
192 | 					y_true_conf: y_true_conf_gen,
193 | 					y_true_loc: y_true_loc_gen,
194 | 					conf_loss_mask: conf_loss_mask_gen,
195 | 					is_training: False
196 | 				})
197 | 				losses.append(loss)
198 | 			valid_loss = np.mean(losses)
199 | 
200 | 			# Record and report train/validation/test losses for this epoch
201 | 			loss_history.append((train_loss, valid_loss))
202 | 
203 | 			# Print accuracy every epoch
204 | 			print('Epoch %d -- Train loss: %.4f, Validation loss: %.4f, Elapsed time: %.2f sec' %\
205 | 				(epoch+1, train_loss, valid_loss, time.time() - last_time))
206 | 			last_time = time.time()
207 | 
208 | 		total_time = time.time() - train_start_time
209 | 		print('Total elapsed time: %d min %d sec' % (total_time/60, total_time%60))
210 | 
211 | 		test_loss = 0.  # TODO: Add test set
212 | 		'''
213 | 		# After training is complete, evaluate accuracy on test set
214 | 		print('Calculating test accuracy...')
215 | 		test_gen = next_batch(X_test, y_test, BATCH_SIZE)
216 | 		test_size = X_test.shape[0]
217 | 		test_acc = calculate_accuracy(test_gen, test_size, BATCH_SIZE, accuracy, x, y, keep_prob, sess)
218 | 		print('Test acc.: %.4f' % (test_acc,))
219 | 		'''
220 | 
221 | 		if SAVE_MODEL:
222 | 			# Save model to disk
223 | 			save_path = saver.save(sess, MODEL_SAVE_PATH)
224 | 			print('Trained model saved at: %s' % save_path)
225 | 
226 | 			# Also save accuracy history
227 | 			print('Loss history saved at loss_history.p')
228 | 			with open('loss_history.p', 'wb') as f:
229 | 				pickle.dump(loss_history, f)
230 | 
231 | 	# Return final test accuracy and accuracy_history
232 | 	return test_loss, loss_history
233 | 
234 | 
235 | if __name__ == '__main__':
236 | 	run_training()
237 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Model definition
  3 | '''
  4 | import numpy as np
  5 | import tensorflow as tf
  6 | import tensorflow.contrib.slim as slim
  7 | from settings import *
  8 | from data_prep import calc_iou
  9 | 
 10 | 
 11 | def SSDHook(feature_map, hook_id):
 12 | 	"""
 13 | 	Takes input feature map, output the predictions tensor
 14 | 	hook_id is for variable_scope unqie string ID
 15 | 	"""
 16 | 	with tf.variable_scope('ssd_hook_' + hook_id):
 17 | 		# Note we have linear activation (i.e. no activation function)
 18 | 		net_conf = slim.conv2d(feature_map, NUM_PRED_CONF, [3, 3], activation_fn=None, scope='conv_conf')
 19 | 		net_conf = tf.contrib.layers.flatten(net_conf)
 20 | 
 21 | 		net_loc = slim.conv2d(feature_map, NUM_PRED_LOC, [3, 3], activation_fn=None, scope='conv_loc')
 22 | 		net_loc = tf.contrib.layers.flatten(net_loc)
 23 | 
 24 | 	return net_conf, net_loc
 25 | 
 26 | 
 27 | def ModelHelper(y_pred_conf, y_pred_loc):
 28 | 	"""
 29 | 	Define loss function, optimizer, predictions, and accuracy metric
 30 | 	Loss includes confidence loss and localization loss
 31 | 
 32 | 	conf_loss_mask is created at batch generation time, to mask the confidence losses
 33 | 	It has 1 at locations w/ positives, and 1 at select negative locations
 34 | 	such that negative-to-positive ratio of NEG_POS_RATIO is satisfied
 35 | 
 36 | 	Arguments:
 37 | 		* y_pred_conf: Class predictions from model,
 38 | 			a tensor of shape [batch_size, num_feature_map_cells * num_defaul_boxes * num_classes]
 39 | 		* y_pred_loc: Localization predictions from model,
 40 | 			a tensor of shape [batch_size, num_feature_map_cells * num_defaul_boxes * 4]
 41 | 
 42 | 	Returns relevant tensor references
 43 | 	"""
 44 | 	num_total_preds = 0
 45 | 	for fm_size in FM_SIZES:
 46 | 		num_total_preds += fm_size[0] * fm_size[1] * NUM_DEFAULT_BOXES
 47 | 	num_total_preds_conf = num_total_preds * NUM_CLASSES
 48 | 	num_total_preds_loc  = num_total_preds * 4
 49 | 
 50 | 	# Input tensors
 51 | 	y_true_conf = tf.placeholder(tf.int32, [None, num_total_preds], name='y_true_conf')  # classification ground-truth labels
 52 | 	y_true_loc  = tf.placeholder(tf.float32, [None, num_total_preds_loc], name='y_true_loc')  # localization ground-truth labels
 53 | 	conf_loss_mask = tf.placeholder(tf.float32, [None, num_total_preds], name='conf_loss_mask')  # 1 mask "bit" per def. box
 54 | 
 55 | 	# Confidence loss
 56 | 	logits = tf.reshape(y_pred_conf, [-1, num_total_preds, NUM_CLASSES])
 57 | 	conf_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, y_true_conf)
 58 | 	conf_loss = conf_loss_mask * conf_loss  # "zero-out" the loss for don't-care negatives
 59 | 	conf_loss = tf.reduce_sum(conf_loss)
 60 | 
 61 | 	# Localization loss (smooth L1 loss)
 62 | 	# loc_loss_mask is analagous to conf_loss_mask, except 4 times the size
 63 | 	diff = y_true_loc - y_pred_loc
 64 | 	
 65 | 	loc_loss_l2 = 0.5 * (diff**2.0)
 66 | 	loc_loss_l1 = tf.abs(diff) - 0.5
 67 | 	smooth_l1_condition = tf.less(tf.abs(diff), 1.0)
 68 | 	loc_loss = tf.select(smooth_l1_condition, loc_loss_l2, loc_loss_l1)
 69 | 	
 70 | 	loc_loss_mask = tf.minimum(y_true_conf, 1)  # have non-zero localization loss only where we have matching ground-truth box
 71 | 	loc_loss_mask = tf.to_float(loc_loss_mask)
 72 | 	loc_loss_mask = tf.stack([loc_loss_mask] * 4, axis=2)  # [0, 1, 1] -> [[[0, 0, 0, 0], [1, 1, 1, 1], [1, 1, 1, 1]], ...]
 73 | 	loc_loss_mask = tf.reshape(loc_loss_mask, [-1, num_total_preds_loc])  # removing the inner-most dimension of above
 74 | 	loc_loss = loc_loss_mask * loc_loss
 75 | 	loc_loss = tf.reduce_sum(loc_loss)
 76 | 
 77 | 	# Weighted average of confidence loss and localization loss
 78 | 	# Also add regularization loss
 79 | 	loss = conf_loss + LOC_LOSS_WEIGHT * loc_loss + tf.reduce_sum(slim.losses.get_regularization_losses())
 80 | 	optimizer = OPT.minimize(loss)
 81 | 
 82 | 	#reported_loss = loss #tf.reduce_sum(loss, 1)  # DEBUG
 83 | 
 84 | 	# Class probabilities and predictions
 85 | 	probs_all = tf.nn.softmax(logits)
 86 | 	probs, preds_conf = tf.nn.top_k(probs_all)  # take top-1 probability, and the index is the predicted class
 87 | 	probs = tf.reshape(probs, [-1, num_total_preds])
 88 | 	preds_conf = tf.reshape(preds_conf, [-1, num_total_preds])
 89 | 
 90 | 	# Return a dictionary of {tensor_name: tensor_reference}
 91 | 	ret_dict = {
 92 | 		'y_true_conf': y_true_conf,
 93 | 		'y_true_loc': y_true_loc,
 94 | 		'conf_loss_mask': conf_loss_mask,
 95 | 		'optimizer': optimizer,
 96 | 		'conf_loss': conf_loss,
 97 | 		'loc_loss': loc_loss,
 98 | 		'loss': loss,
 99 | 		'probs': probs,
100 | 		'preds_conf': preds_conf,
101 | 		'preds_loc': y_pred_loc,
102 | 	}
103 | 	return ret_dict
104 | 
105 | 
106 | def AlexNet():
107 | 	"""
108 | 	AlexNet
109 | 	"""
110 | 	# Image batch tensor and dropout keep prob placeholders
111 | 	x = tf.placeholder(tf.float32, [None, IMG_H, IMG_W, NUM_CHANNELS], name='x')
112 | 	is_training = tf.placeholder(tf.bool, name='is_training')
113 | 
114 | 	# Classification and localization predictions
115 | 	preds_conf = []  # conf -> classification b/c confidence loss -> classification loss
116 | 	preds_loc = []
117 | 
118 | 	# Use batch normalization for all convolution layers
119 | 	# FIXME: Not sure why setting is_training is not working well
120 | 	#with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': is_training}):
121 | 	with slim.arg_scope([slim.conv2d], normalizer_fn=slim.batch_norm, normalizer_params={'is_training': True},\
122 | 			weights_regularizer=slim.l2_regularizer(scale=REG_SCALE)):
123 | 		net = slim.conv2d(x, 64, [11, 11], 4, padding='VALID', scope='conv1')
124 | 		net = slim.max_pool2d(net, [3, 3], 2, scope='pool1')
125 | 		net = slim.conv2d(net, 192, [5, 5], scope='conv2')
126 | 
127 | 		net_conf, net_loc = SSDHook(net, 'conv2')
128 | 		preds_conf.append(net_conf)
129 | 		preds_loc.append(net_loc)
130 | 
131 | 		net = slim.max_pool2d(net, [3, 3], 2, scope='pool2')
132 | 		net = slim.conv2d(net, 384, [3, 3], scope='conv3')
133 | 		net = slim.conv2d(net, 384, [3, 3], scope='conv4')
134 | 		net = slim.conv2d(net, 256, [3, 3], scope='conv5')
135 | 
136 | 		# The following layers added for SSD
137 | 		net = slim.conv2d(net, 1024, [3, 3], scope='conv6')
138 | 		net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
139 | 
140 | 		net_conf, net_loc = SSDHook(net, 'conv7')
141 | 		preds_conf.append(net_conf)
142 | 		preds_loc.append(net_loc)
143 | 
144 | 		net = slim.conv2d(net, 256, [1, 1], scope='conv8')
145 | 		net = slim.conv2d(net, 512, [3, 3], 2, scope='conv8_2')
146 | 
147 | 		net_conf, net_loc = SSDHook(net, 'conv8_2')
148 | 		preds_conf.append(net_conf)
149 | 		preds_loc.append(net_loc)
150 | 
151 | 		net = slim.conv2d(net, 128, [1, 1], scope='conv9')
152 | 		net = slim.conv2d(net, 256, [3, 3], 2, scope='conv9_2')
153 | 
154 | 		net_conf, net_loc = SSDHook(net, 'conv9_2')
155 | 		preds_conf.append(net_conf)
156 | 		preds_loc.append(net_loc)
157 | 
158 | 	# Concatenate all preds together into 1 vector, for both classification and localization predictions
159 | 	final_pred_conf = tf.concat(1, preds_conf)
160 | 	final_pred_loc = tf.concat(1, preds_loc)
161 | 
162 | 	# Return a dictionary of {tensor_name: tensor_reference}
163 | 	ret_dict = {
164 | 		'x': x,
165 | 		'y_pred_conf': final_pred_conf,
166 | 		'y_pred_loc': final_pred_loc,
167 | 		'is_training': is_training,
168 | 	}
169 | 	return ret_dict
170 | 
171 | 
172 | def SSDModel():
173 | 	"""
174 | 	Wrapper around the model and model helper
175 | 	Returns dict of relevant tensor references
176 | 	"""
177 | 	if MODEL == 'AlexNet':
178 | 		model = AlexNet()
179 | 	else:
180 | 		raise NotImplementedError('Model %s not supported' % MODEL)
181 | 
182 | 	model_helper = ModelHelper(model['y_pred_conf'], model['y_pred_loc'])
183 | 
184 | 	ssd_model = {}
185 | 	for k in model.keys():
186 | 		ssd_model[k] = model[k]
187 | 	for k in model_helper.keys():
188 | 		ssd_model[k] = model_helper[k]
189 | 
190 | 	return ssd_model
191 | 
192 | 
193 | def nms(y_pred_conf, y_pred_loc, prob):
194 | 	"""
195 | 	Non-Maximum Suppression (NMS)
196 | 	Performs NMS on all boxes of each class where predicted probability > CONF_THRES
197 | 	For all boxes exceeding IOU threshold, select the box with highest confidence
198 | 	Returns a lsit of box coordinates post-NMS
199 | 
200 | 	Arguments:
201 | 		* y_pred_conf: Class predictions, numpy array of shape (num_feature_map_cells * num_defaul_boxes,)
202 | 		* y_pred_loc: Bounding box coordinates, numpy array of shape (num_feature_map_cells * num_defaul_boxes * 4,)
203 | 			These coordinates are normalized coordinates relative to center of feature map cell
204 | 		* prob: Class probabilities, numpy array of shape (num_feature_map_cells * num_defaul_boxes,)
205 | 
206 | 	Returns:
207 | 		* boxes: Numpy array of boxes, with shape (num_boxes, 6). shape[0] is interpreted as:
208 | 			[x1, y1, x2, y2, class, probability], where x1/y1/x2/y2 are the coordinates of the
209 | 			upper-left and lower-right corners. Box coordinates assume the image size is IMG_W x IMG_H.
210 | 			Remember to rescale box coordinates if your target image has different dimensions.
211 | 	"""
212 | 	# Keep track of boxes for each class
213 | 	class_boxes = {}  # class -> [(x1, y1, x2, y2, prob), (...), ...]
214 | 	with open('signnames.csv', 'r') as f:
215 | 		for line in f:
216 | 			cls, _ = line.split(',')
217 | 			class_boxes[float(cls)] = []
218 | 
219 | 	# Go through all possible boxes and perform class-based greedy NMS (greedy based on class prediction confidence)
220 | 	y_idx = 0
221 | 	for fm_size in FM_SIZES:
222 | 		fm_h, fm_w = fm_size  # feature map height and width
223 | 		for row in range(fm_h):
224 | 			for col in range(fm_w):
225 | 				for db in DEFAULT_BOXES:
226 | 					# Only perform calculations if class confidence > CONF_THRESH and not background class
227 | 					if prob[y_idx] > CONF_THRESH and y_pred_conf[y_idx] > 0.:
228 | 						# Calculate absolute coordinates of predicted bounding box
229 | 						xc, yc = col + 0.5, row + 0.5  # center of current feature map cell
230 | 						center_coords = np.array([xc, yc, xc, yc])
231 | 						abs_box_coords = center_coords + y_pred_loc[y_idx*4 : y_idx*4 + 4]  # predictions are offsets to center of fm cell
232 | 
233 | 						# Calculate predicted box coordinates in actual image
234 | 						scale = np.array([IMG_W/fm_w, IMG_H/fm_h, IMG_W/fm_w, IMG_H/fm_h])
235 | 						box_coords = abs_box_coords * scale
236 | 						box_coords = [int(round(x)) for x in box_coords]
237 | 
238 | 						# Compare this box to all previous boxes of this class
239 | 						cls = y_pred_conf[y_idx]
240 | 						cls_prob = prob[y_idx]
241 | 						box = (*box_coords, cls, cls_prob)
242 | 						if len(class_boxes[cls]) == 0:
243 | 							class_boxes[cls].append(box)
244 | 						else:
245 | 							suppressed = False  # did this box suppress other box(es)?
246 | 							overlapped = False  # did this box overlap with other box(es)?
247 | 							for other_box in class_boxes[cls]:
248 | 								iou = calc_iou(box[:4], other_box[:4])
249 | 								if iou > NMS_IOU_THRESH:
250 | 									overlapped = True
251 | 									# If current box has higher confidence than other box
252 | 									if box[5] > other_box[5]:
253 | 										class_boxes[cls].remove(other_box)
254 | 										suppressed = True
255 | 							if suppressed or not overlapped:
256 | 								class_boxes[cls].append(box)
257 | 
258 | 					y_idx += 1
259 | 
260 | 	# Gather all the pruned boxes and return them
261 | 	boxes = []
262 | 	for cls in class_boxes.keys():
263 | 		for class_box in class_boxes[cls]:
264 | 			boxes.append(class_box)
265 | 	boxes = np.array(boxes)
266 | 
267 | 	return boxes
268 | 


--------------------------------------------------------------------------------