├── requirements.txt
├── test_images
    ├── testytest.jpg
    └── testytest_detected.jpg
├── .gitignore
├── config.json
├── LICENSE
├── predict.py
├── train.py
├── gen_anchors.py
├── README.md
├── utils.py
├── backend.py
├── preprocessing.py
├── frontend.py
└── Yolo Step-by-Step.ipynb


/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorflow-gpu==1.3
2 | keras==2.0.8
3 | imgaug
4 | opencv-python
5 | h5py
6 | 


--------------------------------------------------------------------------------
/test_images/testytest.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/department-for-transport/dftlab-yolo-vehiclecounting/master/test_images/testytest.jpg


--------------------------------------------------------------------------------
/test_images/testytest_detected.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/department-for-transport/dftlab-yolo-vehiclecounting/master/test_images/testytest_detected.jpg


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #ignore weights
 2 | *.h5
 3 | 
 4 | #ignore training images
 5 | /train_image
 6 | 
 7 | #ignore training labels
 8 | /train_labels
 9 | 
10 | #ignore pycache
11 | /__pycache__
12 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "model" : {
 3 |         "backend":              "Full Yolo",
 4 |         "input_size":           416,
 5 |         "anchors":              [0.77,1.62, 1.11,1.74, 1.17,0.69, 1.73,1.60, 1.73,0.88],
 6 |         "max_box_per_image":    10,
 7 |         "labels":               ["car"]
 8 |     },
 9 | 
10 |     "train": {
11 |         "train_image_folder":   "train_image/",
12 |         "train_annot_folder":   "test_labels/",     
13 |           
14 |         "train_times":          1,
15 |         "pretrained_weights":   "",
16 |         "batch_size":           16,
17 |         "learning_rate":        1e-4,
18 |         "nb_epochs":            30,
19 |         "warmup_epochs":        3,
20 | 
21 |         "object_scale":         5.0,
22 |         "no_object_scale":      1.0,
23 |         "coord_scale":          1.0,
24 |         "class_scale":          0.0,
25 | 
26 |         "saved_weights_name":   "monday3.h5",
27 |         "debug":                true
28 |     },
29 | 
30 |     "valid": {
31 |         "valid_image_folder":   "",
32 |         "valid_annot_folder":   "",
33 | 
34 |         "valid_times":          1
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Ngoc Anh Huynh
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/predict.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import argparse
  4 | import os
  5 | import cv2
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from preprocessing import parse_annotation
  9 | from utils import draw_boxes
 10 | from frontend import YOLO
 11 | import json
 12 | 
 13 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
 14 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 15 | 
 16 | argparser = argparse.ArgumentParser(
 17 |     description='Train and validate YOLO_v2 model on any dataset')
 18 | 
 19 | argparser.add_argument(
 20 |     '-c',
 21 |     '--conf',
 22 |     help='path to configuration file')
 23 | 
 24 | argparser.add_argument(
 25 |     '-w',
 26 |     '--weights',
 27 |     help='path to pretrained weights')
 28 | 
 29 | argparser.add_argument(
 30 |     '-i',
 31 |     '--input',
 32 |     help='path to an image or an video (mp4 format)')
 33 | 
 34 | def _main_(args):
 35 |     config_path  = args.conf
 36 |     weights_path = args.weights
 37 |     image_path   = args.input
 38 | 
 39 |     with open(config_path) as config_buffer:    
 40 |         config = json.load(config_buffer)
 41 | 
 42 |     ###############################
 43 |     #   Make the model 
 44 |     ###############################
 45 | 
 46 |     yolo = YOLO(backend             = config['model']['backend'],
 47 |                 input_size          = config['model']['input_size'], 
 48 |                 labels              = config['model']['labels'], 
 49 |                 max_box_per_image   = config['model']['max_box_per_image'],
 50 |                 anchors             = config['model']['anchors'])
 51 | 
 52 |     ###############################
 53 |     #   Load trained weights
 54 |     ###############################    
 55 | 
 56 |     yolo.load_weights(weights_path)
 57 | 
 58 |     ###############################
 59 |     #   Predict bounding boxes 
 60 |     ###############################
 61 | 
 62 |     if image_path[-4:] == '.mp4':
 63 |         video_out = image_path[:-4] + '_detected' + image_path[-4:]
 64 |         video_reader = cv2.VideoCapture(image_path)
 65 | 
 66 |         nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
 67 |         frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))
 68 |         frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))
 69 | 
 70 |         video_writer = cv2.VideoWriter(video_out,
 71 |                                cv2.VideoWriter_fourcc(*'MPEG'), 
 72 |                                50.0, 
 73 |                                (frame_w, frame_h))
 74 |         count = 0
 75 |         for i in tqdm(range(nb_frames)):
 76 |             _, image = video_reader.read()
 77 |             
 78 |             boxes = yolo.predict(image)
 79 |             if i > 0:
 80 |                 image, count = count_boxes(image, boxes, config['model']['labels'], oldboxes, i, count)
 81 | 
 82 |                 video_writer.write(np.uint8(image))
 83 |             
 84 |             oldboxes = boxes
 85 | 
 86 |         video_reader.release()
 87 |         video_writer.release()  
 88 |     else:
 89 |         image = cv2.imread(image_path)
 90 |         boxes = yolo.predict(image)
 91 |         image = draw_boxes(image, boxes, config['model']['labels'])
 92 | 
 93 |         print(len(boxes), 'boxes are found')
 94 | 
 95 |         cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], image)
 96 | 
 97 | if __name__ == '__main__':
 98 |     args = argparser.parse_args()
 99 |     _main_(args)
100 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | import argparse
  4 | import os
  5 | import numpy as np
  6 | from preprocessing import parse_annotation
  7 | from frontend import YOLO
  8 | import json
  9 | 
 10 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
 11 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 12 | 
 13 | argparser = argparse.ArgumentParser(
 14 |     description='Train and validate YOLO_v2 model on any dataset')
 15 | 
 16 | argparser.add_argument(
 17 |     '-c',
 18 |     '--conf',
 19 |     help='path to configuration file')
 20 | 
 21 | def _main_(args):
 22 |     config_path = args.conf
 23 | 
 24 |     with open(config_path) as config_buffer:    
 25 |         config = json.loads(config_buffer.read())
 26 | 
 27 |     ###############################
 28 |     #   Parse the annotations 
 29 |     ###############################
 30 | 
 31 |     # parse annotations of the training set
 32 |     train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'], 
 33 |                                                 config['train']['train_image_folder'], 
 34 |                                                 config['model']['labels'])
 35 | 
 36 |     # parse annotations of the validation set, if any, otherwise split the training set
 37 |     if os.path.exists(config['valid']['valid_annot_folder']):
 38 |         valid_imgs, valid_labels = parse_annotation(config['valid']['valid_annot_folder'], 
 39 |                                                     config['valid']['valid_image_folder'], 
 40 |                                                     config['model']['labels'])
 41 |     else:
 42 |         train_valid_split = int(0.8*len(train_imgs))
 43 |         np.random.shuffle(train_imgs)
 44 | 
 45 |         valid_imgs = train_imgs[train_valid_split:]
 46 |         train_imgs = train_imgs[:train_valid_split]
 47 | 
 48 |     if len(config['model']['labels']) > 0:
 49 |         overlap_labels = set(config['model']['labels']).intersection(set(train_labels.keys()))
 50 | 
 51 |         print('Seen labels:\t', train_labels)
 52 |         print('Given labels:\t', config['model']['labels'])
 53 |         print('Overlap labels:\t', overlap_labels)           
 54 | 
 55 |         if len(overlap_labels) < len(config['model']['labels']):
 56 |             print('Some labels have no annotations! Please revise the list of labels in the config.json file!')
 57 |             return
 58 |     else:
 59 |         print('No labels are provided. Train on all seen labels.')
 60 |         config['model']['labels'] = train_labels.keys()
 61 |         
 62 |     ###############################
 63 |     #   Construct the model 
 64 |     ###############################
 65 | 
 66 |     yolo = YOLO(backend             = config['model']['backend'],
 67 |                 input_size          = config['model']['input_size'], 
 68 |                 labels              = config['model']['labels'], 
 69 |                 max_box_per_image   = config['model']['max_box_per_image'],
 70 |                 anchors             = config['model']['anchors'])
 71 | 
 72 |     ###############################
 73 |     #   Load the pretrained weights (if any) 
 74 |     ###############################    
 75 | 
 76 |     if os.path.exists(config['train']['pretrained_weights']):
 77 |         print("Loading pre-trained weights in", config['train']['pretrained_weights'])
 78 |         yolo.load_weights(config['train']['pretrained_weights'])
 79 | 
 80 |     ###############################
 81 |     #   Start the training process 
 82 |     ###############################
 83 | 
 84 |     yolo.train(train_imgs         = train_imgs,
 85 |                valid_imgs         = valid_imgs,
 86 |                train_times        = config['train']['train_times'],
 87 |                valid_times        = config['valid']['valid_times'],
 88 |                nb_epochs          = config['train']['nb_epochs'], 
 89 |                learning_rate      = config['train']['learning_rate'], 
 90 |                batch_size         = config['train']['batch_size'],
 91 |                warmup_epochs      = config['train']['warmup_epochs'],
 92 |                object_scale       = config['train']['object_scale'],
 93 |                no_object_scale    = config['train']['no_object_scale'],
 94 |                coord_scale        = config['train']['coord_scale'],
 95 |                class_scale        = config['train']['class_scale'],
 96 |                saved_weights_name = config['train']['saved_weights_name'],
 97 |                debug              = config['train']['debug'])
 98 | 
 99 | if __name__ == '__main__':
100 |     args = argparser.parse_args()
101 |     _main_(args)
102 | 


--------------------------------------------------------------------------------
/gen_anchors.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import argparse
  3 | import numpy as np
  4 | 
  5 | from preprocessing import parse_annotation
  6 | import json
  7 | 
  8 | argparser = argparse.ArgumentParser()
  9 | 
 10 | argparser.add_argument(
 11 |     '-c',
 12 |     '--conf',
 13 |     default='config.json',
 14 |     help='path to configuration file')
 15 | 
 16 | argparser.add_argument(
 17 |     '-a',
 18 |     '--anchors',
 19 |     default=5,
 20 |     help='number of anchors to use')
 21 | 
 22 | def IOU(ann, centroids):
 23 |     w, h = ann
 24 |     similarities = []
 25 | 
 26 |     for centroid in centroids:
 27 |         c_w, c_h = centroid
 28 | 
 29 |         if c_w >= w and c_h >= h:
 30 |             similarity = w*h/(c_w*c_h)
 31 |         elif c_w >= w and c_h <= h:
 32 |             similarity = w*c_h/(w*h + (c_w-w)*c_h)
 33 |         elif c_w <= w and c_h >= h:
 34 |             similarity = c_w*h/(w*h + c_w*(c_h-h))
 35 |         else: #means both w,h are bigger than c_w and c_h respectively
 36 |             similarity = (c_w*c_h)/(w*h)
 37 |         similarities.append(similarity) # will become (k,) shape
 38 | 
 39 |     return np.array(similarities)
 40 | 
 41 | def avg_IOU(anns, centroids):
 42 |     n,d = anns.shape
 43 |     sum = 0.
 44 | 
 45 |     for i in range(anns.shape[0]):
 46 |         sum+= max(IOU(anns[i], centroids))
 47 | 
 48 |     return sum/n
 49 | 
 50 | def print_anchors(centroids):
 51 |     anchors = centroids.copy()
 52 | 
 53 |     widths = anchors[:, 0]
 54 |     sorted_indices = np.argsort(widths)
 55 | 
 56 |     r = "anchors: ["
 57 |     for i in sorted_indices[:-1]:
 58 |         r += '%0.2f,%0.2f, ' % (anchors[i,0], anchors[i,1])
 59 | 
 60 |     #there should not be comma after last anchor, that's why
 61 |     r += '%0.2f,%0.2f' % (anchors[sorted_indices[-1:],0], anchors[sorted_indices[-1:],1])
 62 |     r += "]"
 63 | 
 64 |     print(r)
 65 | 
 66 | def run_kmeans(ann_dims, anchor_num):
 67 |     ann_num = ann_dims.shape[0]
 68 |     iterations = 0
 69 |     prev_assignments = np.ones(ann_num)*(-1)
 70 |     iteration = 0
 71 |     old_distances = np.zeros((ann_num, anchor_num))
 72 | 
 73 |     indices = [random.randrange(ann_dims.shape[0]) for i in range(anchor_num)]
 74 |     centroids = ann_dims[indices]
 75 |     anchor_dim = ann_dims.shape[1]
 76 | 
 77 |     while True:
 78 |         distances = []
 79 |         iteration += 1
 80 |         for i in range(ann_num):
 81 |             d = 1 - IOU(ann_dims[i], centroids)
 82 |             distances.append(d)
 83 |         distances = np.array(distances) # distances.shape = (ann_num, anchor_num)
 84 | 
 85 |         print("iteration {}: dists = {}".format(iteration, np.sum(np.abs(old_distances-distances))))
 86 | 
 87 |         #assign samples to centroids
 88 |         assignments = np.argmin(distances,axis=1)
 89 | 
 90 |         if (assignments == prev_assignments).all() :
 91 |             return centroids
 92 | 
 93 |         #calculate new centroids
 94 |         centroid_sums=np.zeros((anchor_num, anchor_dim), np.float)
 95 |         for i in range(ann_num):
 96 |             centroid_sums[assignments[i]]+=ann_dims[i]
 97 |         for j in range(anchor_num):
 98 |             centroids[j] = centroid_sums[j]/(np.sum(assignments==j) + 1e-6)
 99 | 
100 |         prev_assignments = assignments.copy()
101 |         old_distances = distances.copy()
102 | 
103 | def main(argv):
104 |     config_path = args.conf
105 |     num_anchors = args.anchors
106 | 
107 |     with open(config_path) as config_buffer:
108 |         config = json.loads(config_buffer.read())
109 | 
110 |     train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'],
111 |                                                 config['train']['train_image_folder'],
112 |                                                 config['model']['labels'])
113 | 
114 |     grid_w = config['model']['input_size']/32
115 |     grid_h = config['model']['input_size']/32
116 | 
117 |     # run k_mean to find the anchors
118 |     annotation_dims = []
119 |     for image in train_imgs:
120 |         cell_w = image['width']/grid_w
121 |         cell_h = image['height']/grid_h
122 | 
123 |         for obj in image['object']:
124 |             relative_w = (float(obj['xmax']) - float(obj['xmin']))/cell_w
125 |             relatice_h = (float(obj["ymax"]) - float(obj['ymin']))/cell_h
126 |             annotation_dims.append(tuple(map(float, (relative_w,relatice_h))))
127 | 
128 |     annotation_dims = np.array(annotation_dims)
129 |     centroids = run_kmeans(annotation_dims, num_anchors)
130 | 
131 |     # write anchors to file
132 |     print('\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(annotation_dims, centroids))
133 |     print_anchors(centroids)
134 | 
135 | if __name__ == '__main__':
136 |     args = argparser.parse_args()
137 |     main(args)
138 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | #DfT Lab  - Counting Vehicles from satellite/aerial imagery/video
  2 | 
  3 | 
  4 | This repo contains the implementation of YOLOv2 in Keras with Tensorflow backend. It supports training YOLOv2 network with various backends such as MobileNet and InceptionV3. Thanks to Experiencor's excellent implementation, the original repo is here https://github.com/experiencor/keras-yolo2
  5 | 
  6 | Links to our training set and trained weights are below.
  7 | 
  8 | You can see it working on video at https://list.ly/list/2B7T-a-list-of-everything-the-dft-lab-does-and-has-done 
  9 | 
 10 | and at https://www.youtube.com/watch?v=iOcHr77708E
 11 | 
 12 | 
 13 | ## Usage for python code
 14 | 
 15 | ### 0. Requirement
 16 | 
 17 | Check out requirements.txt
 18 | 
 19 | WARNING - if you're going to train this, you need a good Nvidia GPU, with CUDA and CUDnn installed (https://www.tensorflow.org/install/install_linux). Note, we're using tensorflow-gpu 1.3!
 20 | 
 21 | It should predict on most machines though!
 22 | 
 23 | ### 1. Data preparation
 24 | Download the VEDAI dataset from from https://github.com/nikitalpopov/vedai
 25 | 
 26 | Organize the dataset into 4 folders:
 27 | 
 28 | + train_image_folder <= the folder that contains the train images.
 29 | 
 30 | + train_annot_folder <= the folder that contains the train annotations in VOC format.
 31 | 
 32 | + valid_image_folder <= the folder that contains the validation images.
 33 | 
 34 | + valid_annot_folder <= the folder that contains the validation annotations in VOC format.
 35 |     
 36 | There is a one-to-one correspondence by file name between images and annotations. If the validation set is empty, the training set will be automatically splitted into the training set and validation set using the ratio of 0.8.
 37 | 
 38 | ### 2. Edit the configuration file
 39 | The configuration file is a json file, which looks like this:
 40 | 
 41 | ```python
 42 | {
 43 |     "model" : {
 44 |         "architecture":         "Full Yolo",    # "Tiny Yolo" or "Full Yolo" or "MobileNet" or "SqueezeNet" or "Inception3"
 45 |         "input_size":           416,
 46 |         "anchors":              [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828],
 47 |         "max_box_per_image":    10,        
 48 |         "labels":               ["vehicle"]
 49 |     },
 50 | 
 51 |     "train": {
 52 |         "train_image_folder":   "/home/andy/data/raccoon_dataset/images/",
 53 |         "train_annot_folder":   "/home/andy/data/raccoon_dataset/anns/",      
 54 |           
 55 |         "train_times":          10,             # the number of time to cycle through the training set, useful for small datasets
 56 |         "pretrained_weights":   "",             # specify the path of the pretrained weights, but it's fine to start from scratch
 57 |         "batch_size":           16,             # the number of images to read in each batch
 58 |         "learning_rate":        1e-4,           # the base learning rate of the default Adam rate scheduler
 59 |         "nb_epoch":             50,             # number of epoches
 60 |         "warmup_epochs":        3,              # the number of initial epochs during which the sizes of the 5 boxes in each cell is forced to match the sizes of the 5 anchors, this trick seems to improve precision emperically
 61 | 
 62 |         "object_scale":         5.0 ,           # determine how much to penalize wrong prediction of confidence of object predictors
 63 |         "no_object_scale":      1.0,            # determine how much to penalize wrong prediction of confidence of non-object predictors
 64 |         "coord_scale":          1.0,            # determine how much to penalize wrong position and size predictions (x, y, w, h)
 65 |         "class_scale":          1.0,            # determine how much to penalize wrong class prediction
 66 | 
 67 |         "debug":                true            # turn on/off the line that prints current confidence, position, size, class losses and recall
 68 |     },
 69 | 
 70 |     "valid": {
 71 |         "valid_image_folder":   "",
 72 |         "valid_annot_folder":   "",
 73 | 
 74 |         "valid_times":          1
 75 |     }
 76 | }
 77 | 
 78 | ```
 79 | 
 80 | The model section defines the type of the model to construct as well as other parameters of the model such as the input image size and the list of anchors. The ```labels``` setting lists the labels to be trained on. Only images which have labels listed, are fed to the network.  
 81 | 
 82 | Download pretrained weights for backend (tiny yolo, full yolo, squeezenet, mobilenet, and inceptionV3) at:
 83 | 
 84 | https://1drv.ms/f/s!ApLdDEW3ut5fec2OzK4S4RpT-SU
 85 | 
 86 | **These weights must be put in the root folder of the repository if you want to train the network. They are the pretrained weights for the backend only and will be loaded during model creation. The code does not work without these weights.**
 87 | 
 88 | The link to the pretrained weights for the whole model (both frontend and backend) of the vehicle detector can be downloaded at:
 89 | 
 90 | https://storage.googleapis.com/cudnnfreight/trainedweights.h5
 91 | 
 92 | ### 3. Generate anchors for your dataset (optional)
 93 | 
 94 | `python gen_anchors.py -c config.json`
 95 | 
 96 | Copy the generated anchors printed on the terminal to the ```anchors``` setting in ```config.json```.
 97 | 
 98 | ### 4. Start the training process
 99 | 
100 | `python train.py -c config.json`
101 | 
102 | 
103 | 
104 | By the end of this process, the code will write the weights of the best model to file best_weights.h5 (or whatever name specified in the setting "saved_weights_name" in the config.json file). The training process stops when the loss on the validation set is not improved in 3 consecutive epoches.
105 | 
106 | ### 5. Perform detection using trained weights on an image by running
107 | `python predict.py -c config.json -w /path/to/best_weights.h5 -i /path/to/image/or/video`
108 | 
109 | It carries out detection on the image and write the image with detected bounding boxes to the same folder. If you're feeding it videos, it will endevaour to count the unique vehicles (it does this by a slightly crude collision detection (the code for which is in utils.py)
110 | 
111 | Note that the model resizes images to 416*416 (you could change this but would need to alter the net archicture too), so don't go feeding it big images that when resized mean each vehicle is a little smudge of pixels - it wont get these! If it's not making predictions, try tinkering around with the level of zoom on each image, or the threshold values in utils.py 
112 | 
113 | ## Usage for jupyter notebook
114 | 
115 | Refer to the notebook (https://github.com/experiencor/basic-yolo-keras/blob/master/Yolo%20Step-by-Step.ipynb) for a complete walk-through implementation of YOLOv2 from scratch (training, testing, and scoring).
116 | 
117 | 
118 | ## Copyright
119 | 
120 | See [LICENSE](LICENSE) for details.
121 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import xml.etree.ElementTree as ET
  4 | import tensorflow as tf
  5 | import copy
  6 | import cv2
  7 | 
  8 | class BoundBox:
  9 |     def __init__(self, xmin, ymin, xmax, ymax, c = None, classes = None):
 10 |         self.xmin = xmin
 11 |         self.ymin = ymin
 12 |         self.xmax = xmax
 13 |         self.ymax = ymax
 14 |         
 15 |         self.c     = c
 16 |         self.classes = classes
 17 | 
 18 |         self.label = -1
 19 |         self.score = -1
 20 | 
 21 |     def get_label(self):
 22 |         if self.label == -1:
 23 |             self.label = np.argmax(self.classes)
 24 |         
 25 |         return self.label
 26 |     
 27 |     def get_score(self):
 28 |         if self.score == -1:
 29 |             self.score = self.classes[self.get_label()]
 30 |             
 31 |         return self.score
 32 | 
 33 | class WeightReader:
 34 |     def __init__(self, weight_file):
 35 |         self.offset = 4
 36 |         self.all_weights = np.fromfile(weight_file, dtype='float32')
 37 |         
 38 |     def read_bytes(self, size):
 39 |         self.offset = self.offset + size
 40 |         return self.all_weights[self.offset-size:self.offset]
 41 |     
 42 |     def reset(self):
 43 |         self.offset = 4
 44 | 
 45 | def bbox_iou(box1, box2):
 46 |     intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax])
 47 |     intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax])  
 48 |     
 49 |     intersect = intersect_w * intersect_h
 50 | 
 51 |     w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin
 52 |     w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin
 53 |     
 54 |     union = w1*h1 + w2*h2 - intersect
 55 |     
 56 |     return float(intersect) / union
 57 | 
 58 | def count_boxes(image, boxes, labels, oldboxes, framenumber, count):
 59 |     image_h, image_w, _ = image.shape
 60 | 
 61 |     for box in boxes:
 62 |         xmin = int(box.xmin*image_w)
 63 |         ymin = int(box.ymin*image_h)
 64 |         xmax = int(box.xmax*image_w)
 65 |         ymax = int(box.ymax*image_h)
 66 |         
 67 | 
 68 |         cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)
 69 |         
 70 |          
 71 |         if framenumber % 5 == 0:
 72 |             collisions = 0
 73 |             for oldbox in oldboxes:
 74 |                 
 75 |                 xmin2 = int(oldbox.xmin*image_w)
 76 |                 ymin2 = int(oldbox.ymin*image_h)
 77 |                 xmax2 = int(oldbox.xmax*image_w)
 78 |                 ymax2 = int(oldbox.ymax*image_h)
 79 |         
 80 |                 if xmax > xmin2 and xmin < xmax2 and ymax > ymin2 and ymin <ymax2:
 81 |                     collisions = collisions + 1
 82 |                 
 83 |             if collisions == 0:
 84 |                 count = count + 1
 85 |                     
 86 |     cv2.putText(image, str(count),(100, 200), cv2.FONT_HERSHEY_SIMPLEX, 1e-2* image_h,(0,255,0), 2)         
 87 |     return image, count 
 88 | 
 89 | def draw_boxes(image, boxes, labels):
 90 |     image_h, image_w, _ = image.shape
 91 | 
 92 |     for box in boxes:
 93 |         xmin = int(box.xmin*image_w)
 94 |         ymin = int(box.ymin*image_h)
 95 |         xmax = int(box.xmax*image_w)
 96 |         ymax = int(box.ymax*image_h)
 97 | 
 98 |         cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)
 99 |         cv2.putText(image, 
100 |                     labels[box.get_label()] + ' ' + str(box.get_score()), 
101 |                     (xmin, ymin - 13), 
102 |                     cv2.FONT_HERSHEY_SIMPLEX, 
103 |                     1e-3 * image_h, 
104 |                     (0,255,0), 2)
105 |         
106 |     return image 
107 |         
108 | def decode_netout(netout, anchors, nb_class, obj_threshold=0.2, nms_threshold=0.2):
109 |     grid_h, grid_w, nb_box = netout.shape[:3]
110 | 
111 |     boxes = []
112 |     
113 |     # decode the output by the network
114 |     netout[..., 4]  = _sigmoid(netout[..., 4])
115 |     netout[..., 5:] = netout[..., 4][..., np.newaxis] * _softmax(netout[..., 5:])
116 |     netout[..., 5:] *= netout[..., 5:] > obj_threshold
117 |     
118 |     for row in range(grid_h):
119 |         for col in range(grid_w):
120 |             for b in range(nb_box):
121 |                 # from 4th element onwards are confidence and class classes
122 |                 classes = netout[row,col,b,5:]
123 |                 
124 |                 if np.sum(classes) > 0:
125 |                     # first 4 elements are x, y, w, and h
126 |                     x, y, w, h = netout[row,col,b,:4]
127 | 
128 |                     x = (col + _sigmoid(x)) / grid_w # center position, unit: image width
129 |                     y = (row + _sigmoid(y)) / grid_h # center position, unit: image height
130 |                     w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width
131 |                     h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height
132 |                     confidence = netout[row,col,b,4]
133 |                     
134 |                     box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes)
135 |                     
136 |                     boxes.append(box)
137 | 
138 |     # suppress non-maximal boxes
139 |     for c in range(nb_class):
140 |         sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes])))
141 | 
142 |         for i in range(len(sorted_indices)):
143 |             index_i = sorted_indices[i]
144 |             
145 |             if boxes[index_i].classes[c] == 0: 
146 |                 continue
147 |             else:
148 |                 for j in range(i+1, len(sorted_indices)):
149 |                     index_j = sorted_indices[j]
150 |                     
151 |                     if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold:
152 |                         boxes[index_j].classes[c] = 0
153 |                         
154 |     # remove the boxes which are less likely than a obj_threshold
155 |     boxes = [box for box in boxes if box.get_score() > obj_threshold]
156 |     
157 |     return boxes    
158 | 
159 | def compute_overlap(a, b):
160 |     """
161 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
162 |     Parameters
163 |     ----------
164 |     a: (N, 4) ndarray of float
165 |     b: (K, 4) ndarray of float
166 |     Returns
167 |     -------
168 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
169 |     """
170 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
171 | 
172 |     iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
173 |     ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
174 | 
175 |     iw = np.maximum(iw, 0)
176 |     ih = np.maximum(ih, 0)
177 | 
178 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
179 | 
180 |     ua = np.maximum(ua, np.finfo(float).eps)
181 | 
182 |     intersection = iw * ih
183 | 
184 |     return intersection / ua  
185 |     
186 | def compute_ap(recall, precision):
187 |     """ Compute the average precision, given the recall and precision curves.
188 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
189 | 
190 |     # Arguments
191 |         recall:    The recall curve (list).
192 |         precision: The precision curve (list).
193 |     # Returns
194 |         The average precision as computed in py-faster-rcnn.
195 |     """
196 |     # correct AP calculation
197 |     # first append sentinel values at the end
198 |     mrec = np.concatenate(([0.], recall, [1.]))
199 |     mpre = np.concatenate(([0.], precision, [0.]))
200 | 
201 |     # compute the precision envelope
202 |     for i in range(mpre.size - 1, 0, -1):
203 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
204 | 
205 |     # to calculate area under PR curve, look for points
206 |     # where X axis (recall) changes value
207 |     i = np.where(mrec[1:] != mrec[:-1])[0]
208 | 
209 |     # and sum (\Delta recall) * prec
210 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
211 |     return ap      
212 |         
213 | def _interval_overlap(interval_a, interval_b):
214 |     x1, x2 = interval_a
215 |     x3, x4 = interval_b
216 | 
217 |     if x3 < x1:
218 |         if x4 < x1:
219 |             return 0
220 |         else:
221 |             return min(x2,x4) - x1
222 |     else:
223 |         if x2 < x3:
224 |              return 0
225 |         else:
226 |             return min(x2,x4) - x3          
227 | 
228 | def _sigmoid(x):
229 |     return 1. / (1. + np.exp(-x))
230 | 
231 | def _softmax(x, axis=-1, t=-100.):
232 |     x = x - np.max(x)
233 |     
234 |     if np.min(x) < t:
235 |         x = x/np.min(x)*t
236 |         
237 |     e_x = np.exp(x)
238 |     
239 |     return e_x / e_x.sum(axis, keepdims=True)
240 | 


--------------------------------------------------------------------------------
/backend.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Model
  2 | import tensorflow as tf
  3 | from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
  4 | from keras.layers.advanced_activations import LeakyReLU
  5 | from keras.layers.merge import concatenate
  6 | from keras.applications.mobilenet import MobileNet
  7 | from keras.applications import InceptionV3
  8 | from keras.applications.vgg16 import VGG16
  9 | from keras.applications.resnet50 import ResNet50
 10 | 
 11 | FULL_YOLO_BACKEND_PATH  = "full_yolo_backend.h5"   # should be hosted on a server
 12 | TINY_YOLO_BACKEND_PATH  = "tiny_yolo_backend.h5"   # should be hosted on a server
 13 | SQUEEZENET_BACKEND_PATH = "squeezenet_backend.h5"  # should be hosted on a server
 14 | MOBILENET_BACKEND_PATH  = "mobilenet_backend.h5"   # should be hosted on a server
 15 | INCEPTION3_BACKEND_PATH = "inception_backend.h5"   # should be hosted on a server
 16 | VGG16_BACKEND_PATH      = "vgg16_backend.h5"       # should be hosted on a server
 17 | RESNET50_BACKEND_PATH   = "resnet50_backend.h5"    # should be hosted on a server
 18 | 
 19 | class BaseFeatureExtractor(object):
 20 |     """docstring for ClassName"""
 21 | 
 22 |     # to be defined in each subclass
 23 |     def __init__(self, input_size):
 24 |         raise NotImplementedError("error message")
 25 | 
 26 |     # to be defined in each subclass
 27 |     def normalize(self, image):
 28 |         raise NotImplementedError("error message")       
 29 | 
 30 |     def get_output_shape(self):
 31 |         return self.feature_extractor.get_output_shape_at(-1)[1:3]
 32 | 
 33 |     def extract(self, input_image):
 34 |         return self.feature_extractor(input_image)
 35 | 
 36 | class FullYoloFeature(BaseFeatureExtractor):
 37 |     """docstring for ClassName"""
 38 |     def __init__(self, input_size):
 39 |         input_image = Input(shape=(input_size, input_size, 3))
 40 | 
 41 |         # the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)
 42 |         def space_to_depth_x2(x):
 43 |             return tf.space_to_depth(x, block_size=2)
 44 | 
 45 |         # Layer 1
 46 |         x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)
 47 |         x = BatchNormalization(name='norm_1')(x)
 48 |         x = LeakyReLU(alpha=0.1)(x)
 49 |         x = MaxPooling2D(pool_size=(2, 2))(x)
 50 | 
 51 |         # Layer 2
 52 |         x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)
 53 |         x = BatchNormalization(name='norm_2')(x)
 54 |         x = LeakyReLU(alpha=0.1)(x)
 55 |         x = MaxPooling2D(pool_size=(2, 2))(x)
 56 | 
 57 |         # Layer 3
 58 |         x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)
 59 |         x = BatchNormalization(name='norm_3')(x)
 60 |         x = LeakyReLU(alpha=0.1)(x)
 61 | 
 62 |         # Layer 4
 63 |         x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)
 64 |         x = BatchNormalization(name='norm_4')(x)
 65 |         x = LeakyReLU(alpha=0.1)(x)
 66 | 
 67 |         # Layer 5
 68 |         x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)
 69 |         x = BatchNormalization(name='norm_5')(x)
 70 |         x = LeakyReLU(alpha=0.1)(x)
 71 |         x = MaxPooling2D(pool_size=(2, 2))(x)
 72 | 
 73 |         # Layer 6
 74 |         x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)
 75 |         x = BatchNormalization(name='norm_6')(x)
 76 |         x = LeakyReLU(alpha=0.1)(x)
 77 | 
 78 |         # Layer 7
 79 |         x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)
 80 |         x = BatchNormalization(name='norm_7')(x)
 81 |         x = LeakyReLU(alpha=0.1)(x)
 82 | 
 83 |         # Layer 8
 84 |         x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)
 85 |         x = BatchNormalization(name='norm_8')(x)
 86 |         x = LeakyReLU(alpha=0.1)(x)
 87 |         x = MaxPooling2D(pool_size=(2, 2))(x)
 88 | 
 89 |         # Layer 9
 90 |         x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)
 91 |         x = BatchNormalization(name='norm_9')(x)
 92 |         x = LeakyReLU(alpha=0.1)(x)
 93 | 
 94 |         # Layer 10
 95 |         x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)
 96 |         x = BatchNormalization(name='norm_10')(x)
 97 |         x = LeakyReLU(alpha=0.1)(x)
 98 | 
 99 |         # Layer 11
100 |         x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)
101 |         x = BatchNormalization(name='norm_11')(x)
102 |         x = LeakyReLU(alpha=0.1)(x)
103 | 
104 |         # Layer 12
105 |         x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)
106 |         x = BatchNormalization(name='norm_12')(x)
107 |         x = LeakyReLU(alpha=0.1)(x)
108 | 
109 |         # Layer 13
110 |         x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)
111 |         x = BatchNormalization(name='norm_13')(x)
112 |         x = LeakyReLU(alpha=0.1)(x)
113 | 
114 |         skip_connection = x
115 | 
116 |         x = MaxPooling2D(pool_size=(2, 2))(x)
117 | 
118 |         # Layer 14
119 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)
120 |         x = BatchNormalization(name='norm_14')(x)
121 |         x = LeakyReLU(alpha=0.1)(x)
122 | 
123 |         # Layer 15
124 |         x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)
125 |         x = BatchNormalization(name='norm_15')(x)
126 |         x = LeakyReLU(alpha=0.1)(x)
127 | 
128 |         # Layer 16
129 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)
130 |         x = BatchNormalization(name='norm_16')(x)
131 |         x = LeakyReLU(alpha=0.1)(x)
132 | 
133 |         # Layer 17
134 |         x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)
135 |         x = BatchNormalization(name='norm_17')(x)
136 |         x = LeakyReLU(alpha=0.1)(x)
137 | 
138 |         # Layer 18
139 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)
140 |         x = BatchNormalization(name='norm_18')(x)
141 |         x = LeakyReLU(alpha=0.1)(x)
142 | 
143 |         # Layer 19
144 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)
145 |         x = BatchNormalization(name='norm_19')(x)
146 |         x = LeakyReLU(alpha=0.1)(x)
147 | 
148 |         # Layer 20
149 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)
150 |         x = BatchNormalization(name='norm_20')(x)
151 |         x = LeakyReLU(alpha=0.1)(x)
152 | 
153 |         # Layer 21
154 |         skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)
155 |         skip_connection = BatchNormalization(name='norm_21')(skip_connection)
156 |         skip_connection = LeakyReLU(alpha=0.1)(skip_connection)
157 |         skip_connection = Lambda(space_to_depth_x2)(skip_connection)
158 | 
159 |         x = concatenate([skip_connection, x])
160 | 
161 |         # Layer 22
162 |         x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)
163 |         x = BatchNormalization(name='norm_22')(x)
164 |         x = LeakyReLU(alpha=0.1)(x)
165 | 
166 |         self.feature_extractor = Model(input_image, x)  
167 |         self.feature_extractor.load_weights(FULL_YOLO_BACKEND_PATH)
168 | 
169 |     def normalize(self, image):
170 |         return image / 255.
171 | 
172 | class TinyYoloFeature(BaseFeatureExtractor):
173 |     """docstring for ClassName"""
174 |     def __init__(self, input_size):
175 |         input_image = Input(shape=(input_size, input_size, 3))
176 | 
177 |         # Layer 1
178 |         x = Conv2D(16, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)
179 |         x = BatchNormalization(name='norm_1')(x)
180 |         x = LeakyReLU(alpha=0.1)(x)
181 |         x = MaxPooling2D(pool_size=(2, 2))(x)
182 | 
183 |         # Layer 2 - 5
184 |         for i in range(0,4):
185 |             x = Conv2D(32*(2**i), (3,3), strides=(1,1), padding='same', name='conv_' + str(i+2), use_bias=False)(x)
186 |             x = BatchNormalization(name='norm_' + str(i+2))(x)
187 |             x = LeakyReLU(alpha=0.1)(x)
188 |             x = MaxPooling2D(pool_size=(2, 2))(x)
189 | 
190 |         # Layer 6
191 |         x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)
192 |         x = BatchNormalization(name='norm_6')(x)
193 |         x = LeakyReLU(alpha=0.1)(x)
194 |         x = MaxPooling2D(pool_size=(2, 2), strides=(1,1), padding='same')(x)
195 | 
196 |         # Layer 7 - 8
197 |         for i in range(0,2):
198 |             x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_' + str(i+7), use_bias=False)(x)
199 |             x = BatchNormalization(name='norm_' + str(i+7))(x)
200 |             x = LeakyReLU(alpha=0.1)(x)
201 | 
202 |         self.feature_extractor = Model(input_image, x)  
203 |         self.feature_extractor.load_weights(TINY_YOLO_BACKEND_PATH)
204 | 
205 |     def normalize(self, image):
206 |         return image / 255.
207 | 
208 | class MobileNetFeature(BaseFeatureExtractor):
209 |     """docstring for ClassName"""
210 |     def __init__(self, input_size):
211 |         input_image = Input(shape=(input_size, input_size, 3))
212 | 
213 |         mobilenet = MobileNet(input_shape=(224,224,3), include_top=False)
214 |         mobilenet.load_weights(MOBILENET_BACKEND_PATH)
215 | 
216 |         x = mobilenet(input_image)
217 | 
218 |         self.feature_extractor = Model(input_image, x)  
219 | 
220 |     def normalize(self, image):
221 |         image = image / 255.
222 |         image = image - 0.5
223 |         image = image * 2.
224 | 
225 |         return image		
226 | 
227 | class SqueezeNetFeature(BaseFeatureExtractor):
228 |     """docstring for ClassName"""
229 |     def __init__(self, input_size):
230 | 
231 |         # define some auxiliary variables and the fire module
232 |         sq1x1  = "squeeze1x1"
233 |         exp1x1 = "expand1x1"
234 |         exp3x3 = "expand3x3"
235 |         relu   = "relu_"
236 | 
237 |         def fire_module(x, fire_id, squeeze=16, expand=64):
238 |             s_id = 'fire' + str(fire_id) + '/'
239 | 
240 |             x     = Conv2D(squeeze, (1, 1), padding='valid', name=s_id + sq1x1)(x)
241 |             x     = Activation('relu', name=s_id + relu + sq1x1)(x)
242 | 
243 |             left  = Conv2D(expand,  (1, 1), padding='valid', name=s_id + exp1x1)(x)
244 |             left  = Activation('relu', name=s_id + relu + exp1x1)(left)
245 | 
246 |             right = Conv2D(expand,  (3, 3), padding='same',  name=s_id + exp3x3)(x)
247 |             right = Activation('relu', name=s_id + relu + exp3x3)(right)
248 | 
249 |             x = concatenate([left, right], axis=3, name=s_id + 'concat')
250 | 
251 |             return x
252 | 
253 |         # define the model of SqueezeNet
254 |         input_image = Input(shape=(input_size, input_size, 3))
255 | 
256 |         x = Conv2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(input_image)
257 |         x = Activation('relu', name='relu_conv1')(x)
258 |         x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x)
259 | 
260 |         x = fire_module(x, fire_id=2, squeeze=16, expand=64)
261 |         x = fire_module(x, fire_id=3, squeeze=16, expand=64)
262 |         x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x)
263 | 
264 |         x = fire_module(x, fire_id=4, squeeze=32, expand=128)
265 |         x = fire_module(x, fire_id=5, squeeze=32, expand=128)
266 |         x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x)
267 | 
268 |         x = fire_module(x, fire_id=6, squeeze=48, expand=192)
269 |         x = fire_module(x, fire_id=7, squeeze=48, expand=192)
270 |         x = fire_module(x, fire_id=8, squeeze=64, expand=256)
271 |         x = fire_module(x, fire_id=9, squeeze=64, expand=256)
272 | 
273 |         self.feature_extractor = Model(input_image, x)  
274 |         self.feature_extractor.load_weights(SQUEEZENET_BACKEND_PATH)
275 | 
276 |     def normalize(self, image):
277 |         image = image[..., ::-1]
278 |         image = image.astype('float')
279 | 
280 |         image[..., 0] -= 103.939
281 |         image[..., 1] -= 116.779
282 |         image[..., 2] -= 123.68
283 | 
284 |         return image    
285 | 
286 | class Inception3Feature(BaseFeatureExtractor):
287 |     """docstring for ClassName"""
288 |     def __init__(self, input_size):
289 |         input_image = Input(shape=(input_size, input_size, 3))
290 | 
291 |         inception = InceptionV3(input_shape=(input_size,input_size,3), include_top=False)
292 |         inception.load_weights(INCEPTION3_BACKEND_PATH)
293 | 
294 |         x = inception(input_image)
295 | 
296 |         self.feature_extractor = Model(input_image, x)  
297 | 
298 |     def normalize(self, image):
299 |         image = image / 255.
300 |         image = image - 0.5
301 |         image = image * 2.
302 | 
303 |         return image
304 | 
305 | class VGG16Feature(BaseFeatureExtractor):
306 |     """docstring for ClassName"""
307 |     def __init__(self, input_size):
308 |         vgg16 = VGG16(input_shape=(input_size, input_size, 3), include_top=False)
309 |         #vgg16.load_weights(VGG16_BACKEND_PATH)
310 | 
311 |         self.feature_extractor = vgg16
312 | 
313 |     def normalize(self, image):
314 |         image = image[..., ::-1]
315 |         image = image.astype('float')
316 | 
317 |         image[..., 0] -= 103.939
318 |         image[..., 1] -= 116.779
319 |         image[..., 2] -= 123.68
320 | 
321 |         return image 
322 | 
323 | class ResNet50Feature(BaseFeatureExtractor):
324 |     """docstring for ClassName"""
325 |     def __init__(self, input_size):
326 |         resnet50 = ResNet50(input_shape=(input_size, input_size, 3), include_top=False)
327 |         resnet50.layers.pop() # remove the average pooling layer
328 |         #resnet50.load_weights(RESNET50_BACKEND_PATH)
329 | 
330 |         self.feature_extractor = Model(resnet50.layers[0].input, resnet50.layers[-1].output)
331 | 
332 |     def normalize(self, image):
333 |         image = image[..., ::-1]
334 |         image = image.astype('float')
335 | 
336 |         image[..., 0] -= 103.939
337 |         image[..., 1] -= 116.779
338 |         image[..., 2] -= 123.68
339 | 
340 |         return image 
341 | 


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import copy
  4 | import numpy as np
  5 | import imgaug as ia
  6 | from imgaug import augmenters as iaa
  7 | from keras.utils import Sequence
  8 | import xml.etree.ElementTree as ET
  9 | from utils import BoundBox, bbox_iou
 10 | 
 11 | def parse_annotation(ann_dir, img_dir, labels=[]):
 12 |     all_imgs = []
 13 |     seen_labels = {}
 14 |     
 15 |     for ann in sorted(os.listdir(ann_dir)):
 16 |         img = {'object':[]}
 17 | 
 18 |         tree = ET.parse(ann_dir + ann)
 19 |         
 20 |         for elem in tree.iter():
 21 |             if 'filename' in elem.tag:
 22 |                 img['filename'] = img_dir + elem.text
 23 |             if 'width' in elem.tag:
 24 |                 img['width'] = int(elem.text)
 25 |             if 'height' in elem.tag:
 26 |                 img['height'] = int(elem.text)
 27 |             if 'object' in elem.tag or 'part' in elem.tag:
 28 |                 obj = {}
 29 |                 
 30 |                 for attr in list(elem):
 31 |                     if 'name' in attr.tag:
 32 |                         obj['name'] = attr.text
 33 | 
 34 |                         if obj['name'] in seen_labels:
 35 |                             seen_labels[obj['name']] += 1
 36 |                         else:
 37 |                             seen_labels[obj['name']] = 1
 38 |                         
 39 |                         if len(labels) > 0 and obj['name'] not in labels:
 40 |                             break
 41 |                         else:
 42 |                             img['object'] += [obj]
 43 |                             
 44 |                     if 'bndbox' in attr.tag:
 45 |                         for dim in list(attr):
 46 |                             if 'xmin' in dim.tag:
 47 |                                 obj['xmin'] = int(round(float(dim.text)))
 48 |                             if 'ymin' in dim.tag:
 49 |                                 obj['ymin'] = int(round(float(dim.text)))
 50 |                             if 'xmax' in dim.tag:
 51 |                                 obj['xmax'] = int(round(float(dim.text)))
 52 |                             if 'ymax' in dim.tag:
 53 |                                 obj['ymax'] = int(round(float(dim.text)))
 54 | 
 55 |         if len(img['object']) > 0:
 56 |             all_imgs += [img]
 57 |                         
 58 |     return all_imgs, seen_labels
 59 | 
 60 | class BatchGenerator(Sequence):
 61 |     def __init__(self, images, 
 62 |                        config, 
 63 |                        shuffle=True, 
 64 |                        jitter=True, 
 65 |                        norm=None):
 66 |         self.generator = None
 67 | 
 68 |         self.images = images
 69 |         self.config = config
 70 | 
 71 |         self.shuffle = shuffle
 72 |         self.jitter  = jitter
 73 |         self.norm    = norm
 74 | 
 75 |         self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])//2))]
 76 | 
 77 |         ### augmentors by https://github.com/aleju/imgaug
 78 |         sometimes = lambda aug: iaa.Sometimes(0.5, aug)
 79 | 
 80 |         # Define our sequence of augmentation steps that will be applied to every image
 81 |         # All augmenters with per_channel=0.5 will sample one value _per image_
 82 |         # in 50% of all cases. In all other cases they will sample new values
 83 |         # _per channel_.
 84 |         self.aug_pipe = iaa.Sequential(
 85 |             [
 86 |                 # apply the following augmenters to most images
 87 |                 #iaa.Fliplr(0.5), # horizontally flip 50% of all images
 88 |                 #iaa.Flipud(0.2), # vertically flip 20% of all images
 89 |                 #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width
 90 |                 sometimes(iaa.Affine(
 91 |                     #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
 92 |                     #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
 93 |                     #rotate=(-5, 5), # rotate by -45 to +45 degrees
 94 |                     #shear=(-5, 5), # shear by -16 to +16 degrees
 95 |                     #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
 96 |                     #cval=(0, 255), # if mode is constant, use a cval between 0 and 255
 97 |                     #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
 98 |                 )),
 99 |                 # execute 0 to 5 of the following (less important) augmenters per image
100 |                 # don't execute all of them, as that would often be way too strong
101 |                 iaa.SomeOf((0, 5),
102 |                     [
103 |                         #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
104 |                         iaa.OneOf([
105 |                             iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0
106 |                             iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7
107 |                             iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7
108 |                         ]),
109 |                         iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images
110 |                         #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
111 |                         # search either for all edges or for directed edges
112 |                         #sometimes(iaa.OneOf([
113 |                         #    iaa.EdgeDetect(alpha=(0, 0.7)),
114 |                         #    iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),
115 |                         #])),
116 |                         iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images
117 |                         iaa.OneOf([
118 |                             iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels
119 |                             #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),
120 |                         ]),
121 |                         #iaa.Invert(0.05, per_channel=True), # invert color channels
122 |                         iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
123 |                         iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)
124 |                         iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast
125 |                         #iaa.Grayscale(alpha=(0.0, 1.0)),
126 |                         #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
127 |                         #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around
128 |                     ],
129 |                     random_order=True
130 |                 )
131 |             ],
132 |             random_order=True
133 |         )
134 | 
135 |         if shuffle: np.random.shuffle(self.images)
136 | 
137 |     def __len__(self):
138 |         return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))   
139 | 
140 |     def num_classes(self):
141 |         return len(self.config['LABELS'])
142 | 
143 |     def size(self):
144 |         return len(self.images)    
145 | 
146 |     def load_annotation(self, i):
147 |         annots = []
148 | 
149 |         for obj in self.images[i]['object']:
150 |             annot = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], self.config['LABELS'].index(obj['name'])]
151 |             annots += [annot]
152 | 
153 |         if len(annots) == 0: annots = [[]]
154 | 
155 |         return np.array(annots)
156 | 
157 |     def load_image(self, i):
158 |         return cv2.imread(self.images[i]['filename'])
159 | 
160 |     def __getitem__(self, idx):
161 |         l_bound = idx*self.config['BATCH_SIZE']
162 |         r_bound = (idx+1)*self.config['BATCH_SIZE']
163 | 
164 |         if r_bound > len(self.images):
165 |             r_bound = len(self.images)
166 |             l_bound = r_bound - self.config['BATCH_SIZE']
167 | 
168 |         instance_count = 0
169 | 
170 |         x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3))                         # input images
171 |         b_batch = np.zeros((r_bound - l_bound, 1     , 1     , 1    ,  self.config['TRUE_BOX_BUFFER'], 4))   # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes
172 |         y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'],  self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS'])))                # desired network output
173 | 
174 |         for train_instance in self.images[l_bound:r_bound]:
175 |             # augment input image and fix object's position and size
176 |             img, all_objs = self.aug_image(train_instance, jitter=self.jitter)
177 |             
178 |             # construct output from object's x, y, w, h
179 |             true_box_index = 0
180 |             
181 |             for obj in all_objs:
182 |                 if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:
183 |                     center_x = .5*(obj['xmin'] + obj['xmax'])
184 |                     center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])
185 |                     center_y = .5*(obj['ymin'] + obj['ymax'])
186 |                     center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])
187 | 
188 |                     grid_x = int(np.floor(center_x))
189 |                     grid_y = int(np.floor(center_y))
190 | 
191 |                     if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:
192 |                         obj_indx  = self.config['LABELS'].index(obj['name'])
193 |                         
194 |                         center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
195 |                         center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell
196 |                         
197 |                         box = [center_x, center_y, center_w, center_h]
198 | 
199 |                         # find the anchor that best predicts this box
200 |                         best_anchor = -1
201 |                         max_iou     = -1
202 |                         
203 |                         shifted_box = BoundBox(0, 
204 |                                                0,
205 |                                                center_w,                                                
206 |                                                center_h)
207 |                         
208 |                         for i in range(len(self.anchors)):
209 |                             anchor = self.anchors[i]
210 |                             iou    = bbox_iou(shifted_box, anchor)
211 |                             
212 |                             if max_iou < iou:
213 |                                 best_anchor = i
214 |                                 max_iou     = iou
215 |                                 
216 |                         # assign ground truth x, y, w, h, confidence and class probs to y_batch
217 |                         y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box
218 |                         y_batch[instance_count, grid_y, grid_x, best_anchor, 4  ] = 1.
219 |                         y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1
220 |                         
221 |                         # assign the true box to b_batch
222 |                         b_batch[instance_count, 0, 0, 0, true_box_index] = box
223 |                         
224 |                         true_box_index += 1
225 |                         true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']
226 |                             
227 |             # assign input image to x_batch
228 |             if self.norm != None: 
229 |                 x_batch[instance_count] = self.norm(img)
230 |             else:
231 |                 # plot image and bounding boxes for sanity check
232 |                 for obj in all_objs:
233 |                     if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:
234 |                         cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)
235 |                         cv2.putText(img[:,:,::-1], obj['name'], 
236 |                                     (obj['xmin']+2, obj['ymin']+12), 
237 |                                     0, 1.2e-3 * img.shape[0], 
238 |                                     (0,255,0), 2)
239 |                         
240 |                 x_batch[instance_count] = img
241 | 
242 |             # increase instance counter in current batch
243 |             instance_count += 1  
244 | 
245 |         #print(' new batch created', idx)
246 | 
247 |         return [x_batch, b_batch], y_batch
248 | 
249 |     def on_epoch_end(self):
250 |         if self.shuffle: np.random.shuffle(self.images)
251 | 
252 |     def aug_image(self, train_instance, jitter):
253 |         image_name = train_instance['filename']
254 |         image = cv2.imread(image_name)
255 | 
256 |         if image is None: print('Cannot find ', image_name)
257 | 
258 |         h, w, c = image.shape
259 |         all_objs = copy.deepcopy(train_instance['object'])
260 | 
261 |         if jitter:
262 |             ### scale the image
263 |             scale = np.random.uniform() / 10. + 1.
264 |             image = cv2.resize(image, (0,0), fx = scale, fy = scale)
265 | 
266 |             ### translate the image
267 |             max_offx = (scale-1.) * w
268 |             max_offy = (scale-1.) * h
269 |             offx = int(np.random.uniform() * max_offx)
270 |             offy = int(np.random.uniform() * max_offy)
271 |             
272 |             image = image[offy : (offy + h), offx : (offx + w)]
273 | 
274 |             ### flip the image
275 |             flip = np.random.binomial(1, .5)
276 |             if flip > 0.5: image = cv2.flip(image, 1)
277 |                 
278 |             image = self.aug_pipe.augment_image(image)            
279 |             
280 |         # resize the image to standard size
281 |         image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))
282 |         image = image[:,:,::-1]
283 | 
284 |         # fix object's position and size
285 |         for obj in all_objs:
286 |             for attr in ['xmin', 'xmax']:
287 |                 if jitter: obj[attr] = int(obj[attr] * scale - offx)
288 |                     
289 |                 obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)
290 |                 obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)
291 |                 
292 |             for attr in ['ymin', 'ymax']:
293 |                 if jitter: obj[attr] = int(obj[attr] * scale - offy)
294 |                     
295 |                 obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)
296 |                 obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)
297 | 
298 |             if jitter and flip > 0.5:
299 |                 xmin = obj['xmin']
300 |                 obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']
301 |                 obj['xmax'] = self.config['IMAGE_W'] - xmin
302 |                 
303 |         return image, all_objs
304 | 


--------------------------------------------------------------------------------
/frontend.py:
--------------------------------------------------------------------------------
  1 | from keras.models import Model
  2 | from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
  3 | from keras.layers.advanced_activations import LeakyReLU
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import os
  7 | import cv2
  8 | from utils import decode_netout, compute_overlap, compute_ap
  9 | from keras.applications.mobilenet import MobileNet
 10 | from keras.layers.merge import concatenate
 11 | from keras.optimizers import SGD, Adam, RMSprop
 12 | from preprocessing import BatchGenerator
 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
 14 | from backend import TinyYoloFeature, FullYoloFeature, MobileNetFeature, SqueezeNetFeature, Inception3Feature, VGG16Feature, ResNet50Feature
 15 | 
 16 | class YOLO(object):
 17 |     def __init__(self, backend,
 18 |                        input_size, 
 19 |                        labels, 
 20 |                        max_box_per_image,
 21 |                        anchors):
 22 | 
 23 |         self.input_size = input_size
 24 |         
 25 |         self.labels   = list(labels)
 26 |         self.nb_class = len(self.labels)
 27 |         self.nb_box   = len(anchors)//2
 28 |         self.class_wt = np.ones(self.nb_class, dtype='float32')
 29 |         self.anchors  = anchors
 30 | 
 31 |         self.max_box_per_image = max_box_per_image
 32 | 
 33 |         ##########################
 34 |         # Make the model
 35 |         ##########################
 36 | 
 37 |         # make the feature extractor layers
 38 |         input_image     = Input(shape=(self.input_size, self.input_size, 3))
 39 |         self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4))  
 40 | 
 41 |         if backend == 'Inception3':
 42 |             self.feature_extractor = Inception3Feature(self.input_size)  
 43 |         elif backend == 'SqueezeNet':
 44 |             self.feature_extractor = SqueezeNetFeature(self.input_size)        
 45 |         elif backend == 'MobileNet':
 46 |             self.feature_extractor = MobileNetFeature(self.input_size)
 47 |         elif backend == 'Full Yolo':
 48 |             self.feature_extractor = FullYoloFeature(self.input_size)
 49 |         elif backend == 'Tiny Yolo':
 50 |             self.feature_extractor = TinyYoloFeature(self.input_size)
 51 |         elif backend == 'VGG16':
 52 |             self.feature_extractor = VGG16Feature(self.input_size)
 53 |         elif backend == 'ResNet50':
 54 |             self.feature_extractor = ResNet50Feature(self.input_size)
 55 |         else:
 56 |             raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!')
 57 | 
 58 |         print(self.feature_extractor.get_output_shape())    
 59 |         self.grid_h, self.grid_w = self.feature_extractor.get_output_shape()        
 60 |         features = self.feature_extractor.extract(input_image)            
 61 | 
 62 |         # make the object detection layer
 63 |         output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), 
 64 |                         (1,1), strides=(1,1), 
 65 |                         padding='same', 
 66 |                         name='DetectionLayer', 
 67 |                         kernel_initializer='lecun_normal')(features)
 68 |         output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output)
 69 |         output = Lambda(lambda args: args[0])([output, self.true_boxes])
 70 | 
 71 |         self.model = Model([input_image, self.true_boxes], output)
 72 | 
 73 |         
 74 |         # initialize the weights of the detection layer
 75 |         layer = self.model.layers[-4]
 76 |         weights = layer.get_weights()
 77 | 
 78 |         new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w)
 79 |         new_bias   = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w)
 80 | 
 81 |         layer.set_weights([new_kernel, new_bias])
 82 | 
 83 |         # print a summary of the whole model
 84 |         self.model.summary()
 85 | 
 86 |     def custom_loss(self, y_true, y_pred):
 87 |         mask_shape = tf.shape(y_true)[:4]
 88 |         
 89 |         cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1)))
 90 |         cell_y = tf.transpose(cell_x, (0,2,1,3,4))
 91 | 
 92 |         cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1])
 93 |         
 94 |         coord_mask = tf.zeros(mask_shape)
 95 |         conf_mask  = tf.zeros(mask_shape)
 96 |         class_mask = tf.zeros(mask_shape)
 97 |         
 98 |         seen = tf.Variable(0.)
 99 |         total_recall = tf.Variable(0.)
100 |         
101 |         """
102 |         Adjust prediction
103 |         """
104 |         ### adjust x and y      
105 |         pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid
106 |         
107 |         ### adjust w and h
108 |         pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2])
109 |         
110 |         ### adjust confidence
111 |         pred_box_conf = tf.sigmoid(y_pred[..., 4])
112 |         
113 |         ### adjust class probabilities
114 |         pred_box_class = y_pred[..., 5:]
115 |         
116 |         """
117 |         Adjust ground truth
118 |         """
119 |         ### adjust x and y
120 |         true_box_xy = y_true[..., 0:2] # relative position to the containing cell
121 |         
122 |         ### adjust w and h
123 |         true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically
124 |         
125 |         ### adjust confidence
126 |         true_wh_half = true_box_wh / 2.
127 |         true_mins    = true_box_xy - true_wh_half
128 |         true_maxes   = true_box_xy + true_wh_half
129 |         
130 |         pred_wh_half = pred_box_wh / 2.
131 |         pred_mins    = pred_box_xy - pred_wh_half
132 |         pred_maxes   = pred_box_xy + pred_wh_half       
133 |         
134 |         intersect_mins  = tf.maximum(pred_mins,  true_mins)
135 |         intersect_maxes = tf.minimum(pred_maxes, true_maxes)
136 |         intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
137 |         intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
138 |         
139 |         true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
140 |         pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]
141 | 
142 |         union_areas = pred_areas + true_areas - intersect_areas
143 |         iou_scores  = tf.truediv(intersect_areas, union_areas)
144 |         
145 |         true_box_conf = iou_scores * y_true[..., 4]
146 |         
147 |         ### adjust class probabilities
148 |         true_box_class = tf.argmax(y_true[..., 5:], -1)
149 |         
150 |         """
151 |         Determine the masks
152 |         """
153 |         ### coordinate mask: simply the position of the ground truth boxes (the predictors)
154 |         coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale
155 |         
156 |         ### confidence mask: penelize predictors + penalize boxes with low IOU
157 |         # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6
158 |         true_xy = self.true_boxes[..., 0:2]
159 |         true_wh = self.true_boxes[..., 2:4]
160 |         
161 |         true_wh_half = true_wh / 2.
162 |         true_mins    = true_xy - true_wh_half
163 |         true_maxes   = true_xy + true_wh_half
164 |         
165 |         pred_xy = tf.expand_dims(pred_box_xy, 4)
166 |         pred_wh = tf.expand_dims(pred_box_wh, 4)
167 |         
168 |         pred_wh_half = pred_wh / 2.
169 |         pred_mins    = pred_xy - pred_wh_half
170 |         pred_maxes   = pred_xy + pred_wh_half    
171 |         
172 |         intersect_mins  = tf.maximum(pred_mins,  true_mins)
173 |         intersect_maxes = tf.minimum(pred_maxes, true_maxes)
174 |         intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)
175 |         intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
176 |         
177 |         true_areas = true_wh[..., 0] * true_wh[..., 1]
178 |         pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
179 | 
180 |         union_areas = pred_areas + true_areas - intersect_areas
181 |         iou_scores  = tf.truediv(intersect_areas, union_areas)
182 | 
183 |         best_ious = tf.reduce_max(iou_scores, axis=4)
184 |         conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale
185 |         
186 |         # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box
187 |         conf_mask = conf_mask + y_true[..., 4] * self.object_scale
188 |         
189 |         ### class mask: simply the position of the ground truth boxes (the predictors)
190 |         class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale       
191 |         
192 |         """
193 |         Warm-up training
194 |         """
195 |         no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.)
196 |         seen = tf.assign_add(seen, 1.)
197 |         
198 |         true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), 
199 |                               lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, 
200 |                                        true_box_wh + tf.ones_like(true_box_wh) * \
201 |                                        np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \
202 |                                        no_boxes_mask, 
203 |                                        tf.ones_like(coord_mask)],
204 |                               lambda: [true_box_xy, 
205 |                                        true_box_wh,
206 |                                        coord_mask])
207 |         
208 |         """
209 |         Finalize the loss
210 |         """
211 |         nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))
212 |         nb_conf_box  = tf.reduce_sum(tf.to_float(conf_mask  > 0.0))
213 |         nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))
214 |         
215 |         loss_xy    = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
216 |         loss_wh    = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.
217 |         loss_conf  = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / 2.
218 |         loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
219 |         loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)
220 |         
221 |         loss = tf.cond(tf.less(seen, self.warmup_batches+1), 
222 |                       lambda: loss_xy + loss_wh + loss_conf + loss_class + 10,
223 |                       lambda: loss_xy + loss_wh + loss_conf + loss_class)
224 |         
225 |         if self.debug:
226 |             nb_true_box = tf.reduce_sum(y_true[..., 4])
227 |             nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))
228 |             
229 |             current_recall = nb_pred_box/(nb_true_box + 1e-6)
230 |             total_recall = tf.assign_add(total_recall, current_recall) 
231 | 
232 |             loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000)
233 |             loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000)
234 |             loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000)
235 |             loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000)
236 |             loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000)
237 |             loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000)
238 |             loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000)
239 |         
240 |         return loss
241 | 
242 |     def load_weights(self, weight_path):
243 |         self.model.load_weights(weight_path)
244 | 
245 |     def train(self, train_imgs,     # the list of images to train the model
246 |                     valid_imgs,     # the list of images used to validate the model
247 |                     train_times,    # the number of time to repeat the training set, often used for small datasets
248 |                     valid_times,    # the number of times to repeat the validation set, often used for small datasets
249 |                     nb_epochs,      # number of epoches
250 |                     learning_rate,  # the learning rate
251 |                     batch_size,     # the size of the batch
252 |                     warmup_epochs,  # number of initial batches to let the model familiarize with the new dataset
253 |                     object_scale,
254 |                     no_object_scale,
255 |                     coord_scale,
256 |                     class_scale,
257 |                     saved_weights_name='best_weights.h5',
258 |                     debug=False):     
259 | 
260 |         self.batch_size = batch_size
261 | 
262 |         self.object_scale    = object_scale
263 |         self.no_object_scale = no_object_scale
264 |         self.coord_scale     = coord_scale
265 |         self.class_scale     = class_scale
266 | 
267 |         self.debug = debug
268 | 
269 |         ############################################
270 |         # Make train and validation generators
271 |         ############################################
272 | 
273 |         generator_config = {
274 |             'IMAGE_H'         : self.input_size, 
275 |             'IMAGE_W'         : self.input_size,
276 |             'GRID_H'          : self.grid_h,  
277 |             'GRID_W'          : self.grid_w,
278 |             'BOX'             : self.nb_box,
279 |             'LABELS'          : self.labels,
280 |             'CLASS'           : len(self.labels),
281 |             'ANCHORS'         : self.anchors,
282 |             'BATCH_SIZE'      : self.batch_size,
283 |             'TRUE_BOX_BUFFER' : self.max_box_per_image,
284 |         }    
285 | 
286 |         train_generator = BatchGenerator(train_imgs, 
287 |                                      generator_config, 
288 |                                      norm=self.feature_extractor.normalize)
289 |         valid_generator = BatchGenerator(valid_imgs, 
290 |                                      generator_config, 
291 |                                      norm=self.feature_extractor.normalize,
292 |                                      jitter=False)   
293 |                                      
294 |         self.warmup_batches  = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator))   
295 | 
296 |         ############################################
297 |         # Compile the model
298 |         ############################################
299 | 
300 |         optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
301 |         self.model.compile(loss=self.custom_loss, optimizer=optimizer)
302 | 
303 |         ############################################
304 |         # Make a few callbacks
305 |         ############################################
306 | 
307 |         early_stop = EarlyStopping(monitor='val_loss', 
308 |                            min_delta=0.001, 
309 |                            patience=3, 
310 |                            mode='min', 
311 |                            verbose=1)
312 |         checkpoint = ModelCheckpoint(saved_weights_name, 
313 |                                      monitor='val_loss', 
314 |                                      verbose=1, 
315 |                                      save_best_only=True, 
316 |                                      mode='min', 
317 |                                      period=1)
318 |         tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), 
319 |                                   histogram_freq=0, 
320 |                                   #write_batch_performance=True,
321 |                                   write_graph=True, 
322 |                                   write_images=False)
323 | 
324 |         ############################################
325 |         # Start the training process
326 |         ############################################        
327 | 
328 |         self.model.fit_generator(generator        = train_generator, 
329 |                                  steps_per_epoch  = len(train_generator) * train_times, 
330 |                                  epochs           = warmup_epochs + nb_epochs, 
331 |                                  verbose          = 2 if debug else 1,
332 |                                  validation_data  = valid_generator,
333 |                                  validation_steps = len(valid_generator) * valid_times,
334 |                                  callbacks        = [early_stop, checkpoint, tensorboard], 
335 |                                  workers          = 3,
336 |                                  max_queue_size   = 8)      
337 | 
338 |         ############################################
339 |         # Compute mAP on the validation set
340 |         ############################################
341 |         average_precisions = self.evaluate(valid_generator)     
342 | 
343 |         # print evaluation
344 |         for label, average_precision in average_precisions.items():
345 |             print(self.labels[label], '{:.4f}'.format(average_precision))
346 |         print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions)))         
347 | 
348 |     def evaluate(self, 
349 |                  generator, 
350 |                  iou_threshold=0.3,
351 |                  score_threshold=0.3,
352 |                  max_detections=100,
353 |                  save_path=None):
354 |         """ Evaluate a given dataset using a given model.
355 |         code originally from https://github.com/fizyr/keras-retinanet
356 | 
357 |         # Arguments
358 |             generator       : The generator that represents the dataset to evaluate.
359 |             model           : The model to evaluate.
360 |             iou_threshold   : The threshold used to consider when a detection is positive or negative.
361 |             score_threshold : The score confidence threshold to use for detections.
362 |             max_detections  : The maximum number of detections to use per image.
363 |             save_path       : The path to save images with visualized detections to.
364 |         # Returns
365 |             A dict mapping class names to mAP scores.
366 |         """    
367 |         # gather all detections and annotations
368 |         all_detections     = [[None for i in range(generator.num_classes())] for j in range(generator.size())]
369 |         all_annotations    = [[None for i in range(generator.num_classes())] for j in range(generator.size())]
370 | 
371 |         for i in range(generator.size()):
372 |             raw_image = generator.load_image(i)
373 | 
374 |             # make the boxes and the labels
375 |             pred_boxes  = self.predict(raw_image)
376 |             
377 |             score = np.array([box.score for box in pred_boxes])
378 |             pred_labels = np.array([box.label for box in pred_boxes])        
379 |             
380 |             if len(pred_boxes) > 0:
381 |                 pred_boxes = np.array([[box.xmin, box.ymin, box.xmax, box.ymax, box.score] for box in pred_boxes]) 
382 |             else:
383 |                 pred_boxes = np.array([[]])  
384 |             
385 |             # sort the boxes and the labels according to scores
386 |             score_sort = np.argsort(-score)
387 |             pred_labels = pred_labels[score_sort]
388 |             pred_boxes  = pred_boxes[score_sort]
389 |             
390 |             # copy detections to all_detections
391 |             for label in range(generator.num_classes()):
392 |                 all_detections[i][label] = pred_boxes[pred_labels == label, :]
393 |                 
394 |             annotations = generator.load_annotation(i)
395 |             
396 |             # copy detections to all_annotations
397 |             for label in range(generator.num_classes()):
398 |                 all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy()
399 |                 
400 |         # compute mAP by comparing all detections and all annotations
401 |         average_precisions = {}
402 |         
403 |         for label in range(generator.num_classes()):
404 |             false_positives = np.zeros((0,))
405 |             true_positives  = np.zeros((0,))
406 |             scores          = np.zeros((0,))
407 |             num_annotations = 0.0
408 | 
409 |             for i in range(generator.size()):
410 |                 detections           = all_detections[i][label]
411 |                 annotations          = all_annotations[i][label]
412 |                 num_annotations     += annotations.shape[0]
413 |                 detected_annotations = []
414 | 
415 |                 for d in detections:
416 |                     scores = np.append(scores, d[4])
417 | 
418 |                     if annotations.shape[0] == 0:
419 |                         false_positives = np.append(false_positives, 1)
420 |                         true_positives  = np.append(true_positives, 0)
421 |                         continue
422 | 
423 |                     overlaps            = compute_overlap(np.expand_dims(d, axis=0), annotations)
424 |                     assigned_annotation = np.argmax(overlaps, axis=1)
425 |                     max_overlap         = overlaps[0, assigned_annotation]
426 | 
427 |                     if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
428 |                         false_positives = np.append(false_positives, 0)
429 |                         true_positives  = np.append(true_positives, 1)
430 |                         detected_annotations.append(assigned_annotation)
431 |                     else:
432 |                         false_positives = np.append(false_positives, 1)
433 |                         true_positives  = np.append(true_positives, 0)
434 | 
435 |             # no annotations -> AP for this class is 0 (is this correct?)
436 |             if num_annotations == 0:
437 |                 average_precisions[label] = 0
438 |                 continue
439 | 
440 |             # sort by score
441 |             indices         = np.argsort(-scores)
442 |             false_positives = false_positives[indices]
443 |             true_positives  = true_positives[indices]
444 | 
445 |             # compute false positives and true positives
446 |             false_positives = np.cumsum(false_positives)
447 |             true_positives  = np.cumsum(true_positives)
448 | 
449 |             # compute recall and precision
450 |             recall    = true_positives / num_annotations
451 |             precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps)
452 | 
453 |             # compute average precision
454 |             average_precision  = compute_ap(recall, precision)  
455 |             average_precisions[label] = average_precision
456 | 
457 |         return average_precisions    
458 | 
459 |     def predict(self, image):
460 |         image_h, image_w, _ = image.shape
461 |         image = cv2.resize(image, (self.input_size, self.input_size))
462 |         image = self.feature_extractor.normalize(image)
463 | 
464 |         input_image = image[:,:,::-1]
465 |         input_image = np.expand_dims(input_image, 0)
466 |         dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4))
467 | 
468 |         netout = self.model.predict([input_image, dummy_array])[0]
469 |         boxes  = decode_netout(netout, self.anchors, self.nb_class)
470 | 
471 |         return boxes


--------------------------------------------------------------------------------
/Yolo Step-by-Step.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "**Outline of Steps**\n",
   8 |     "    + Initialization\n",
   9 |     "        + Download COCO detection data from http://cocodataset.org/#download\n",
  10 |     "            + http://images.cocodataset.org/zips/train2014.zip <= train images\n",
  11 |     "            + http://images.cocodataset.org/zips/val2014.zip <= validation images\n",
  12 |     "            + http://images.cocodataset.org/annotations/annotations_trainval2014.zip <= train and validation annotations\n",
  13 |     "        + Run this script to convert annotations in COCO format to VOC format\n",
  14 |     "            + https://gist.github.com/chicham/6ed3842d0d2014987186#file-coco2pascal-py\n",
  15 |     "        + Download pre-trained weights from https://pjreddie.com/darknet/yolo/\n",
  16 |     "            + https://pjreddie.com/media/files/yolo.weights\n",
  17 |     "        + Specify the directory of train annotations (train_annot_folder) and train images (train_image_folder)\n",
  18 |     "        + Specify the directory of validation annotations (valid_annot_folder) and validation images (valid_image_folder)\n",
  19 |     "        + Specity the path of pre-trained weights by setting variable *wt_path*\n",
  20 |     "    + Construct equivalent network in Keras\n",
  21 |     "        + Network arch from https://github.com/pjreddie/darknet/blob/master/cfg/yolo-voc.cfg\n",
  22 |     "    + Load the pretrained weights\n",
  23 |     "    + Perform training \n",
  24 |     "    + Perform detection on an image with newly trained weights\n",
  25 |     "    + Perform detection on an video with newly trained weights"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "# Initialization"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": null,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": []
  41 |   },
  42 |   {
  43 |    "cell_type": "code",
  44 |    "execution_count": 1,
  45 |    "metadata": {
  46 |     "ExecuteTime": {
  47 |      "end_time": "2018-04-04T00:18:52.056478",
  48 |      "start_time": "2018-04-04T00:18:50.879887"
  49 |     },
  50 |     "code_folding": [],
  51 |     "scrolled": true
  52 |    },
  53 |    "outputs": [
  54 |     {
  55 |      "name": "stderr",
  56 |      "output_type": "stream",
  57 |      "text": [
  58 |       "/home/zachary_arundel/freightkeras/env/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
  59 |       "  from ._conv import register_converters as _register_converters\n",
  60 |       "Using TensorFlow backend.\n"
  61 |      ]
  62 |     }
  63 |    ],
  64 |    "source": [
  65 |     "from keras.models import Sequential, Model\n",
  66 |     "from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda\n",
  67 |     "from keras.layers.advanced_activations import LeakyReLU\n",
  68 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard\n",
  69 |     "from keras.optimizers import SGD, Adam, RMSprop\n",
  70 |     "from keras.layers.merge import concatenate\n",
  71 |     "import matplotlib.pyplot as plt\n",
  72 |     "import keras.backend as K\n",
  73 |     "import tensorflow as tf\n",
  74 |     "import imgaug as ia\n",
  75 |     "from tqdm import tqdm\n",
  76 |     "from imgaug import augmenters as iaa\n",
  77 |     "import numpy as np\n",
  78 |     "import pickle\n",
  79 |     "import os, cv2\n",
  80 |     "from preprocessing import parse_annotation, BatchGenerator\n",
  81 |     "from utils import WeightReader, decode_netout, draw_boxes\n",
  82 |     "\n",
  83 |     "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n",
  84 |     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
  85 |     "\n",
  86 |     "%matplotlib inline"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": 2,
  92 |    "metadata": {
  93 |     "ExecuteTime": {
  94 |      "end_time": "2018-04-04T00:18:52.075535",
  95 |      "start_time": "2018-04-04T00:18:52.057712"
  96 |     },
  97 |     "scrolled": true
  98 |    },
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "LABELS = [\"car\", \"truck\", \"pickup\", \"tractor\", \"camping car\", \"boat\",\"motorcycle\", \"van\", \"other\", \"plane\"] \n",
 102 |     "\n",
 103 |     "IMAGE_H, IMAGE_W = 416, 416\n",
 104 |     "GRID_H,  GRID_W  = 13 , 13\n",
 105 |     "BOX              = 5\n",
 106 |     "CLASS            = len(LABELS)\n",
 107 |     "CLASS_WEIGHTS    = np.ones(CLASS, dtype='float32')\n",
 108 |     "OBJ_THRESHOLD    = 0.3#0.5\n",
 109 |     "NMS_THRESHOLD    = 0.3#0.45\n",
 110 |     "ANCHORS          = [0.88,1.69, 1.18,0.7, 1.65,1.77,1.77,0.9, 3.75, 3.57],\n",
 111 |     "\n",
 112 |     "NO_OBJECT_SCALE  = 1.0\n",
 113 |     "OBJECT_SCALE     = 5.0\n",
 114 |     "COORD_SCALE      = 1.0\n",
 115 |     "CLASS_SCALE      = 1.0\n",
 116 |     "\n",
 117 |     "BATCH_SIZE       = 16\n",
 118 |     "WARM_UP_BATCHES  = 0\n",
 119 |     "TRUE_BOX_BUFFER  = 50"
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "code",
 124 |    "execution_count": 3,
 125 |    "metadata": {
 126 |     "ExecuteTime": {
 127 |      "end_time": "2018-04-04T00:18:52.981155",
 128 |      "start_time": "2018-04-04T00:18:52.978076"
 129 |     }
 130 |    },
 131 |    "outputs": [],
 132 |    "source": [
 133 |     "wt_path = 'full_yolo_backend.h5'                      \n",
 134 |     "train_image_folder = 'train_image_folder/'\n",
 135 |     "train_annot_folder = 'train_annot_folder/'\n",
 136 |     "valid_image_folder = 'valid_image_folder/'\n",
 137 |     "valid_annot_folder = 'valid_annot_folder/'"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {},
 143 |    "source": [
 144 |     "# Construct the network"
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": 4,
 150 |    "metadata": {
 151 |     "ExecuteTime": {
 152 |      "end_time": "2018-04-04T00:18:53.978220",
 153 |      "start_time": "2018-04-04T00:18:53.967537"
 154 |     }
 155 |    },
 156 |    "outputs": [],
 157 |    "source": [
 158 |     "# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n",
 159 |     "def space_to_depth_x2(x):\n",
 160 |     "    return tf.space_to_depth(x, block_size=2)"
 161 |    ]
 162 |   },
 163 |   {
 164 |    "cell_type": "code",
 165 |    "execution_count": 5,
 166 |    "metadata": {
 167 |     "ExecuteTime": {
 168 |      "end_time": "2018-04-04T00:18:58.022959",
 169 |      "start_time": "2018-04-04T00:18:55.740759"
 170 |     },
 171 |     "code_folding": []
 172 |    },
 173 |    "outputs": [],
 174 |    "source": [
 175 |     "input_image = Input(shape=(IMAGE_H, IMAGE_W, 3))\n",
 176 |     "true_boxes  = Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))\n",
 177 |     "\n",
 178 |     "# Layer 1\n",
 179 |     "x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)\n",
 180 |     "x = BatchNormalization(name='norm_1')(x)\n",
 181 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 182 |     "x = MaxPooling2D(pool_size=(2, 2))(x)\n",
 183 |     "\n",
 184 |     "# Layer 2\n",
 185 |     "x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)\n",
 186 |     "x = BatchNormalization(name='norm_2')(x)\n",
 187 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 188 |     "x = MaxPooling2D(pool_size=(2, 2))(x)\n",
 189 |     "\n",
 190 |     "# Layer 3\n",
 191 |     "x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)\n",
 192 |     "x = BatchNormalization(name='norm_3')(x)\n",
 193 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 194 |     "\n",
 195 |     "# Layer 4\n",
 196 |     "x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)\n",
 197 |     "x = BatchNormalization(name='norm_4')(x)\n",
 198 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 199 |     "\n",
 200 |     "# Layer 5\n",
 201 |     "x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)\n",
 202 |     "x = BatchNormalization(name='norm_5')(x)\n",
 203 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 204 |     "x = MaxPooling2D(pool_size=(2, 2))(x)\n",
 205 |     "\n",
 206 |     "# Layer 6\n",
 207 |     "x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)\n",
 208 |     "x = BatchNormalization(name='norm_6')(x)\n",
 209 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 210 |     "\n",
 211 |     "# Layer 7\n",
 212 |     "x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)\n",
 213 |     "x = BatchNormalization(name='norm_7')(x)\n",
 214 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 215 |     "\n",
 216 |     "# Layer 8\n",
 217 |     "x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)\n",
 218 |     "x = BatchNormalization(name='norm_8')(x)\n",
 219 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 220 |     "x = MaxPooling2D(pool_size=(2, 2))(x)\n",
 221 |     "\n",
 222 |     "# Layer 9\n",
 223 |     "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)\n",
 224 |     "x = BatchNormalization(name='norm_9')(x)\n",
 225 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 226 |     "\n",
 227 |     "# Layer 10\n",
 228 |     "x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)\n",
 229 |     "x = BatchNormalization(name='norm_10')(x)\n",
 230 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 231 |     "\n",
 232 |     "# Layer 11\n",
 233 |     "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)\n",
 234 |     "x = BatchNormalization(name='norm_11')(x)\n",
 235 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 236 |     "\n",
 237 |     "# Layer 12\n",
 238 |     "x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)\n",
 239 |     "x = BatchNormalization(name='norm_12')(x)\n",
 240 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 241 |     "\n",
 242 |     "# Layer 13\n",
 243 |     "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)\n",
 244 |     "x = BatchNormalization(name='norm_13')(x)\n",
 245 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 246 |     "\n",
 247 |     "skip_connection = x\n",
 248 |     "\n",
 249 |     "x = MaxPooling2D(pool_size=(2, 2))(x)\n",
 250 |     "\n",
 251 |     "# Layer 14\n",
 252 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)\n",
 253 |     "x = BatchNormalization(name='norm_14')(x)\n",
 254 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 255 |     "\n",
 256 |     "# Layer 15\n",
 257 |     "x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)\n",
 258 |     "x = BatchNormalization(name='norm_15')(x)\n",
 259 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 260 |     "\n",
 261 |     "# Layer 16\n",
 262 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)\n",
 263 |     "x = BatchNormalization(name='norm_16')(x)\n",
 264 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 265 |     "\n",
 266 |     "# Layer 17\n",
 267 |     "x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)\n",
 268 |     "x = BatchNormalization(name='norm_17')(x)\n",
 269 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 270 |     "\n",
 271 |     "# Layer 18\n",
 272 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)\n",
 273 |     "x = BatchNormalization(name='norm_18')(x)\n",
 274 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 275 |     "\n",
 276 |     "# Layer 19\n",
 277 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)\n",
 278 |     "x = BatchNormalization(name='norm_19')(x)\n",
 279 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 280 |     "\n",
 281 |     "# Layer 20\n",
 282 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)\n",
 283 |     "x = BatchNormalization(name='norm_20')(x)\n",
 284 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 285 |     "\n",
 286 |     "# Layer 21\n",
 287 |     "skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)\n",
 288 |     "skip_connection = BatchNormalization(name='norm_21')(skip_connection)\n",
 289 |     "skip_connection = LeakyReLU(alpha=0.1)(skip_connection)\n",
 290 |     "skip_connection = Lambda(space_to_depth_x2)(skip_connection)\n",
 291 |     "\n",
 292 |     "x = concatenate([skip_connection, x])\n",
 293 |     "\n",
 294 |     "# Layer 22\n",
 295 |     "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)\n",
 296 |     "x = BatchNormalization(name='norm_22')(x)\n",
 297 |     "x = LeakyReLU(alpha=0.1)(x)\n",
 298 |     "\n",
 299 |     "# Layer 23\n",
 300 |     "x = Conv2D(BOX * (4 + 1 + CLASS), (1,1), strides=(1,1), padding='same', name='conv_23')(x)\n",
 301 |     "output = Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)\n",
 302 |     "\n",
 303 |     "# small hack to allow true_boxes to be registered when Keras build the model \n",
 304 |     "# for more information: https://github.com/fchollet/keras/issues/2790\n",
 305 |     "output = Lambda(lambda args: args[0])([output, true_boxes])\n",
 306 |     "\n",
 307 |     "model = Model([input_image, true_boxes], output)"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": 6,
 313 |    "metadata": {
 314 |     "ExecuteTime": {
 315 |      "end_time": "2017-11-26T12:34:03.819802Z",
 316 |      "start_time": "2017-11-26T12:34:03.786125Z"
 317 |     },
 318 |     "scrolled": false
 319 |    },
 320 |    "outputs": [
 321 |     {
 322 |      "name": "stdout",
 323 |      "output_type": "stream",
 324 |      "text": [
 325 |       "____________________________________________________________________________________________________\n",
 326 |       "Layer (type)                     Output Shape          Param #     Connected to                     \n",
 327 |       "====================================================================================================\n",
 328 |       "input_1 (InputLayer)             (None, 416, 416, 3)   0                                            \n",
 329 |       "____________________________________________________________________________________________________\n",
 330 |       "conv_1 (Conv2D)                  (None, 416, 416, 32)  864         input_1[0][0]                    \n",
 331 |       "____________________________________________________________________________________________________\n",
 332 |       "norm_1 (BatchNormalization)      (None, 416, 416, 32)  128         conv_1[0][0]                     \n",
 333 |       "____________________________________________________________________________________________________\n",
 334 |       "leaky_re_lu_1 (LeakyReLU)        (None, 416, 416, 32)  0           norm_1[0][0]                     \n",
 335 |       "____________________________________________________________________________________________________\n",
 336 |       "max_pooling2d_1 (MaxPooling2D)   (None, 208, 208, 32)  0           leaky_re_lu_1[0][0]              \n",
 337 |       "____________________________________________________________________________________________________\n",
 338 |       "conv_2 (Conv2D)                  (None, 208, 208, 64)  18432       max_pooling2d_1[0][0]            \n",
 339 |       "____________________________________________________________________________________________________\n",
 340 |       "norm_2 (BatchNormalization)      (None, 208, 208, 64)  256         conv_2[0][0]                     \n",
 341 |       "____________________________________________________________________________________________________\n",
 342 |       "leaky_re_lu_2 (LeakyReLU)        (None, 208, 208, 64)  0           norm_2[0][0]                     \n",
 343 |       "____________________________________________________________________________________________________\n",
 344 |       "max_pooling2d_2 (MaxPooling2D)   (None, 104, 104, 64)  0           leaky_re_lu_2[0][0]              \n",
 345 |       "____________________________________________________________________________________________________\n",
 346 |       "conv_3 (Conv2D)                  (None, 104, 104, 128) 73728       max_pooling2d_2[0][0]            \n",
 347 |       "____________________________________________________________________________________________________\n",
 348 |       "norm_3 (BatchNormalization)      (None, 104, 104, 128) 512         conv_3[0][0]                     \n",
 349 |       "____________________________________________________________________________________________________\n",
 350 |       "leaky_re_lu_3 (LeakyReLU)        (None, 104, 104, 128) 0           norm_3[0][0]                     \n",
 351 |       "____________________________________________________________________________________________________\n",
 352 |       "conv_4 (Conv2D)                  (None, 104, 104, 64)  8192        leaky_re_lu_3[0][0]              \n",
 353 |       "____________________________________________________________________________________________________\n",
 354 |       "norm_4 (BatchNormalization)      (None, 104, 104, 64)  256         conv_4[0][0]                     \n",
 355 |       "____________________________________________________________________________________________________\n",
 356 |       "leaky_re_lu_4 (LeakyReLU)        (None, 104, 104, 64)  0           norm_4[0][0]                     \n",
 357 |       "____________________________________________________________________________________________________\n",
 358 |       "conv_5 (Conv2D)                  (None, 104, 104, 128) 73728       leaky_re_lu_4[0][0]              \n",
 359 |       "____________________________________________________________________________________________________\n",
 360 |       "norm_5 (BatchNormalization)      (None, 104, 104, 128) 512         conv_5[0][0]                     \n",
 361 |       "____________________________________________________________________________________________________\n",
 362 |       "leaky_re_lu_5 (LeakyReLU)        (None, 104, 104, 128) 0           norm_5[0][0]                     \n",
 363 |       "____________________________________________________________________________________________________\n",
 364 |       "max_pooling2d_3 (MaxPooling2D)   (None, 52, 52, 128)   0           leaky_re_lu_5[0][0]              \n",
 365 |       "____________________________________________________________________________________________________\n",
 366 |       "conv_6 (Conv2D)                  (None, 52, 52, 256)   294912      max_pooling2d_3[0][0]            \n",
 367 |       "____________________________________________________________________________________________________\n",
 368 |       "norm_6 (BatchNormalization)      (None, 52, 52, 256)   1024        conv_6[0][0]                     \n",
 369 |       "____________________________________________________________________________________________________\n",
 370 |       "leaky_re_lu_6 (LeakyReLU)        (None, 52, 52, 256)   0           norm_6[0][0]                     \n",
 371 |       "____________________________________________________________________________________________________\n",
 372 |       "conv_7 (Conv2D)                  (None, 52, 52, 128)   32768       leaky_re_lu_6[0][0]              \n",
 373 |       "____________________________________________________________________________________________________\n",
 374 |       "norm_7 (BatchNormalization)      (None, 52, 52, 128)   512         conv_7[0][0]                     \n",
 375 |       "____________________________________________________________________________________________________\n",
 376 |       "leaky_re_lu_7 (LeakyReLU)        (None, 52, 52, 128)   0           norm_7[0][0]                     \n",
 377 |       "____________________________________________________________________________________________________\n",
 378 |       "conv_8 (Conv2D)                  (None, 52, 52, 256)   294912      leaky_re_lu_7[0][0]              \n",
 379 |       "____________________________________________________________________________________________________\n",
 380 |       "norm_8 (BatchNormalization)      (None, 52, 52, 256)   1024        conv_8[0][0]                     \n",
 381 |       "____________________________________________________________________________________________________\n",
 382 |       "leaky_re_lu_8 (LeakyReLU)        (None, 52, 52, 256)   0           norm_8[0][0]                     \n",
 383 |       "____________________________________________________________________________________________________\n",
 384 |       "max_pooling2d_4 (MaxPooling2D)   (None, 26, 26, 256)   0           leaky_re_lu_8[0][0]              \n",
 385 |       "____________________________________________________________________________________________________\n",
 386 |       "conv_9 (Conv2D)                  (None, 26, 26, 512)   1179648     max_pooling2d_4[0][0]            \n",
 387 |       "____________________________________________________________________________________________________\n",
 388 |       "norm_9 (BatchNormalization)      (None, 26, 26, 512)   2048        conv_9[0][0]                     \n",
 389 |       "____________________________________________________________________________________________________\n",
 390 |       "leaky_re_lu_9 (LeakyReLU)        (None, 26, 26, 512)   0           norm_9[0][0]                     \n",
 391 |       "____________________________________________________________________________________________________\n",
 392 |       "conv_10 (Conv2D)                 (None, 26, 26, 256)   131072      leaky_re_lu_9[0][0]              \n",
 393 |       "____________________________________________________________________________________________________\n",
 394 |       "norm_10 (BatchNormalization)     (None, 26, 26, 256)   1024        conv_10[0][0]                    \n",
 395 |       "____________________________________________________________________________________________________\n",
 396 |       "leaky_re_lu_10 (LeakyReLU)       (None, 26, 26, 256)   0           norm_10[0][0]                    \n",
 397 |       "____________________________________________________________________________________________________\n",
 398 |       "conv_11 (Conv2D)                 (None, 26, 26, 512)   1179648     leaky_re_lu_10[0][0]             \n",
 399 |       "____________________________________________________________________________________________________\n",
 400 |       "norm_11 (BatchNormalization)     (None, 26, 26, 512)   2048        conv_11[0][0]                    \n",
 401 |       "____________________________________________________________________________________________________\n",
 402 |       "leaky_re_lu_11 (LeakyReLU)       (None, 26, 26, 512)   0           norm_11[0][0]                    \n",
 403 |       "____________________________________________________________________________________________________\n",
 404 |       "conv_12 (Conv2D)                 (None, 26, 26, 256)   131072      leaky_re_lu_11[0][0]             \n",
 405 |       "____________________________________________________________________________________________________\n",
 406 |       "norm_12 (BatchNormalization)     (None, 26, 26, 256)   1024        conv_12[0][0]                    \n",
 407 |       "____________________________________________________________________________________________________\n",
 408 |       "leaky_re_lu_12 (LeakyReLU)       (None, 26, 26, 256)   0           norm_12[0][0]                    \n",
 409 |       "____________________________________________________________________________________________________\n",
 410 |       "conv_13 (Conv2D)                 (None, 26, 26, 512)   1179648     leaky_re_lu_12[0][0]             \n",
 411 |       "____________________________________________________________________________________________________\n",
 412 |       "norm_13 (BatchNormalization)     (None, 26, 26, 512)   2048        conv_13[0][0]                    \n",
 413 |       "____________________________________________________________________________________________________\n",
 414 |       "leaky_re_lu_13 (LeakyReLU)       (None, 26, 26, 512)   0           norm_13[0][0]                    \n",
 415 |       "____________________________________________________________________________________________________\n",
 416 |       "max_pooling2d_5 (MaxPooling2D)   (None, 13, 13, 512)   0           leaky_re_lu_13[0][0]             \n",
 417 |       "____________________________________________________________________________________________________\n",
 418 |       "conv_14 (Conv2D)                 (None, 13, 13, 1024)  4718592     max_pooling2d_5[0][0]            \n",
 419 |       "____________________________________________________________________________________________________\n",
 420 |       "norm_14 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_14[0][0]                    \n",
 421 |       "____________________________________________________________________________________________________\n",
 422 |       "leaky_re_lu_14 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_14[0][0]                    \n",
 423 |       "____________________________________________________________________________________________________\n",
 424 |       "conv_15 (Conv2D)                 (None, 13, 13, 512)   524288      leaky_re_lu_14[0][0]             \n",
 425 |       "____________________________________________________________________________________________________\n",
 426 |       "norm_15 (BatchNormalization)     (None, 13, 13, 512)   2048        conv_15[0][0]                    \n",
 427 |       "____________________________________________________________________________________________________\n",
 428 |       "leaky_re_lu_15 (LeakyReLU)       (None, 13, 13, 512)   0           norm_15[0][0]                    \n",
 429 |       "____________________________________________________________________________________________________\n",
 430 |       "conv_16 (Conv2D)                 (None, 13, 13, 1024)  4718592     leaky_re_lu_15[0][0]             \n",
 431 |       "____________________________________________________________________________________________________\n",
 432 |       "norm_16 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_16[0][0]                    \n",
 433 |       "____________________________________________________________________________________________________\n",
 434 |       "leaky_re_lu_16 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_16[0][0]                    \n",
 435 |       "____________________________________________________________________________________________________\n",
 436 |       "conv_17 (Conv2D)                 (None, 13, 13, 512)   524288      leaky_re_lu_16[0][0]             \n",
 437 |       "____________________________________________________________________________________________________\n",
 438 |       "norm_17 (BatchNormalization)     (None, 13, 13, 512)   2048        conv_17[0][0]                    \n",
 439 |       "____________________________________________________________________________________________________\n",
 440 |       "leaky_re_lu_17 (LeakyReLU)       (None, 13, 13, 512)   0           norm_17[0][0]                    \n",
 441 |       "____________________________________________________________________________________________________\n",
 442 |       "conv_18 (Conv2D)                 (None, 13, 13, 1024)  4718592     leaky_re_lu_17[0][0]             \n",
 443 |       "____________________________________________________________________________________________________\n",
 444 |       "norm_18 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_18[0][0]                    \n",
 445 |       "____________________________________________________________________________________________________\n",
 446 |       "leaky_re_lu_18 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_18[0][0]                    \n",
 447 |       "____________________________________________________________________________________________________\n",
 448 |       "conv_19 (Conv2D)                 (None, 13, 13, 1024)  9437184     leaky_re_lu_18[0][0]             \n",
 449 |       "____________________________________________________________________________________________________\n",
 450 |       "norm_19 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_19[0][0]                    \n",
 451 |       "____________________________________________________________________________________________________\n",
 452 |       "conv_21 (Conv2D)                 (None, 26, 26, 64)    32768       leaky_re_lu_13[0][0]             \n",
 453 |       "____________________________________________________________________________________________________\n",
 454 |       "leaky_re_lu_19 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_19[0][0]                    \n",
 455 |       "____________________________________________________________________________________________________\n",
 456 |       "norm_21 (BatchNormalization)     (None, 26, 26, 64)    256         conv_21[0][0]                    \n",
 457 |       "____________________________________________________________________________________________________\n",
 458 |       "conv_20 (Conv2D)                 (None, 13, 13, 1024)  9437184     leaky_re_lu_19[0][0]             \n",
 459 |       "____________________________________________________________________________________________________\n",
 460 |       "leaky_re_lu_21 (LeakyReLU)       (None, 26, 26, 64)    0           norm_21[0][0]                    \n",
 461 |       "____________________________________________________________________________________________________\n",
 462 |       "norm_20 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_20[0][0]                    \n",
 463 |       "____________________________________________________________________________________________________\n",
 464 |       "lambda_1 (Lambda)                (None, 13, 13, 256)   0           leaky_re_lu_21[0][0]             \n",
 465 |       "____________________________________________________________________________________________________\n",
 466 |       "leaky_re_lu_20 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_20[0][0]                    \n",
 467 |       "____________________________________________________________________________________________________\n",
 468 |       "concatenate_1 (Concatenate)      (None, 13, 13, 1280)  0           lambda_1[0][0]                   \n",
 469 |       "                                                                   leaky_re_lu_20[0][0]             \n",
 470 |       "____________________________________________________________________________________________________\n",
 471 |       "conv_22 (Conv2D)                 (None, 13, 13, 1024)  11796480    concatenate_1[0][0]              \n",
 472 |       "____________________________________________________________________________________________________\n",
 473 |       "norm_22 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_22[0][0]                    \n",
 474 |       "____________________________________________________________________________________________________\n",
 475 |       "leaky_re_lu_22 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_22[0][0]                    \n",
 476 |       "____________________________________________________________________________________________________\n",
 477 |       "conv_23 (Conv2D)                 (None, 13, 13, 30)    30750       leaky_re_lu_22[0][0]             \n",
 478 |       "____________________________________________________________________________________________________\n",
 479 |       "reshape_1 (Reshape)              (None, 13, 13, 5, 6)  0           conv_23[0][0]                    \n",
 480 |       "____________________________________________________________________________________________________\n",
 481 |       "input_2 (InputLayer)             (None, 1, 1, 1, 50, 4 0                                            \n",
 482 |       "____________________________________________________________________________________________________\n",
 483 |       "lambda_2 (Lambda)                (None, 13, 13, 5, 6)  0           reshape_1[0][0]                  \n",
 484 |       "                                                                   input_2[0][0]                    \n",
 485 |       "====================================================================================================\n",
 486 |       "Total params: 50,578,686\n",
 487 |       "Trainable params: 50,558,014\n",
 488 |       "Non-trainable params: 20,672\n",
 489 |       "____________________________________________________________________________________________________\n"
 490 |      ]
 491 |     }
 492 |    ],
 493 |    "source": [
 494 |     "model.summary()"
 495 |    ]
 496 |   },
 497 |   {
 498 |    "cell_type": "markdown",
 499 |    "metadata": {},
 500 |    "source": [
 501 |     "# Load pretrained weights"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "markdown",
 506 |    "metadata": {},
 507 |    "source": [
 508 |     "**Load the weights originally provided by YOLO**"
 509 |    ]
 510 |   },
 511 |   {
 512 |    "cell_type": "code",
 513 |    "execution_count": 7,
 514 |    "metadata": {
 515 |     "ExecuteTime": {
 516 |      "end_time": "2018-04-04T00:18:58.168386",
 517 |      "start_time": "2018-04-04T00:18:58.110194"
 518 |     }
 519 |    },
 520 |    "outputs": [],
 521 |    "source": [
 522 |     "weight_reader = WeightReader(wt_path)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": 8,
 528 |    "metadata": {
 529 |     "ExecuteTime": {
 530 |      "end_time": "2018-04-04T00:19:04.250579",
 531 |      "start_time": "2018-04-04T00:18:58.711706"
 532 |     }
 533 |    },
 534 |    "outputs": [],
 535 |    "source": [
 536 |     "weight_reader.reset()\n",
 537 |     "nb_conv = 23\n",
 538 |     "\n",
 539 |     "for i in range(1, nb_conv+1):\n",
 540 |     "    conv_layer = model.get_layer('conv_' + str(i))\n",
 541 |     "    \n",
 542 |     "    if i < nb_conv:\n",
 543 |     "        norm_layer = model.get_layer('norm_' + str(i))\n",
 544 |     "        \n",
 545 |     "        size = np.prod(norm_layer.get_weights()[0].shape)\n",
 546 |     "\n",
 547 |     "        beta  = weight_reader.read_bytes(size)\n",
 548 |     "        gamma = weight_reader.read_bytes(size)\n",
 549 |     "        mean  = weight_reader.read_bytes(size)\n",
 550 |     "        var   = weight_reader.read_bytes(size)\n",
 551 |     "\n",
 552 |     "        weights = norm_layer.set_weights([gamma, beta, mean, var])       \n",
 553 |     "        \n",
 554 |     "    if len(conv_layer.get_weights()) > 1:\n",
 555 |     "        bias   = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))\n",
 556 |     "        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
 557 |     "        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
 558 |     "        kernel = kernel.transpose([2,3,1,0])\n",
 559 |     "        conv_layer.set_weights([kernel, bias])\n",
 560 |     "    else:\n",
 561 |     "        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
 562 |     "        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
 563 |     "        kernel = kernel.transpose([2,3,1,0])\n",
 564 |     "        conv_layer.set_weights([kernel])"
 565 |    ]
 566 |   },
 567 |   {
 568 |    "cell_type": "markdown",
 569 |    "metadata": {},
 570 |    "source": [
 571 |     "**Randomize weights of the last layer**"
 572 |    ]
 573 |   },
 574 |   {
 575 |    "cell_type": "code",
 576 |    "execution_count": 9,
 577 |    "metadata": {
 578 |     "ExecuteTime": {
 579 |      "end_time": "2017-11-22T14:08:00.245248Z",
 580 |      "start_time": "2017-11-22T14:08:00.215495Z"
 581 |     }
 582 |    },
 583 |    "outputs": [],
 584 |    "source": [
 585 |     "layer   = model.layers[-4] # the last convolutional layer\n",
 586 |     "weights = layer.get_weights()\n",
 587 |     "\n",
 588 |     "new_kernel = np.random.normal(size=weights[0].shape)/(GRID_H*GRID_W)\n",
 589 |     "new_bias   = np.random.normal(size=weights[1].shape)/(GRID_H*GRID_W)\n",
 590 |     "\n",
 591 |     "layer.set_weights([new_kernel, new_bias])"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "markdown",
 596 |    "metadata": {},
 597 |    "source": [
 598 |     "# Perform training"
 599 |    ]
 600 |   },
 601 |   {
 602 |    "cell_type": "markdown",
 603 |    "metadata": {},
 604 |    "source": [
 605 |     "**Loss function**"
 606 |    ]
 607 |   },
 608 |   {
 609 |    "cell_type": "markdown",
 610 |    "metadata": {
 611 |     "ExecuteTime": {
 612 |      "end_time": "2017-02-01T20:44:50.211553",
 613 |      "start_time": "2017-02-01T20:44:50.206006"
 614 |     }
 615 |    },
 616 |    "source": [
 617 |     "$$\\begin{multline}\n",
 618 |     "\\lambda_\\textbf{coord}\n",
 619 |     "\\sum_{i = 0}^{S^2}\n",
 620 |     "    \\sum_{j = 0}^{B}\n",
 621 |     "     L_{ij}^{\\text{obj}}\n",
 622 |     "            \\left[\n",
 623 |     "            \\left(\n",
 624 |     "                x_i - \\hat{x}_i\n",
 625 |     "            \\right)^2 +\n",
 626 |     "            \\left(\n",
 627 |     "                y_i - \\hat{y}_i\n",
 628 |     "            \\right)^2\n",
 629 |     "            \\right]\n",
 630 |     "\\\\\n",
 631 |     "+ \\lambda_\\textbf{coord} \n",
 632 |     "\\sum_{i = 0}^{S^2}\n",
 633 |     "    \\sum_{j = 0}^{B}\n",
 634 |     "         L_{ij}^{\\text{obj}}\n",
 635 |     "         \\left[\n",
 636 |     "        \\left(\n",
 637 |     "            \\sqrt{w_i} - \\sqrt{\\hat{w}_i}\n",
 638 |     "        \\right)^2 +\n",
 639 |     "        \\left(\n",
 640 |     "            \\sqrt{h_i} - \\sqrt{\\hat{h}_i}\n",
 641 |     "        \\right)^2\n",
 642 |     "        \\right]\n",
 643 |     "\\\\\n",
 644 |     "+ \\sum_{i = 0}^{S^2}\n",
 645 |     "    \\sum_{j = 0}^{B}\n",
 646 |     "        L_{ij}^{\\text{obj}}\n",
 647 |     "        \\left(\n",
 648 |     "            C_i - \\hat{C}_i\n",
 649 |     "        \\right)^2\n",
 650 |     "\\\\\n",
 651 |     "+ \\lambda_\\textrm{noobj}\n",
 652 |     "\\sum_{i = 0}^{S^2}\n",
 653 |     "    \\sum_{j = 0}^{B}\n",
 654 |     "    L_{ij}^{\\text{noobj}}\n",
 655 |     "        \\left(\n",
 656 |     "            C_i - \\hat{C}_i\n",
 657 |     "        \\right)^2\n",
 658 |     "\\\\\n",
 659 |     "+ \\sum_{i = 0}^{S^2}\n",
 660 |     "L_i^{\\text{obj}}\n",
 661 |     "    \\sum_{c \\in \\textrm{classes}}\n",
 662 |     "        \\left(\n",
 663 |     "            p_i(c) - \\hat{p}_i(c)\n",
 664 |     "        \\right)^2\n",
 665 |     "\\end{multline}$$"
 666 |    ]
 667 |   },
 668 |   {
 669 |    "cell_type": "code",
 670 |    "execution_count": 10,
 671 |    "metadata": {
 672 |     "ExecuteTime": {
 673 |      "end_time": "2017-11-26T12:34:28.064549Z",
 674 |      "start_time": "2017-11-26T12:34:27.800510Z"
 675 |     },
 676 |     "code_folding": []
 677 |    },
 678 |    "outputs": [],
 679 |    "source": [
 680 |     "def custom_loss(y_true, y_pred):\n",
 681 |     "    mask_shape = tf.shape(y_true)[:4]\n",
 682 |     "    \n",
 683 |     "    cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))\n",
 684 |     "    cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n",
 685 |     "\n",
 686 |     "    cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])\n",
 687 |     "    \n",
 688 |     "    coord_mask = tf.zeros(mask_shape)\n",
 689 |     "    conf_mask  = tf.zeros(mask_shape)\n",
 690 |     "    class_mask = tf.zeros(mask_shape)\n",
 691 |     "    \n",
 692 |     "    seen = tf.Variable(0.)\n",
 693 |     "    total_recall = tf.Variable(0.)\n",
 694 |     "    \n",
 695 |     "    \"\"\"\n",
 696 |     "    Adjust prediction\n",
 697 |     "    \"\"\"\n",
 698 |     "    ### adjust x and y      \n",
 699 |     "    pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n",
 700 |     "    \n",
 701 |     "    ### adjust w and h\n",
 702 |     "    pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])\n",
 703 |     "    \n",
 704 |     "    ### adjust confidence\n",
 705 |     "    pred_box_conf = tf.sigmoid(y_pred[..., 4])\n",
 706 |     "    \n",
 707 |     "    ### adjust class probabilities\n",
 708 |     "    pred_box_class = y_pred[..., 5:]\n",
 709 |     "    \n",
 710 |     "    \"\"\"\n",
 711 |     "    Adjust ground truth\n",
 712 |     "    \"\"\"\n",
 713 |     "    ### adjust x and y\n",
 714 |     "    true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n",
 715 |     "    \n",
 716 |     "    ### adjust w and h\n",
 717 |     "    true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n",
 718 |     "    \n",
 719 |     "    ### adjust confidence\n",
 720 |     "    true_wh_half = true_box_wh / 2.\n",
 721 |     "    true_mins    = true_box_xy - true_wh_half\n",
 722 |     "    true_maxes   = true_box_xy + true_wh_half\n",
 723 |     "    \n",
 724 |     "    pred_wh_half = pred_box_wh / 2.\n",
 725 |     "    pred_mins    = pred_box_xy - pred_wh_half\n",
 726 |     "    pred_maxes   = pred_box_xy + pred_wh_half       \n",
 727 |     "    \n",
 728 |     "    intersect_mins  = tf.maximum(pred_mins,  true_mins)\n",
 729 |     "    intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
 730 |     "    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
 731 |     "    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
 732 |     "    \n",
 733 |     "    true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n",
 734 |     "    pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n",
 735 |     "\n",
 736 |     "    union_areas = pred_areas + true_areas - intersect_areas\n",
 737 |     "    iou_scores  = tf.truediv(intersect_areas, union_areas)\n",
 738 |     "    \n",
 739 |     "    true_box_conf = iou_scores * y_true[..., 4]\n",
 740 |     "    \n",
 741 |     "    ### adjust class probabilities\n",
 742 |     "    true_box_class = tf.argmax(y_true[..., 5:], -1)\n",
 743 |     "    \n",
 744 |     "    \"\"\"\n",
 745 |     "    Determine the masks\n",
 746 |     "    \"\"\"\n",
 747 |     "    ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n",
 748 |     "    coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE\n",
 749 |     "    \n",
 750 |     "    ### confidence mask: penelize predictors + penalize boxes with low IOU\n",
 751 |     "    # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n",
 752 |     "    true_xy = true_boxes[..., 0:2]\n",
 753 |     "    true_wh = true_boxes[..., 2:4]\n",
 754 |     "    \n",
 755 |     "    true_wh_half = true_wh / 2.\n",
 756 |     "    true_mins    = true_xy - true_wh_half\n",
 757 |     "    true_maxes   = true_xy + true_wh_half\n",
 758 |     "    \n",
 759 |     "    pred_xy = tf.expand_dims(pred_box_xy, 4)\n",
 760 |     "    pred_wh = tf.expand_dims(pred_box_wh, 4)\n",
 761 |     "    \n",
 762 |     "    pred_wh_half = pred_wh / 2.\n",
 763 |     "    pred_mins    = pred_xy - pred_wh_half\n",
 764 |     "    pred_maxes   = pred_xy + pred_wh_half    \n",
 765 |     "    \n",
 766 |     "    intersect_mins  = tf.maximum(pred_mins,  true_mins)\n",
 767 |     "    intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
 768 |     "    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
 769 |     "    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
 770 |     "    \n",
 771 |     "    true_areas = true_wh[..., 0] * true_wh[..., 1]\n",
 772 |     "    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n",
 773 |     "\n",
 774 |     "    union_areas = pred_areas + true_areas - intersect_areas\n",
 775 |     "    iou_scores  = tf.truediv(intersect_areas, union_areas)\n",
 776 |     "\n",
 777 |     "    best_ious = tf.reduce_max(iou_scores, axis=4)\n",
 778 |     "    conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE\n",
 779 |     "    \n",
 780 |     "    # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n",
 781 |     "    conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE\n",
 782 |     "    \n",
 783 |     "    ### class mask: simply the position of the ground truth boxes (the predictors)\n",
 784 |     "    class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE       \n",
 785 |     "    \n",
 786 |     "    \"\"\"\n",
 787 |     "    Warm-up training\n",
 788 |     "    \"\"\"\n",
 789 |     "    no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)\n",
 790 |     "    seen = tf.assign_add(seen, 1.)\n",
 791 |     "    \n",
 792 |     "    true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), \n",
 793 |     "                          lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n",
 794 |     "                                   true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask, \n",
 795 |     "                                   tf.ones_like(coord_mask)],\n",
 796 |     "                          lambda: [true_box_xy, \n",
 797 |     "                                   true_box_wh,\n",
 798 |     "                                   coord_mask])\n",
 799 |     "    \n",
 800 |     "    \"\"\"\n",
 801 |     "    Finalize the loss\n",
 802 |     "    \"\"\"\n",
 803 |     "    nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n",
 804 |     "    nb_conf_box  = tf.reduce_sum(tf.to_float(conf_mask  > 0.0))\n",
 805 |     "    nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n",
 806 |     "    \n",
 807 |     "    loss_xy    = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
 808 |     "    loss_wh    = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
 809 |     "    loss_conf  = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / 2.\n",
 810 |     "    loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n",
 811 |     "    loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n",
 812 |     "    \n",
 813 |     "    loss = loss_xy + loss_wh + loss_conf + loss_class\n",
 814 |     "    \n",
 815 |     "    nb_true_box = tf.reduce_sum(y_true[..., 4])\n",
 816 |     "    nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))\n",
 817 |     "\n",
 818 |     "    \"\"\"\n",
 819 |     "    Debugging code\n",
 820 |     "    \"\"\"    \n",
 821 |     "    current_recall = nb_pred_box/(nb_true_box + 1e-6)\n",
 822 |     "    total_recall = tf.assign_add(total_recall, current_recall) \n",
 823 |     "\n",
 824 |     "    loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \\t', summarize=1000)\n",
 825 |     "    loss = tf.Print(loss, [loss_xy], message='Loss XY \\t', summarize=1000)\n",
 826 |     "    loss = tf.Print(loss, [loss_wh], message='Loss WH \\t', summarize=1000)\n",
 827 |     "    loss = tf.Print(loss, [loss_conf], message='Loss Conf \\t', summarize=1000)\n",
 828 |     "    loss = tf.Print(loss, [loss_class], message='Loss Class \\t', summarize=1000)\n",
 829 |     "    loss = tf.Print(loss, [loss], message='Total Loss \\t', summarize=1000)\n",
 830 |     "    loss = tf.Print(loss, [current_recall], message='Current Recall \\t', summarize=1000)\n",
 831 |     "    loss = tf.Print(loss, [total_recall/seen], message='Average Recall \\t', summarize=1000)\n",
 832 |     "    \n",
 833 |     "    return loss"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "markdown",
 838 |    "metadata": {},
 839 |    "source": [
 840 |     "**Parse the annotations to construct train generator and validation generator**"
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": 11,
 846 |    "metadata": {
 847 |     "ExecuteTime": {
 848 |      "end_time": "2017-11-26T12:38:44.283547Z",
 849 |      "start_time": "2017-11-26T12:38:44.277155Z"
 850 |     }
 851 |    },
 852 |    "outputs": [],
 853 |    "source": [
 854 |     "generator_config = {\n",
 855 |     "    'IMAGE_H'         : IMAGE_H, \n",
 856 |     "    'IMAGE_W'         : IMAGE_W,\n",
 857 |     "    'GRID_H'          : GRID_H,  \n",
 858 |     "    'GRID_W'          : GRID_W,\n",
 859 |     "    'BOX'             : BOX,\n",
 860 |     "    'LABELS'          : LABELS,\n",
 861 |     "    'CLASS'           : len(LABELS),\n",
 862 |     "    'ANCHORS'         : ANCHORS,\n",
 863 |     "    'BATCH_SIZE'      : BATCH_SIZE,\n",
 864 |     "    'TRUE_BOX_BUFFER' : 50,\n",
 865 |     "}"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": 12,
 871 |    "metadata": {},
 872 |    "outputs": [],
 873 |    "source": [
 874 |     "def normalize(image):\n",
 875 |     "    return image / 255."
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": 13,
 881 |    "metadata": {
 882 |     "ExecuteTime": {
 883 |      "end_time": "2017-11-26T12:38:51.836129Z",
 884 |      "start_time": "2017-11-26T12:38:51.766843Z"
 885 |     }
 886 |    },
 887 |    "outputs": [],
 888 |    "source": [
 889 |     "train_imgs, seen_train_labels = parse_annotation(train_annot_folder, train_image_folder, labels=LABELS)\n",
 890 |     "### write parsed annotations to pickle for fast retrieval next time\n",
 891 |     "#with open('train_imgs', 'wb') as fp:\n",
 892 |     "#    pickle.dump(train_imgs, fp)\n",
 893 |     "\n",
 894 |     "### read saved pickle of parsed annotations\n",
 895 |     "#with open ('train_imgs', 'rb') as fp:\n",
 896 |     "#    train_imgs = pickle.load(fp)\n",
 897 |     "train_batch = BatchGenerator(train_imgs, generator_config, norm=normalize)\n",
 898 |     "\n",
 899 |     "valid_imgs, seen_valid_labels = parse_annotation(valid_annot_folder, valid_image_folder, labels=LABELS)\n",
 900 |     "### write parsed annotations to pickle for fast retrieval next time\n",
 901 |     "#with open('valid_imgs', 'wb') as fp:\n",
 902 |     "#    pickle.dump(valid_imgs, fp)\n",
 903 |     "\n",
 904 |     "### read saved pickle of parsed annotations\n",
 905 |     "#with open ('valid_imgs', 'rb') as fp:\n",
 906 |     "#    valid_imgs = pickle.load(fp)\n",
 907 |     "valid_batch = BatchGenerator(valid_imgs, generator_config, norm=normalize, jitter=False)"
 908 |    ]
 909 |   },
 910 |   {
 911 |    "cell_type": "markdown",
 912 |    "metadata": {},
 913 |    "source": [
 914 |     "**Setup a few callbacks and start the training**"
 915 |    ]
 916 |   },
 917 |   {
 918 |    "cell_type": "code",
 919 |    "execution_count": 14,
 920 |    "metadata": {
 921 |     "ExecuteTime": {
 922 |      "end_time": "2017-11-26T12:38:15.714460Z",
 923 |      "start_time": "2017-11-26T12:38:15.708674Z"
 924 |     },
 925 |     "code_folding": []
 926 |    },
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "early_stop = EarlyStopping(monitor='val_loss', \n",
 930 |     "                           min_delta=0.001, \n",
 931 |     "                           patience=10, \n",
 932 |     "                           mode='min', \n",
 933 |     "                           verbose=1)\n",
 934 |     "\n",
 935 |     "checkpoint = ModelCheckpoint('weights_truck2.h5', \n",
 936 |     "                             monitor='val_loss', \n",
 937 |     "                             verbose=1, \n",
 938 |     "                             save_best_only=True, \n",
 939 |     "                             mode='min', \n",
 940 |     "                             period=1)"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "code",
 945 |    "execution_count": 15,
 946 |    "metadata": {
 947 |     "ExecuteTime": {
 948 |      "start_time": "2017-11-26T20:38:54.037Z"
 949 |     },
 950 |     "scrolled": false
 951 |    },
 952 |    "outputs": [
 953 |     {
 954 |      "name": "stdout",
 955 |      "output_type": "stream",
 956 |      "text": [
 957 |       "Epoch 1/100\n",
 958 |       "12/13 [==========================>...] - ETA: 4s - loss: 0.5833Epoch 00000: val_loss improved from inf to 0.47258, saving model to weights_truck2.h5\n",
 959 |       "13/13 [==============================] - 70s - loss: 0.5712 - val_loss: 0.4726\n",
 960 |       "Epoch 2/100\n",
 961 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5854Epoch 00001: val_loss improved from 0.47258 to 0.44968, saving model to weights_truck2.h5\n",
 962 |       "13/13 [==============================] - 68s - loss: 0.5872 - val_loss: 0.4497\n",
 963 |       "Epoch 3/100\n",
 964 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5750Epoch 00002: val_loss improved from 0.44968 to 0.43230, saving model to weights_truck2.h5\n",
 965 |       "13/13 [==============================] - 61s - loss: 0.5659 - val_loss: 0.4323\n",
 966 |       "Epoch 4/100\n",
 967 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5410Epoch 00003: val_loss did not improve\n",
 968 |       "13/13 [==============================] - 52s - loss: 0.5439 - val_loss: 0.4491\n",
 969 |       "Epoch 5/100\n",
 970 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5806Epoch 00004: val_loss did not improve\n",
 971 |       "13/13 [==============================] - 48s - loss: 0.5815 - val_loss: 0.4697\n",
 972 |       "Epoch 6/100\n",
 973 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.6094Epoch 00005: val_loss did not improve\n",
 974 |       "13/13 [==============================] - 51s - loss: 0.6043 - val_loss: 0.4687\n",
 975 |       "Epoch 7/100\n",
 976 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5725Epoch 00006: val_loss did not improve\n",
 977 |       "13/13 [==============================] - 52s - loss: 0.5736 - val_loss: 0.4330\n",
 978 |       "Epoch 8/100\n",
 979 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5668Epoch 00007: val_loss did not improve\n",
 980 |       "13/13 [==============================] - 50s - loss: 0.5612 - val_loss: 0.4426\n",
 981 |       "Epoch 9/100\n",
 982 |       "12/13 [==========================>...] - ETA: 3s - loss: 0.5820Epoch 00008: val_loss did not improve\n",
 983 |       "13/13 [==============================] - 49s - loss: 0.5785 - val_loss: 0.4446\n",
 984 |       "Epoch 10/100\n",
 985 |       "11/13 [========================>.....] - ETA: 6s - loss: 0.5646"
 986 |      ]
 987 |     },
 988 |     {
 989 |      "ename": "KeyboardInterrupt",
 990 |      "evalue": "",
 991 |      "output_type": "error",
 992 |      "traceback": [
 993 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 994 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
 995 |       "\u001b[0;32m<ipython-input-15-98657455a4f5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     22\u001b[0m                     \u001b[0mvalidation_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalid_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     23\u001b[0m                     \u001b[0mcallbacks\u001b[0m        \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mearly_stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheckpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensorboard\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m                     max_queue_size   = 3)\n\u001b[0m\u001b[1;32m     25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     26\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 996 |       "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/legacy/interfaces.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     85\u001b[0m                 warnings.warn('Update your `' + object_name +\n\u001b[1;32m     86\u001b[0m                               '` call to the Keras 2 API: ' + signature, stacklevel=2)\n\u001b[0;32m---> 87\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     88\u001b[0m         \u001b[0mwrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     89\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 997 |       "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m   2009\u001b[0m                 \u001b[0mbatch_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2010\u001b[0m                 \u001b[0;32mwhile\u001b[0m \u001b[0msteps_done\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2011\u001b[0;31m                     \u001b[0mgenerator_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2012\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2013\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__len__'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 998 |       "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/utils/data_utils.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    503\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    504\u001b[0m             \u001b[0;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m                 \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqueue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    506\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m                     \u001b[0;32myield\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 999 |       "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    601\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 602\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    603\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    604\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1000 |       "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    597\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    598\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    601\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1001 |       "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    547\u001b[0m             \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    548\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m                 \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    550\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
1002 |       "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    291\u001b[0m         \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m    \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    292\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m                 \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    294\u001b[0m                 \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    295\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1003 |       "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
1004 |      ]
1005 |     }
1006 |    ],
1007 |    "source": [
1008 |     "tb_counter  = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'truck_' in log]) + 1\n",
1009 |     "tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'truck_' + '_' + str(tb_counter), \n",
1010 |     "                          histogram_freq=0, \n",
1011 |     "                          write_graph=True, \n",
1012 |     "                          write_images=False)\n",
1013 |     "\n",
1014 |     "optimizer = Adam(lr=0.1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n",
1015 |     "#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)\n",
1016 |     "#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)\n",
1017 |     "\n",
1018 |     "model.load_weights(\"wednesday2.h5\")\n",
1019 |     "\n",
1020 |     "model.compile(loss=custom_loss, optimizer=optimizer)\n",
1021 |     "\n",
1022 |     "\n",
1023 |     "\n",
1024 |     "#history = model.fit_generator(generator        = train_batch, \n",
1025 |     "                    steps_per_epoch  = len(train_batch), \n",
1026 |     "                    epochs           = 100, \n",
1027 |     "                    verbose          = 1,\n",
1028 |     "                    validation_data  = valid_batch,\n",
1029 |     "                    validation_steps = len(valid_batch),\n",
1030 |     "                    callbacks        = [early_stop, checkpoint, tensorboard], \n",
1031 |     "                    max_queue_size   = 3)\n",
1032 |     "\n",
1033 |     "#print(history.history.keys())\n",
1034 |     "# summarize history for accuracy\n",
1035 |     "#plt.plot(history.history['loss'])\n",
1036 |     "#plt.plot(history.history['val_loss'])\n",
1037 |     "#plt.title('model loss')\n",
1038 |     "#plt.ylabel('loss')\n",
1039 |     "#plt.xlabel('epoch')\n",
1040 |     "#plt.legend(['train', 'test'], loc='upper left')\n",
1041 |     "#plt.show()"
1042 |    ]
1043 |   },
1044 |   {
1045 |    "cell_type": "markdown",
1046 |    "metadata": {},
1047 |    "source": [
1048 |     "# Perform detection on image"
1049 |    ]
1050 |   },
1051 |   {
1052 |    "cell_type": "code",
1053 |    "execution_count": 1,
1054 |    "metadata": {
1055 |     "ExecuteTime": {
1056 |      "end_time": "2017-11-22T14:07:49.271978Z",
1057 |      "start_time": "2017-11-22T14:07:49.268999Z"
1058 |     }
1059 |    },
1060 |    "outputs": [
1061 |     {
1062 |      "ename": "NameError",
1063 |      "evalue": "name 'model' is not defined",
1064 |      "output_type": "error",
1065 |      "traceback": [
1066 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1067 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
1068 |       "\u001b[0;32m<ipython-input-1-3100b1404c64>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_weights.h5\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1069 |       "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined"
1070 |      ]
1071 |     }
1072 |    ],
1073 |    "source": [
1074 |     "model.load_weights(\"best_weights.h5\")"
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "code",
1079 |    "execution_count": null,
1080 |    "metadata": {
1081 |     "ExecuteTime": {
1082 |      "end_time": "2018-04-04T00:19:07.263359",
1083 |      "start_time": "2018-04-04T00:19:05.658285"
1084 |     },
1085 |     "scrolled": false
1086 |    },
1087 |    "outputs": [],
1088 |    "source": [
1089 |     "image = cv2.imread('train_image_folder/00001018.jpg')\n",
1090 |     "dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))\n",
1091 |     "\n",
1092 |     "plt.figure(figsize=(10,10))\n",
1093 |     "\n",
1094 |     "input_image = cv2.resize(image, (416, 416))\n",
1095 |     "input_image = input_image / 255.\n",
1096 |     "input_image = input_image[:,:,::-1]\n",
1097 |     "input_image = np.expand_dims(input_image, 0)\n",
1098 |     "\n",
1099 |     "netout = model.predict([input_image, dummy_array])\n",
1100 |     "\n",
1101 |     "boxes = decode_netout(netout[0], \n",
1102 |     "                      obj_threshold=0.3,\n",
1103 |     "                      nms_threshold=NMS_THRESHOLD,\n",
1104 |     "                      anchors=ANCHORS, \n",
1105 |     "                      nb_class=CLASS)\n",
1106 |     "            \n",
1107 |     "image = draw_boxes(image, boxes, labels=LABELS)\n",
1108 |     "\n",
1109 |     "plt.imshow(image[:,:,::-1]); plt.show()"
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "markdown",
1114 |    "metadata": {},
1115 |    "source": [
1116 |     "# Perform detection on video"
1117 |    ]
1118 |   },
1119 |   {
1120 |    "cell_type": "code",
1121 |    "execution_count": null,
1122 |    "metadata": {
1123 |     "ExecuteTime": {
1124 |      "end_time": "2017-10-06T13:28:28.029334Z",
1125 |      "start_time": "2017-10-06T13:28:28.024662Z"
1126 |     }
1127 |    },
1128 |    "outputs": [],
1129 |    "source": [
1130 |     "model.load_weights(\"weights_coco.h5\")\n",
1131 |     "\n",
1132 |     "dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))"
1133 |    ]
1134 |   },
1135 |   {
1136 |    "cell_type": "code",
1137 |    "execution_count": null,
1138 |    "metadata": {
1139 |     "ExecuteTime": {
1140 |      "end_time": "2017-10-06T13:39:09.640646Z",
1141 |      "start_time": "2017-10-06T13:31:44.627609Z"
1142 |     }
1143 |    },
1144 |    "outputs": [],
1145 |    "source": [
1146 |     "video_inp = '../basic-yolo-keras/images/phnom_penh.mp4'\n",
1147 |     "video_out = '../basic-yolo-keras/images/phnom_penh_bbox.mp4'\n",
1148 |     "\n",
1149 |     "video_reader = cv2.VideoCapture(video_inp)\n",
1150 |     "\n",
1151 |     "nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))\n",
1152 |     "frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))\n",
1153 |     "frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))\n",
1154 |     "\n",
1155 |     "video_writer = cv2.VideoWriter(video_out,\n",
1156 |     "                               cv2.VideoWriter_fourcc(*'XVID'), \n",
1157 |     "                               50.0, \n",
1158 |     "                               (frame_w, frame_h))\n",
1159 |     "\n",
1160 |     "for i in tqdm(range(nb_frames)):\n",
1161 |     "    ret, image = video_reader.read()\n",
1162 |     "    \n",
1163 |     "    input_image = cv2.resize(image, (416, 416))\n",
1164 |     "    input_image = input_image / 255.\n",
1165 |     "    input_image = input_image[:,:,::-1]\n",
1166 |     "    input_image = np.expand_dims(input_image, 0)\n",
1167 |     "\n",
1168 |     "    netout = model.predict([input_image, dummy_array])\n",
1169 |     "\n",
1170 |     "    boxes = decode_netout(netout[0], \n",
1171 |     "                          obj_threshold=0.3,\n",
1172 |     "                          nms_threshold=NMS_THRESHOLD,\n",
1173 |     "                          anchors=ANCHORS, \n",
1174 |     "                          nb_class=CLASS)\n",
1175 |     "    image = draw_boxes(image, boxes, labels=LABELS)\n",
1176 |     "\n",
1177 |     "    video_writer.write(np.uint8(image))\n",
1178 |     "    \n",
1179 |     "video_reader.release()\n",
1180 |     "video_writer.release()  "
1181 |    ]
1182 |   }
1183 |  ],
1184 |  "metadata": {
1185 |   "anaconda-cloud": {},
1186 |   "hide_input": false,
1187 |   "kernelspec": {
1188 |    "display_name": "Python 3",
1189 |    "language": "python",
1190 |    "name": "python3"
1191 |   },
1192 |   "language_info": {
1193 |    "codemirror_mode": {
1194 |     "name": "ipython",
1195 |     "version": 3
1196 |    },
1197 |    "file_extension": ".py",
1198 |    "mimetype": "text/x-python",
1199 |    "name": "python",
1200 |    "nbconvert_exporter": "python",
1201 |    "pygments_lexer": "ipython3",
1202 |    "version": "3.5.2"
1203 |   },
1204 |   "toc": {
1205 |    "nav_menu": {
1206 |     "height": "122px",
1207 |     "width": "252px"
1208 |    },
1209 |    "number_sections": true,
1210 |    "sideBar": true,
1211 |    "skip_h1_title": false,
1212 |    "toc_cell": false,
1213 |    "toc_position": {
1214 |     "height": "758px",
1215 |     "left": "0px",
1216 |     "right": "1096px",
1217 |     "top": "73px",
1218 |     "width": "253px"
1219 |    },
1220 |    "toc_section_display": "block",
1221 |    "toc_window_display": true
1222 |   }
1223 |  },
1224 |  "nbformat": 4,
1225 |  "nbformat_minor": 1
1226 | }
1227 | 


--------------------------------------------------------------------------------