├── requirements.txt ├── test_images ├── testytest.jpg └── testytest_detected.jpg ├── .gitignore ├── config.json ├── LICENSE ├── predict.py ├── train.py ├── gen_anchors.py ├── README.md ├── utils.py ├── backend.py ├── preprocessing.py ├── frontend.py └── Yolo Step-by-Step.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==1.3 2 | keras==2.0.8 3 | imgaug 4 | opencv-python 5 | h5py 6 | -------------------------------------------------------------------------------- /test_images/testytest.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/department-for-transport/dftlab-yolo-vehiclecounting/master/test_images/testytest.jpg -------------------------------------------------------------------------------- /test_images/testytest_detected.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/department-for-transport/dftlab-yolo-vehiclecounting/master/test_images/testytest_detected.jpg -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #ignore weights 2 | *.h5 3 | 4 | #ignore training images 5 | /train_image 6 | 7 | #ignore training labels 8 | /train_labels 9 | 10 | #ignore pycache 11 | /__pycache__ 12 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model" : { 3 | "backend": "Full Yolo", 4 | "input_size": 416, 5 | "anchors": [0.77,1.62, 1.11,1.74, 1.17,0.69, 1.73,1.60, 1.73,0.88], 6 | "max_box_per_image": 10, 7 | "labels": ["car"] 8 | }, 9 | 10 | "train": { 11 | "train_image_folder": "train_image/", 12 | "train_annot_folder": "test_labels/", 13 | 14 | "train_times": 1, 15 | "pretrained_weights": "", 16 | "batch_size": 16, 17 | "learning_rate": 1e-4, 18 | "nb_epochs": 30, 19 | "warmup_epochs": 3, 20 | 21 | "object_scale": 5.0, 22 | "no_object_scale": 1.0, 23 | "coord_scale": 1.0, 24 | "class_scale": 0.0, 25 | 26 | "saved_weights_name": "monday3.h5", 27 | "debug": true 28 | }, 29 | 30 | "valid": { 31 | "valid_image_folder": "", 32 | "valid_annot_folder": "", 33 | 34 | "valid_times": 1 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ngoc Anh Huynh 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /predict.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import cv2 6 | import numpy as np 7 | from tqdm import tqdm 8 | from preprocessing import parse_annotation 9 | from utils import draw_boxes 10 | from frontend import YOLO 11 | import json 12 | 13 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 14 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 15 | 16 | argparser = argparse.ArgumentParser( 17 | description='Train and validate YOLO_v2 model on any dataset') 18 | 19 | argparser.add_argument( 20 | '-c', 21 | '--conf', 22 | help='path to configuration file') 23 | 24 | argparser.add_argument( 25 | '-w', 26 | '--weights', 27 | help='path to pretrained weights') 28 | 29 | argparser.add_argument( 30 | '-i', 31 | '--input', 32 | help='path to an image or an video (mp4 format)') 33 | 34 | def _main_(args): 35 | config_path = args.conf 36 | weights_path = args.weights 37 | image_path = args.input 38 | 39 | with open(config_path) as config_buffer: 40 | config = json.load(config_buffer) 41 | 42 | ############################### 43 | # Make the model 44 | ############################### 45 | 46 | yolo = YOLO(backend = config['model']['backend'], 47 | input_size = config['model']['input_size'], 48 | labels = config['model']['labels'], 49 | max_box_per_image = config['model']['max_box_per_image'], 50 | anchors = config['model']['anchors']) 51 | 52 | ############################### 53 | # Load trained weights 54 | ############################### 55 | 56 | yolo.load_weights(weights_path) 57 | 58 | ############################### 59 | # Predict bounding boxes 60 | ############################### 61 | 62 | if image_path[-4:] == '.mp4': 63 | video_out = image_path[:-4] + '_detected' + image_path[-4:] 64 | video_reader = cv2.VideoCapture(image_path) 65 | 66 | nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT)) 67 | frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT)) 68 | frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH)) 69 | 70 | video_writer = cv2.VideoWriter(video_out, 71 | cv2.VideoWriter_fourcc(*'MPEG'), 72 | 50.0, 73 | (frame_w, frame_h)) 74 | count = 0 75 | for i in tqdm(range(nb_frames)): 76 | _, image = video_reader.read() 77 | 78 | boxes = yolo.predict(image) 79 | if i > 0: 80 | image, count = count_boxes(image, boxes, config['model']['labels'], oldboxes, i, count) 81 | 82 | video_writer.write(np.uint8(image)) 83 | 84 | oldboxes = boxes 85 | 86 | video_reader.release() 87 | video_writer.release() 88 | else: 89 | image = cv2.imread(image_path) 90 | boxes = yolo.predict(image) 91 | image = draw_boxes(image, boxes, config['model']['labels']) 92 | 93 | print(len(boxes), 'boxes are found') 94 | 95 | cv2.imwrite(image_path[:-4] + '_detected' + image_path[-4:], image) 96 | 97 | if __name__ == '__main__': 98 | args = argparser.parse_args() 99 | _main_(args) 100 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | import numpy as np 6 | from preprocessing import parse_annotation 7 | from frontend import YOLO 8 | import json 9 | 10 | os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 11 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 12 | 13 | argparser = argparse.ArgumentParser( 14 | description='Train and validate YOLO_v2 model on any dataset') 15 | 16 | argparser.add_argument( 17 | '-c', 18 | '--conf', 19 | help='path to configuration file') 20 | 21 | def _main_(args): 22 | config_path = args.conf 23 | 24 | with open(config_path) as config_buffer: 25 | config = json.loads(config_buffer.read()) 26 | 27 | ############################### 28 | # Parse the annotations 29 | ############################### 30 | 31 | # parse annotations of the training set 32 | train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'], 33 | config['train']['train_image_folder'], 34 | config['model']['labels']) 35 | 36 | # parse annotations of the validation set, if any, otherwise split the training set 37 | if os.path.exists(config['valid']['valid_annot_folder']): 38 | valid_imgs, valid_labels = parse_annotation(config['valid']['valid_annot_folder'], 39 | config['valid']['valid_image_folder'], 40 | config['model']['labels']) 41 | else: 42 | train_valid_split = int(0.8*len(train_imgs)) 43 | np.random.shuffle(train_imgs) 44 | 45 | valid_imgs = train_imgs[train_valid_split:] 46 | train_imgs = train_imgs[:train_valid_split] 47 | 48 | if len(config['model']['labels']) > 0: 49 | overlap_labels = set(config['model']['labels']).intersection(set(train_labels.keys())) 50 | 51 | print('Seen labels:\t', train_labels) 52 | print('Given labels:\t', config['model']['labels']) 53 | print('Overlap labels:\t', overlap_labels) 54 | 55 | if len(overlap_labels) < len(config['model']['labels']): 56 | print('Some labels have no annotations! Please revise the list of labels in the config.json file!') 57 | return 58 | else: 59 | print('No labels are provided. Train on all seen labels.') 60 | config['model']['labels'] = train_labels.keys() 61 | 62 | ############################### 63 | # Construct the model 64 | ############################### 65 | 66 | yolo = YOLO(backend = config['model']['backend'], 67 | input_size = config['model']['input_size'], 68 | labels = config['model']['labels'], 69 | max_box_per_image = config['model']['max_box_per_image'], 70 | anchors = config['model']['anchors']) 71 | 72 | ############################### 73 | # Load the pretrained weights (if any) 74 | ############################### 75 | 76 | if os.path.exists(config['train']['pretrained_weights']): 77 | print("Loading pre-trained weights in", config['train']['pretrained_weights']) 78 | yolo.load_weights(config['train']['pretrained_weights']) 79 | 80 | ############################### 81 | # Start the training process 82 | ############################### 83 | 84 | yolo.train(train_imgs = train_imgs, 85 | valid_imgs = valid_imgs, 86 | train_times = config['train']['train_times'], 87 | valid_times = config['valid']['valid_times'], 88 | nb_epochs = config['train']['nb_epochs'], 89 | learning_rate = config['train']['learning_rate'], 90 | batch_size = config['train']['batch_size'], 91 | warmup_epochs = config['train']['warmup_epochs'], 92 | object_scale = config['train']['object_scale'], 93 | no_object_scale = config['train']['no_object_scale'], 94 | coord_scale = config['train']['coord_scale'], 95 | class_scale = config['train']['class_scale'], 96 | saved_weights_name = config['train']['saved_weights_name'], 97 | debug = config['train']['debug']) 98 | 99 | if __name__ == '__main__': 100 | args = argparser.parse_args() 101 | _main_(args) 102 | -------------------------------------------------------------------------------- /gen_anchors.py: -------------------------------------------------------------------------------- 1 | import random 2 | import argparse 3 | import numpy as np 4 | 5 | from preprocessing import parse_annotation 6 | import json 7 | 8 | argparser = argparse.ArgumentParser() 9 | 10 | argparser.add_argument( 11 | '-c', 12 | '--conf', 13 | default='config.json', 14 | help='path to configuration file') 15 | 16 | argparser.add_argument( 17 | '-a', 18 | '--anchors', 19 | default=5, 20 | help='number of anchors to use') 21 | 22 | def IOU(ann, centroids): 23 | w, h = ann 24 | similarities = [] 25 | 26 | for centroid in centroids: 27 | c_w, c_h = centroid 28 | 29 | if c_w >= w and c_h >= h: 30 | similarity = w*h/(c_w*c_h) 31 | elif c_w >= w and c_h <= h: 32 | similarity = w*c_h/(w*h + (c_w-w)*c_h) 33 | elif c_w <= w and c_h >= h: 34 | similarity = c_w*h/(w*h + c_w*(c_h-h)) 35 | else: #means both w,h are bigger than c_w and c_h respectively 36 | similarity = (c_w*c_h)/(w*h) 37 | similarities.append(similarity) # will become (k,) shape 38 | 39 | return np.array(similarities) 40 | 41 | def avg_IOU(anns, centroids): 42 | n,d = anns.shape 43 | sum = 0. 44 | 45 | for i in range(anns.shape[0]): 46 | sum+= max(IOU(anns[i], centroids)) 47 | 48 | return sum/n 49 | 50 | def print_anchors(centroids): 51 | anchors = centroids.copy() 52 | 53 | widths = anchors[:, 0] 54 | sorted_indices = np.argsort(widths) 55 | 56 | r = "anchors: [" 57 | for i in sorted_indices[:-1]: 58 | r += '%0.2f,%0.2f, ' % (anchors[i,0], anchors[i,1]) 59 | 60 | #there should not be comma after last anchor, that's why 61 | r += '%0.2f,%0.2f' % (anchors[sorted_indices[-1:],0], anchors[sorted_indices[-1:],1]) 62 | r += "]" 63 | 64 | print(r) 65 | 66 | def run_kmeans(ann_dims, anchor_num): 67 | ann_num = ann_dims.shape[0] 68 | iterations = 0 69 | prev_assignments = np.ones(ann_num)*(-1) 70 | iteration = 0 71 | old_distances = np.zeros((ann_num, anchor_num)) 72 | 73 | indices = [random.randrange(ann_dims.shape[0]) for i in range(anchor_num)] 74 | centroids = ann_dims[indices] 75 | anchor_dim = ann_dims.shape[1] 76 | 77 | while True: 78 | distances = [] 79 | iteration += 1 80 | for i in range(ann_num): 81 | d = 1 - IOU(ann_dims[i], centroids) 82 | distances.append(d) 83 | distances = np.array(distances) # distances.shape = (ann_num, anchor_num) 84 | 85 | print("iteration {}: dists = {}".format(iteration, np.sum(np.abs(old_distances-distances)))) 86 | 87 | #assign samples to centroids 88 | assignments = np.argmin(distances,axis=1) 89 | 90 | if (assignments == prev_assignments).all() : 91 | return centroids 92 | 93 | #calculate new centroids 94 | centroid_sums=np.zeros((anchor_num, anchor_dim), np.float) 95 | for i in range(ann_num): 96 | centroid_sums[assignments[i]]+=ann_dims[i] 97 | for j in range(anchor_num): 98 | centroids[j] = centroid_sums[j]/(np.sum(assignments==j) + 1e-6) 99 | 100 | prev_assignments = assignments.copy() 101 | old_distances = distances.copy() 102 | 103 | def main(argv): 104 | config_path = args.conf 105 | num_anchors = args.anchors 106 | 107 | with open(config_path) as config_buffer: 108 | config = json.loads(config_buffer.read()) 109 | 110 | train_imgs, train_labels = parse_annotation(config['train']['train_annot_folder'], 111 | config['train']['train_image_folder'], 112 | config['model']['labels']) 113 | 114 | grid_w = config['model']['input_size']/32 115 | grid_h = config['model']['input_size']/32 116 | 117 | # run k_mean to find the anchors 118 | annotation_dims = [] 119 | for image in train_imgs: 120 | cell_w = image['width']/grid_w 121 | cell_h = image['height']/grid_h 122 | 123 | for obj in image['object']: 124 | relative_w = (float(obj['xmax']) - float(obj['xmin']))/cell_w 125 | relatice_h = (float(obj["ymax"]) - float(obj['ymin']))/cell_h 126 | annotation_dims.append(tuple(map(float, (relative_w,relatice_h)))) 127 | 128 | annotation_dims = np.array(annotation_dims) 129 | centroids = run_kmeans(annotation_dims, num_anchors) 130 | 131 | # write anchors to file 132 | print('\naverage IOU for', num_anchors, 'anchors:', '%0.2f' % avg_IOU(annotation_dims, centroids)) 133 | print_anchors(centroids) 134 | 135 | if __name__ == '__main__': 136 | args = argparser.parse_args() 137 | main(args) 138 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #DfT Lab - Counting Vehicles from satellite/aerial imagery/video 2 | 3 | 4 | This repo contains the implementation of YOLOv2 in Keras with Tensorflow backend. It supports training YOLOv2 network with various backends such as MobileNet and InceptionV3. Thanks to Experiencor's excellent implementation, the original repo is here https://github.com/experiencor/keras-yolo2 5 | 6 | Links to our training set and trained weights are below. 7 | 8 | You can see it working on video at https://list.ly/list/2B7T-a-list-of-everything-the-dft-lab-does-and-has-done 9 | 10 | and at https://www.youtube.com/watch?v=iOcHr77708E 11 | 12 | 13 | ## Usage for python code 14 | 15 | ### 0. Requirement 16 | 17 | Check out requirements.txt 18 | 19 | WARNING - if you're going to train this, you need a good Nvidia GPU, with CUDA and CUDnn installed (https://www.tensorflow.org/install/install_linux). Note, we're using tensorflow-gpu 1.3! 20 | 21 | It should predict on most machines though! 22 | 23 | ### 1. Data preparation 24 | Download the VEDAI dataset from from https://github.com/nikitalpopov/vedai 25 | 26 | Organize the dataset into 4 folders: 27 | 28 | + train_image_folder <= the folder that contains the train images. 29 | 30 | + train_annot_folder <= the folder that contains the train annotations in VOC format. 31 | 32 | + valid_image_folder <= the folder that contains the validation images. 33 | 34 | + valid_annot_folder <= the folder that contains the validation annotations in VOC format. 35 | 36 | There is a one-to-one correspondence by file name between images and annotations. If the validation set is empty, the training set will be automatically splitted into the training set and validation set using the ratio of 0.8. 37 | 38 | ### 2. Edit the configuration file 39 | The configuration file is a json file, which looks like this: 40 | 41 | ```python 42 | { 43 | "model" : { 44 | "architecture": "Full Yolo", # "Tiny Yolo" or "Full Yolo" or "MobileNet" or "SqueezeNet" or "Inception3" 45 | "input_size": 416, 46 | "anchors": [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828], 47 | "max_box_per_image": 10, 48 | "labels": ["vehicle"] 49 | }, 50 | 51 | "train": { 52 | "train_image_folder": "/home/andy/data/raccoon_dataset/images/", 53 | "train_annot_folder": "/home/andy/data/raccoon_dataset/anns/", 54 | 55 | "train_times": 10, # the number of time to cycle through the training set, useful for small datasets 56 | "pretrained_weights": "", # specify the path of the pretrained weights, but it's fine to start from scratch 57 | "batch_size": 16, # the number of images to read in each batch 58 | "learning_rate": 1e-4, # the base learning rate of the default Adam rate scheduler 59 | "nb_epoch": 50, # number of epoches 60 | "warmup_epochs": 3, # the number of initial epochs during which the sizes of the 5 boxes in each cell is forced to match the sizes of the 5 anchors, this trick seems to improve precision emperically 61 | 62 | "object_scale": 5.0 , # determine how much to penalize wrong prediction of confidence of object predictors 63 | "no_object_scale": 1.0, # determine how much to penalize wrong prediction of confidence of non-object predictors 64 | "coord_scale": 1.0, # determine how much to penalize wrong position and size predictions (x, y, w, h) 65 | "class_scale": 1.0, # determine how much to penalize wrong class prediction 66 | 67 | "debug": true # turn on/off the line that prints current confidence, position, size, class losses and recall 68 | }, 69 | 70 | "valid": { 71 | "valid_image_folder": "", 72 | "valid_annot_folder": "", 73 | 74 | "valid_times": 1 75 | } 76 | } 77 | 78 | ``` 79 | 80 | The model section defines the type of the model to construct as well as other parameters of the model such as the input image size and the list of anchors. The ```labels``` setting lists the labels to be trained on. Only images which have labels listed, are fed to the network. 81 | 82 | Download pretrained weights for backend (tiny yolo, full yolo, squeezenet, mobilenet, and inceptionV3) at: 83 | 84 | https://1drv.ms/f/s!ApLdDEW3ut5fec2OzK4S4RpT-SU 85 | 86 | **These weights must be put in the root folder of the repository if you want to train the network. They are the pretrained weights for the backend only and will be loaded during model creation. The code does not work without these weights.** 87 | 88 | The link to the pretrained weights for the whole model (both frontend and backend) of the vehicle detector can be downloaded at: 89 | 90 | https://storage.googleapis.com/cudnnfreight/trainedweights.h5 91 | 92 | ### 3. Generate anchors for your dataset (optional) 93 | 94 | `python gen_anchors.py -c config.json` 95 | 96 | Copy the generated anchors printed on the terminal to the ```anchors``` setting in ```config.json```. 97 | 98 | ### 4. Start the training process 99 | 100 | `python train.py -c config.json` 101 | 102 | 103 | 104 | By the end of this process, the code will write the weights of the best model to file best_weights.h5 (or whatever name specified in the setting "saved_weights_name" in the config.json file). The training process stops when the loss on the validation set is not improved in 3 consecutive epoches. 105 | 106 | ### 5. Perform detection using trained weights on an image by running 107 | `python predict.py -c config.json -w /path/to/best_weights.h5 -i /path/to/image/or/video` 108 | 109 | It carries out detection on the image and write the image with detected bounding boxes to the same folder. If you're feeding it videos, it will endevaour to count the unique vehicles (it does this by a slightly crude collision detection (the code for which is in utils.py) 110 | 111 | Note that the model resizes images to 416*416 (you could change this but would need to alter the net archicture too), so don't go feeding it big images that when resized mean each vehicle is a little smudge of pixels - it wont get these! If it's not making predictions, try tinkering around with the level of zoom on each image, or the threshold values in utils.py 112 | 113 | ## Usage for jupyter notebook 114 | 115 | Refer to the notebook (https://github.com/experiencor/basic-yolo-keras/blob/master/Yolo%20Step-by-Step.ipynb) for a complete walk-through implementation of YOLOv2 from scratch (training, testing, and scoring). 116 | 117 | 118 | ## Copyright 119 | 120 | See [LICENSE](LICENSE) for details. 121 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import xml.etree.ElementTree as ET 4 | import tensorflow as tf 5 | import copy 6 | import cv2 7 | 8 | class BoundBox: 9 | def __init__(self, xmin, ymin, xmax, ymax, c = None, classes = None): 10 | self.xmin = xmin 11 | self.ymin = ymin 12 | self.xmax = xmax 13 | self.ymax = ymax 14 | 15 | self.c = c 16 | self.classes = classes 17 | 18 | self.label = -1 19 | self.score = -1 20 | 21 | def get_label(self): 22 | if self.label == -1: 23 | self.label = np.argmax(self.classes) 24 | 25 | return self.label 26 | 27 | def get_score(self): 28 | if self.score == -1: 29 | self.score = self.classes[self.get_label()] 30 | 31 | return self.score 32 | 33 | class WeightReader: 34 | def __init__(self, weight_file): 35 | self.offset = 4 36 | self.all_weights = np.fromfile(weight_file, dtype='float32') 37 | 38 | def read_bytes(self, size): 39 | self.offset = self.offset + size 40 | return self.all_weights[self.offset-size:self.offset] 41 | 42 | def reset(self): 43 | self.offset = 4 44 | 45 | def bbox_iou(box1, box2): 46 | intersect_w = _interval_overlap([box1.xmin, box1.xmax], [box2.xmin, box2.xmax]) 47 | intersect_h = _interval_overlap([box1.ymin, box1.ymax], [box2.ymin, box2.ymax]) 48 | 49 | intersect = intersect_w * intersect_h 50 | 51 | w1, h1 = box1.xmax-box1.xmin, box1.ymax-box1.ymin 52 | w2, h2 = box2.xmax-box2.xmin, box2.ymax-box2.ymin 53 | 54 | union = w1*h1 + w2*h2 - intersect 55 | 56 | return float(intersect) / union 57 | 58 | def count_boxes(image, boxes, labels, oldboxes, framenumber, count): 59 | image_h, image_w, _ = image.shape 60 | 61 | for box in boxes: 62 | xmin = int(box.xmin*image_w) 63 | ymin = int(box.ymin*image_h) 64 | xmax = int(box.xmax*image_w) 65 | ymax = int(box.ymax*image_h) 66 | 67 | 68 | cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3) 69 | 70 | 71 | if framenumber % 5 == 0: 72 | collisions = 0 73 | for oldbox in oldboxes: 74 | 75 | xmin2 = int(oldbox.xmin*image_w) 76 | ymin2 = int(oldbox.ymin*image_h) 77 | xmax2 = int(oldbox.xmax*image_w) 78 | ymax2 = int(oldbox.ymax*image_h) 79 | 80 | if xmax > xmin2 and xmin < xmax2 and ymax > ymin2 and ymin obj_threshold 117 | 118 | for row in range(grid_h): 119 | for col in range(grid_w): 120 | for b in range(nb_box): 121 | # from 4th element onwards are confidence and class classes 122 | classes = netout[row,col,b,5:] 123 | 124 | if np.sum(classes) > 0: 125 | # first 4 elements are x, y, w, and h 126 | x, y, w, h = netout[row,col,b,:4] 127 | 128 | x = (col + _sigmoid(x)) / grid_w # center position, unit: image width 129 | y = (row + _sigmoid(y)) / grid_h # center position, unit: image height 130 | w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width 131 | h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height 132 | confidence = netout[row,col,b,4] 133 | 134 | box = BoundBox(x-w/2, y-h/2, x+w/2, y+h/2, confidence, classes) 135 | 136 | boxes.append(box) 137 | 138 | # suppress non-maximal boxes 139 | for c in range(nb_class): 140 | sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) 141 | 142 | for i in range(len(sorted_indices)): 143 | index_i = sorted_indices[i] 144 | 145 | if boxes[index_i].classes[c] == 0: 146 | continue 147 | else: 148 | for j in range(i+1, len(sorted_indices)): 149 | index_j = sorted_indices[j] 150 | 151 | if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: 152 | boxes[index_j].classes[c] = 0 153 | 154 | # remove the boxes which are less likely than a obj_threshold 155 | boxes = [box for box in boxes if box.get_score() > obj_threshold] 156 | 157 | return boxes 158 | 159 | def compute_overlap(a, b): 160 | """ 161 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 162 | Parameters 163 | ---------- 164 | a: (N, 4) ndarray of float 165 | b: (K, 4) ndarray of float 166 | Returns 167 | ------- 168 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 169 | """ 170 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 171 | 172 | iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 173 | ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 174 | 175 | iw = np.maximum(iw, 0) 176 | ih = np.maximum(ih, 0) 177 | 178 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 179 | 180 | ua = np.maximum(ua, np.finfo(float).eps) 181 | 182 | intersection = iw * ih 183 | 184 | return intersection / ua 185 | 186 | def compute_ap(recall, precision): 187 | """ Compute the average precision, given the recall and precision curves. 188 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 189 | 190 | # Arguments 191 | recall: The recall curve (list). 192 | precision: The precision curve (list). 193 | # Returns 194 | The average precision as computed in py-faster-rcnn. 195 | """ 196 | # correct AP calculation 197 | # first append sentinel values at the end 198 | mrec = np.concatenate(([0.], recall, [1.])) 199 | mpre = np.concatenate(([0.], precision, [0.])) 200 | 201 | # compute the precision envelope 202 | for i in range(mpre.size - 1, 0, -1): 203 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 204 | 205 | # to calculate area under PR curve, look for points 206 | # where X axis (recall) changes value 207 | i = np.where(mrec[1:] != mrec[:-1])[0] 208 | 209 | # and sum (\Delta recall) * prec 210 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 211 | return ap 212 | 213 | def _interval_overlap(interval_a, interval_b): 214 | x1, x2 = interval_a 215 | x3, x4 = interval_b 216 | 217 | if x3 < x1: 218 | if x4 < x1: 219 | return 0 220 | else: 221 | return min(x2,x4) - x1 222 | else: 223 | if x2 < x3: 224 | return 0 225 | else: 226 | return min(x2,x4) - x3 227 | 228 | def _sigmoid(x): 229 | return 1. / (1. + np.exp(-x)) 230 | 231 | def _softmax(x, axis=-1, t=-100.): 232 | x = x - np.max(x) 233 | 234 | if np.min(x) < t: 235 | x = x/np.min(x)*t 236 | 237 | e_x = np.exp(x) 238 | 239 | return e_x / e_x.sum(axis, keepdims=True) 240 | -------------------------------------------------------------------------------- /backend.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | import tensorflow as tf 3 | from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda 4 | from keras.layers.advanced_activations import LeakyReLU 5 | from keras.layers.merge import concatenate 6 | from keras.applications.mobilenet import MobileNet 7 | from keras.applications import InceptionV3 8 | from keras.applications.vgg16 import VGG16 9 | from keras.applications.resnet50 import ResNet50 10 | 11 | FULL_YOLO_BACKEND_PATH = "full_yolo_backend.h5" # should be hosted on a server 12 | TINY_YOLO_BACKEND_PATH = "tiny_yolo_backend.h5" # should be hosted on a server 13 | SQUEEZENET_BACKEND_PATH = "squeezenet_backend.h5" # should be hosted on a server 14 | MOBILENET_BACKEND_PATH = "mobilenet_backend.h5" # should be hosted on a server 15 | INCEPTION3_BACKEND_PATH = "inception_backend.h5" # should be hosted on a server 16 | VGG16_BACKEND_PATH = "vgg16_backend.h5" # should be hosted on a server 17 | RESNET50_BACKEND_PATH = "resnet50_backend.h5" # should be hosted on a server 18 | 19 | class BaseFeatureExtractor(object): 20 | """docstring for ClassName""" 21 | 22 | # to be defined in each subclass 23 | def __init__(self, input_size): 24 | raise NotImplementedError("error message") 25 | 26 | # to be defined in each subclass 27 | def normalize(self, image): 28 | raise NotImplementedError("error message") 29 | 30 | def get_output_shape(self): 31 | return self.feature_extractor.get_output_shape_at(-1)[1:3] 32 | 33 | def extract(self, input_image): 34 | return self.feature_extractor(input_image) 35 | 36 | class FullYoloFeature(BaseFeatureExtractor): 37 | """docstring for ClassName""" 38 | def __init__(self, input_size): 39 | input_image = Input(shape=(input_size, input_size, 3)) 40 | 41 | # the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K) 42 | def space_to_depth_x2(x): 43 | return tf.space_to_depth(x, block_size=2) 44 | 45 | # Layer 1 46 | x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image) 47 | x = BatchNormalization(name='norm_1')(x) 48 | x = LeakyReLU(alpha=0.1)(x) 49 | x = MaxPooling2D(pool_size=(2, 2))(x) 50 | 51 | # Layer 2 52 | x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x) 53 | x = BatchNormalization(name='norm_2')(x) 54 | x = LeakyReLU(alpha=0.1)(x) 55 | x = MaxPooling2D(pool_size=(2, 2))(x) 56 | 57 | # Layer 3 58 | x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x) 59 | x = BatchNormalization(name='norm_3')(x) 60 | x = LeakyReLU(alpha=0.1)(x) 61 | 62 | # Layer 4 63 | x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x) 64 | x = BatchNormalization(name='norm_4')(x) 65 | x = LeakyReLU(alpha=0.1)(x) 66 | 67 | # Layer 5 68 | x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x) 69 | x = BatchNormalization(name='norm_5')(x) 70 | x = LeakyReLU(alpha=0.1)(x) 71 | x = MaxPooling2D(pool_size=(2, 2))(x) 72 | 73 | # Layer 6 74 | x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x) 75 | x = BatchNormalization(name='norm_6')(x) 76 | x = LeakyReLU(alpha=0.1)(x) 77 | 78 | # Layer 7 79 | x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x) 80 | x = BatchNormalization(name='norm_7')(x) 81 | x = LeakyReLU(alpha=0.1)(x) 82 | 83 | # Layer 8 84 | x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x) 85 | x = BatchNormalization(name='norm_8')(x) 86 | x = LeakyReLU(alpha=0.1)(x) 87 | x = MaxPooling2D(pool_size=(2, 2))(x) 88 | 89 | # Layer 9 90 | x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x) 91 | x = BatchNormalization(name='norm_9')(x) 92 | x = LeakyReLU(alpha=0.1)(x) 93 | 94 | # Layer 10 95 | x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x) 96 | x = BatchNormalization(name='norm_10')(x) 97 | x = LeakyReLU(alpha=0.1)(x) 98 | 99 | # Layer 11 100 | x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x) 101 | x = BatchNormalization(name='norm_11')(x) 102 | x = LeakyReLU(alpha=0.1)(x) 103 | 104 | # Layer 12 105 | x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x) 106 | x = BatchNormalization(name='norm_12')(x) 107 | x = LeakyReLU(alpha=0.1)(x) 108 | 109 | # Layer 13 110 | x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x) 111 | x = BatchNormalization(name='norm_13')(x) 112 | x = LeakyReLU(alpha=0.1)(x) 113 | 114 | skip_connection = x 115 | 116 | x = MaxPooling2D(pool_size=(2, 2))(x) 117 | 118 | # Layer 14 119 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x) 120 | x = BatchNormalization(name='norm_14')(x) 121 | x = LeakyReLU(alpha=0.1)(x) 122 | 123 | # Layer 15 124 | x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x) 125 | x = BatchNormalization(name='norm_15')(x) 126 | x = LeakyReLU(alpha=0.1)(x) 127 | 128 | # Layer 16 129 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x) 130 | x = BatchNormalization(name='norm_16')(x) 131 | x = LeakyReLU(alpha=0.1)(x) 132 | 133 | # Layer 17 134 | x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x) 135 | x = BatchNormalization(name='norm_17')(x) 136 | x = LeakyReLU(alpha=0.1)(x) 137 | 138 | # Layer 18 139 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x) 140 | x = BatchNormalization(name='norm_18')(x) 141 | x = LeakyReLU(alpha=0.1)(x) 142 | 143 | # Layer 19 144 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x) 145 | x = BatchNormalization(name='norm_19')(x) 146 | x = LeakyReLU(alpha=0.1)(x) 147 | 148 | # Layer 20 149 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x) 150 | x = BatchNormalization(name='norm_20')(x) 151 | x = LeakyReLU(alpha=0.1)(x) 152 | 153 | # Layer 21 154 | skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection) 155 | skip_connection = BatchNormalization(name='norm_21')(skip_connection) 156 | skip_connection = LeakyReLU(alpha=0.1)(skip_connection) 157 | skip_connection = Lambda(space_to_depth_x2)(skip_connection) 158 | 159 | x = concatenate([skip_connection, x]) 160 | 161 | # Layer 22 162 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x) 163 | x = BatchNormalization(name='norm_22')(x) 164 | x = LeakyReLU(alpha=0.1)(x) 165 | 166 | self.feature_extractor = Model(input_image, x) 167 | self.feature_extractor.load_weights(FULL_YOLO_BACKEND_PATH) 168 | 169 | def normalize(self, image): 170 | return image / 255. 171 | 172 | class TinyYoloFeature(BaseFeatureExtractor): 173 | """docstring for ClassName""" 174 | def __init__(self, input_size): 175 | input_image = Input(shape=(input_size, input_size, 3)) 176 | 177 | # Layer 1 178 | x = Conv2D(16, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image) 179 | x = BatchNormalization(name='norm_1')(x) 180 | x = LeakyReLU(alpha=0.1)(x) 181 | x = MaxPooling2D(pool_size=(2, 2))(x) 182 | 183 | # Layer 2 - 5 184 | for i in range(0,4): 185 | x = Conv2D(32*(2**i), (3,3), strides=(1,1), padding='same', name='conv_' + str(i+2), use_bias=False)(x) 186 | x = BatchNormalization(name='norm_' + str(i+2))(x) 187 | x = LeakyReLU(alpha=0.1)(x) 188 | x = MaxPooling2D(pool_size=(2, 2))(x) 189 | 190 | # Layer 6 191 | x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x) 192 | x = BatchNormalization(name='norm_6')(x) 193 | x = LeakyReLU(alpha=0.1)(x) 194 | x = MaxPooling2D(pool_size=(2, 2), strides=(1,1), padding='same')(x) 195 | 196 | # Layer 7 - 8 197 | for i in range(0,2): 198 | x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_' + str(i+7), use_bias=False)(x) 199 | x = BatchNormalization(name='norm_' + str(i+7))(x) 200 | x = LeakyReLU(alpha=0.1)(x) 201 | 202 | self.feature_extractor = Model(input_image, x) 203 | self.feature_extractor.load_weights(TINY_YOLO_BACKEND_PATH) 204 | 205 | def normalize(self, image): 206 | return image / 255. 207 | 208 | class MobileNetFeature(BaseFeatureExtractor): 209 | """docstring for ClassName""" 210 | def __init__(self, input_size): 211 | input_image = Input(shape=(input_size, input_size, 3)) 212 | 213 | mobilenet = MobileNet(input_shape=(224,224,3), include_top=False) 214 | mobilenet.load_weights(MOBILENET_BACKEND_PATH) 215 | 216 | x = mobilenet(input_image) 217 | 218 | self.feature_extractor = Model(input_image, x) 219 | 220 | def normalize(self, image): 221 | image = image / 255. 222 | image = image - 0.5 223 | image = image * 2. 224 | 225 | return image 226 | 227 | class SqueezeNetFeature(BaseFeatureExtractor): 228 | """docstring for ClassName""" 229 | def __init__(self, input_size): 230 | 231 | # define some auxiliary variables and the fire module 232 | sq1x1 = "squeeze1x1" 233 | exp1x1 = "expand1x1" 234 | exp3x3 = "expand3x3" 235 | relu = "relu_" 236 | 237 | def fire_module(x, fire_id, squeeze=16, expand=64): 238 | s_id = 'fire' + str(fire_id) + '/' 239 | 240 | x = Conv2D(squeeze, (1, 1), padding='valid', name=s_id + sq1x1)(x) 241 | x = Activation('relu', name=s_id + relu + sq1x1)(x) 242 | 243 | left = Conv2D(expand, (1, 1), padding='valid', name=s_id + exp1x1)(x) 244 | left = Activation('relu', name=s_id + relu + exp1x1)(left) 245 | 246 | right = Conv2D(expand, (3, 3), padding='same', name=s_id + exp3x3)(x) 247 | right = Activation('relu', name=s_id + relu + exp3x3)(right) 248 | 249 | x = concatenate([left, right], axis=3, name=s_id + 'concat') 250 | 251 | return x 252 | 253 | # define the model of SqueezeNet 254 | input_image = Input(shape=(input_size, input_size, 3)) 255 | 256 | x = Conv2D(64, (3, 3), strides=(2, 2), padding='valid', name='conv1')(input_image) 257 | x = Activation('relu', name='relu_conv1')(x) 258 | x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool1')(x) 259 | 260 | x = fire_module(x, fire_id=2, squeeze=16, expand=64) 261 | x = fire_module(x, fire_id=3, squeeze=16, expand=64) 262 | x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool3')(x) 263 | 264 | x = fire_module(x, fire_id=4, squeeze=32, expand=128) 265 | x = fire_module(x, fire_id=5, squeeze=32, expand=128) 266 | x = MaxPooling2D(pool_size=(3, 3), strides=(2, 2), name='pool5')(x) 267 | 268 | x = fire_module(x, fire_id=6, squeeze=48, expand=192) 269 | x = fire_module(x, fire_id=7, squeeze=48, expand=192) 270 | x = fire_module(x, fire_id=8, squeeze=64, expand=256) 271 | x = fire_module(x, fire_id=9, squeeze=64, expand=256) 272 | 273 | self.feature_extractor = Model(input_image, x) 274 | self.feature_extractor.load_weights(SQUEEZENET_BACKEND_PATH) 275 | 276 | def normalize(self, image): 277 | image = image[..., ::-1] 278 | image = image.astype('float') 279 | 280 | image[..., 0] -= 103.939 281 | image[..., 1] -= 116.779 282 | image[..., 2] -= 123.68 283 | 284 | return image 285 | 286 | class Inception3Feature(BaseFeatureExtractor): 287 | """docstring for ClassName""" 288 | def __init__(self, input_size): 289 | input_image = Input(shape=(input_size, input_size, 3)) 290 | 291 | inception = InceptionV3(input_shape=(input_size,input_size,3), include_top=False) 292 | inception.load_weights(INCEPTION3_BACKEND_PATH) 293 | 294 | x = inception(input_image) 295 | 296 | self.feature_extractor = Model(input_image, x) 297 | 298 | def normalize(self, image): 299 | image = image / 255. 300 | image = image - 0.5 301 | image = image * 2. 302 | 303 | return image 304 | 305 | class VGG16Feature(BaseFeatureExtractor): 306 | """docstring for ClassName""" 307 | def __init__(self, input_size): 308 | vgg16 = VGG16(input_shape=(input_size, input_size, 3), include_top=False) 309 | #vgg16.load_weights(VGG16_BACKEND_PATH) 310 | 311 | self.feature_extractor = vgg16 312 | 313 | def normalize(self, image): 314 | image = image[..., ::-1] 315 | image = image.astype('float') 316 | 317 | image[..., 0] -= 103.939 318 | image[..., 1] -= 116.779 319 | image[..., 2] -= 123.68 320 | 321 | return image 322 | 323 | class ResNet50Feature(BaseFeatureExtractor): 324 | """docstring for ClassName""" 325 | def __init__(self, input_size): 326 | resnet50 = ResNet50(input_shape=(input_size, input_size, 3), include_top=False) 327 | resnet50.layers.pop() # remove the average pooling layer 328 | #resnet50.load_weights(RESNET50_BACKEND_PATH) 329 | 330 | self.feature_extractor = Model(resnet50.layers[0].input, resnet50.layers[-1].output) 331 | 332 | def normalize(self, image): 333 | image = image[..., ::-1] 334 | image = image.astype('float') 335 | 336 | image[..., 0] -= 103.939 337 | image[..., 1] -= 116.779 338 | image[..., 2] -= 123.68 339 | 340 | return image 341 | -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import copy 4 | import numpy as np 5 | import imgaug as ia 6 | from imgaug import augmenters as iaa 7 | from keras.utils import Sequence 8 | import xml.etree.ElementTree as ET 9 | from utils import BoundBox, bbox_iou 10 | 11 | def parse_annotation(ann_dir, img_dir, labels=[]): 12 | all_imgs = [] 13 | seen_labels = {} 14 | 15 | for ann in sorted(os.listdir(ann_dir)): 16 | img = {'object':[]} 17 | 18 | tree = ET.parse(ann_dir + ann) 19 | 20 | for elem in tree.iter(): 21 | if 'filename' in elem.tag: 22 | img['filename'] = img_dir + elem.text 23 | if 'width' in elem.tag: 24 | img['width'] = int(elem.text) 25 | if 'height' in elem.tag: 26 | img['height'] = int(elem.text) 27 | if 'object' in elem.tag or 'part' in elem.tag: 28 | obj = {} 29 | 30 | for attr in list(elem): 31 | if 'name' in attr.tag: 32 | obj['name'] = attr.text 33 | 34 | if obj['name'] in seen_labels: 35 | seen_labels[obj['name']] += 1 36 | else: 37 | seen_labels[obj['name']] = 1 38 | 39 | if len(labels) > 0 and obj['name'] not in labels: 40 | break 41 | else: 42 | img['object'] += [obj] 43 | 44 | if 'bndbox' in attr.tag: 45 | for dim in list(attr): 46 | if 'xmin' in dim.tag: 47 | obj['xmin'] = int(round(float(dim.text))) 48 | if 'ymin' in dim.tag: 49 | obj['ymin'] = int(round(float(dim.text))) 50 | if 'xmax' in dim.tag: 51 | obj['xmax'] = int(round(float(dim.text))) 52 | if 'ymax' in dim.tag: 53 | obj['ymax'] = int(round(float(dim.text))) 54 | 55 | if len(img['object']) > 0: 56 | all_imgs += [img] 57 | 58 | return all_imgs, seen_labels 59 | 60 | class BatchGenerator(Sequence): 61 | def __init__(self, images, 62 | config, 63 | shuffle=True, 64 | jitter=True, 65 | norm=None): 66 | self.generator = None 67 | 68 | self.images = images 69 | self.config = config 70 | 71 | self.shuffle = shuffle 72 | self.jitter = jitter 73 | self.norm = norm 74 | 75 | self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])//2))] 76 | 77 | ### augmentors by https://github.com/aleju/imgaug 78 | sometimes = lambda aug: iaa.Sometimes(0.5, aug) 79 | 80 | # Define our sequence of augmentation steps that will be applied to every image 81 | # All augmenters with per_channel=0.5 will sample one value _per image_ 82 | # in 50% of all cases. In all other cases they will sample new values 83 | # _per channel_. 84 | self.aug_pipe = iaa.Sequential( 85 | [ 86 | # apply the following augmenters to most images 87 | #iaa.Fliplr(0.5), # horizontally flip 50% of all images 88 | #iaa.Flipud(0.2), # vertically flip 20% of all images 89 | #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width 90 | sometimes(iaa.Affine( 91 | #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis 92 | #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis) 93 | #rotate=(-5, 5), # rotate by -45 to +45 degrees 94 | #shear=(-5, 5), # shear by -16 to +16 degrees 95 | #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast) 96 | #cval=(0, 255), # if mode is constant, use a cval between 0 and 255 97 | #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples) 98 | )), 99 | # execute 0 to 5 of the following (less important) augmenters per image 100 | # don't execute all of them, as that would often be way too strong 101 | iaa.SomeOf((0, 5), 102 | [ 103 | #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation 104 | iaa.OneOf([ 105 | iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0 106 | iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7 107 | iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7 108 | ]), 109 | iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images 110 | #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images 111 | # search either for all edges or for directed edges 112 | #sometimes(iaa.OneOf([ 113 | # iaa.EdgeDetect(alpha=(0, 0.7)), 114 | # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)), 115 | #])), 116 | iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images 117 | iaa.OneOf([ 118 | iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels 119 | #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), 120 | ]), 121 | #iaa.Invert(0.05, per_channel=True), # invert color channels 122 | iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value) 123 | iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value) 124 | iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast 125 | #iaa.Grayscale(alpha=(0.0, 1.0)), 126 | #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths) 127 | #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around 128 | ], 129 | random_order=True 130 | ) 131 | ], 132 | random_order=True 133 | ) 134 | 135 | if shuffle: np.random.shuffle(self.images) 136 | 137 | def __len__(self): 138 | return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE'])) 139 | 140 | def num_classes(self): 141 | return len(self.config['LABELS']) 142 | 143 | def size(self): 144 | return len(self.images) 145 | 146 | def load_annotation(self, i): 147 | annots = [] 148 | 149 | for obj in self.images[i]['object']: 150 | annot = [obj['xmin'], obj['ymin'], obj['xmax'], obj['ymax'], self.config['LABELS'].index(obj['name'])] 151 | annots += [annot] 152 | 153 | if len(annots) == 0: annots = [[]] 154 | 155 | return np.array(annots) 156 | 157 | def load_image(self, i): 158 | return cv2.imread(self.images[i]['filename']) 159 | 160 | def __getitem__(self, idx): 161 | l_bound = idx*self.config['BATCH_SIZE'] 162 | r_bound = (idx+1)*self.config['BATCH_SIZE'] 163 | 164 | if r_bound > len(self.images): 165 | r_bound = len(self.images) 166 | l_bound = r_bound - self.config['BATCH_SIZE'] 167 | 168 | instance_count = 0 169 | 170 | x_batch = np.zeros((r_bound - l_bound, self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images 171 | b_batch = np.zeros((r_bound - l_bound, 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes 172 | y_batch = np.zeros((r_bound - l_bound, self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+len(self.config['LABELS']))) # desired network output 173 | 174 | for train_instance in self.images[l_bound:r_bound]: 175 | # augment input image and fix object's position and size 176 | img, all_objs = self.aug_image(train_instance, jitter=self.jitter) 177 | 178 | # construct output from object's x, y, w, h 179 | true_box_index = 0 180 | 181 | for obj in all_objs: 182 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: 183 | center_x = .5*(obj['xmin'] + obj['xmax']) 184 | center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) 185 | center_y = .5*(obj['ymin'] + obj['ymax']) 186 | center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) 187 | 188 | grid_x = int(np.floor(center_x)) 189 | grid_y = int(np.floor(center_y)) 190 | 191 | if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: 192 | obj_indx = self.config['LABELS'].index(obj['name']) 193 | 194 | center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell 195 | center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_H']) / self.config['GRID_H']) # unit: grid cell 196 | 197 | box = [center_x, center_y, center_w, center_h] 198 | 199 | # find the anchor that best predicts this box 200 | best_anchor = -1 201 | max_iou = -1 202 | 203 | shifted_box = BoundBox(0, 204 | 0, 205 | center_w, 206 | center_h) 207 | 208 | for i in range(len(self.anchors)): 209 | anchor = self.anchors[i] 210 | iou = bbox_iou(shifted_box, anchor) 211 | 212 | if max_iou < iou: 213 | best_anchor = i 214 | max_iou = iou 215 | 216 | # assign ground truth x, y, w, h, confidence and class probs to y_batch 217 | y_batch[instance_count, grid_y, grid_x, best_anchor, 0:4] = box 218 | y_batch[instance_count, grid_y, grid_x, best_anchor, 4 ] = 1. 219 | y_batch[instance_count, grid_y, grid_x, best_anchor, 5+obj_indx] = 1 220 | 221 | # assign the true box to b_batch 222 | b_batch[instance_count, 0, 0, 0, true_box_index] = box 223 | 224 | true_box_index += 1 225 | true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] 226 | 227 | # assign input image to x_batch 228 | if self.norm != None: 229 | x_batch[instance_count] = self.norm(img) 230 | else: 231 | # plot image and bounding boxes for sanity check 232 | for obj in all_objs: 233 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: 234 | cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) 235 | cv2.putText(img[:,:,::-1], obj['name'], 236 | (obj['xmin']+2, obj['ymin']+12), 237 | 0, 1.2e-3 * img.shape[0], 238 | (0,255,0), 2) 239 | 240 | x_batch[instance_count] = img 241 | 242 | # increase instance counter in current batch 243 | instance_count += 1 244 | 245 | #print(' new batch created', idx) 246 | 247 | return [x_batch, b_batch], y_batch 248 | 249 | def on_epoch_end(self): 250 | if self.shuffle: np.random.shuffle(self.images) 251 | 252 | def aug_image(self, train_instance, jitter): 253 | image_name = train_instance['filename'] 254 | image = cv2.imread(image_name) 255 | 256 | if image is None: print('Cannot find ', image_name) 257 | 258 | h, w, c = image.shape 259 | all_objs = copy.deepcopy(train_instance['object']) 260 | 261 | if jitter: 262 | ### scale the image 263 | scale = np.random.uniform() / 10. + 1. 264 | image = cv2.resize(image, (0,0), fx = scale, fy = scale) 265 | 266 | ### translate the image 267 | max_offx = (scale-1.) * w 268 | max_offy = (scale-1.) * h 269 | offx = int(np.random.uniform() * max_offx) 270 | offy = int(np.random.uniform() * max_offy) 271 | 272 | image = image[offy : (offy + h), offx : (offx + w)] 273 | 274 | ### flip the image 275 | flip = np.random.binomial(1, .5) 276 | if flip > 0.5: image = cv2.flip(image, 1) 277 | 278 | image = self.aug_pipe.augment_image(image) 279 | 280 | # resize the image to standard size 281 | image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W'])) 282 | image = image[:,:,::-1] 283 | 284 | # fix object's position and size 285 | for obj in all_objs: 286 | for attr in ['xmin', 'xmax']: 287 | if jitter: obj[attr] = int(obj[attr] * scale - offx) 288 | 289 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w) 290 | obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0) 291 | 292 | for attr in ['ymin', 'ymax']: 293 | if jitter: obj[attr] = int(obj[attr] * scale - offy) 294 | 295 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h) 296 | obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0) 297 | 298 | if jitter and flip > 0.5: 299 | xmin = obj['xmin'] 300 | obj['xmin'] = self.config['IMAGE_W'] - obj['xmax'] 301 | obj['xmax'] = self.config['IMAGE_W'] - xmin 302 | 303 | return image, all_objs 304 | -------------------------------------------------------------------------------- /frontend.py: -------------------------------------------------------------------------------- 1 | from keras.models import Model 2 | from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda 3 | from keras.layers.advanced_activations import LeakyReLU 4 | import tensorflow as tf 5 | import numpy as np 6 | import os 7 | import cv2 8 | from utils import decode_netout, compute_overlap, compute_ap 9 | from keras.applications.mobilenet import MobileNet 10 | from keras.layers.merge import concatenate 11 | from keras.optimizers import SGD, Adam, RMSprop 12 | from preprocessing import BatchGenerator 13 | from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard 14 | from backend import TinyYoloFeature, FullYoloFeature, MobileNetFeature, SqueezeNetFeature, Inception3Feature, VGG16Feature, ResNet50Feature 15 | 16 | class YOLO(object): 17 | def __init__(self, backend, 18 | input_size, 19 | labels, 20 | max_box_per_image, 21 | anchors): 22 | 23 | self.input_size = input_size 24 | 25 | self.labels = list(labels) 26 | self.nb_class = len(self.labels) 27 | self.nb_box = len(anchors)//2 28 | self.class_wt = np.ones(self.nb_class, dtype='float32') 29 | self.anchors = anchors 30 | 31 | self.max_box_per_image = max_box_per_image 32 | 33 | ########################## 34 | # Make the model 35 | ########################## 36 | 37 | # make the feature extractor layers 38 | input_image = Input(shape=(self.input_size, self.input_size, 3)) 39 | self.true_boxes = Input(shape=(1, 1, 1, max_box_per_image , 4)) 40 | 41 | if backend == 'Inception3': 42 | self.feature_extractor = Inception3Feature(self.input_size) 43 | elif backend == 'SqueezeNet': 44 | self.feature_extractor = SqueezeNetFeature(self.input_size) 45 | elif backend == 'MobileNet': 46 | self.feature_extractor = MobileNetFeature(self.input_size) 47 | elif backend == 'Full Yolo': 48 | self.feature_extractor = FullYoloFeature(self.input_size) 49 | elif backend == 'Tiny Yolo': 50 | self.feature_extractor = TinyYoloFeature(self.input_size) 51 | elif backend == 'VGG16': 52 | self.feature_extractor = VGG16Feature(self.input_size) 53 | elif backend == 'ResNet50': 54 | self.feature_extractor = ResNet50Feature(self.input_size) 55 | else: 56 | raise Exception('Architecture not supported! Only support Full Yolo, Tiny Yolo, MobileNet, SqueezeNet, VGG16, ResNet50, and Inception3 at the moment!') 57 | 58 | print(self.feature_extractor.get_output_shape()) 59 | self.grid_h, self.grid_w = self.feature_extractor.get_output_shape() 60 | features = self.feature_extractor.extract(input_image) 61 | 62 | # make the object detection layer 63 | output = Conv2D(self.nb_box * (4 + 1 + self.nb_class), 64 | (1,1), strides=(1,1), 65 | padding='same', 66 | name='DetectionLayer', 67 | kernel_initializer='lecun_normal')(features) 68 | output = Reshape((self.grid_h, self.grid_w, self.nb_box, 4 + 1 + self.nb_class))(output) 69 | output = Lambda(lambda args: args[0])([output, self.true_boxes]) 70 | 71 | self.model = Model([input_image, self.true_boxes], output) 72 | 73 | 74 | # initialize the weights of the detection layer 75 | layer = self.model.layers[-4] 76 | weights = layer.get_weights() 77 | 78 | new_kernel = np.random.normal(size=weights[0].shape)/(self.grid_h*self.grid_w) 79 | new_bias = np.random.normal(size=weights[1].shape)/(self.grid_h*self.grid_w) 80 | 81 | layer.set_weights([new_kernel, new_bias]) 82 | 83 | # print a summary of the whole model 84 | self.model.summary() 85 | 86 | def custom_loss(self, y_true, y_pred): 87 | mask_shape = tf.shape(y_true)[:4] 88 | 89 | cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(self.grid_w), [self.grid_h]), (1, self.grid_h, self.grid_w, 1, 1))) 90 | cell_y = tf.transpose(cell_x, (0,2,1,3,4)) 91 | 92 | cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [self.batch_size, 1, 1, self.nb_box, 1]) 93 | 94 | coord_mask = tf.zeros(mask_shape) 95 | conf_mask = tf.zeros(mask_shape) 96 | class_mask = tf.zeros(mask_shape) 97 | 98 | seen = tf.Variable(0.) 99 | total_recall = tf.Variable(0.) 100 | 101 | """ 102 | Adjust prediction 103 | """ 104 | ### adjust x and y 105 | pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid 106 | 107 | ### adjust w and h 108 | pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(self.anchors, [1,1,1,self.nb_box,2]) 109 | 110 | ### adjust confidence 111 | pred_box_conf = tf.sigmoid(y_pred[..., 4]) 112 | 113 | ### adjust class probabilities 114 | pred_box_class = y_pred[..., 5:] 115 | 116 | """ 117 | Adjust ground truth 118 | """ 119 | ### adjust x and y 120 | true_box_xy = y_true[..., 0:2] # relative position to the containing cell 121 | 122 | ### adjust w and h 123 | true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically 124 | 125 | ### adjust confidence 126 | true_wh_half = true_box_wh / 2. 127 | true_mins = true_box_xy - true_wh_half 128 | true_maxes = true_box_xy + true_wh_half 129 | 130 | pred_wh_half = pred_box_wh / 2. 131 | pred_mins = pred_box_xy - pred_wh_half 132 | pred_maxes = pred_box_xy + pred_wh_half 133 | 134 | intersect_mins = tf.maximum(pred_mins, true_mins) 135 | intersect_maxes = tf.minimum(pred_maxes, true_maxes) 136 | intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) 137 | intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] 138 | 139 | true_areas = true_box_wh[..., 0] * true_box_wh[..., 1] 140 | pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1] 141 | 142 | union_areas = pred_areas + true_areas - intersect_areas 143 | iou_scores = tf.truediv(intersect_areas, union_areas) 144 | 145 | true_box_conf = iou_scores * y_true[..., 4] 146 | 147 | ### adjust class probabilities 148 | true_box_class = tf.argmax(y_true[..., 5:], -1) 149 | 150 | """ 151 | Determine the masks 152 | """ 153 | ### coordinate mask: simply the position of the ground truth boxes (the predictors) 154 | coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * self.coord_scale 155 | 156 | ### confidence mask: penelize predictors + penalize boxes with low IOU 157 | # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6 158 | true_xy = self.true_boxes[..., 0:2] 159 | true_wh = self.true_boxes[..., 2:4] 160 | 161 | true_wh_half = true_wh / 2. 162 | true_mins = true_xy - true_wh_half 163 | true_maxes = true_xy + true_wh_half 164 | 165 | pred_xy = tf.expand_dims(pred_box_xy, 4) 166 | pred_wh = tf.expand_dims(pred_box_wh, 4) 167 | 168 | pred_wh_half = pred_wh / 2. 169 | pred_mins = pred_xy - pred_wh_half 170 | pred_maxes = pred_xy + pred_wh_half 171 | 172 | intersect_mins = tf.maximum(pred_mins, true_mins) 173 | intersect_maxes = tf.minimum(pred_maxes, true_maxes) 174 | intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.) 175 | intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] 176 | 177 | true_areas = true_wh[..., 0] * true_wh[..., 1] 178 | pred_areas = pred_wh[..., 0] * pred_wh[..., 1] 179 | 180 | union_areas = pred_areas + true_areas - intersect_areas 181 | iou_scores = tf.truediv(intersect_areas, union_areas) 182 | 183 | best_ious = tf.reduce_max(iou_scores, axis=4) 184 | conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * self.no_object_scale 185 | 186 | # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box 187 | conf_mask = conf_mask + y_true[..., 4] * self.object_scale 188 | 189 | ### class mask: simply the position of the ground truth boxes (the predictors) 190 | class_mask = y_true[..., 4] * tf.gather(self.class_wt, true_box_class) * self.class_scale 191 | 192 | """ 193 | Warm-up training 194 | """ 195 | no_boxes_mask = tf.to_float(coord_mask < self.coord_scale/2.) 196 | seen = tf.assign_add(seen, 1.) 197 | 198 | true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, self.warmup_batches+1), 199 | lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, 200 | true_box_wh + tf.ones_like(true_box_wh) * \ 201 | np.reshape(self.anchors, [1,1,1,self.nb_box,2]) * \ 202 | no_boxes_mask, 203 | tf.ones_like(coord_mask)], 204 | lambda: [true_box_xy, 205 | true_box_wh, 206 | coord_mask]) 207 | 208 | """ 209 | Finalize the loss 210 | """ 211 | nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0)) 212 | nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0)) 213 | nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0)) 214 | 215 | loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2. 216 | loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2. 217 | loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2. 218 | loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class) 219 | loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6) 220 | 221 | loss = tf.cond(tf.less(seen, self.warmup_batches+1), 222 | lambda: loss_xy + loss_wh + loss_conf + loss_class + 10, 223 | lambda: loss_xy + loss_wh + loss_conf + loss_class) 224 | 225 | if self.debug: 226 | nb_true_box = tf.reduce_sum(y_true[..., 4]) 227 | nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3)) 228 | 229 | current_recall = nb_pred_box/(nb_true_box + 1e-6) 230 | total_recall = tf.assign_add(total_recall, current_recall) 231 | 232 | loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000) 233 | loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000) 234 | loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000) 235 | loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000) 236 | loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000) 237 | loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000) 238 | loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000) 239 | 240 | return loss 241 | 242 | def load_weights(self, weight_path): 243 | self.model.load_weights(weight_path) 244 | 245 | def train(self, train_imgs, # the list of images to train the model 246 | valid_imgs, # the list of images used to validate the model 247 | train_times, # the number of time to repeat the training set, often used for small datasets 248 | valid_times, # the number of times to repeat the validation set, often used for small datasets 249 | nb_epochs, # number of epoches 250 | learning_rate, # the learning rate 251 | batch_size, # the size of the batch 252 | warmup_epochs, # number of initial batches to let the model familiarize with the new dataset 253 | object_scale, 254 | no_object_scale, 255 | coord_scale, 256 | class_scale, 257 | saved_weights_name='best_weights.h5', 258 | debug=False): 259 | 260 | self.batch_size = batch_size 261 | 262 | self.object_scale = object_scale 263 | self.no_object_scale = no_object_scale 264 | self.coord_scale = coord_scale 265 | self.class_scale = class_scale 266 | 267 | self.debug = debug 268 | 269 | ############################################ 270 | # Make train and validation generators 271 | ############################################ 272 | 273 | generator_config = { 274 | 'IMAGE_H' : self.input_size, 275 | 'IMAGE_W' : self.input_size, 276 | 'GRID_H' : self.grid_h, 277 | 'GRID_W' : self.grid_w, 278 | 'BOX' : self.nb_box, 279 | 'LABELS' : self.labels, 280 | 'CLASS' : len(self.labels), 281 | 'ANCHORS' : self.anchors, 282 | 'BATCH_SIZE' : self.batch_size, 283 | 'TRUE_BOX_BUFFER' : self.max_box_per_image, 284 | } 285 | 286 | train_generator = BatchGenerator(train_imgs, 287 | generator_config, 288 | norm=self.feature_extractor.normalize) 289 | valid_generator = BatchGenerator(valid_imgs, 290 | generator_config, 291 | norm=self.feature_extractor.normalize, 292 | jitter=False) 293 | 294 | self.warmup_batches = warmup_epochs * (train_times*len(train_generator) + valid_times*len(valid_generator)) 295 | 296 | ############################################ 297 | # Compile the model 298 | ############################################ 299 | 300 | optimizer = Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) 301 | self.model.compile(loss=self.custom_loss, optimizer=optimizer) 302 | 303 | ############################################ 304 | # Make a few callbacks 305 | ############################################ 306 | 307 | early_stop = EarlyStopping(monitor='val_loss', 308 | min_delta=0.001, 309 | patience=3, 310 | mode='min', 311 | verbose=1) 312 | checkpoint = ModelCheckpoint(saved_weights_name, 313 | monitor='val_loss', 314 | verbose=1, 315 | save_best_only=True, 316 | mode='min', 317 | period=1) 318 | tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/'), 319 | histogram_freq=0, 320 | #write_batch_performance=True, 321 | write_graph=True, 322 | write_images=False) 323 | 324 | ############################################ 325 | # Start the training process 326 | ############################################ 327 | 328 | self.model.fit_generator(generator = train_generator, 329 | steps_per_epoch = len(train_generator) * train_times, 330 | epochs = warmup_epochs + nb_epochs, 331 | verbose = 2 if debug else 1, 332 | validation_data = valid_generator, 333 | validation_steps = len(valid_generator) * valid_times, 334 | callbacks = [early_stop, checkpoint, tensorboard], 335 | workers = 3, 336 | max_queue_size = 8) 337 | 338 | ############################################ 339 | # Compute mAP on the validation set 340 | ############################################ 341 | average_precisions = self.evaluate(valid_generator) 342 | 343 | # print evaluation 344 | for label, average_precision in average_precisions.items(): 345 | print(self.labels[label], '{:.4f}'.format(average_precision)) 346 | print('mAP: {:.4f}'.format(sum(average_precisions.values()) / len(average_precisions))) 347 | 348 | def evaluate(self, 349 | generator, 350 | iou_threshold=0.3, 351 | score_threshold=0.3, 352 | max_detections=100, 353 | save_path=None): 354 | """ Evaluate a given dataset using a given model. 355 | code originally from https://github.com/fizyr/keras-retinanet 356 | 357 | # Arguments 358 | generator : The generator that represents the dataset to evaluate. 359 | model : The model to evaluate. 360 | iou_threshold : The threshold used to consider when a detection is positive or negative. 361 | score_threshold : The score confidence threshold to use for detections. 362 | max_detections : The maximum number of detections to use per image. 363 | save_path : The path to save images with visualized detections to. 364 | # Returns 365 | A dict mapping class names to mAP scores. 366 | """ 367 | # gather all detections and annotations 368 | all_detections = [[None for i in range(generator.num_classes())] for j in range(generator.size())] 369 | all_annotations = [[None for i in range(generator.num_classes())] for j in range(generator.size())] 370 | 371 | for i in range(generator.size()): 372 | raw_image = generator.load_image(i) 373 | 374 | # make the boxes and the labels 375 | pred_boxes = self.predict(raw_image) 376 | 377 | score = np.array([box.score for box in pred_boxes]) 378 | pred_labels = np.array([box.label for box in pred_boxes]) 379 | 380 | if len(pred_boxes) > 0: 381 | pred_boxes = np.array([[box.xmin, box.ymin, box.xmax, box.ymax, box.score] for box in pred_boxes]) 382 | else: 383 | pred_boxes = np.array([[]]) 384 | 385 | # sort the boxes and the labels according to scores 386 | score_sort = np.argsort(-score) 387 | pred_labels = pred_labels[score_sort] 388 | pred_boxes = pred_boxes[score_sort] 389 | 390 | # copy detections to all_detections 391 | for label in range(generator.num_classes()): 392 | all_detections[i][label] = pred_boxes[pred_labels == label, :] 393 | 394 | annotations = generator.load_annotation(i) 395 | 396 | # copy detections to all_annotations 397 | for label in range(generator.num_classes()): 398 | all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() 399 | 400 | # compute mAP by comparing all detections and all annotations 401 | average_precisions = {} 402 | 403 | for label in range(generator.num_classes()): 404 | false_positives = np.zeros((0,)) 405 | true_positives = np.zeros((0,)) 406 | scores = np.zeros((0,)) 407 | num_annotations = 0.0 408 | 409 | for i in range(generator.size()): 410 | detections = all_detections[i][label] 411 | annotations = all_annotations[i][label] 412 | num_annotations += annotations.shape[0] 413 | detected_annotations = [] 414 | 415 | for d in detections: 416 | scores = np.append(scores, d[4]) 417 | 418 | if annotations.shape[0] == 0: 419 | false_positives = np.append(false_positives, 1) 420 | true_positives = np.append(true_positives, 0) 421 | continue 422 | 423 | overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) 424 | assigned_annotation = np.argmax(overlaps, axis=1) 425 | max_overlap = overlaps[0, assigned_annotation] 426 | 427 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 428 | false_positives = np.append(false_positives, 0) 429 | true_positives = np.append(true_positives, 1) 430 | detected_annotations.append(assigned_annotation) 431 | else: 432 | false_positives = np.append(false_positives, 1) 433 | true_positives = np.append(true_positives, 0) 434 | 435 | # no annotations -> AP for this class is 0 (is this correct?) 436 | if num_annotations == 0: 437 | average_precisions[label] = 0 438 | continue 439 | 440 | # sort by score 441 | indices = np.argsort(-scores) 442 | false_positives = false_positives[indices] 443 | true_positives = true_positives[indices] 444 | 445 | # compute false positives and true positives 446 | false_positives = np.cumsum(false_positives) 447 | true_positives = np.cumsum(true_positives) 448 | 449 | # compute recall and precision 450 | recall = true_positives / num_annotations 451 | precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) 452 | 453 | # compute average precision 454 | average_precision = compute_ap(recall, precision) 455 | average_precisions[label] = average_precision 456 | 457 | return average_precisions 458 | 459 | def predict(self, image): 460 | image_h, image_w, _ = image.shape 461 | image = cv2.resize(image, (self.input_size, self.input_size)) 462 | image = self.feature_extractor.normalize(image) 463 | 464 | input_image = image[:,:,::-1] 465 | input_image = np.expand_dims(input_image, 0) 466 | dummy_array = np.zeros((1,1,1,1,self.max_box_per_image,4)) 467 | 468 | netout = self.model.predict([input_image, dummy_array])[0] 469 | boxes = decode_netout(netout, self.anchors, self.nb_class) 470 | 471 | return boxes -------------------------------------------------------------------------------- /Yolo Step-by-Step.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "**Outline of Steps**\n", 8 | " + Initialization\n", 9 | " + Download COCO detection data from http://cocodataset.org/#download\n", 10 | " + http://images.cocodataset.org/zips/train2014.zip <= train images\n", 11 | " + http://images.cocodataset.org/zips/val2014.zip <= validation images\n", 12 | " + http://images.cocodataset.org/annotations/annotations_trainval2014.zip <= train and validation annotations\n", 13 | " + Run this script to convert annotations in COCO format to VOC format\n", 14 | " + https://gist.github.com/chicham/6ed3842d0d2014987186#file-coco2pascal-py\n", 15 | " + Download pre-trained weights from https://pjreddie.com/darknet/yolo/\n", 16 | " + https://pjreddie.com/media/files/yolo.weights\n", 17 | " + Specify the directory of train annotations (train_annot_folder) and train images (train_image_folder)\n", 18 | " + Specify the directory of validation annotations (valid_annot_folder) and validation images (valid_image_folder)\n", 19 | " + Specity the path of pre-trained weights by setting variable *wt_path*\n", 20 | " + Construct equivalent network in Keras\n", 21 | " + Network arch from https://github.com/pjreddie/darknet/blob/master/cfg/yolo-voc.cfg\n", 22 | " + Load the pretrained weights\n", 23 | " + Perform training \n", 24 | " + Perform detection on an image with newly trained weights\n", 25 | " + Perform detection on an video with newly trained weights" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "# Initialization" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 1, 45 | "metadata": { 46 | "ExecuteTime": { 47 | "end_time": "2018-04-04T00:18:52.056478", 48 | "start_time": "2018-04-04T00:18:50.879887" 49 | }, 50 | "code_folding": [], 51 | "scrolled": true 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stderr", 56 | "output_type": "stream", 57 | "text": [ 58 | "/home/zachary_arundel/freightkeras/env/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 59 | " from ._conv import register_converters as _register_converters\n", 60 | "Using TensorFlow backend.\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "from keras.models import Sequential, Model\n", 66 | "from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda\n", 67 | "from keras.layers.advanced_activations import LeakyReLU\n", 68 | "from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard\n", 69 | "from keras.optimizers import SGD, Adam, RMSprop\n", 70 | "from keras.layers.merge import concatenate\n", 71 | "import matplotlib.pyplot as plt\n", 72 | "import keras.backend as K\n", 73 | "import tensorflow as tf\n", 74 | "import imgaug as ia\n", 75 | "from tqdm import tqdm\n", 76 | "from imgaug import augmenters as iaa\n", 77 | "import numpy as np\n", 78 | "import pickle\n", 79 | "import os, cv2\n", 80 | "from preprocessing import parse_annotation, BatchGenerator\n", 81 | "from utils import WeightReader, decode_netout, draw_boxes\n", 82 | "\n", 83 | "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", 84 | "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n", 85 | "\n", 86 | "%matplotlib inline" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 2, 92 | "metadata": { 93 | "ExecuteTime": { 94 | "end_time": "2018-04-04T00:18:52.075535", 95 | "start_time": "2018-04-04T00:18:52.057712" 96 | }, 97 | "scrolled": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "LABELS = [\"car\", \"truck\", \"pickup\", \"tractor\", \"camping car\", \"boat\",\"motorcycle\", \"van\", \"other\", \"plane\"] \n", 102 | "\n", 103 | "IMAGE_H, IMAGE_W = 416, 416\n", 104 | "GRID_H, GRID_W = 13 , 13\n", 105 | "BOX = 5\n", 106 | "CLASS = len(LABELS)\n", 107 | "CLASS_WEIGHTS = np.ones(CLASS, dtype='float32')\n", 108 | "OBJ_THRESHOLD = 0.3#0.5\n", 109 | "NMS_THRESHOLD = 0.3#0.45\n", 110 | "ANCHORS = [0.88,1.69, 1.18,0.7, 1.65,1.77,1.77,0.9, 3.75, 3.57],\n", 111 | "\n", 112 | "NO_OBJECT_SCALE = 1.0\n", 113 | "OBJECT_SCALE = 5.0\n", 114 | "COORD_SCALE = 1.0\n", 115 | "CLASS_SCALE = 1.0\n", 116 | "\n", 117 | "BATCH_SIZE = 16\n", 118 | "WARM_UP_BATCHES = 0\n", 119 | "TRUE_BOX_BUFFER = 50" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 3, 125 | "metadata": { 126 | "ExecuteTime": { 127 | "end_time": "2018-04-04T00:18:52.981155", 128 | "start_time": "2018-04-04T00:18:52.978076" 129 | } 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "wt_path = 'full_yolo_backend.h5' \n", 134 | "train_image_folder = 'train_image_folder/'\n", 135 | "train_annot_folder = 'train_annot_folder/'\n", 136 | "valid_image_folder = 'valid_image_folder/'\n", 137 | "valid_annot_folder = 'valid_annot_folder/'" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "# Construct the network" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "metadata": { 151 | "ExecuteTime": { 152 | "end_time": "2018-04-04T00:18:53.978220", 153 | "start_time": "2018-04-04T00:18:53.967537" 154 | } 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n", 159 | "def space_to_depth_x2(x):\n", 160 | " return tf.space_to_depth(x, block_size=2)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 5, 166 | "metadata": { 167 | "ExecuteTime": { 168 | "end_time": "2018-04-04T00:18:58.022959", 169 | "start_time": "2018-04-04T00:18:55.740759" 170 | }, 171 | "code_folding": [] 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "input_image = Input(shape=(IMAGE_H, IMAGE_W, 3))\n", 176 | "true_boxes = Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))\n", 177 | "\n", 178 | "# Layer 1\n", 179 | "x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)\n", 180 | "x = BatchNormalization(name='norm_1')(x)\n", 181 | "x = LeakyReLU(alpha=0.1)(x)\n", 182 | "x = MaxPooling2D(pool_size=(2, 2))(x)\n", 183 | "\n", 184 | "# Layer 2\n", 185 | "x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)\n", 186 | "x = BatchNormalization(name='norm_2')(x)\n", 187 | "x = LeakyReLU(alpha=0.1)(x)\n", 188 | "x = MaxPooling2D(pool_size=(2, 2))(x)\n", 189 | "\n", 190 | "# Layer 3\n", 191 | "x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)\n", 192 | "x = BatchNormalization(name='norm_3')(x)\n", 193 | "x = LeakyReLU(alpha=0.1)(x)\n", 194 | "\n", 195 | "# Layer 4\n", 196 | "x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)\n", 197 | "x = BatchNormalization(name='norm_4')(x)\n", 198 | "x = LeakyReLU(alpha=0.1)(x)\n", 199 | "\n", 200 | "# Layer 5\n", 201 | "x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)\n", 202 | "x = BatchNormalization(name='norm_5')(x)\n", 203 | "x = LeakyReLU(alpha=0.1)(x)\n", 204 | "x = MaxPooling2D(pool_size=(2, 2))(x)\n", 205 | "\n", 206 | "# Layer 6\n", 207 | "x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)\n", 208 | "x = BatchNormalization(name='norm_6')(x)\n", 209 | "x = LeakyReLU(alpha=0.1)(x)\n", 210 | "\n", 211 | "# Layer 7\n", 212 | "x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)\n", 213 | "x = BatchNormalization(name='norm_7')(x)\n", 214 | "x = LeakyReLU(alpha=0.1)(x)\n", 215 | "\n", 216 | "# Layer 8\n", 217 | "x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)\n", 218 | "x = BatchNormalization(name='norm_8')(x)\n", 219 | "x = LeakyReLU(alpha=0.1)(x)\n", 220 | "x = MaxPooling2D(pool_size=(2, 2))(x)\n", 221 | "\n", 222 | "# Layer 9\n", 223 | "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)\n", 224 | "x = BatchNormalization(name='norm_9')(x)\n", 225 | "x = LeakyReLU(alpha=0.1)(x)\n", 226 | "\n", 227 | "# Layer 10\n", 228 | "x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)\n", 229 | "x = BatchNormalization(name='norm_10')(x)\n", 230 | "x = LeakyReLU(alpha=0.1)(x)\n", 231 | "\n", 232 | "# Layer 11\n", 233 | "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)\n", 234 | "x = BatchNormalization(name='norm_11')(x)\n", 235 | "x = LeakyReLU(alpha=0.1)(x)\n", 236 | "\n", 237 | "# Layer 12\n", 238 | "x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)\n", 239 | "x = BatchNormalization(name='norm_12')(x)\n", 240 | "x = LeakyReLU(alpha=0.1)(x)\n", 241 | "\n", 242 | "# Layer 13\n", 243 | "x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)\n", 244 | "x = BatchNormalization(name='norm_13')(x)\n", 245 | "x = LeakyReLU(alpha=0.1)(x)\n", 246 | "\n", 247 | "skip_connection = x\n", 248 | "\n", 249 | "x = MaxPooling2D(pool_size=(2, 2))(x)\n", 250 | "\n", 251 | "# Layer 14\n", 252 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)\n", 253 | "x = BatchNormalization(name='norm_14')(x)\n", 254 | "x = LeakyReLU(alpha=0.1)(x)\n", 255 | "\n", 256 | "# Layer 15\n", 257 | "x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)\n", 258 | "x = BatchNormalization(name='norm_15')(x)\n", 259 | "x = LeakyReLU(alpha=0.1)(x)\n", 260 | "\n", 261 | "# Layer 16\n", 262 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)\n", 263 | "x = BatchNormalization(name='norm_16')(x)\n", 264 | "x = LeakyReLU(alpha=0.1)(x)\n", 265 | "\n", 266 | "# Layer 17\n", 267 | "x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)\n", 268 | "x = BatchNormalization(name='norm_17')(x)\n", 269 | "x = LeakyReLU(alpha=0.1)(x)\n", 270 | "\n", 271 | "# Layer 18\n", 272 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)\n", 273 | "x = BatchNormalization(name='norm_18')(x)\n", 274 | "x = LeakyReLU(alpha=0.1)(x)\n", 275 | "\n", 276 | "# Layer 19\n", 277 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)\n", 278 | "x = BatchNormalization(name='norm_19')(x)\n", 279 | "x = LeakyReLU(alpha=0.1)(x)\n", 280 | "\n", 281 | "# Layer 20\n", 282 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)\n", 283 | "x = BatchNormalization(name='norm_20')(x)\n", 284 | "x = LeakyReLU(alpha=0.1)(x)\n", 285 | "\n", 286 | "# Layer 21\n", 287 | "skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)\n", 288 | "skip_connection = BatchNormalization(name='norm_21')(skip_connection)\n", 289 | "skip_connection = LeakyReLU(alpha=0.1)(skip_connection)\n", 290 | "skip_connection = Lambda(space_to_depth_x2)(skip_connection)\n", 291 | "\n", 292 | "x = concatenate([skip_connection, x])\n", 293 | "\n", 294 | "# Layer 22\n", 295 | "x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)\n", 296 | "x = BatchNormalization(name='norm_22')(x)\n", 297 | "x = LeakyReLU(alpha=0.1)(x)\n", 298 | "\n", 299 | "# Layer 23\n", 300 | "x = Conv2D(BOX * (4 + 1 + CLASS), (1,1), strides=(1,1), padding='same', name='conv_23')(x)\n", 301 | "output = Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)\n", 302 | "\n", 303 | "# small hack to allow true_boxes to be registered when Keras build the model \n", 304 | "# for more information: https://github.com/fchollet/keras/issues/2790\n", 305 | "output = Lambda(lambda args: args[0])([output, true_boxes])\n", 306 | "\n", 307 | "model = Model([input_image, true_boxes], output)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 6, 313 | "metadata": { 314 | "ExecuteTime": { 315 | "end_time": "2017-11-26T12:34:03.819802Z", 316 | "start_time": "2017-11-26T12:34:03.786125Z" 317 | }, 318 | "scrolled": false 319 | }, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "____________________________________________________________________________________________________\n", 326 | "Layer (type) Output Shape Param # Connected to \n", 327 | "====================================================================================================\n", 328 | "input_1 (InputLayer) (None, 416, 416, 3) 0 \n", 329 | "____________________________________________________________________________________________________\n", 330 | "conv_1 (Conv2D) (None, 416, 416, 32) 864 input_1[0][0] \n", 331 | "____________________________________________________________________________________________________\n", 332 | "norm_1 (BatchNormalization) (None, 416, 416, 32) 128 conv_1[0][0] \n", 333 | "____________________________________________________________________________________________________\n", 334 | "leaky_re_lu_1 (LeakyReLU) (None, 416, 416, 32) 0 norm_1[0][0] \n", 335 | "____________________________________________________________________________________________________\n", 336 | "max_pooling2d_1 (MaxPooling2D) (None, 208, 208, 32) 0 leaky_re_lu_1[0][0] \n", 337 | "____________________________________________________________________________________________________\n", 338 | "conv_2 (Conv2D) (None, 208, 208, 64) 18432 max_pooling2d_1[0][0] \n", 339 | "____________________________________________________________________________________________________\n", 340 | "norm_2 (BatchNormalization) (None, 208, 208, 64) 256 conv_2[0][0] \n", 341 | "____________________________________________________________________________________________________\n", 342 | "leaky_re_lu_2 (LeakyReLU) (None, 208, 208, 64) 0 norm_2[0][0] \n", 343 | "____________________________________________________________________________________________________\n", 344 | "max_pooling2d_2 (MaxPooling2D) (None, 104, 104, 64) 0 leaky_re_lu_2[0][0] \n", 345 | "____________________________________________________________________________________________________\n", 346 | "conv_3 (Conv2D) (None, 104, 104, 128) 73728 max_pooling2d_2[0][0] \n", 347 | "____________________________________________________________________________________________________\n", 348 | "norm_3 (BatchNormalization) (None, 104, 104, 128) 512 conv_3[0][0] \n", 349 | "____________________________________________________________________________________________________\n", 350 | "leaky_re_lu_3 (LeakyReLU) (None, 104, 104, 128) 0 norm_3[0][0] \n", 351 | "____________________________________________________________________________________________________\n", 352 | "conv_4 (Conv2D) (None, 104, 104, 64) 8192 leaky_re_lu_3[0][0] \n", 353 | "____________________________________________________________________________________________________\n", 354 | "norm_4 (BatchNormalization) (None, 104, 104, 64) 256 conv_4[0][0] \n", 355 | "____________________________________________________________________________________________________\n", 356 | "leaky_re_lu_4 (LeakyReLU) (None, 104, 104, 64) 0 norm_4[0][0] \n", 357 | "____________________________________________________________________________________________________\n", 358 | "conv_5 (Conv2D) (None, 104, 104, 128) 73728 leaky_re_lu_4[0][0] \n", 359 | "____________________________________________________________________________________________________\n", 360 | "norm_5 (BatchNormalization) (None, 104, 104, 128) 512 conv_5[0][0] \n", 361 | "____________________________________________________________________________________________________\n", 362 | "leaky_re_lu_5 (LeakyReLU) (None, 104, 104, 128) 0 norm_5[0][0] \n", 363 | "____________________________________________________________________________________________________\n", 364 | "max_pooling2d_3 (MaxPooling2D) (None, 52, 52, 128) 0 leaky_re_lu_5[0][0] \n", 365 | "____________________________________________________________________________________________________\n", 366 | "conv_6 (Conv2D) (None, 52, 52, 256) 294912 max_pooling2d_3[0][0] \n", 367 | "____________________________________________________________________________________________________\n", 368 | "norm_6 (BatchNormalization) (None, 52, 52, 256) 1024 conv_6[0][0] \n", 369 | "____________________________________________________________________________________________________\n", 370 | "leaky_re_lu_6 (LeakyReLU) (None, 52, 52, 256) 0 norm_6[0][0] \n", 371 | "____________________________________________________________________________________________________\n", 372 | "conv_7 (Conv2D) (None, 52, 52, 128) 32768 leaky_re_lu_6[0][0] \n", 373 | "____________________________________________________________________________________________________\n", 374 | "norm_7 (BatchNormalization) (None, 52, 52, 128) 512 conv_7[0][0] \n", 375 | "____________________________________________________________________________________________________\n", 376 | "leaky_re_lu_7 (LeakyReLU) (None, 52, 52, 128) 0 norm_7[0][0] \n", 377 | "____________________________________________________________________________________________________\n", 378 | "conv_8 (Conv2D) (None, 52, 52, 256) 294912 leaky_re_lu_7[0][0] \n", 379 | "____________________________________________________________________________________________________\n", 380 | "norm_8 (BatchNormalization) (None, 52, 52, 256) 1024 conv_8[0][0] \n", 381 | "____________________________________________________________________________________________________\n", 382 | "leaky_re_lu_8 (LeakyReLU) (None, 52, 52, 256) 0 norm_8[0][0] \n", 383 | "____________________________________________________________________________________________________\n", 384 | "max_pooling2d_4 (MaxPooling2D) (None, 26, 26, 256) 0 leaky_re_lu_8[0][0] \n", 385 | "____________________________________________________________________________________________________\n", 386 | "conv_9 (Conv2D) (None, 26, 26, 512) 1179648 max_pooling2d_4[0][0] \n", 387 | "____________________________________________________________________________________________________\n", 388 | "norm_9 (BatchNormalization) (None, 26, 26, 512) 2048 conv_9[0][0] \n", 389 | "____________________________________________________________________________________________________\n", 390 | "leaky_re_lu_9 (LeakyReLU) (None, 26, 26, 512) 0 norm_9[0][0] \n", 391 | "____________________________________________________________________________________________________\n", 392 | "conv_10 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_9[0][0] \n", 393 | "____________________________________________________________________________________________________\n", 394 | "norm_10 (BatchNormalization) (None, 26, 26, 256) 1024 conv_10[0][0] \n", 395 | "____________________________________________________________________________________________________\n", 396 | "leaky_re_lu_10 (LeakyReLU) (None, 26, 26, 256) 0 norm_10[0][0] \n", 397 | "____________________________________________________________________________________________________\n", 398 | "conv_11 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_10[0][0] \n", 399 | "____________________________________________________________________________________________________\n", 400 | "norm_11 (BatchNormalization) (None, 26, 26, 512) 2048 conv_11[0][0] \n", 401 | "____________________________________________________________________________________________________\n", 402 | "leaky_re_lu_11 (LeakyReLU) (None, 26, 26, 512) 0 norm_11[0][0] \n", 403 | "____________________________________________________________________________________________________\n", 404 | "conv_12 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_11[0][0] \n", 405 | "____________________________________________________________________________________________________\n", 406 | "norm_12 (BatchNormalization) (None, 26, 26, 256) 1024 conv_12[0][0] \n", 407 | "____________________________________________________________________________________________________\n", 408 | "leaky_re_lu_12 (LeakyReLU) (None, 26, 26, 256) 0 norm_12[0][0] \n", 409 | "____________________________________________________________________________________________________\n", 410 | "conv_13 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_12[0][0] \n", 411 | "____________________________________________________________________________________________________\n", 412 | "norm_13 (BatchNormalization) (None, 26, 26, 512) 2048 conv_13[0][0] \n", 413 | "____________________________________________________________________________________________________\n", 414 | "leaky_re_lu_13 (LeakyReLU) (None, 26, 26, 512) 0 norm_13[0][0] \n", 415 | "____________________________________________________________________________________________________\n", 416 | "max_pooling2d_5 (MaxPooling2D) (None, 13, 13, 512) 0 leaky_re_lu_13[0][0] \n", 417 | "____________________________________________________________________________________________________\n", 418 | "conv_14 (Conv2D) (None, 13, 13, 1024) 4718592 max_pooling2d_5[0][0] \n", 419 | "____________________________________________________________________________________________________\n", 420 | "norm_14 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_14[0][0] \n", 421 | "____________________________________________________________________________________________________\n", 422 | "leaky_re_lu_14 (LeakyReLU) (None, 13, 13, 1024) 0 norm_14[0][0] \n", 423 | "____________________________________________________________________________________________________\n", 424 | "conv_15 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_14[0][0] \n", 425 | "____________________________________________________________________________________________________\n", 426 | "norm_15 (BatchNormalization) (None, 13, 13, 512) 2048 conv_15[0][0] \n", 427 | "____________________________________________________________________________________________________\n", 428 | "leaky_re_lu_15 (LeakyReLU) (None, 13, 13, 512) 0 norm_15[0][0] \n", 429 | "____________________________________________________________________________________________________\n", 430 | "conv_16 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_15[0][0] \n", 431 | "____________________________________________________________________________________________________\n", 432 | "norm_16 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_16[0][0] \n", 433 | "____________________________________________________________________________________________________\n", 434 | "leaky_re_lu_16 (LeakyReLU) (None, 13, 13, 1024) 0 norm_16[0][0] \n", 435 | "____________________________________________________________________________________________________\n", 436 | "conv_17 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_16[0][0] \n", 437 | "____________________________________________________________________________________________________\n", 438 | "norm_17 (BatchNormalization) (None, 13, 13, 512) 2048 conv_17[0][0] \n", 439 | "____________________________________________________________________________________________________\n", 440 | "leaky_re_lu_17 (LeakyReLU) (None, 13, 13, 512) 0 norm_17[0][0] \n", 441 | "____________________________________________________________________________________________________\n", 442 | "conv_18 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_17[0][0] \n", 443 | "____________________________________________________________________________________________________\n", 444 | "norm_18 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_18[0][0] \n", 445 | "____________________________________________________________________________________________________\n", 446 | "leaky_re_lu_18 (LeakyReLU) (None, 13, 13, 1024) 0 norm_18[0][0] \n", 447 | "____________________________________________________________________________________________________\n", 448 | "conv_19 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_18[0][0] \n", 449 | "____________________________________________________________________________________________________\n", 450 | "norm_19 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_19[0][0] \n", 451 | "____________________________________________________________________________________________________\n", 452 | "conv_21 (Conv2D) (None, 26, 26, 64) 32768 leaky_re_lu_13[0][0] \n", 453 | "____________________________________________________________________________________________________\n", 454 | "leaky_re_lu_19 (LeakyReLU) (None, 13, 13, 1024) 0 norm_19[0][0] \n", 455 | "____________________________________________________________________________________________________\n", 456 | "norm_21 (BatchNormalization) (None, 26, 26, 64) 256 conv_21[0][0] \n", 457 | "____________________________________________________________________________________________________\n", 458 | "conv_20 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_19[0][0] \n", 459 | "____________________________________________________________________________________________________\n", 460 | "leaky_re_lu_21 (LeakyReLU) (None, 26, 26, 64) 0 norm_21[0][0] \n", 461 | "____________________________________________________________________________________________________\n", 462 | "norm_20 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_20[0][0] \n", 463 | "____________________________________________________________________________________________________\n", 464 | "lambda_1 (Lambda) (None, 13, 13, 256) 0 leaky_re_lu_21[0][0] \n", 465 | "____________________________________________________________________________________________________\n", 466 | "leaky_re_lu_20 (LeakyReLU) (None, 13, 13, 1024) 0 norm_20[0][0] \n", 467 | "____________________________________________________________________________________________________\n", 468 | "concatenate_1 (Concatenate) (None, 13, 13, 1280) 0 lambda_1[0][0] \n", 469 | " leaky_re_lu_20[0][0] \n", 470 | "____________________________________________________________________________________________________\n", 471 | "conv_22 (Conv2D) (None, 13, 13, 1024) 11796480 concatenate_1[0][0] \n", 472 | "____________________________________________________________________________________________________\n", 473 | "norm_22 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_22[0][0] \n", 474 | "____________________________________________________________________________________________________\n", 475 | "leaky_re_lu_22 (LeakyReLU) (None, 13, 13, 1024) 0 norm_22[0][0] \n", 476 | "____________________________________________________________________________________________________\n", 477 | "conv_23 (Conv2D) (None, 13, 13, 30) 30750 leaky_re_lu_22[0][0] \n", 478 | "____________________________________________________________________________________________________\n", 479 | "reshape_1 (Reshape) (None, 13, 13, 5, 6) 0 conv_23[0][0] \n", 480 | "____________________________________________________________________________________________________\n", 481 | "input_2 (InputLayer) (None, 1, 1, 1, 50, 4 0 \n", 482 | "____________________________________________________________________________________________________\n", 483 | "lambda_2 (Lambda) (None, 13, 13, 5, 6) 0 reshape_1[0][0] \n", 484 | " input_2[0][0] \n", 485 | "====================================================================================================\n", 486 | "Total params: 50,578,686\n", 487 | "Trainable params: 50,558,014\n", 488 | "Non-trainable params: 20,672\n", 489 | "____________________________________________________________________________________________________\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "model.summary()" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": {}, 500 | "source": [ 501 | "# Load pretrained weights" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "**Load the weights originally provided by YOLO**" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 7, 514 | "metadata": { 515 | "ExecuteTime": { 516 | "end_time": "2018-04-04T00:18:58.168386", 517 | "start_time": "2018-04-04T00:18:58.110194" 518 | } 519 | }, 520 | "outputs": [], 521 | "source": [ 522 | "weight_reader = WeightReader(wt_path)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 8, 528 | "metadata": { 529 | "ExecuteTime": { 530 | "end_time": "2018-04-04T00:19:04.250579", 531 | "start_time": "2018-04-04T00:18:58.711706" 532 | } 533 | }, 534 | "outputs": [], 535 | "source": [ 536 | "weight_reader.reset()\n", 537 | "nb_conv = 23\n", 538 | "\n", 539 | "for i in range(1, nb_conv+1):\n", 540 | " conv_layer = model.get_layer('conv_' + str(i))\n", 541 | " \n", 542 | " if i < nb_conv:\n", 543 | " norm_layer = model.get_layer('norm_' + str(i))\n", 544 | " \n", 545 | " size = np.prod(norm_layer.get_weights()[0].shape)\n", 546 | "\n", 547 | " beta = weight_reader.read_bytes(size)\n", 548 | " gamma = weight_reader.read_bytes(size)\n", 549 | " mean = weight_reader.read_bytes(size)\n", 550 | " var = weight_reader.read_bytes(size)\n", 551 | "\n", 552 | " weights = norm_layer.set_weights([gamma, beta, mean, var]) \n", 553 | " \n", 554 | " if len(conv_layer.get_weights()) > 1:\n", 555 | " bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))\n", 556 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n", 557 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n", 558 | " kernel = kernel.transpose([2,3,1,0])\n", 559 | " conv_layer.set_weights([kernel, bias])\n", 560 | " else:\n", 561 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n", 562 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n", 563 | " kernel = kernel.transpose([2,3,1,0])\n", 564 | " conv_layer.set_weights([kernel])" 565 | ] 566 | }, 567 | { 568 | "cell_type": "markdown", 569 | "metadata": {}, 570 | "source": [ 571 | "**Randomize weights of the last layer**" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 9, 577 | "metadata": { 578 | "ExecuteTime": { 579 | "end_time": "2017-11-22T14:08:00.245248Z", 580 | "start_time": "2017-11-22T14:08:00.215495Z" 581 | } 582 | }, 583 | "outputs": [], 584 | "source": [ 585 | "layer = model.layers[-4] # the last convolutional layer\n", 586 | "weights = layer.get_weights()\n", 587 | "\n", 588 | "new_kernel = np.random.normal(size=weights[0].shape)/(GRID_H*GRID_W)\n", 589 | "new_bias = np.random.normal(size=weights[1].shape)/(GRID_H*GRID_W)\n", 590 | "\n", 591 | "layer.set_weights([new_kernel, new_bias])" 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "metadata": {}, 597 | "source": [ 598 | "# Perform training" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "**Loss function**" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": { 611 | "ExecuteTime": { 612 | "end_time": "2017-02-01T20:44:50.211553", 613 | "start_time": "2017-02-01T20:44:50.206006" 614 | } 615 | }, 616 | "source": [ 617 | "$$\\begin{multline}\n", 618 | "\\lambda_\\textbf{coord}\n", 619 | "\\sum_{i = 0}^{S^2}\n", 620 | " \\sum_{j = 0}^{B}\n", 621 | " L_{ij}^{\\text{obj}}\n", 622 | " \\left[\n", 623 | " \\left(\n", 624 | " x_i - \\hat{x}_i\n", 625 | " \\right)^2 +\n", 626 | " \\left(\n", 627 | " y_i - \\hat{y}_i\n", 628 | " \\right)^2\n", 629 | " \\right]\n", 630 | "\\\\\n", 631 | "+ \\lambda_\\textbf{coord} \n", 632 | "\\sum_{i = 0}^{S^2}\n", 633 | " \\sum_{j = 0}^{B}\n", 634 | " L_{ij}^{\\text{obj}}\n", 635 | " \\left[\n", 636 | " \\left(\n", 637 | " \\sqrt{w_i} - \\sqrt{\\hat{w}_i}\n", 638 | " \\right)^2 +\n", 639 | " \\left(\n", 640 | " \\sqrt{h_i} - \\sqrt{\\hat{h}_i}\n", 641 | " \\right)^2\n", 642 | " \\right]\n", 643 | "\\\\\n", 644 | "+ \\sum_{i = 0}^{S^2}\n", 645 | " \\sum_{j = 0}^{B}\n", 646 | " L_{ij}^{\\text{obj}}\n", 647 | " \\left(\n", 648 | " C_i - \\hat{C}_i\n", 649 | " \\right)^2\n", 650 | "\\\\\n", 651 | "+ \\lambda_\\textrm{noobj}\n", 652 | "\\sum_{i = 0}^{S^2}\n", 653 | " \\sum_{j = 0}^{B}\n", 654 | " L_{ij}^{\\text{noobj}}\n", 655 | " \\left(\n", 656 | " C_i - \\hat{C}_i\n", 657 | " \\right)^2\n", 658 | "\\\\\n", 659 | "+ \\sum_{i = 0}^{S^2}\n", 660 | "L_i^{\\text{obj}}\n", 661 | " \\sum_{c \\in \\textrm{classes}}\n", 662 | " \\left(\n", 663 | " p_i(c) - \\hat{p}_i(c)\n", 664 | " \\right)^2\n", 665 | "\\end{multline}$$" 666 | ] 667 | }, 668 | { 669 | "cell_type": "code", 670 | "execution_count": 10, 671 | "metadata": { 672 | "ExecuteTime": { 673 | "end_time": "2017-11-26T12:34:28.064549Z", 674 | "start_time": "2017-11-26T12:34:27.800510Z" 675 | }, 676 | "code_folding": [] 677 | }, 678 | "outputs": [], 679 | "source": [ 680 | "def custom_loss(y_true, y_pred):\n", 681 | " mask_shape = tf.shape(y_true)[:4]\n", 682 | " \n", 683 | " cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))\n", 684 | " cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n", 685 | "\n", 686 | " cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])\n", 687 | " \n", 688 | " coord_mask = tf.zeros(mask_shape)\n", 689 | " conf_mask = tf.zeros(mask_shape)\n", 690 | " class_mask = tf.zeros(mask_shape)\n", 691 | " \n", 692 | " seen = tf.Variable(0.)\n", 693 | " total_recall = tf.Variable(0.)\n", 694 | " \n", 695 | " \"\"\"\n", 696 | " Adjust prediction\n", 697 | " \"\"\"\n", 698 | " ### adjust x and y \n", 699 | " pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n", 700 | " \n", 701 | " ### adjust w and h\n", 702 | " pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])\n", 703 | " \n", 704 | " ### adjust confidence\n", 705 | " pred_box_conf = tf.sigmoid(y_pred[..., 4])\n", 706 | " \n", 707 | " ### adjust class probabilities\n", 708 | " pred_box_class = y_pred[..., 5:]\n", 709 | " \n", 710 | " \"\"\"\n", 711 | " Adjust ground truth\n", 712 | " \"\"\"\n", 713 | " ### adjust x and y\n", 714 | " true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n", 715 | " \n", 716 | " ### adjust w and h\n", 717 | " true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n", 718 | " \n", 719 | " ### adjust confidence\n", 720 | " true_wh_half = true_box_wh / 2.\n", 721 | " true_mins = true_box_xy - true_wh_half\n", 722 | " true_maxes = true_box_xy + true_wh_half\n", 723 | " \n", 724 | " pred_wh_half = pred_box_wh / 2.\n", 725 | " pred_mins = pred_box_xy - pred_wh_half\n", 726 | " pred_maxes = pred_box_xy + pred_wh_half \n", 727 | " \n", 728 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n", 729 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", 730 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", 731 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", 732 | " \n", 733 | " true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n", 734 | " pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n", 735 | "\n", 736 | " union_areas = pred_areas + true_areas - intersect_areas\n", 737 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n", 738 | " \n", 739 | " true_box_conf = iou_scores * y_true[..., 4]\n", 740 | " \n", 741 | " ### adjust class probabilities\n", 742 | " true_box_class = tf.argmax(y_true[..., 5:], -1)\n", 743 | " \n", 744 | " \"\"\"\n", 745 | " Determine the masks\n", 746 | " \"\"\"\n", 747 | " ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n", 748 | " coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE\n", 749 | " \n", 750 | " ### confidence mask: penelize predictors + penalize boxes with low IOU\n", 751 | " # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n", 752 | " true_xy = true_boxes[..., 0:2]\n", 753 | " true_wh = true_boxes[..., 2:4]\n", 754 | " \n", 755 | " true_wh_half = true_wh / 2.\n", 756 | " true_mins = true_xy - true_wh_half\n", 757 | " true_maxes = true_xy + true_wh_half\n", 758 | " \n", 759 | " pred_xy = tf.expand_dims(pred_box_xy, 4)\n", 760 | " pred_wh = tf.expand_dims(pred_box_wh, 4)\n", 761 | " \n", 762 | " pred_wh_half = pred_wh / 2.\n", 763 | " pred_mins = pred_xy - pred_wh_half\n", 764 | " pred_maxes = pred_xy + pred_wh_half \n", 765 | " \n", 766 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n", 767 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", 768 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", 769 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", 770 | " \n", 771 | " true_areas = true_wh[..., 0] * true_wh[..., 1]\n", 772 | " pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n", 773 | "\n", 774 | " union_areas = pred_areas + true_areas - intersect_areas\n", 775 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n", 776 | "\n", 777 | " best_ious = tf.reduce_max(iou_scores, axis=4)\n", 778 | " conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE\n", 779 | " \n", 780 | " # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n", 781 | " conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE\n", 782 | " \n", 783 | " ### class mask: simply the position of the ground truth boxes (the predictors)\n", 784 | " class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE \n", 785 | " \n", 786 | " \"\"\"\n", 787 | " Warm-up training\n", 788 | " \"\"\"\n", 789 | " no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)\n", 790 | " seen = tf.assign_add(seen, 1.)\n", 791 | " \n", 792 | " true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), \n", 793 | " lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n", 794 | " true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask, \n", 795 | " tf.ones_like(coord_mask)],\n", 796 | " lambda: [true_box_xy, \n", 797 | " true_box_wh,\n", 798 | " coord_mask])\n", 799 | " \n", 800 | " \"\"\"\n", 801 | " Finalize the loss\n", 802 | " \"\"\"\n", 803 | " nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n", 804 | " nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0))\n", 805 | " nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n", 806 | " \n", 807 | " loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", 808 | " loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", 809 | " loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2.\n", 810 | " loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n", 811 | " loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n", 812 | " \n", 813 | " loss = loss_xy + loss_wh + loss_conf + loss_class\n", 814 | " \n", 815 | " nb_true_box = tf.reduce_sum(y_true[..., 4])\n", 816 | " nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))\n", 817 | "\n", 818 | " \"\"\"\n", 819 | " Debugging code\n", 820 | " \"\"\" \n", 821 | " current_recall = nb_pred_box/(nb_true_box + 1e-6)\n", 822 | " total_recall = tf.assign_add(total_recall, current_recall) \n", 823 | "\n", 824 | " loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \\t', summarize=1000)\n", 825 | " loss = tf.Print(loss, [loss_xy], message='Loss XY \\t', summarize=1000)\n", 826 | " loss = tf.Print(loss, [loss_wh], message='Loss WH \\t', summarize=1000)\n", 827 | " loss = tf.Print(loss, [loss_conf], message='Loss Conf \\t', summarize=1000)\n", 828 | " loss = tf.Print(loss, [loss_class], message='Loss Class \\t', summarize=1000)\n", 829 | " loss = tf.Print(loss, [loss], message='Total Loss \\t', summarize=1000)\n", 830 | " loss = tf.Print(loss, [current_recall], message='Current Recall \\t', summarize=1000)\n", 831 | " loss = tf.Print(loss, [total_recall/seen], message='Average Recall \\t', summarize=1000)\n", 832 | " \n", 833 | " return loss" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "**Parse the annotations to construct train generator and validation generator**" 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 11, 846 | "metadata": { 847 | "ExecuteTime": { 848 | "end_time": "2017-11-26T12:38:44.283547Z", 849 | "start_time": "2017-11-26T12:38:44.277155Z" 850 | } 851 | }, 852 | "outputs": [], 853 | "source": [ 854 | "generator_config = {\n", 855 | " 'IMAGE_H' : IMAGE_H, \n", 856 | " 'IMAGE_W' : IMAGE_W,\n", 857 | " 'GRID_H' : GRID_H, \n", 858 | " 'GRID_W' : GRID_W,\n", 859 | " 'BOX' : BOX,\n", 860 | " 'LABELS' : LABELS,\n", 861 | " 'CLASS' : len(LABELS),\n", 862 | " 'ANCHORS' : ANCHORS,\n", 863 | " 'BATCH_SIZE' : BATCH_SIZE,\n", 864 | " 'TRUE_BOX_BUFFER' : 50,\n", 865 | "}" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": 12, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "def normalize(image):\n", 875 | " return image / 255." 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 13, 881 | "metadata": { 882 | "ExecuteTime": { 883 | "end_time": "2017-11-26T12:38:51.836129Z", 884 | "start_time": "2017-11-26T12:38:51.766843Z" 885 | } 886 | }, 887 | "outputs": [], 888 | "source": [ 889 | "train_imgs, seen_train_labels = parse_annotation(train_annot_folder, train_image_folder, labels=LABELS)\n", 890 | "### write parsed annotations to pickle for fast retrieval next time\n", 891 | "#with open('train_imgs', 'wb') as fp:\n", 892 | "# pickle.dump(train_imgs, fp)\n", 893 | "\n", 894 | "### read saved pickle of parsed annotations\n", 895 | "#with open ('train_imgs', 'rb') as fp:\n", 896 | "# train_imgs = pickle.load(fp)\n", 897 | "train_batch = BatchGenerator(train_imgs, generator_config, norm=normalize)\n", 898 | "\n", 899 | "valid_imgs, seen_valid_labels = parse_annotation(valid_annot_folder, valid_image_folder, labels=LABELS)\n", 900 | "### write parsed annotations to pickle for fast retrieval next time\n", 901 | "#with open('valid_imgs', 'wb') as fp:\n", 902 | "# pickle.dump(valid_imgs, fp)\n", 903 | "\n", 904 | "### read saved pickle of parsed annotations\n", 905 | "#with open ('valid_imgs', 'rb') as fp:\n", 906 | "# valid_imgs = pickle.load(fp)\n", 907 | "valid_batch = BatchGenerator(valid_imgs, generator_config, norm=normalize, jitter=False)" 908 | ] 909 | }, 910 | { 911 | "cell_type": "markdown", 912 | "metadata": {}, 913 | "source": [ 914 | "**Setup a few callbacks and start the training**" 915 | ] 916 | }, 917 | { 918 | "cell_type": "code", 919 | "execution_count": 14, 920 | "metadata": { 921 | "ExecuteTime": { 922 | "end_time": "2017-11-26T12:38:15.714460Z", 923 | "start_time": "2017-11-26T12:38:15.708674Z" 924 | }, 925 | "code_folding": [] 926 | }, 927 | "outputs": [], 928 | "source": [ 929 | "early_stop = EarlyStopping(monitor='val_loss', \n", 930 | " min_delta=0.001, \n", 931 | " patience=10, \n", 932 | " mode='min', \n", 933 | " verbose=1)\n", 934 | "\n", 935 | "checkpoint = ModelCheckpoint('weights_truck2.h5', \n", 936 | " monitor='val_loss', \n", 937 | " verbose=1, \n", 938 | " save_best_only=True, \n", 939 | " mode='min', \n", 940 | " period=1)" 941 | ] 942 | }, 943 | { 944 | "cell_type": "code", 945 | "execution_count": 15, 946 | "metadata": { 947 | "ExecuteTime": { 948 | "start_time": "2017-11-26T20:38:54.037Z" 949 | }, 950 | "scrolled": false 951 | }, 952 | "outputs": [ 953 | { 954 | "name": "stdout", 955 | "output_type": "stream", 956 | "text": [ 957 | "Epoch 1/100\n", 958 | "12/13 [==========================>...] - ETA: 4s - loss: 0.5833Epoch 00000: val_loss improved from inf to 0.47258, saving model to weights_truck2.h5\n", 959 | "13/13 [==============================] - 70s - loss: 0.5712 - val_loss: 0.4726\n", 960 | "Epoch 2/100\n", 961 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5854Epoch 00001: val_loss improved from 0.47258 to 0.44968, saving model to weights_truck2.h5\n", 962 | "13/13 [==============================] - 68s - loss: 0.5872 - val_loss: 0.4497\n", 963 | "Epoch 3/100\n", 964 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5750Epoch 00002: val_loss improved from 0.44968 to 0.43230, saving model to weights_truck2.h5\n", 965 | "13/13 [==============================] - 61s - loss: 0.5659 - val_loss: 0.4323\n", 966 | "Epoch 4/100\n", 967 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5410Epoch 00003: val_loss did not improve\n", 968 | "13/13 [==============================] - 52s - loss: 0.5439 - val_loss: 0.4491\n", 969 | "Epoch 5/100\n", 970 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5806Epoch 00004: val_loss did not improve\n", 971 | "13/13 [==============================] - 48s - loss: 0.5815 - val_loss: 0.4697\n", 972 | "Epoch 6/100\n", 973 | "12/13 [==========================>...] - ETA: 3s - loss: 0.6094Epoch 00005: val_loss did not improve\n", 974 | "13/13 [==============================] - 51s - loss: 0.6043 - val_loss: 0.4687\n", 975 | "Epoch 7/100\n", 976 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5725Epoch 00006: val_loss did not improve\n", 977 | "13/13 [==============================] - 52s - loss: 0.5736 - val_loss: 0.4330\n", 978 | "Epoch 8/100\n", 979 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5668Epoch 00007: val_loss did not improve\n", 980 | "13/13 [==============================] - 50s - loss: 0.5612 - val_loss: 0.4426\n", 981 | "Epoch 9/100\n", 982 | "12/13 [==========================>...] - ETA: 3s - loss: 0.5820Epoch 00008: val_loss did not improve\n", 983 | "13/13 [==============================] - 49s - loss: 0.5785 - val_loss: 0.4446\n", 984 | "Epoch 10/100\n", 985 | "11/13 [========================>.....] - ETA: 6s - loss: 0.5646" 986 | ] 987 | }, 988 | { 989 | "ename": "KeyboardInterrupt", 990 | "evalue": "", 991 | "output_type": "error", 992 | "traceback": [ 993 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 994 | "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", 995 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0mvalidation_steps\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalid_batch\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 23\u001b[0m \u001b[0mcallbacks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mearly_stop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcheckpoint\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtensorboard\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 24\u001b[0;31m max_queue_size = 3)\n\u001b[0m\u001b[1;32m 25\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhistory\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 996 | "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/legacy/interfaces.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 85\u001b[0m warnings.warn('Update your `' + object_name +\n\u001b[1;32m 86\u001b[0m '` call to the Keras 2 API: ' + signature, stacklevel=2)\n\u001b[0;32m---> 87\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 88\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_original_function\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 89\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 997 | "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/engine/training.py\u001b[0m in \u001b[0;36mfit_generator\u001b[0;34m(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)\u001b[0m\n\u001b[1;32m 2009\u001b[0m \u001b[0mbatch_index\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2010\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0msteps_done\u001b[0m \u001b[0;34m<\u001b[0m \u001b[0msteps_per_epoch\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2011\u001b[0;31m \u001b[0mgenerator_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_generator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2012\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2013\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mhasattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgenerator_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'__len__'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 998 | "\u001b[0;32m~/freightkeras/env/lib/python3.5/site-packages/keras/utils/data_utils.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 503\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 504\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_running\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 505\u001b[0;31m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mqueue\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 506\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 999 | "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mget\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 602\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 603\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mready\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 604\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1000 | "\u001b[0;32m/usr/lib/python3.5/multiprocessing/pool.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 597\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 598\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 599\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 600\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 601\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1001 | "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 547\u001b[0m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 548\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 549\u001b[0;31m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 550\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 1002 | "\u001b[0;32m/usr/lib/python3.5/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 293\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1003 | "\u001b[0;31mKeyboardInterrupt\u001b[0m: " 1004 | ] 1005 | } 1006 | ], 1007 | "source": [ 1008 | "tb_counter = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'truck_' in log]) + 1\n", 1009 | "tensorboard = TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'truck_' + '_' + str(tb_counter), \n", 1010 | " histogram_freq=0, \n", 1011 | " write_graph=True, \n", 1012 | " write_images=False)\n", 1013 | "\n", 1014 | "optimizer = Adam(lr=0.1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n", 1015 | "#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)\n", 1016 | "#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)\n", 1017 | "\n", 1018 | "model.load_weights(\"wednesday2.h5\")\n", 1019 | "\n", 1020 | "model.compile(loss=custom_loss, optimizer=optimizer)\n", 1021 | "\n", 1022 | "\n", 1023 | "\n", 1024 | "#history = model.fit_generator(generator = train_batch, \n", 1025 | " steps_per_epoch = len(train_batch), \n", 1026 | " epochs = 100, \n", 1027 | " verbose = 1,\n", 1028 | " validation_data = valid_batch,\n", 1029 | " validation_steps = len(valid_batch),\n", 1030 | " callbacks = [early_stop, checkpoint, tensorboard], \n", 1031 | " max_queue_size = 3)\n", 1032 | "\n", 1033 | "#print(history.history.keys())\n", 1034 | "# summarize history for accuracy\n", 1035 | "#plt.plot(history.history['loss'])\n", 1036 | "#plt.plot(history.history['val_loss'])\n", 1037 | "#plt.title('model loss')\n", 1038 | "#plt.ylabel('loss')\n", 1039 | "#plt.xlabel('epoch')\n", 1040 | "#plt.legend(['train', 'test'], loc='upper left')\n", 1041 | "#plt.show()" 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "markdown", 1046 | "metadata": {}, 1047 | "source": [ 1048 | "# Perform detection on image" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": 1, 1054 | "metadata": { 1055 | "ExecuteTime": { 1056 | "end_time": "2017-11-22T14:07:49.271978Z", 1057 | "start_time": "2017-11-22T14:07:49.268999Z" 1058 | } 1059 | }, 1060 | "outputs": [ 1061 | { 1062 | "ename": "NameError", 1063 | "evalue": "name 'model' is not defined", 1064 | "output_type": "error", 1065 | "traceback": [ 1066 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1067 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 1068 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_weights.h5\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 1069 | "\u001b[0;31mNameError\u001b[0m: name 'model' is not defined" 1070 | ] 1071 | } 1072 | ], 1073 | "source": [ 1074 | "model.load_weights(\"best_weights.h5\")" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "metadata": { 1081 | "ExecuteTime": { 1082 | "end_time": "2018-04-04T00:19:07.263359", 1083 | "start_time": "2018-04-04T00:19:05.658285" 1084 | }, 1085 | "scrolled": false 1086 | }, 1087 | "outputs": [], 1088 | "source": [ 1089 | "image = cv2.imread('train_image_folder/00001018.jpg')\n", 1090 | "dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))\n", 1091 | "\n", 1092 | "plt.figure(figsize=(10,10))\n", 1093 | "\n", 1094 | "input_image = cv2.resize(image, (416, 416))\n", 1095 | "input_image = input_image / 255.\n", 1096 | "input_image = input_image[:,:,::-1]\n", 1097 | "input_image = np.expand_dims(input_image, 0)\n", 1098 | "\n", 1099 | "netout = model.predict([input_image, dummy_array])\n", 1100 | "\n", 1101 | "boxes = decode_netout(netout[0], \n", 1102 | " obj_threshold=0.3,\n", 1103 | " nms_threshold=NMS_THRESHOLD,\n", 1104 | " anchors=ANCHORS, \n", 1105 | " nb_class=CLASS)\n", 1106 | " \n", 1107 | "image = draw_boxes(image, boxes, labels=LABELS)\n", 1108 | "\n", 1109 | "plt.imshow(image[:,:,::-1]); plt.show()" 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "markdown", 1114 | "metadata": {}, 1115 | "source": [ 1116 | "# Perform detection on video" 1117 | ] 1118 | }, 1119 | { 1120 | "cell_type": "code", 1121 | "execution_count": null, 1122 | "metadata": { 1123 | "ExecuteTime": { 1124 | "end_time": "2017-10-06T13:28:28.029334Z", 1125 | "start_time": "2017-10-06T13:28:28.024662Z" 1126 | } 1127 | }, 1128 | "outputs": [], 1129 | "source": [ 1130 | "model.load_weights(\"weights_coco.h5\")\n", 1131 | "\n", 1132 | "dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))" 1133 | ] 1134 | }, 1135 | { 1136 | "cell_type": "code", 1137 | "execution_count": null, 1138 | "metadata": { 1139 | "ExecuteTime": { 1140 | "end_time": "2017-10-06T13:39:09.640646Z", 1141 | "start_time": "2017-10-06T13:31:44.627609Z" 1142 | } 1143 | }, 1144 | "outputs": [], 1145 | "source": [ 1146 | "video_inp = '../basic-yolo-keras/images/phnom_penh.mp4'\n", 1147 | "video_out = '../basic-yolo-keras/images/phnom_penh_bbox.mp4'\n", 1148 | "\n", 1149 | "video_reader = cv2.VideoCapture(video_inp)\n", 1150 | "\n", 1151 | "nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))\n", 1152 | "frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT))\n", 1153 | "frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH))\n", 1154 | "\n", 1155 | "video_writer = cv2.VideoWriter(video_out,\n", 1156 | " cv2.VideoWriter_fourcc(*'XVID'), \n", 1157 | " 50.0, \n", 1158 | " (frame_w, frame_h))\n", 1159 | "\n", 1160 | "for i in tqdm(range(nb_frames)):\n", 1161 | " ret, image = video_reader.read()\n", 1162 | " \n", 1163 | " input_image = cv2.resize(image, (416, 416))\n", 1164 | " input_image = input_image / 255.\n", 1165 | " input_image = input_image[:,:,::-1]\n", 1166 | " input_image = np.expand_dims(input_image, 0)\n", 1167 | "\n", 1168 | " netout = model.predict([input_image, dummy_array])\n", 1169 | "\n", 1170 | " boxes = decode_netout(netout[0], \n", 1171 | " obj_threshold=0.3,\n", 1172 | " nms_threshold=NMS_THRESHOLD,\n", 1173 | " anchors=ANCHORS, \n", 1174 | " nb_class=CLASS)\n", 1175 | " image = draw_boxes(image, boxes, labels=LABELS)\n", 1176 | "\n", 1177 | " video_writer.write(np.uint8(image))\n", 1178 | " \n", 1179 | "video_reader.release()\n", 1180 | "video_writer.release() " 1181 | ] 1182 | } 1183 | ], 1184 | "metadata": { 1185 | "anaconda-cloud": {}, 1186 | "hide_input": false, 1187 | "kernelspec": { 1188 | "display_name": "Python 3", 1189 | "language": "python", 1190 | "name": "python3" 1191 | }, 1192 | "language_info": { 1193 | "codemirror_mode": { 1194 | "name": "ipython", 1195 | "version": 3 1196 | }, 1197 | "file_extension": ".py", 1198 | "mimetype": "text/x-python", 1199 | "name": "python", 1200 | "nbconvert_exporter": "python", 1201 | "pygments_lexer": "ipython3", 1202 | "version": "3.5.2" 1203 | }, 1204 | "toc": { 1205 | "nav_menu": { 1206 | "height": "122px", 1207 | "width": "252px" 1208 | }, 1209 | "number_sections": true, 1210 | "sideBar": true, 1211 | "skip_h1_title": false, 1212 | "toc_cell": false, 1213 | "toc_position": { 1214 | "height": "758px", 1215 | "left": "0px", 1216 | "right": "1096px", 1217 | "top": "73px", 1218 | "width": "253px" 1219 | }, 1220 | "toc_section_display": "block", 1221 | "toc_window_display": true 1222 | } 1223 | }, 1224 | "nbformat": 4, 1225 | "nbformat_minor": 1 1226 | } 1227 | --------------------------------------------------------------------------------