├── vod-converter ├── vod_converter │ ├── __init__.py │ ├── main.py │ ├── udacity.py │ ├── kitti_tracking.py │ ├── kitti.py │ ├── voc.py │ └── converter.py ├── .gitignore ├── tests │ ├── context.py │ └── test_converter.py ├── LICENSE └── README.md ├── model_data ├── tiny_yolo_anchors.txt ├── yolo_anchors.txt ├── coco_classes.txt └── yolov3.cfg ├── 9_CLASS_test_classes.txt ├── README.md ├── voc_to_YOLOv3.py ├── yolo3 ├── utils.py └── model.py ├── oid_to_pascal_voc_xml.py ├── train.py ├── convert.py └── train_bottleneck.py /vod-converter/vod_converter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vod-converter/.gitignore: -------------------------------------------------------------------------------- 1 | .idea* 2 | .cache* 3 | kitti-tracking* -------------------------------------------------------------------------------- /model_data/tiny_yolo_anchors.txt: -------------------------------------------------------------------------------- 1 | 10,14, 23,27, 37,58, 81,82, 135,169, 344,319 2 | -------------------------------------------------------------------------------- /model_data/yolo_anchors.txt: -------------------------------------------------------------------------------- 1 | 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 2 | -------------------------------------------------------------------------------- /9_CLASS_test_classes.txt: -------------------------------------------------------------------------------- 1 | car 2 | Van 3 | Truck 4 | person 5 | Person_sitting 6 | Cyclist 7 | Tram 8 | Misc 9 | DontCare 10 | -------------------------------------------------------------------------------- /vod-converter/tests/context.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../vod_converter'))) 4 | 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Object-distance-Estimation-and-Collision-warning 2 | A project which uses deep learning to detect and estimate the distance of the detected objects from the monocular camera. An alarm is raised the detected object is in a distance range considered as dangerous. 3 | 4 | In this poject we used the TensorFlow implementation of Yolov3 architecture from [pythonlesson](https://github.com/pythonlessons/YOLOv3-object-detection-tutorial/tree/master/YOLOv3-custom-training). We trained our model on the [Kitti](http://www.cvlibs.net/datasets/kitti/) dataset for this model to be detect the road object. This dataset has 9 classes which are 'car', 'Van', 'Truck','person', 'Person_sitting', 'Cyclist', 'Tram', 'Misc' and 'DontCare' 5 | -------------------------------------------------------------------------------- /model_data/coco_classes.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /vod-converter/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /voc_to_YOLOv3.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | from os import getcwd 3 | import os 4 | 5 | 6 | dataset_train = 'kitti_data/training/image_2' 7 | dataset_file = '9_CLASS_test.txt' 8 | # classes_file = dataset_file[:-4]+'_classes.txt' 9 | 10 | CLS = ["car", "Van", "Truck", "person", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"] 11 | # classes =[dataset_train+CLASS for CLASS in CLS] 12 | wd = getcwd() 13 | 14 | 15 | def test(fullname): 16 | bb = "" 17 | in_file = open(fullname) 18 | tree=ET.parse(in_file) 19 | root = tree.getroot() 20 | for i, obj in enumerate(root.iter('object')): 21 | difficult = obj.find('difficult').text 22 | cls = obj.find('name').text 23 | if cls not in CLS or int(difficult)==1: 24 | continue 25 | cls_id = CLS.index(cls) 26 | xmlbox = obj.find('bndbox') 27 | b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text))) 28 | bb += (" " + ",".join([str(a) for a in b]) + ',' + str(cls_id)) 29 | 30 | # we need this because I don't know overlapping or something like that 31 | if cls == 'DontCare': 32 | list_file = open(dataset_file, 'a') 33 | file_string = str(fullname)[:-4]+'.png'+bb+'\n' 34 | list_file.write(file_string) 35 | list_file.close() 36 | bb = "" 37 | 38 | if bb != "": 39 | list_file = open(dataset_file, 'a') 40 | file_string = str(fullname)[:-4]+'.png'+bb+'\n' 41 | list_file.write(file_string) 42 | list_file.close() 43 | 44 | 45 | 46 | for filename in os.listdir(dataset_train): 47 | if not filename.endswith('.xml'): 48 | continue 49 | fullname = os.getcwd()+'/'+dataset_train+'/'+filename 50 | test(fullname) 51 | 52 | -------------------------------------------------------------------------------- /vod-converter/README.md: -------------------------------------------------------------------------------- 1 | # Visual Object Dataset converter 2 | 3 | Converts between object dataset formats. Requires Python 3.6. 4 | 5 | Example: convert from data in [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php) format to 6 | [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) format: 7 | 8 | ``` 9 | $ python3.6 vod_converter/main.py --from kitti --from-path datasets/mydata-kitti --to voc --to-path datasets/mydata-voc 10 | ``` 11 | 12 | See `main.py` for documentation on how to easily plug in additional data formats; you can define a function 13 | that can read in your data into a common format, and it will be then ready to convert to any supported format. 14 | 15 | Similarly, you can implement a single function that takes the common format and outputs to the filesystem in 16 | your format and you will be ready to convert from e.g VOC to yours. 17 | 18 | Currently support conversion from: 19 | 20 | - [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php) 21 | - [KITTI tracking](http://www.cvlibs.net/datasets/kitti/eval_tracking.php) 22 | - [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) 23 | - [Udacity CrowdAI and AUTTI](https://github.com/udacity/self-driving-car/tree/master/annotations) 24 | 25 | to: 26 | 27 | - [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) 28 | - [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php) 29 | 30 | ## That 'train.txt' file for KITTI 31 | 32 | When reading in KITTI, the script expects a `train.txt` file that isn't part of the original dataset. This is simply a file with the name of each datapoint you wish to capture. [Here's an example with everything in the training set](https://github.com/umautobots/vod-converter/files/1139276/train.txt). You can also create it like so: 33 | 34 | ``` 35 | $ cd datasets/kitti && ls -1 training/image_2 | cut -d. -f1 > train.txt && cd - 36 | $ head datasets/kitti/train.txt 37 | 000000 38 | 000001 39 | 000002 40 | 000003 41 | 000004 42 | 000005 43 | 000006 44 | 000007 45 | 000008 46 | 000009 47 | ``` 48 | 49 | ## Python2 support 50 | 51 | This project is written using features requiring Python3.6+, but there is [a fork](https://github.com/nghiattran/vod-converter) that has been updated to work in Python2 if you need it. 52 | 53 | -------------------------------------------------------------------------------- /vod-converter/tests/test_converter.py: -------------------------------------------------------------------------------- 1 | import context # augment system path to make imports work 2 | from vod_converter import converter 3 | 4 | 5 | def test_convert_labels(): 6 | assert [{'detections': [ 7 | {'label': 'person'}, 8 | {'label': 'person'}, 9 | {'label': 'person'}, 10 | {'label': 'rhinoZaurus'} 11 | ]}] == \ 12 | converter.convert_labels( 13 | image_detections=[ 14 | {'detections': [ 15 | {'label': 'Pedestrian'}, 16 | {'label': 'pedestrian'}, 17 | {'label': 'Person'}, 18 | {'label': 'rhinoZaurus'} 19 | ]} 20 | ], 21 | expected_labels={'person': ['Pedestrian']}, 22 | select_only_known_labels=False, 23 | filter_images_without_labels=False 24 | ) 25 | 26 | 27 | def test_select_only_known_labels(): 28 | assert [{'detections': [ 29 | {'label': 'person'}, 30 | {'label': 'person'}, 31 | {'label': 'person'}, 32 | ]}] == \ 33 | converter.convert_labels( 34 | image_detections=[ 35 | {'detections': [ 36 | {'label': 'Pedestrian'}, 37 | {'label': 'pedestrian'}, 38 | {'label': 'Person'}, 39 | {'label': 'rhinoZaurus'} 40 | ]} 41 | ], 42 | expected_labels={'person': ['Pedestrian']}, 43 | select_only_known_labels=True, 44 | filter_images_without_labels=False 45 | ) 46 | 47 | 48 | def test_filter_images_without_labels(): 49 | assert [{'detections': [ 50 | {'label': 'person'}, 51 | {'label': 'person'}, 52 | {'label': 'person'}, 53 | ]}] == \ 54 | converter.convert_labels( 55 | image_detections=[ 56 | {'detections': [ 57 | {'label': 'Pedestrian'}, 58 | {'label': 'pedestrian'}, 59 | {'label': 'Person'}, 60 | {'label': 'rhinoZaurus'} 61 | ], 62 | }, 63 | {'detections': [ 64 | {'label': 'rhinoZaurus'} 65 | ] 66 | } 67 | ], 68 | expected_labels={'person': ['Pedestrian']}, 69 | select_only_known_labels=True, 70 | filter_images_without_labels=True 71 | ) 72 | -------------------------------------------------------------------------------- /vod-converter/vod_converter/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converts between visual object detection dataset formats. See `converter.py` for more info. 3 | 4 | To add support for additional data formats, define a module with an `converter.Ingestor` and/or 5 | `converter.Egestor` implementation and add them to the `INGESTORS` and `EGESTORS` dicts below. 6 | """ 7 | 8 | import argparse 9 | import logging 10 | 11 | import converter 12 | import kitti 13 | import kitti_tracking 14 | import udacity 15 | import voc 16 | 17 | import sys 18 | 19 | logger = logging.getLogger() 20 | logger.setLevel(logging.INFO) 21 | 22 | INGESTORS = { 23 | 'kitti': kitti.KITTIIngestor(), 24 | 'kitti-tracking': kitti_tracking.KITTITrackingIngestor(), 25 | 'voc': voc.VOCIngestor(), 26 | 'udacity-crowdai': udacity.UdacityCrowdAIIngestor(), 27 | 'udacity-autti': udacity.UdacityAuttiIngestor() 28 | } 29 | 30 | EGESTORS = { 31 | 'voc': voc.VOCEgestor(), 32 | 'kitti': kitti.KITTIEgestor() 33 | } 34 | 35 | 36 | def main(*, from_path, from_key, to_path, to_key, select_only_known_labels, filter_images_without_labels): 37 | success, msg = converter.convert(from_path=from_path, ingestor=INGESTORS[from_key], 38 | to_path=to_path, egestor=EGESTORS[to_key], 39 | select_only_known_labels=select_only_known_labels, 40 | filter_images_without_labels=filter_images_without_labels) 41 | if success: 42 | print(f"Successfully converted from {from_key} to {to_key}.") 43 | else: 44 | print(f"Failed to convert from {from_key} to {to_key}: {msg}") 45 | return 1 46 | 47 | 48 | def parse_args(): 49 | parser = argparse.ArgumentParser(description='Convert visual object datasets.') 50 | parser._action_groups.pop() 51 | required = parser.add_argument_group('required arguments') 52 | optional = parser.add_argument_group('optional arguments') 53 | required.add_argument('--from', 54 | dest='from_key', 55 | required=True, 56 | help=f'Format to convert from: one of {", ".join(INGESTORS.keys())}', type=str) 57 | required.add_argument('--from-path', dest='from_path', 58 | required=True, 59 | help=f'Path to dataset you wish to convert.', type=str) 60 | required.add_argument('--to', dest='to_key', required=True, 61 | help=f'Format to convert to: one of {", ".join(EGESTORS.keys())}', 62 | type=str) 63 | required.add_argument( 64 | '--to-path', 65 | dest='to_path', required=True, 66 | help="Path to output directory for converted dataset.", type=str) 67 | optional.add_argument( 68 | '--select-only-known-labels', 69 | help="only include labels known to the destination dataset (e.g skip 'trafficlight' if VOC doesn't know about it)", 70 | required=False, 71 | action='store_true', 72 | default=False 73 | ) 74 | optional.add_argument( 75 | '--filter-images-without-labels', 76 | help="skip images that don't have any (known) labels", 77 | required=False, 78 | action='store_true', 79 | default=False 80 | ) 81 | 82 | args = parser.parse_args() 83 | logging.info(args) 84 | return args 85 | 86 | 87 | if __name__ == '__main__': 88 | args = parse_args() 89 | sys.exit(main(from_path=args.from_path, from_key=args.from_key, 90 | to_path=args.to_path, to_key=args.to_key, 91 | select_only_known_labels=args.select_only_known_labels, 92 | filter_images_without_labels=args.filter_images_without_labels)) 93 | -------------------------------------------------------------------------------- /yolo3/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous utility functions.""" 2 | 3 | from functools import reduce 4 | 5 | from PIL import Image 6 | import numpy as np 7 | from matplotlib.colors import rgb_to_hsv, hsv_to_rgb 8 | import cv2 9 | 10 | def compose(*funcs): 11 | """Compose arbitrarily many functions, evaluated left to right. 12 | 13 | Reference: https://mathieularose.com/function-composition-in-python/ 14 | """ 15 | # return lambda x: reduce(lambda v, f: f(v), funcs, x) 16 | if funcs: 17 | return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs) 18 | else: 19 | raise ValueError('Composition of empty sequence not supported.') 20 | 21 | def image_preporcess(image, target_size, gt_boxes=None): 22 | 23 | ih, iw = target_size 24 | h, w, _ = image.shape 25 | 26 | scale = min(iw/w, ih/h) 27 | nw = int(scale * w) 28 | nh = int(scale * h) 29 | 30 | image_resized = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_CUBIC) 31 | 32 | image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0) 33 | dw, dh = (iw - nw) // 2, (ih-nh) // 2 34 | image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized 35 | 36 | image_paded = np.array(image_paded, dtype='float32') 37 | 38 | image_paded = image_paded / 255. 39 | 40 | image_paded = np.expand_dims(image_paded, axis=0) 41 | 42 | return image_paded 43 | 44 | def rand(a=0, b=1): 45 | return np.random.rand()*(b-a) + a 46 | 47 | def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jitter=.3, hue=.1, sat=1.5, val=1.5, proc_img=True): 48 | '''random preprocessing for real-time data augmentation''' 49 | line = annotation_line.split() 50 | image = Image.open(line[0]) 51 | iw, ih = image.size 52 | h, w = input_shape 53 | box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]]) 54 | 55 | if not random: 56 | # resize image 57 | scale = min(w/iw, h/ih) 58 | nw = int(iw*scale) 59 | nh = int(ih*scale) 60 | dx = (w-nw)//2 61 | dy = (h-nh)//2 62 | image_data=0 63 | if proc_img: 64 | image = image.resize((nw,nh), Image.BICUBIC) 65 | new_image = Image.new('RGB', (w,h), (128,128,128)) 66 | new_image.paste(image, (dx, dy)) 67 | image_data = np.array(new_image)/255. 68 | 69 | # correct boxes 70 | box_data = np.zeros((max_boxes,5)) 71 | if len(box)>0: 72 | np.random.shuffle(box) 73 | if len(box)>max_boxes: box = box[:max_boxes] 74 | box[:, [0,2]] = box[:, [0,2]]*scale + dx 75 | box[:, [1,3]] = box[:, [1,3]]*scale + dy 76 | box_data[:len(box)] = box 77 | 78 | return image_data, box_data 79 | 80 | # resize image 81 | new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter) 82 | scale = rand(.25, 2) 83 | if new_ar < 1: 84 | nh = int(scale*h) 85 | nw = int(nh*new_ar) 86 | else: 87 | nw = int(scale*w) 88 | nh = int(nw/new_ar) 89 | image = image.resize((nw,nh), Image.BICUBIC) 90 | 91 | # place image 92 | dx = int(rand(0, w-nw)) 93 | dy = int(rand(0, h-nh)) 94 | new_image = Image.new('RGB', (w,h), (128,128,128)) 95 | new_image.paste(image, (dx, dy)) 96 | image = new_image 97 | 98 | # flip image or not 99 | flip = rand()<.5 100 | if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT) 101 | 102 | # distort image 103 | hue = rand(-hue, hue) 104 | sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat) 105 | val = rand(1, val) if rand()<.5 else 1/rand(1, val) 106 | x = rgb_to_hsv(np.array(image)/255.) 107 | x[..., 0] += hue 108 | x[..., 0][x[..., 0]>1] -= 1 109 | x[..., 0][x[..., 0]<0] += 1 110 | x[..., 1] *= sat 111 | x[..., 2] *= val 112 | x[x>1] = 1 113 | x[x<0] = 0 114 | image_data = hsv_to_rgb(x) # numpy array, 0 to 1 115 | 116 | # correct boxes 117 | box_data = np.zeros((max_boxes,5)) 118 | if len(box)>0: 119 | np.random.shuffle(box) 120 | box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx 121 | box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy 122 | if flip: box[:, [0,2]] = w - box[:, [2,0]] 123 | box[:, 0:2][box[:, 0:2]<0] = 0 124 | box[:, 2][box[:, 2]>w] = w 125 | box[:, 3][box[:, 3]>h] = h 126 | box_w = box[:, 2] - box[:, 0] 127 | box_h = box[:, 3] - box[:, 1] 128 | box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box 129 | if len(box)>max_boxes: box = box[:max_boxes] 130 | box_data[:len(box)] = box 131 | 132 | return image_data, box_data 133 | -------------------------------------------------------------------------------- /vod-converter/vod_converter/udacity.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://github.com/udacity/self-driving-car/tree/master/annotations 3 | 4 | 5 | """ 6 | 7 | import csv 8 | import glob 9 | import os 10 | from PIL import Image 11 | 12 | from collections import defaultdict 13 | 14 | 15 | from converter import Ingestor 16 | 17 | 18 | class UdacityCrowdAIIngestor(Ingestor): 19 | 20 | def validate(self, root): 21 | labels_path = f"{root}/labels.csv" 22 | if not os.path.isfile(labels_path): 23 | return False, f"Expected to find {labels_path}" 24 | return True, None 25 | 26 | def ingest(self, root): 27 | labels_path = f"{root}/labels.csv" 28 | image_labels = defaultdict(list) 29 | 30 | with open(labels_path) as labels_file: 31 | labels_csv = csv.reader(labels_file) 32 | next(labels_csv, None) # skip header 33 | for idx, row in enumerate(labels_csv): 34 | image_labels[row[4]].append(row) 35 | 36 | image_detections = [] 37 | for idx, image_path in enumerate(glob.glob(f"{root}/*.jpg")): 38 | f_name = image_path.split("/")[-1] 39 | f_image_labels = image_labels[f_name] 40 | fname_id = f_name.split('.')[0] 41 | 42 | image_width, image_height = _image_dimensions(image_path) 43 | 44 | def clamp_bbox(det): 45 | if det['right'] > image_width - 1: 46 | det['right'] = image_width - 1 47 | if det['bottom'] > image_height - 1: 48 | det['bottom'] = image_height - 1 49 | return det 50 | 51 | def valid_bbox(det): 52 | return det['right'] > det['left'] and det['bottom'] > det['top'] 53 | 54 | detections = [] 55 | 56 | for image_label in f_image_labels: 57 | x1, y1, x2, y2 = map(float, image_label[0:4]) 58 | label = image_label[5] 59 | detections.append({ 60 | 'label': label, 61 | 'left': x1, 62 | 'right': x2, 63 | 'top': y1, 64 | 'bottom': y2 65 | }) 66 | 67 | filtered_detections = [clamp_bbox(det) for det in detections if valid_bbox(det)] 68 | if filtered_detections: 69 | image_detections.append({ 70 | 'image': { 71 | 'id': fname_id, 72 | 'path': image_path, 73 | 'segmented_path': None, 74 | 'width': image_width, 75 | 'height': image_height 76 | }, 77 | 'detections': filtered_detections 78 | }) 79 | return image_detections 80 | 81 | 82 | class UdacityAuttiIngestor(Ingestor): 83 | def validate(self, root): 84 | labels_path = f"{root}/labels.csv" 85 | if not os.path.isfile(labels_path): 86 | return False, f"Expected to find {labels_path}" 87 | return True, None 88 | 89 | def ingest(self, root): 90 | labels_path = f"{root}/labels.csv" 91 | image_labels = defaultdict(list) 92 | 93 | with open(labels_path) as labels_file: 94 | labels_csv = csv.reader(labels_file, delimiter=' ') 95 | next(labels_csv, None) # skip header 96 | for idx, row in enumerate(labels_csv): 97 | image_labels[row[0]].append(row) 98 | 99 | image_detections = [] 100 | for idx, image_path in enumerate(glob.glob(f"{root}/*.jpg")): 101 | f_name = image_path.split("/")[-1] 102 | f_image_labels = image_labels[f_name] 103 | fname_id = f_name.split('.')[0] 104 | 105 | image_width, image_height = _image_dimensions(image_path) 106 | 107 | def clamp_bbox(det): 108 | if det['right'] > image_width - 1: 109 | det['right'] = image_width - 1 110 | if det['bottom'] > image_height - 1: 111 | det['bottom'] = image_height - 1 112 | return det 113 | 114 | def valid_bbox(det): 115 | return det['right'] > det['left'] and det['bottom'] > det['top'] 116 | 117 | detections = [] 118 | 119 | for image_label in f_image_labels: 120 | x1, y1, x2, y2, x, label = image_label[1:7] 121 | x1, y1, x2, y2 = map(float, (x1, y1, x2, y2)) 122 | detections.append({ 123 | 'label': label, 124 | 'left': x1, 125 | 'right': x2, 126 | 'top': y1, 127 | 'bottom': y2 128 | }) 129 | 130 | filtered_detections = [clamp_bbox(det) for det in detections if valid_bbox(det)] 131 | if filtered_detections: 132 | image_detections.append({ 133 | 'image': { 134 | 'id': fname_id, 135 | 'path': image_path, 136 | 'segmented_path': None, 137 | 'width': image_width, 138 | 'height': image_height 139 | }, 140 | 'detections': filtered_detections 141 | }) 142 | return image_detections 143 | 144 | 145 | def _image_dimensions(path): 146 | with Image.open(path) as image: 147 | return image.width, image.height -------------------------------------------------------------------------------- /vod-converter/vod_converter/kitti_tracking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingestor for KITTI tracking formats. 3 | 4 | http://www.cvlibs.net/datasets/kitti/eval_tracking.php 5 | 6 | Note: even though this is for tracking instead of object detection, sometime it's helpful to convert 7 | data from this for object detection training. This reads in the left color labels. 8 | 9 | Per devkit docs: 10 | 11 | The data for training and testing can be found in the corresponding folders. 12 | The sub-folders are structured as follows: 13 | 14 | - image_02/%04d/ contains the left color camera sequence images (png) 15 | - image_03/%04d/ contains the right color camera sequence images (png) 16 | - label_02/ contains the left color camera label files (plain text files) 17 | - calib/ contains the calibration for all four cameras (plain text files) 18 | 19 | The label files contain the following information, which can be read and 20 | written using the matlab tools (readLabels.m) provided within this devkit. 21 | All values (numerical or strings) are separated via spaces, each row 22 | corresponds to one object. The 17 columns represent: 23 | 24 | #Values Name Description 25 | ---------------------------------------------------------------------------- 26 | 1 frame Frame within the sequence where the object appearers 27 | 1 track id Unique tracking id of this object within this sequence 28 | 1 type Describes the type of object: 'Car', 'Van', 'Truck', 29 | 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram', 30 | 'Misc' or 'DontCare' 31 | 1 truncated Float from 0 (non-truncated) to 1 (truncated), where 32 | truncated refers to the object leaving image boundaries. 33 | Truncation 2 indicates an ignored object (in particular 34 | in the beginning or end of a track) introduced by manual 35 | labeling. 36 | 1 occluded Integer (0,1,2,3) indicating occlusion state: 37 | 0 = fully visible, 1 = partly occluded 38 | 2 = largely occluded, 3 = unknown 39 | 1 alpha Observation angle of object, ranging [-pi..pi] 40 | 4 bbox 2D bounding box of object in the image (0-based index): 41 | contains left, top, right, bottom pixel coordinates 42 | 3 dimensions 3D object dimensions: height, width, length (in meters) 43 | 3 location 3D object location x,y,z in camera coordinates (in meters) 44 | 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi] 45 | 1 score Only for results: Float, indicating confidence in 46 | detection, needed for p/r curves, higher is better. 47 | 48 | 49 | """ 50 | 51 | import csv 52 | from collections import defaultdict 53 | import os 54 | import re 55 | from PIL import Image 56 | 57 | from converter import Ingestor 58 | 59 | LABEL_F_PATTERN = re.compile('[0-9]+\.txt') 60 | 61 | 62 | class KITTITrackingIngestor(Ingestor): 63 | def validate(self, path): 64 | expected_dirs = [ 65 | 'image_02', 66 | 'label_02' 67 | ] 68 | for subdir in expected_dirs: 69 | if not os.path.isdir(f"{path}/{subdir}"): 70 | return False, f"Expected subdirectory {subdir} within {path}" 71 | return True, None 72 | 73 | def ingest(self, path): 74 | fs = os.listdir(f"{path}/label_02") 75 | label_fnames = [f for f in fs if LABEL_F_PATTERN.match(f)] 76 | image_detections = [] 77 | for label_fname in label_fnames: 78 | frame_name = label_fname.split(".")[0] 79 | labels_path = f"{path}/label_02/{label_fname}" 80 | images_dir = f"{path}/image_02/{frame_name}" 81 | image_detections.extend( 82 | self._get_track_image_detections(frame_name=frame_name, labels_path=labels_path, images_dir=images_dir)) 83 | return image_detections 84 | 85 | def _get_track_image_detections(self, *, frame_name, labels_path, images_dir): 86 | detections_by_frame = defaultdict(list) 87 | with open(labels_path) as f: 88 | f_csv = csv.reader(f, delimiter=' ') 89 | for row in f_csv: 90 | frame_id = int(row[0]) 91 | x1, y1, x2, y2 = map(float, row[6:10]) 92 | label = row[2] 93 | detections_by_frame[frame_id].append({ 94 | 'label': label, 95 | 'left': x1, 96 | 'right': x2, 97 | 'top': y1, 98 | 'bottom': y2 99 | }) 100 | 101 | image_detections = [] 102 | for frame_id in sorted(detections_by_frame.keys()): 103 | frame_dets = detections_by_frame[frame_id] 104 | image_path = f"{images_dir}/{frame_id:06d}.png" 105 | if not os.path.exists(image_path): 106 | image_path = f"{images_dir}/{frame_id:06d}.jpg" 107 | with Image.open(image_path) as image: 108 | image_width = image.width 109 | image_height = image.height 110 | 111 | def clamp_bbox(det): 112 | if det['right'] > image_width - 1: 113 | det['right'] = image_width - 1 114 | if det['bottom'] > image_height - 1: 115 | det['bottom'] = image_height - 1 116 | return det 117 | 118 | image_detections.append({ 119 | 'image': { 120 | 'id': f"{frame_name}-{frame_id:06d}", 121 | 'path': image_path, 122 | 'segmented_path': None, 123 | 'width': image.width, 124 | 'height': image.height 125 | }, 126 | 'detections': [clamp_bbox(det) for det in frame_dets] 127 | }) 128 | return image_detections 129 | -------------------------------------------------------------------------------- /vod-converter/vod_converter/kitti.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingestor for KITTI formats. 3 | 4 | http://www.cvlibs.net/datasets/kitti/eval_object.php 5 | 6 | Per devkit docs: 7 | 8 | All values (numerical or strings) are separated via spaces, 9 | each row corresponds to one object. The 15 columns represent: 10 | 11 | #Values Name Description 12 | ---------------------------------------------------------------------------- 13 | 1 type Describes the type of object: 'Car', 'Van', 'Truck', 14 | 'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram', 15 | 'Misc' or 'DontCare' 16 | 1 truncated Float from 0 (non-truncated) to 1 (truncated), where 17 | truncated refers to the object leaving image boundaries 18 | 1 occluded Integer (0,1,2,3) indicating occlusion state: 19 | 0 = fully visible, 1 = partly occluded 20 | 2 = largely occluded, 3 = unknown 21 | 1 alpha Observation angle of object, ranging [-pi..pi] 22 | 4 bbox 2D bounding box of object in the image (0-based index): 23 | contains left, top, right, bottom pixel coordinates 24 | 3 dimensions 3D object dimensions: height, width, length (in meters) 25 | 3 location 3D object location x,y,z in camera coordinates (in meters) 26 | 1 rotation_y Rotation ry around Y-axis in camera coordinates [-pi..pi] 27 | 1 score Only for results: Float, indicating confidence in 28 | detection, needed for p/r curves, higher is better. 29 | 30 | 31 | """ 32 | 33 | import csv 34 | import os 35 | from PIL import Image 36 | import shutil 37 | 38 | from converter import Ingestor, Egestor 39 | 40 | 41 | class KITTIIngestor(Ingestor): 42 | def validate(self, path): 43 | expected_dirs = [ 44 | 'training/image_2', 45 | 'training/label_2' 46 | ] 47 | for subdir in expected_dirs: 48 | if not os.path.isdir(f"{path}/{subdir}"): 49 | return False, f"Expected subdirectory {subdir} within {path}" 50 | if not os.path.isfile(f"{path}/train.txt"): 51 | return False, f"Expected train.txt file within {path}" 52 | return True, None 53 | 54 | def ingest(self, path): 55 | image_ids = self._get_image_ids(path) 56 | image_ext = 'png' 57 | if len(image_ids): 58 | first_image_id = image_ids[0] 59 | image_ext = self.find_image_ext(path, first_image_id) 60 | return [self._get_image_detection(path, image_name, image_ext=image_ext) for image_name in image_ids] 61 | 62 | def find_image_ext(self, root, image_id): 63 | for image_ext in ['png', 'jpg']: 64 | if os.path.exists(f"{root}/training/image_2/{image_id}.{image_ext}"): 65 | return image_ext 66 | raise Exception(f"could not find jpg or png for {image_id} at {root}/training/image_2") 67 | 68 | def _get_image_ids(self, root): 69 | path = f"{root}/train.txt" 70 | with open(path) as f: 71 | return f.read().strip().split('\n') 72 | 73 | def _get_image_detection(self, root, image_id, *, image_ext='png'): 74 | detections_fpath = f"{root}/training/label_2/{image_id}.txt" 75 | detections = self._get_detections(detections_fpath) 76 | detections = [det for det in detections if det['left'] < det['right'] and det['top'] < det['bottom']] 77 | image_path = f"{root}/training/image_2/{image_id}.{image_ext}" 78 | image_width, image_height = _image_dimensions(image_path) 79 | return { 80 | 'image': { 81 | 'id': image_id, 82 | 'path': image_path, 83 | 'segmented_path': None, 84 | 'width': image_width, 85 | 'height': image_height 86 | }, 87 | 'detections': detections 88 | } 89 | 90 | def _get_detections(self, detections_fpath): 91 | detections = [] 92 | with open(detections_fpath) as f: 93 | f_csv = csv.reader(f, delimiter=' ') 94 | for row in f_csv: 95 | x1, y1, x2, y2 = map(float, row[4:8]) 96 | label = row[0] 97 | detections.append({ 98 | 'label': label, 99 | 'left': x1, 100 | 'right': x2, 101 | 'top': y1, 102 | 'bottom': y2 103 | }) 104 | return detections 105 | 106 | 107 | def _image_dimensions(path): 108 | with Image.open(path) as image: 109 | return image.width, image.height 110 | 111 | DEFAULT_TRUNCATED = 0.0 # 0% truncated 112 | DEFAULT_OCCLUDED = 0 # fully visible 113 | 114 | class KITTIEgestor(Egestor): 115 | 116 | def expected_labels(self): 117 | return { 118 | 'Car': [], 119 | 'Cyclist': ['biker'], 120 | 'Misc': [], 121 | 'Pedestrian': ['person'], 122 | 'Person_sitting': [], 123 | 'Tram': [], 124 | 'Truck': [], 125 | 'Van': [], 126 | } 127 | 128 | def egest(self, *, image_detections, root): 129 | images_dir = f"{root}/training/image_2" 130 | os.makedirs(images_dir, exist_ok=True) 131 | labels_dir = f"{root}/training/label_2" 132 | os.makedirs(labels_dir, exist_ok=True) 133 | 134 | id_file = f"{root}/train.txt" 135 | 136 | for image_detection in image_detections: 137 | image = image_detection['image'] 138 | image_id = image['id'] 139 | src_extension = image['path'].split('.')[-1] 140 | shutil.copyfile(image['path'], f"{images_dir}/{image_id}.{src_extension}") 141 | 142 | with open(id_file, 'a') as out_image_index_file: 143 | out_image_index_file.write(f'{image_id}\n') 144 | 145 | out_labels_path = f"{labels_dir}/{image_id}.txt" 146 | with open(out_labels_path, 'w') as csvfile: 147 | csvwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL) 148 | 149 | for detection in image_detection['detections']: 150 | kitti_row = [-1] * 15 151 | kitti_row[0] = detection['label'] 152 | kitti_row[1] = DEFAULT_TRUNCATED 153 | kitti_row[2] = DEFAULT_OCCLUDED 154 | x1 = detection['left'] 155 | x2 = detection['right'] 156 | y1 = detection['top'] 157 | y2 = detection['bottom'] 158 | kitti_row[4:8] = x1, y1, x2, y2 159 | csvwriter.writerow(kitti_row) 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /oid_to_pascal_voc_xml.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tqdm import tqdm 3 | from sys import exit 4 | import argparse 5 | import cv2 6 | from textwrap import dedent 7 | from lxml import etree 8 | 9 | XML_DIR = '' 10 | 11 | os.chdir('kitti_data') 12 | #os.chdir(os.path.join("OID", "Dataset")) 13 | DIRS = os.listdir(os.getcwd()) 14 | 15 | for DIR in DIRS: 16 | if os.path.isdir(DIR): 17 | os.chdir(DIR) 18 | 19 | print("Currently in Subdirectory:", DIR) 20 | CLASS_DIRS = os.listdir(os.getcwd()) 21 | for CLASS_DIR in CLASS_DIRS: 22 | if " " in CLASS_DIR: 23 | os.rename(CLASS_DIR, CLASS_DIR.replace(" ", "_")) 24 | 25 | CLASS_DIRS = os.listdir(os.getcwd()) 26 | for CLASS_DIR in CLASS_DIRS: 27 | #if " " in CLASS_DIR: 28 | # os.rename(CLASS_DIR, CLASS_DIR.replace(" ", "_")) 29 | if os.path.isdir(CLASS_DIR): 30 | os.chdir(CLASS_DIR) 31 | 32 | print("\n" + "Creating PASCAL VOC XML Files for Class:", CLASS_DIR) 33 | # Create Directory for annotations if it does not exist yet 34 | #if not os.path.exists(XML_DIR): 35 | # os.makedirs(XML_DIR) 36 | 37 | #Read Labels from OIDv4 ToolKit 38 | os.chdir("label") 39 | 40 | #Create PASCAL XML 41 | for filename in tqdm(os.listdir(os.getcwd())): 42 | if filename.endswith(".txt"): 43 | filename_str = str.split(filename, ".")[0] 44 | 45 | 46 | annotation = etree.Element("annotation") 47 | 48 | os.chdir("..") 49 | folder = etree.Element("folder") 50 | folder.text = os.path.basename(os.getcwd()) 51 | annotation.append(folder) 52 | 53 | filename_xml = etree.Element("filename") 54 | filename_xml.text = filename_str + ".png" 55 | annotation.append(filename_xml) 56 | 57 | path = etree.Element("path") 58 | path.text = os.path.join(os.path.dirname(os.path.abspath(filename)), filename_str + ".png") 59 | annotation.append(path) 60 | 61 | source = etree.Element("source") 62 | annotation.append(source) 63 | 64 | database = etree.Element("database") 65 | database.text = "Unknown" 66 | source.append(database) 67 | 68 | size = etree.Element("size") 69 | annotation.append(size) 70 | 71 | width = etree.Element("width") 72 | height = etree.Element("height") 73 | depth = etree.Element("depth") 74 | img = cv2.imread(filename_xml.text) 75 | 76 | try: 77 | width.text = str(img.shape[1]) 78 | except AttributeError: 79 | #os.chdir("..") 80 | os.chdir("label") 81 | continue 82 | height.text = str(img.shape[0]) 83 | depth.text = str(img.shape[2]) 84 | 85 | size.append(width) 86 | size.append(height) 87 | size.append(depth) 88 | 89 | segmented = etree.Element("segmented") 90 | segmented.text = "0" 91 | annotation.append(segmented) 92 | 93 | os.chdir("label") 94 | label_original = open(filename, 'r') 95 | 96 | # Labels from OIDv4 Toolkit: name_of_class X_min Y_min X_max Y_max 97 | for line in label_original: 98 | line = line.strip() 99 | l = line.split(' ') 100 | class_name = l[0] 101 | try: 102 | xmin_l = str(int(float(l[1]))) 103 | add1 = 0 104 | except ValueError: 105 | class_name = l[0]+"_"+l[1] 106 | add1 = 1 107 | 108 | xmin_l = str(int(float(l[1+add1]))) 109 | ymin_l = str(int(float(l[2+add1]))) 110 | xmax_l = str(int(float(l[3+add1]))) 111 | ymax_l = str(int(float(l[4+add1]))) 112 | 113 | obj = etree.Element("object") 114 | annotation.append(obj) 115 | 116 | name = etree.Element("name") 117 | name.text = class_name 118 | obj.append(name) 119 | 120 | pose = etree.Element("pose") 121 | pose.text = "Unspecified" 122 | obj.append(pose) 123 | 124 | truncated = etree.Element("truncated") 125 | truncated.text = "0" 126 | obj.append(truncated) 127 | 128 | difficult = etree.Element("difficult") 129 | difficult.text = "0" 130 | obj.append(difficult) 131 | 132 | bndbox = etree.Element("bndbox") 133 | obj.append(bndbox) 134 | 135 | xmin = etree.Element("xmin") 136 | xmin.text = xmin_l 137 | bndbox.append(xmin) 138 | 139 | ymin = etree.Element("ymin") 140 | ymin.text = ymin_l 141 | bndbox.append(ymin) 142 | 143 | xmax = etree.Element("xmax") 144 | xmax.text = xmax_l 145 | bndbox.append(xmax) 146 | 147 | ymax = etree.Element("ymax") 148 | ymax.text = ymax_l 149 | bndbox.append(ymax) 150 | 151 | os.chdir("..") 152 | 153 | #os.chdir(XML_DIR) 154 | 155 | # write xml to file 156 | s = etree.tostring(annotation, pretty_print=True) 157 | with open(filename_str + ".xml", 'wb') as f: 158 | f.write(s) 159 | f.close() 160 | 161 | #os.chdir("..") 162 | os.chdir("label") 163 | 164 | os.chdir("..") 165 | os.chdir("..") 166 | 167 | os.chdir("..") 168 | -------------------------------------------------------------------------------- /vod-converter/vod_converter/voc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingestor and egestor for VOC formats. 3 | 4 | http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html 5 | """ 6 | 7 | import os 8 | import shutil 9 | 10 | from converter import Ingestor, Egestor 11 | import xml.etree.ElementTree as ET 12 | 13 | 14 | class VOCIngestor(Ingestor): 15 | def validate(self, root): 16 | path = f"{root}/VOC2012" 17 | for subdir in ["ImageSets", "JPEGImages", "Annotations"]: 18 | if not os.path.isdir(f"{path}/{subdir}"): 19 | return False, f"Expected subdirectory {subdir} within {path}" 20 | if not os.path.isfile(f"{path}/ImageSets/Main/trainval.txt"): 21 | return False, f"Expected main image set ImageSets/Main/trainval.txt to exist within {path}" 22 | return True, None 23 | 24 | def ingest(self, path): 25 | image_names = self._get_image_ids(path) 26 | return [self._get_image_detection(path, image_name) for image_name in image_names] 27 | 28 | def _get_image_ids(self, root): 29 | path = f"{root}/VOC2012" 30 | with open(f"{path}/ImageSets/Main/trainval.txt") as f: 31 | fnames = [] 32 | for line in f.read().strip().split('\n'): 33 | cols = line.split() 34 | if len(cols) > 1: 35 | score = cols[1] 36 | if score != '1': 37 | continue 38 | fnames.append(cols[0]) 39 | return fnames 40 | 41 | def _get_image_detection(self, root, image_id): 42 | path = f"{root}/VOC2012" 43 | image_path = f"{path}/JPEGImages/{image_id}.jpg" 44 | if not os.path.isfile(image_path): 45 | raise Exception(f"Expected {image_path} to exist.") 46 | annotation_path = f"{path}/Annotations/{image_id}.xml" 47 | if not os.path.isfile(annotation_path): 48 | raise Exception(f"Expected annotation file {annotation_path} to exist.") 49 | tree = ET.parse(annotation_path) 50 | xml_root = tree.getroot() 51 | size = xml_root.find('size') 52 | segmented = xml_root.find('segmented').text == '1' 53 | segmented_path = None 54 | if segmented: 55 | segmented_path = f"{path}/SegmentationObject/{image_id}.png" 56 | if not os.path.isfile(segmented_path): 57 | raise Exception(f"Expected segmentation file {segmented_path} to exist.") 58 | image_width = int(size.find('width').text) 59 | image_height = int(size.find('height').text) 60 | return { 61 | 'image': { 62 | 'id': image_id, 63 | 'path': image_path, 64 | 'segmented_path': segmented_path, 65 | 'width': image_width, 66 | 'height': image_height 67 | }, 68 | 'detections': [self._get_detection(node) for node in xml_root.findall('object')] 69 | } 70 | 71 | def _get_detection(self, node): 72 | bndbox = node.find('bndbox') 73 | return { 74 | 'label': node.find('name').text, 75 | 'top': float(bndbox.find('ymin').text) - 1, 76 | 'left': float(bndbox.find('xmin').text) - 1, 77 | 'right': float(bndbox.find('xmax').text) - 1, 78 | 'bottom': float(bndbox.find('ymax').text) - 1, 79 | } 80 | 81 | 82 | class VOCEgestor(Egestor): 83 | 84 | def expected_labels(self): 85 | return { 86 | 'aeroplane': [], 87 | 'bicycle': [], 88 | 'bird': [], 89 | 'boat': [], 90 | 'bottle': [], 91 | 'bus': [], 92 | 'car': [], 93 | 'cat': [], 94 | 'chair': [], 95 | 'cow': [], 96 | 'diningtable': [], 97 | 'dog': [], 98 | 'horse': [], 99 | 'motorbike': [], 100 | 'person': ['pedestrian'], 101 | 'pottedplant': [], 102 | 'sheep': [], 103 | 'sofa': [], 104 | 'train': [], 105 | 'tvmonitor': [] 106 | } 107 | 108 | def egest(self, *, image_detections, root): 109 | image_sets_path = f"{root}/VOC2012/ImageSets/Main" 110 | images_path = f"{root}/VOC2012/JPEGImages" 111 | annotations_path = f"{root}/VOC2012/Annotations" 112 | segmentations_path = f"{root}/VOC2012/SegmentationObject" 113 | segmentations_dir_created = False 114 | 115 | for to_create in [image_sets_path, images_path, annotations_path]: 116 | os.makedirs(to_create, exist_ok=True) 117 | 118 | for image_detection in image_detections: 119 | image = image_detection['image'] 120 | image_id = image['id'] 121 | src_extension = image['path'].split('.')[-1] 122 | shutil.copyfile(image['path'], f"{images_path}/{image_id}.{src_extension}") 123 | 124 | with open(f"{image_sets_path}/trainval.txt", 'a') as out_image_index_file: 125 | out_image_index_file.write(f'{image_id}\n') 126 | 127 | if image['segmented_path'] is not None: 128 | if not segmentations_dir_created: 129 | os.makedirs(segmentations_path) 130 | segmentations_dir_created = True 131 | shutil.copyfile(image['segmented_path'], f"{segmentations_path}/{image_id}.png") 132 | 133 | xml_root = ET.Element('annotation') 134 | add_text_node(xml_root, 'filename', f"{image_id}.{src_extension}") 135 | add_text_node(xml_root, 'folder', 'VOC2012') 136 | add_text_node(xml_root, 'segmented', int(segmentations_dir_created)) 137 | 138 | add_sub_node(xml_root, 'size', { 139 | 'depth': 3, 140 | 'width': image['width'], 141 | 'height': image['height'] 142 | }) 143 | add_sub_node(xml_root, 'source', { 144 | 'annotation': 'Dummy', 145 | 'database': 'Dummy', 146 | 'image': 'Dummy' 147 | }) 148 | 149 | for detection in image_detection['detections']: 150 | x_object = add_sub_node(xml_root, 'object', { 151 | 'name': detection['label'], 152 | 'difficult': 0, 153 | 'occluded': 0, 154 | 'truncated': 0, 155 | 'pose': 'Unspecified' 156 | }) 157 | add_sub_node(x_object, 'bndbox', { 158 | 'xmin': detection['left'] + 1, 159 | 'xmax': detection['right'] + 1, 160 | 'ymin': detection['top'] + 1, 161 | 'ymax': detection['bottom'] + 1 162 | }) 163 | 164 | ET.ElementTree(xml_root).write(f"{annotations_path}/{image_id}.xml") 165 | 166 | 167 | def add_sub_node(node, name, kvs): 168 | subnode = ET.SubElement(node, name) 169 | for k, v in kvs.items(): 170 | add_text_node(subnode, k, v) 171 | return subnode 172 | 173 | 174 | def add_text_node(node, name, text): 175 | subnode = ET.SubElement(node, name) 176 | subnode.text = f"{text}" 177 | return subnode 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | -------------------------------------------------------------------------------- /vod-converter/vod_converter/converter.py: -------------------------------------------------------------------------------- 1 | """ 2 | Defines the protocol for converting too and from a common data format and executes 3 | the conversion, validating proper conversion along the way. 4 | 5 | For a given dataformat, e.g `voc.py`, if you wish to support reading in of your data format, define 6 | an `Ingestor` that can read in data from a path and return an array of data conforming to `IMAGE_DETECTION_SCHEMA`. 7 | 8 | If you wish to support data output, define an `Egestor` that, given an array of data of the same form, 9 | can output the data to the filesystem. 10 | 11 | See `main.py` for the supported types, and `voc.py` and `kitti.py` for reference. 12 | """ 13 | from jsonschema import validate as raw_validate 14 | from jsonschema.exceptions import ValidationError as SchemaError 15 | 16 | 17 | def validate_schema(data, schema): 18 | """Wraps default implementation but accepting tuples as arrays too. 19 | 20 | https://github.com/Julian/jsonschema/issues/148 21 | """ 22 | return raw_validate(data, schema, types={"array": (list, tuple)}) 23 | 24 | 25 | IMAGE_SCHEMA = { 26 | 'type': 'object', 27 | 'properties': { 28 | 'id': {'type': 'string'}, 29 | 'path': {'type': 'string'}, 30 | 'segmented_path': { 31 | 'anyOf': [ 32 | {'type': 'null'}, 33 | {'type': 'string'} 34 | ] 35 | }, 36 | 'width': {'type': 'integer', 'minimum': 10}, 37 | 'height': {'type': 'integer', 'minimum': 10}, 38 | }, 39 | 'required': ['id', 'path', 'segmented_path', 'width', 'height'] 40 | } 41 | 42 | 43 | DETECTION_SCHEMA = { 44 | 'type': 'object', 45 | 'properties': { 46 | 'label': {'type': 'string'}, 47 | 'top': {'type': 'number', 'minimum': 0}, 48 | 'left': {'type': 'number', 'minimum': 0}, 49 | 'right': {'type': 'number', 'minimum': 0}, 50 | 'bottom': {'type': 'number', 'minimum': 0} 51 | }, 52 | 'required': ['top', 'left', 'right', 'bottom'] 53 | } 54 | 55 | IMAGE_DETECTION_SCHEMA = { 56 | 'type': 'object', 57 | 'properties': { 58 | 'image': IMAGE_SCHEMA, 59 | 'detections': { 60 | 'type': 'array', 61 | 'items': DETECTION_SCHEMA 62 | } 63 | } 64 | } 65 | 66 | 67 | class Ingestor: 68 | def validate(self, path): 69 | """ 70 | Validate that a path contains files / directories expected for a given data format. 71 | 72 | This is where you can provide feedback to the end user if they are attempting to convert from 73 | your format but have passed you path to a directory that is missing the expected files or directory 74 | structure. 75 | 76 | :param path: Where the data is stored 77 | :return: (sucess, error message), e.g (False, "error message") if anything is awry, (True, None) otherwise. 78 | """ 79 | return True, None 80 | 81 | def ingest(self, path): 82 | """ 83 | Read in data from the filesytem. 84 | :param path: '/path/to/data/' 85 | :return: an array of dicts conforming to `IMAGE_DETECTION_SCHEMA` 86 | """ 87 | pass 88 | 89 | 90 | class Egestor: 91 | 92 | def expected_labels(self): 93 | """ 94 | Return a dict with a key for each label generally expected by this dataset format and 95 | any aliases that should be converted. 96 | 97 | In the example below the expected labels are 'car' and 'pedestrian' and, for example, both 98 | 'Car' and 'auto' should be converted to 'car'. 99 | 100 | :return: {'car': ['Car', 'auto'], 'pedestrian': ['Person']} 101 | """ 102 | raise NotImplementedError() 103 | 104 | def egest(self, *, image_detections, root): 105 | """ 106 | Output data to the filesystem. 107 | 108 | Note: image_detections will already have any conversions specified via `expected_labels` applied 109 | by the time they are passed to this method. 110 | 111 | :param image_detections: an array of dicts conforming to `IMAGE_DETECTION_SCHEMA` 112 | :param root: '/path/to/output/data/' 113 | """ 114 | raise NotImplementedError() 115 | 116 | 117 | def convert(*, from_path, ingestor, to_path, egestor, select_only_known_labels, filter_images_without_labels): 118 | """ 119 | Converts between data formats, validating that the converted data matches 120 | `IMAGE_DETECTION_SCHEMA` along the way. 121 | 122 | :param from_path: '/path/to/read/from' 123 | :param ingestor: `Ingestor` to read in data 124 | :param to_path: '/path/to/write/to' 125 | :param egestor: `Egestor` to write out data 126 | :return: (success, message) 127 | """ 128 | from_valid, from_msg = ingestor.validate(from_path) 129 | 130 | if not from_valid: 131 | return from_valid, from_msg 132 | 133 | image_detections = ingestor.ingest(from_path) 134 | validate_image_detections(image_detections) 135 | image_detections = convert_labels( 136 | image_detections=image_detections, expected_labels=egestor.expected_labels(), 137 | select_only_known_labels=select_only_known_labels, 138 | filter_images_without_labels=filter_images_without_labels) 139 | 140 | egestor.egest(image_detections=image_detections, root=to_path) 141 | return True, '' 142 | 143 | 144 | def validate_image_detections(image_detections): 145 | for i, image_detection in enumerate(image_detections): 146 | try: 147 | validate_schema(image_detection, IMAGE_DETECTION_SCHEMA) 148 | except SchemaError as se: 149 | raise Exception(f"at index {i}") from se 150 | image = image_detection['image'] 151 | for detection in image_detection['detections']: 152 | if detection['right'] >= image['width'] or detection['bottom'] >= image['height']: 153 | raise ValueError(f"Image {image} has out of bounds bounding box {detection}") 154 | if detection['right'] <= detection['left'] or detection['bottom'] <= detection['top']: 155 | raise ValueError(f"Image {image} has zero dimension bbox {detection}") 156 | 157 | 158 | def convert_labels(*, image_detections, expected_labels, 159 | select_only_known_labels, filter_images_without_labels): 160 | convert_dict = {} 161 | for label, aliases in expected_labels.items(): 162 | convert_dict[label.lower()] = label 163 | for alias in aliases: 164 | convert_dict[alias.lower()] = label 165 | 166 | final_image_detections = [] 167 | for image_detection in image_detections: 168 | detections = [] 169 | for detection in image_detection['detections']: 170 | label = detection['label'] 171 | fallback_label = label if not select_only_known_labels else None 172 | final_label = convert_dict.get(label.lower(), fallback_label) 173 | if final_label: 174 | detection['label'] = final_label 175 | detections.append(detection) 176 | image_detection['detections'] = detections 177 | if detections: 178 | final_image_detections.append(image_detection) 179 | elif not filter_images_without_labels: 180 | final_image_detections.append(image_detection) 181 | 182 | return final_image_detections 183 | 184 | 185 | 186 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retrain the YOLO model for your own dataset. 3 | """ 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | 7 | import numpy as np 8 | import keras.backend as K 9 | from keras.layers import Input, Lambda 10 | from keras.models import Model 11 | from keras.optimizers import Adam 12 | from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping 13 | 14 | from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss 15 | from yolo3.utils import get_random_data 16 | 17 | 18 | def _main(): 19 | annotation_path = '9_CLASS_test.txt' 20 | log_dir = 'logs/000/' 21 | classes_path = '9_CLASS_test_classes.txt' 22 | anchors_path = 'model_data/yolo_anchors.txt' 23 | class_names = get_classes(classes_path) 24 | num_classes = len(class_names) 25 | anchors = get_anchors(anchors_path) 26 | 27 | input_shape = (416,416) # multiple of 32, hw 28 | 29 | is_tiny_version = len(anchors)==6 # default setting 30 | if is_tiny_version: 31 | model = create_tiny_model(input_shape, anchors, num_classes, 32 | freeze_body=2, weights_path='model_data/yolo_weights.h5') 33 | else: 34 | model = create_model(input_shape, anchors, num_classes, freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze 35 | 36 | 37 | logging = TensorBoard(log_dir=log_dir) 38 | checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', 39 | monitor='val_loss', save_weights_only=True, save_best_only=True, period=3) 40 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1) 41 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1) 42 | 43 | val_split = 0.1 44 | with open(annotation_path) as f: 45 | lines = f.readlines() 46 | np.random.shuffle(lines) 47 | num_val = int(len(lines)*val_split) 48 | num_train = len(lines) - num_val 49 | 50 | # Train with frozen layers first, to get a stable loss. 51 | # Adjust num epochs to your dataset. This step is enough to obtain a not bad model. 52 | if True: 53 | model.compile(optimizer=Adam(lr=1e-3), loss={ 54 | # use custom yolo_loss Lambda layer. 55 | 'yolo_loss': lambda y_true, y_pred: y_pred}) 56 | 57 | batch_size = 32 58 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 59 | model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes), 60 | steps_per_epoch=max(1, num_train//batch_size), 61 | validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes), 62 | validation_steps=max(1, num_val//batch_size), 63 | epochs=50, 64 | initial_epoch=0, 65 | callbacks=[logging, checkpoint]) 66 | model.save_weights(log_dir + 'trained_weights_stage_1.h5') 67 | 68 | # Unfreeze and continue training, to fine-tune. 69 | # Train longer if the result is not good. 70 | if True: 71 | for i in range(len(model.layers)): 72 | model.layers[i].trainable = True 73 | model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change 74 | print('Unfreeze all of the layers.') 75 | 76 | batch_size = 8 # note that more GPU memory is required after unfreezing the body 77 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 78 | model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes), 79 | steps_per_epoch=max(1, num_train//batch_size), 80 | validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes), 81 | validation_steps=max(1, num_val//batch_size), 82 | epochs=100, 83 | initial_epoch=50, 84 | callbacks=[logging, checkpoint, reduce_lr, early_stopping]) 85 | model.save_weights(log_dir + 'trained_weights_final.h5') 86 | 87 | # Further training if needed. 88 | 89 | 90 | def get_classes(classes_path): 91 | '''loads the classes''' 92 | with open(classes_path) as f: 93 | class_names = f.readlines() 94 | class_names = [c.strip() for c in class_names] 95 | return class_names 96 | 97 | def get_anchors(anchors_path): 98 | '''loads the anchors from a file''' 99 | with open(anchors_path) as f: 100 | anchors = f.readline() 101 | anchors = [float(x) for x in anchors.split(',')] 102 | return np.array(anchors).reshape(-1, 2) 103 | 104 | 105 | def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2, 106 | weights_path='model_data/yolo_weights.h5'): 107 | '''create the training model''' 108 | K.clear_session() # get a new session 109 | image_input = Input(shape=(None, None, 3)) 110 | h, w = input_shape 111 | num_anchors = len(anchors) 112 | 113 | y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \ 114 | num_anchors//3, num_classes+5)) for l in range(3)] 115 | 116 | model_body = yolo_body(image_input, num_anchors//3, num_classes) 117 | print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes)) 118 | 119 | if load_pretrained: 120 | model_body.load_weights(weights_path, by_name=True, skip_mismatch=True) 121 | print('Load weights {}.'.format(weights_path)) 122 | if freeze_body in [1, 2]: 123 | # Freeze darknet53 body or freeze all but 3 output layers. 124 | num = (185, len(model_body.layers)-3)[freeze_body-1] 125 | for i in range(num): model_body.layers[i].trainable = False 126 | print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers))) 127 | 128 | model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 129 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})( 130 | [*model_body.output, *y_true]) 131 | model = Model([model_body.input, *y_true], model_loss) 132 | 133 | return model 134 | 135 | def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2, 136 | weights_path='model_data/tiny_yolo_weights.h5'): 137 | '''create the training model, for Tiny YOLOv3''' 138 | K.clear_session() # get a new session 139 | image_input = Input(shape=(None, None, 3)) 140 | h, w = input_shape 141 | num_anchors = len(anchors) 142 | 143 | y_true = [Input(shape=(h//{0:32, 1:16}[l], w//{0:32, 1:16}[l], \ 144 | num_anchors//2, num_classes+5)) for l in range(2)] 145 | 146 | model_body = tiny_yolo_body(image_input, num_anchors//2, num_classes) 147 | print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes)) 148 | 149 | if load_pretrained: 150 | model_body.load_weights(weights_path, by_name=True, skip_mismatch=True) 151 | print('Load weights {}.'.format(weights_path)) 152 | if freeze_body in [1, 2]: 153 | # Freeze the darknet body or freeze all but 2 output layers. 154 | num = (20, len(model_body.layers)-2)[freeze_body-1] 155 | for i in range(num): model_body.layers[i].trainable = False 156 | print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers))) 157 | 158 | model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 159 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})( 160 | [*model_body.output, *y_true]) 161 | model = Model([model_body.input, *y_true], model_loss) 162 | 163 | return model 164 | 165 | def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes): 166 | '''data generator for fit_generator''' 167 | n = len(annotation_lines) 168 | i = 0 169 | while True: 170 | image_data = [] 171 | box_data = [] 172 | for b in range(batch_size): 173 | if i==0: 174 | np.random.shuffle(annotation_lines) 175 | image, box = get_random_data(annotation_lines[i], input_shape, random=True) 176 | image_data.append(image) 177 | box_data.append(box) 178 | i = (i+1) % n 179 | image_data = np.array(image_data) 180 | box_data = np.array(box_data) 181 | y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes) 182 | yield [image_data, *y_true], np.zeros(batch_size) 183 | 184 | def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes): 185 | n = len(annotation_lines) 186 | if n==0 or batch_size<=0: return None 187 | return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes) 188 | 189 | if __name__ == '__main__': 190 | _main() 191 | -------------------------------------------------------------------------------- /convert.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Reads Darknet config and weights and creates Keras model with TF backend. 4 | 5 | """ 6 | 7 | import argparse 8 | import configparser 9 | import io 10 | import os 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 12 | from collections import defaultdict 13 | 14 | import numpy as np 15 | from keras import backend as K 16 | from keras.layers import (Conv2D, Input, ZeroPadding2D, Add, 17 | UpSampling2D, MaxPooling2D, Concatenate) 18 | from keras.layers.advanced_activations import LeakyReLU 19 | from keras.layers.normalization import BatchNormalization 20 | from keras.models import Model 21 | from keras.regularizers import l2 22 | from keras.utils.vis_utils import plot_model as plot 23 | 24 | 25 | parser = argparse.ArgumentParser(description='Darknet To Keras Converter.') 26 | parser.add_argument('config_path', help='Path to Darknet cfg file.') 27 | parser.add_argument('weights_path', help='Path to Darknet weights file.') 28 | parser.add_argument('output_path', help='Path to output Keras model file.') 29 | parser.add_argument( 30 | '-p', 31 | '--plot_model', 32 | help='Plot generated Keras model and save as image.', 33 | action='store_true') 34 | parser.add_argument( 35 | '-w', 36 | '--weights_only', 37 | help='Save as Keras weights file instead of model file.', 38 | action='store_true') 39 | 40 | def unique_config_sections(config_file): 41 | """Convert all config sections to have unique names. 42 | 43 | Adds unique suffixes to config sections for compability with configparser. 44 | """ 45 | section_counters = defaultdict(int) 46 | output_stream = io.StringIO() 47 | with open(config_file) as fin: 48 | for line in fin: 49 | if line.startswith('['): 50 | section = line.strip().strip('[]') 51 | _section = section + '_' + str(section_counters[section]) 52 | section_counters[section] += 1 53 | line = line.replace(section, _section) 54 | output_stream.write(line) 55 | output_stream.seek(0) 56 | return output_stream 57 | 58 | # %% 59 | def _main(args): 60 | config_path = os.path.expanduser(args.config_path) 61 | weights_path = os.path.expanduser(args.weights_path) 62 | assert config_path.endswith('.cfg'), '{} is not a .cfg file'.format( 63 | config_path) 64 | assert weights_path.endswith( 65 | '.weights'), '{} is not a .weights file'.format(weights_path) 66 | 67 | output_path = os.path.expanduser(args.output_path) 68 | assert output_path.endswith( 69 | '.h5'), 'output path {} is not a .h5 file'.format(output_path) 70 | output_root = os.path.splitext(output_path)[0] 71 | 72 | # Load weights and config. 73 | print('Loading weights.') 74 | weights_file = open(weights_path, 'rb') 75 | major, minor, revision = np.ndarray( 76 | shape=(3, ), dtype='int32', buffer=weights_file.read(12)) 77 | if (major*10+minor)>=2 and major<1000 and minor<1000: 78 | seen = np.ndarray(shape=(1,), dtype='int64', buffer=weights_file.read(8)) 79 | else: 80 | seen = np.ndarray(shape=(1,), dtype='int32', buffer=weights_file.read(4)) 81 | print('Weights Header: ', major, minor, revision, seen) 82 | 83 | print('Parsing Darknet config.') 84 | unique_config_file = unique_config_sections(config_path) 85 | cfg_parser = configparser.ConfigParser() 86 | cfg_parser.read_file(unique_config_file) 87 | 88 | print('Creating Keras model.') 89 | input_layer = Input(shape=(None, None, 3)) 90 | prev_layer = input_layer 91 | all_layers = [] 92 | 93 | weight_decay = float(cfg_parser['net_0']['decay'] 94 | ) if 'net_0' in cfg_parser.sections() else 5e-4 95 | count = 0 96 | out_index = [] 97 | for section in cfg_parser.sections(): 98 | print('Parsing section {}'.format(section)) 99 | if section.startswith('convolutional'): 100 | filters = int(cfg_parser[section]['filters']) 101 | size = int(cfg_parser[section]['size']) 102 | stride = int(cfg_parser[section]['stride']) 103 | pad = int(cfg_parser[section]['pad']) 104 | activation = cfg_parser[section]['activation'] 105 | batch_normalize = 'batch_normalize' in cfg_parser[section] 106 | 107 | padding = 'same' if pad == 1 and stride == 1 else 'valid' 108 | 109 | # Setting weights. 110 | # Darknet serializes convolutional weights as: 111 | # [bias/beta, [gamma, mean, variance], conv_weights] 112 | prev_layer_shape = K.int_shape(prev_layer) 113 | 114 | weights_shape = (size, size, prev_layer_shape[-1], filters) 115 | darknet_w_shape = (filters, weights_shape[2], size, size) 116 | weights_size = np.product(weights_shape) 117 | 118 | print('conv2d', 'bn' 119 | if batch_normalize else ' ', activation, weights_shape) 120 | 121 | conv_bias = np.ndarray( 122 | shape=(filters, ), 123 | dtype='float32', 124 | buffer=weights_file.read(filters * 4)) 125 | count += filters 126 | 127 | if batch_normalize: 128 | bn_weights = np.ndarray( 129 | shape=(3, filters), 130 | dtype='float32', 131 | buffer=weights_file.read(filters * 12)) 132 | count += 3 * filters 133 | 134 | bn_weight_list = [ 135 | bn_weights[0], # scale gamma 136 | conv_bias, # shift beta 137 | bn_weights[1], # running mean 138 | bn_weights[2] # running var 139 | ] 140 | 141 | conv_weights = np.ndarray( 142 | shape=darknet_w_shape, 143 | dtype='float32', 144 | buffer=weights_file.read(weights_size * 4)) 145 | count += weights_size 146 | 147 | # DarkNet conv_weights are serialized Caffe-style: 148 | # (out_dim, in_dim, height, width) 149 | # We would like to set these to Tensorflow order: 150 | # (height, width, in_dim, out_dim) 151 | conv_weights = np.transpose(conv_weights, [2, 3, 1, 0]) 152 | conv_weights = [conv_weights] if batch_normalize else [ 153 | conv_weights, conv_bias 154 | ] 155 | 156 | # Handle activation. 157 | act_fn = None 158 | if activation == 'leaky': 159 | pass # Add advanced activation later. 160 | elif activation != 'linear': 161 | raise ValueError( 162 | 'Unknown activation function `{}` in section {}'.format( 163 | activation, section)) 164 | 165 | # Create Conv2D layer 166 | if stride>1: 167 | # Darknet uses left and top padding instead of 'same' mode 168 | prev_layer = ZeroPadding2D(((1,0),(1,0)))(prev_layer) 169 | conv_layer = (Conv2D( 170 | filters, (size, size), 171 | strides=(stride, stride), 172 | kernel_regularizer=l2(weight_decay), 173 | use_bias=not batch_normalize, 174 | weights=conv_weights, 175 | activation=act_fn, 176 | padding=padding))(prev_layer) 177 | 178 | if batch_normalize: 179 | conv_layer = (BatchNormalization( 180 | weights=bn_weight_list))(conv_layer) 181 | prev_layer = conv_layer 182 | 183 | if activation == 'linear': 184 | all_layers.append(prev_layer) 185 | elif activation == 'leaky': 186 | act_layer = LeakyReLU(alpha=0.1)(prev_layer) 187 | prev_layer = act_layer 188 | all_layers.append(act_layer) 189 | 190 | elif section.startswith('route'): 191 | ids = [int(i) for i in cfg_parser[section]['layers'].split(',')] 192 | layers = [all_layers[i] for i in ids] 193 | if len(layers) > 1: 194 | print('Concatenating route layers:', layers) 195 | concatenate_layer = Concatenate()(layers) 196 | all_layers.append(concatenate_layer) 197 | prev_layer = concatenate_layer 198 | else: 199 | skip_layer = layers[0] # only one layer to route 200 | all_layers.append(skip_layer) 201 | prev_layer = skip_layer 202 | 203 | elif section.startswith('maxpool'): 204 | size = int(cfg_parser[section]['size']) 205 | stride = int(cfg_parser[section]['stride']) 206 | all_layers.append( 207 | MaxPooling2D( 208 | pool_size=(size, size), 209 | strides=(stride, stride), 210 | padding='same')(prev_layer)) 211 | prev_layer = all_layers[-1] 212 | 213 | elif section.startswith('shortcut'): 214 | index = int(cfg_parser[section]['from']) 215 | activation = cfg_parser[section]['activation'] 216 | assert activation == 'linear', 'Only linear activation supported.' 217 | all_layers.append(Add()([all_layers[index], prev_layer])) 218 | prev_layer = all_layers[-1] 219 | 220 | elif section.startswith('upsample'): 221 | stride = int(cfg_parser[section]['stride']) 222 | assert stride == 2, 'Only stride=2 supported.' 223 | all_layers.append(UpSampling2D(stride)(prev_layer)) 224 | prev_layer = all_layers[-1] 225 | 226 | elif section.startswith('yolo'): 227 | out_index.append(len(all_layers)-1) 228 | all_layers.append(None) 229 | prev_layer = all_layers[-1] 230 | 231 | elif section.startswith('net'): 232 | pass 233 | 234 | else: 235 | raise ValueError( 236 | 'Unsupported section header type: {}'.format(section)) 237 | 238 | # Create and save model. 239 | if len(out_index)==0: out_index.append(len(all_layers)-1) 240 | model = Model(inputs=input_layer, outputs=[all_layers[i] for i in out_index]) 241 | print(model.summary()) 242 | if args.weights_only: 243 | model.save_weights('{}'.format(output_path)) 244 | print('Saved Keras weights to {}'.format(output_path)) 245 | else: 246 | model.save('{}'.format(output_path)) 247 | print('Saved Keras model to {}'.format(output_path)) 248 | 249 | # Check to see if all weights have been read. 250 | remaining_weights = len(weights_file.read()) / 4 251 | weights_file.close() 252 | print('Read {} of {} from Darknet weights.'.format(count, count + 253 | remaining_weights)) 254 | if remaining_weights > 0: 255 | print('Warning: {} unused weights'.format(remaining_weights)) 256 | 257 | if args.plot_model: 258 | plot(model, to_file='{}.png'.format(output_root), show_shapes=True) 259 | print('Saved model plot to {}.png'.format(output_root)) 260 | 261 | 262 | if __name__ == '__main__': 263 | _main(parser.parse_args()) 264 | -------------------------------------------------------------------------------- /train_bottleneck.py: -------------------------------------------------------------------------------- 1 | """ 2 | Retrain the YOLO model for your own dataset. 3 | """ 4 | import os 5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 6 | 7 | import numpy as np 8 | import keras.backend as K 9 | from keras.layers import Input, Lambda 10 | from keras.models import Model 11 | from keras.optimizers import Adam 12 | from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping 13 | 14 | from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss 15 | from yolo3.utils import get_random_data 16 | 17 | 18 | def _main(): 19 | annotation_path = '4_CLASS_test.txt' 20 | log_dir = 'logs/' 21 | classes_path = '4_CLASS_test_classes.txt' 22 | anchors_path = 'model_data/yolo_anchors.txt' 23 | class_names = get_classes(classes_path) 24 | num_classes = len(class_names) 25 | anchors = get_anchors(anchors_path) 26 | 27 | input_shape = (416,416) # multiple of 32, hw 28 | 29 | model, bottleneck_model, last_layer_model = create_model(input_shape, anchors, num_classes, 30 | freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze 31 | 32 | logging = TensorBoard(log_dir=log_dir) 33 | checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5', 34 | monitor='val_loss', save_weights_only=True, save_best_only=True, period=10) 35 | reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1) 36 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1) 37 | 38 | val_split = 0.1 39 | with open(annotation_path) as f: 40 | lines = f.readlines() 41 | #np.random.seed(10101) 42 | np.random.shuffle(lines) 43 | #np.random.seed(None) 44 | num_val = int(len(lines)*val_split) 45 | num_train = len(lines) - num_val 46 | 47 | # Train with frozen layers first, to get a stable loss. 48 | # Adjust num epochs to your dataset. This step is enough to obtain a not bad model. 49 | if True: 50 | # perform bottleneck training 51 | if not os.path.isfile("bottlenecks.npz"): 52 | print("calculating bottlenecks") 53 | batch_size=32 54 | bottlenecks=bottleneck_model.predict_generator(data_generator_wrapper(lines, batch_size, input_shape, anchors, num_classes, random=False, verbose=True), 55 | steps=(len(lines)//batch_size)+1, max_queue_size=1) 56 | np.savez("bottlenecks.npz", bot0=bottlenecks[0], bot1=bottlenecks[1], bot2=bottlenecks[2]) 57 | 58 | # load bottleneck features from file 59 | dict_bot=np.load("bottlenecks.npz") 60 | bottlenecks_train=[dict_bot["bot0"][:num_train], dict_bot["bot1"][:num_train], dict_bot["bot2"][:num_train]] 61 | bottlenecks_val=[dict_bot["bot0"][num_train:], dict_bot["bot1"][num_train:], dict_bot["bot2"][num_train:]] 62 | 63 | # train last layers with fixed bottleneck features 64 | batch_size=32 65 | print("Training last layers with bottleneck features") 66 | print('with {} samples, val on {} samples and batch size {}.'.format(num_train, num_val, batch_size)) 67 | last_layer_model.compile(optimizer='adam', loss={'yolo_loss': lambda y_true, y_pred: y_pred}) 68 | last_layer_model.fit_generator(bottleneck_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, bottlenecks_train), 69 | steps_per_epoch=max(1, num_train//batch_size), 70 | validation_data=bottleneck_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, bottlenecks_val), 71 | validation_steps=max(1, num_val//batch_size), 72 | epochs=90, 73 | initial_epoch=0, max_queue_size=1) 74 | model.save_weights(log_dir + 'trained_weights_stage_0.h5') 75 | 76 | # train last layers with random augmented data 77 | model.compile(optimizer=Adam(lr=1e-3), loss={ 78 | # use custom yolo_loss Lambda layer. 79 | 'yolo_loss': lambda y_true, y_pred: y_pred}) 80 | batch_size = 32 81 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 82 | model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes), 83 | steps_per_epoch=max(1, num_train//batch_size), 84 | validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes), 85 | validation_steps=max(1, num_val//batch_size), 86 | epochs=150, 87 | initial_epoch=0, 88 | callbacks=[logging, checkpoint]) 89 | model.save_weights(log_dir + 'trained_weights_stage_1.h5') 90 | 91 | # Unfreeze and continue training, to fine-tune. 92 | # Train longer if the result is not good. 93 | if True: 94 | for i in range(len(model.layers)): 95 | model.layers[i].trainable = True 96 | model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change 97 | print('Unfreeze all of the layers.') 98 | 99 | batch_size = 4 # note that more GPU memory is required after unfreezing the body 100 | print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size)) 101 | model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes), 102 | steps_per_epoch=max(1, num_train//batch_size), 103 | validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes), 104 | validation_steps=max(1, num_val//batch_size), 105 | epochs=300, 106 | initial_epoch=150, 107 | callbacks=[logging, checkpoint, reduce_lr, early_stopping]) 108 | model.save_weights(log_dir + 'trained_weights_final.h5') 109 | 110 | # Further training if needed. 111 | 112 | 113 | def get_classes(classes_path): 114 | '''loads the classes''' 115 | with open(classes_path) as f: 116 | class_names = f.readlines() 117 | class_names = [c.strip() for c in class_names] 118 | return class_names 119 | 120 | def get_anchors(anchors_path): 121 | '''loads the anchors from a file''' 122 | with open(anchors_path) as f: 123 | anchors = f.readline() 124 | anchors = [float(x) for x in anchors.split(',')] 125 | return np.array(anchors).reshape(-1, 2) 126 | 127 | 128 | def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2, 129 | weights_path='model_data/yolo_weights.h5'): 130 | '''create the training model''' 131 | K.clear_session() # get a new session 132 | image_input = Input(shape=(None, None, 3)) 133 | h, w = input_shape 134 | num_anchors = len(anchors) 135 | 136 | y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \ 137 | num_anchors//3, num_classes+5)) for l in range(3)] 138 | 139 | model_body = yolo_body(image_input, num_anchors//3, num_classes) 140 | print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes)) 141 | 142 | if load_pretrained: 143 | model_body.load_weights(weights_path, by_name=True, skip_mismatch=True) 144 | print('Load weights {}.'.format(weights_path)) 145 | if freeze_body in [1, 2]: 146 | # Freeze darknet53 body or freeze all but 3 output layers. 147 | num = (185, len(model_body.layers)-3)[freeze_body-1] 148 | for i in range(num): model_body.layers[i].trainable = False 149 | print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers))) 150 | 151 | # get output of second last layers and create bottleneck model of it 152 | out1=model_body.layers[246].output 153 | out2=model_body.layers[247].output 154 | out3=model_body.layers[248].output 155 | bottleneck_model = Model([model_body.input, *y_true], [out1, out2, out3]) 156 | 157 | # create last layer model of last layers from yolo model 158 | in0 = Input(shape=bottleneck_model.output[0].shape[1:].as_list()) 159 | in1 = Input(shape=bottleneck_model.output[1].shape[1:].as_list()) 160 | in2 = Input(shape=bottleneck_model.output[2].shape[1:].as_list()) 161 | last_out0=model_body.layers[249](in0) 162 | last_out1=model_body.layers[250](in1) 163 | last_out2=model_body.layers[251](in2) 164 | model_last=Model(inputs=[in0, in1, in2], outputs=[last_out0, last_out1, last_out2]) 165 | model_loss_last =Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 166 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})( 167 | [*model_last.output, *y_true]) 168 | last_layer_model = Model([in0,in1,in2, *y_true], model_loss_last) 169 | 170 | 171 | model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss', 172 | arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})( 173 | [*model_body.output, *y_true]) 174 | model = Model([model_body.input, *y_true], model_loss) 175 | 176 | return model, bottleneck_model, last_layer_model 177 | 178 | def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, random=True, verbose=False): 179 | '''data generator for fit_generator''' 180 | n = len(annotation_lines) 181 | i = 0 182 | while True: 183 | image_data = [] 184 | box_data = [] 185 | for b in range(batch_size): 186 | if i==0 and random: 187 | np.random.shuffle(annotation_lines) 188 | image, box = get_random_data(annotation_lines[i], input_shape, random=random) 189 | image_data.append(image) 190 | box_data.append(box) 191 | i = (i+1) % n 192 | image_data = np.array(image_data) 193 | if verbose: 194 | print("Progress: ",i,"/",n) 195 | box_data = np.array(box_data) 196 | y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes) 197 | yield [image_data, *y_true], np.zeros(batch_size) 198 | 199 | def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes, random=True, verbose=False): 200 | n = len(annotation_lines) 201 | if n==0 or batch_size<=0: return None 202 | return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, random, verbose) 203 | 204 | def bottleneck_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, bottlenecks): 205 | n = len(annotation_lines) 206 | i = 0 207 | while True: 208 | box_data = [] 209 | b0=np.zeros((batch_size,bottlenecks[0].shape[1],bottlenecks[0].shape[2],bottlenecks[0].shape[3])) 210 | b1=np.zeros((batch_size,bottlenecks[1].shape[1],bottlenecks[1].shape[2],bottlenecks[1].shape[3])) 211 | b2=np.zeros((batch_size,bottlenecks[2].shape[1],bottlenecks[2].shape[2],bottlenecks[2].shape[3])) 212 | for b in range(batch_size): 213 | _, box = get_random_data(annotation_lines[i], input_shape, random=False, proc_img=False) 214 | box_data.append(box) 215 | b0[b]=bottlenecks[0][i] 216 | b1[b]=bottlenecks[1][i] 217 | b2[b]=bottlenecks[2][i] 218 | i = (i+1) % n 219 | box_data = np.array(box_data) 220 | y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes) 221 | yield [b0, b1, b2, *y_true], np.zeros(batch_size) 222 | 223 | if __name__ == '__main__': 224 | _main() 225 | -------------------------------------------------------------------------------- /model_data/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | batch=1 4 | subdivisions=1 5 | # Training 6 | # batch=64 7 | # subdivisions=16 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [convolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .5 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [convolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .5 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [convolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .5 787 | truth_thresh = 1 788 | random=1 789 | 790 | -------------------------------------------------------------------------------- /yolo3/model.py: -------------------------------------------------------------------------------- 1 | """YOLO_v3 Model Defined in Keras.""" 2 | 3 | from functools import wraps 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from keras import backend as K 8 | from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, MaxPooling2D 9 | from keras.layers.advanced_activations import LeakyReLU 10 | from keras.layers.normalization import BatchNormalization 11 | from keras.models import Model 12 | from keras.regularizers import l2 13 | 14 | from yolo3.utils import compose 15 | 16 | 17 | @wraps(Conv2D) 18 | def DarknetConv2D(*args, **kwargs): 19 | """Wrapper to set Darknet parameters for Convolution2D.""" 20 | darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)} 21 | darknet_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same' 22 | darknet_conv_kwargs.update(kwargs) 23 | return Conv2D(*args, **darknet_conv_kwargs) 24 | 25 | def DarknetConv2D_BN_Leaky(*args, **kwargs): 26 | """Darknet Convolution2D followed by BatchNormalization and LeakyReLU.""" 27 | no_bias_kwargs = {'use_bias': False} 28 | no_bias_kwargs.update(kwargs) 29 | return compose( 30 | DarknetConv2D(*args, **no_bias_kwargs), 31 | BatchNormalization(), 32 | LeakyReLU(alpha=0.1)) 33 | 34 | def resblock_body(x, num_filters, num_blocks): 35 | '''A series of resblocks starting with a downsampling Convolution2D''' 36 | # Darknet uses left and top padding instead of 'same' mode 37 | x = ZeroPadding2D(((1,0),(1,0)))(x) 38 | x = DarknetConv2D_BN_Leaky(num_filters, (3,3), strides=(2,2))(x) 39 | for i in range(num_blocks): 40 | y = compose( 41 | DarknetConv2D_BN_Leaky(num_filters//2, (1,1)), 42 | DarknetConv2D_BN_Leaky(num_filters, (3,3)))(x) 43 | x = Add()([x,y]) 44 | return x 45 | 46 | def darknet_body(x): 47 | '''Darknent body having 52 Convolution2D layers''' 48 | x = DarknetConv2D_BN_Leaky(32, (3,3))(x) 49 | x = resblock_body(x, 64, 1) 50 | x = resblock_body(x, 128, 2) 51 | x = resblock_body(x, 256, 8) 52 | x = resblock_body(x, 512, 8) 53 | x = resblock_body(x, 1024, 4) 54 | return x 55 | 56 | def make_last_layers(x, num_filters, out_filters): 57 | '''6 Conv2D_BN_Leaky layers followed by a Conv2D_linear layer''' 58 | x = compose( 59 | DarknetConv2D_BN_Leaky(num_filters, (1,1)), 60 | DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), 61 | DarknetConv2D_BN_Leaky(num_filters, (1,1)), 62 | DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), 63 | DarknetConv2D_BN_Leaky(num_filters, (1,1)))(x) 64 | y = compose( 65 | DarknetConv2D_BN_Leaky(num_filters*2, (3,3)), 66 | DarknetConv2D(out_filters, (1,1)))(x) 67 | return x, y 68 | 69 | 70 | def yolo_body(inputs, num_anchors, num_classes): 71 | """Create YOLO_V3 model CNN body in Keras.""" 72 | darknet = Model(inputs, darknet_body(inputs)) 73 | x, y1 = make_last_layers(darknet.output, 512, num_anchors*(num_classes+5)) 74 | 75 | x = compose( 76 | DarknetConv2D_BN_Leaky(256, (1,1)), 77 | UpSampling2D(2))(x) 78 | x = Concatenate()([x,darknet.layers[152].output]) 79 | x, y2 = make_last_layers(x, 256, num_anchors*(num_classes+5)) 80 | 81 | x = compose( 82 | DarknetConv2D_BN_Leaky(128, (1,1)), 83 | UpSampling2D(2))(x) 84 | x = Concatenate()([x,darknet.layers[92].output]) 85 | x, y3 = make_last_layers(x, 128, num_anchors*(num_classes+5)) 86 | 87 | return Model(inputs, [y1,y2,y3]) 88 | 89 | def tiny_yolo_body(inputs, num_anchors, num_classes): 90 | '''Create Tiny YOLO_v3 model CNN body in keras.''' 91 | x1 = compose( 92 | DarknetConv2D_BN_Leaky(16, (3,3)), 93 | MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'), 94 | DarknetConv2D_BN_Leaky(32, (3,3)), 95 | MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'), 96 | DarknetConv2D_BN_Leaky(64, (3,3)), 97 | MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'), 98 | DarknetConv2D_BN_Leaky(128, (3,3)), 99 | MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'), 100 | DarknetConv2D_BN_Leaky(256, (3,3)))(inputs) 101 | x2 = compose( 102 | MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'), 103 | DarknetConv2D_BN_Leaky(512, (3,3)), 104 | MaxPooling2D(pool_size=(2,2), strides=(1,1), padding='same'), 105 | DarknetConv2D_BN_Leaky(1024, (3,3)), 106 | DarknetConv2D_BN_Leaky(256, (1,1)))(x1) 107 | y1 = compose( 108 | DarknetConv2D_BN_Leaky(512, (3,3)), 109 | DarknetConv2D(num_anchors*(num_classes+5), (1,1)))(x2) 110 | 111 | x2 = compose( 112 | DarknetConv2D_BN_Leaky(128, (1,1)), 113 | UpSampling2D(2))(x2) 114 | y2 = compose( 115 | Concatenate(), 116 | DarknetConv2D_BN_Leaky(256, (3,3)), 117 | DarknetConv2D(num_anchors*(num_classes+5), (1,1)))([x2,x1]) 118 | 119 | return Model(inputs, [y1,y2]) 120 | 121 | 122 | def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False): 123 | """Convert final layer features to bounding box parameters.""" 124 | num_anchors = len(anchors) 125 | # Reshape to batch, height, width, num_anchors, box_params. 126 | anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2]) 127 | 128 | grid_shape = K.shape(feats)[1:3] # height, width 129 | grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]), 130 | [1, grid_shape[1], 1, 1]) 131 | grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]), 132 | [grid_shape[0], 1, 1, 1]) 133 | grid = K.concatenate([grid_x, grid_y]) 134 | grid = K.cast(grid, K.dtype(feats)) 135 | 136 | feats = K.reshape( 137 | feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5]) 138 | 139 | # Adjust preditions to each spatial grid point and anchor size. 140 | box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats)) 141 | box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats)) 142 | box_confidence = K.sigmoid(feats[..., 4:5]) 143 | box_class_probs = K.sigmoid(feats[..., 5:]) 144 | 145 | if calc_loss == True: 146 | return grid, feats, box_xy, box_wh 147 | return box_xy, box_wh, box_confidence, box_class_probs 148 | 149 | 150 | def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape): 151 | '''Get corrected boxes''' 152 | box_yx = box_xy[..., ::-1] 153 | box_hw = box_wh[..., ::-1] 154 | input_shape = K.cast(input_shape, K.dtype(box_yx)) 155 | image_shape = K.cast(image_shape, K.dtype(box_yx)) 156 | new_shape = K.round(image_shape * K.min(input_shape/image_shape)) 157 | offset = (input_shape-new_shape)/2./input_shape 158 | scale = input_shape/new_shape 159 | box_yx = (box_yx - offset) * scale 160 | box_hw *= scale 161 | 162 | box_mins = box_yx - (box_hw / 2.) 163 | box_maxes = box_yx + (box_hw / 2.) 164 | boxes = K.concatenate([ 165 | box_mins[..., 0:1], # y_min 166 | box_mins[..., 1:2], # x_min 167 | box_maxes[..., 0:1], # y_max 168 | box_maxes[..., 1:2] # x_max 169 | ]) 170 | 171 | # Scale boxes back to original image shape. 172 | boxes *= K.concatenate([image_shape, image_shape]) 173 | return boxes 174 | 175 | 176 | def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape): 177 | '''Process Conv layer output''' 178 | box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats, 179 | anchors, num_classes, input_shape) 180 | boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape) 181 | boxes = K.reshape(boxes, [-1, 4]) 182 | box_scores = box_confidence * box_class_probs 183 | box_scores = K.reshape(box_scores, [-1, num_classes]) 184 | return boxes, box_scores 185 | 186 | 187 | def yolo_eval(yolo_outputs, 188 | anchors, 189 | num_classes, 190 | image_shape, 191 | max_boxes=20, 192 | score_threshold=.6, 193 | iou_threshold=.5): 194 | """Evaluate YOLO model on given input and return filtered boxes.""" 195 | num_layers = len(yolo_outputs) 196 | anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] # default setting 197 | input_shape = K.shape(yolo_outputs[0])[1:3] * 32 198 | boxes = [] 199 | box_scores = [] 200 | for l in range(num_layers): 201 | _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l], 202 | anchors[anchor_mask[l]], num_classes, input_shape, image_shape) 203 | boxes.append(_boxes) 204 | box_scores.append(_box_scores) 205 | boxes = K.concatenate(boxes, axis=0) 206 | box_scores = K.concatenate(box_scores, axis=0) 207 | 208 | mask = box_scores >= score_threshold 209 | max_boxes_tensor = K.constant(max_boxes, dtype='int32') 210 | boxes_ = [] 211 | scores_ = [] 212 | classes_ = [] 213 | for c in range(num_classes): 214 | # TODO: use keras backend instead of tf. 215 | class_boxes = tf.boolean_mask(boxes, mask[:, c]) 216 | class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c]) 217 | nms_index = tf.image.non_max_suppression( 218 | class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold) 219 | class_boxes = K.gather(class_boxes, nms_index) 220 | class_box_scores = K.gather(class_box_scores, nms_index) 221 | classes = K.ones_like(class_box_scores, 'int32') * c 222 | boxes_.append(class_boxes) 223 | scores_.append(class_box_scores) 224 | classes_.append(classes) 225 | boxes_ = K.concatenate(boxes_, axis=0) 226 | scores_ = K.concatenate(scores_, axis=0) 227 | classes_ = K.concatenate(classes_, axis=0) 228 | 229 | return boxes_, scores_, classes_ 230 | 231 | 232 | def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes): 233 | '''Preprocess true boxes to training input format 234 | 235 | Parameters 236 | ---------- 237 | true_boxes: array, shape=(m, T, 5) 238 | Absolute x_min, y_min, x_max, y_max, class_id relative to input_shape. 239 | input_shape: array-like, hw, multiples of 32 240 | anchors: array, shape=(N, 2), wh 241 | num_classes: integer 242 | 243 | Returns 244 | ------- 245 | y_true: list of array, shape like yolo_outputs, xywh are reletive value 246 | 247 | ''' 248 | assert (true_boxes[..., 4]0 269 | 270 | for b in range(m): 271 | # Discard zero rows. 272 | wh = boxes_wh[b, valid_mask[b]] 273 | if len(wh)==0: continue 274 | # Expand dim to apply broadcasting. 275 | wh = np.expand_dims(wh, -2) 276 | box_maxes = wh / 2. 277 | box_mins = -box_maxes 278 | 279 | intersect_mins = np.maximum(box_mins, anchor_mins) 280 | intersect_maxes = np.minimum(box_maxes, anchor_maxes) 281 | intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.) 282 | intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] 283 | box_area = wh[..., 0] * wh[..., 1] 284 | anchor_area = anchors[..., 0] * anchors[..., 1] 285 | iou = intersect_area / (box_area + anchor_area - intersect_area) 286 | 287 | # Find best anchor for each true box 288 | best_anchor = np.argmax(iou, axis=-1) 289 | 290 | for t, n in enumerate(best_anchor): 291 | for l in range(num_layers): 292 | if n in anchor_mask[l]: 293 | i = np.floor(true_boxes[b,t,0]*grid_shapes[l][1]).astype('int32') 294 | j = np.floor(true_boxes[b,t,1]*grid_shapes[l][0]).astype('int32') 295 | k = anchor_mask[l].index(n) 296 | c = true_boxes[b,t, 4].astype('int32') 297 | y_true[l][b, j, i, k, 0:4] = true_boxes[b,t, 0:4] 298 | y_true[l][b, j, i, k, 4] = 1 299 | y_true[l][b, j, i, k, 5+c] = 1 300 | 301 | return y_true 302 | 303 | 304 | def box_iou(b1, b2): 305 | '''Return iou tensor 306 | 307 | Parameters 308 | ---------- 309 | b1: tensor, shape=(i1,...,iN, 4), xywh 310 | b2: tensor, shape=(j, 4), xywh 311 | 312 | Returns 313 | ------- 314 | iou: tensor, shape=(i1,...,iN, j) 315 | 316 | ''' 317 | 318 | # Expand dim to apply broadcasting. 319 | b1 = K.expand_dims(b1, -2) 320 | b1_xy = b1[..., :2] 321 | b1_wh = b1[..., 2:4] 322 | b1_wh_half = b1_wh/2. 323 | b1_mins = b1_xy - b1_wh_half 324 | b1_maxes = b1_xy + b1_wh_half 325 | 326 | # Expand dim to apply broadcasting. 327 | b2 = K.expand_dims(b2, 0) 328 | b2_xy = b2[..., :2] 329 | b2_wh = b2[..., 2:4] 330 | b2_wh_half = b2_wh/2. 331 | b2_mins = b2_xy - b2_wh_half 332 | b2_maxes = b2_xy + b2_wh_half 333 | 334 | intersect_mins = K.maximum(b1_mins, b2_mins) 335 | intersect_maxes = K.minimum(b1_maxes, b2_maxes) 336 | intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) 337 | intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] 338 | b1_area = b1_wh[..., 0] * b1_wh[..., 1] 339 | b2_area = b2_wh[..., 0] * b2_wh[..., 1] 340 | iou = intersect_area / (b1_area + b2_area - intersect_area) 341 | 342 | return iou 343 | 344 | 345 | def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False): 346 | '''Return yolo_loss tensor 347 | 348 | Parameters 349 | ---------- 350 | yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body 351 | y_true: list of array, the output of preprocess_true_boxes 352 | anchors: array, shape=(N, 2), wh 353 | num_classes: integer 354 | ignore_thresh: float, the iou threshold whether to ignore object confidence loss 355 | 356 | Returns 357 | ------- 358 | loss: tensor, shape=(1,) 359 | 360 | ''' 361 | num_layers = len(anchors)//3 # default setting 362 | yolo_outputs = args[:num_layers] 363 | y_true = args[num_layers:] 364 | anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] 365 | input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0])) 366 | grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)] 367 | loss = 0 368 | m = K.shape(yolo_outputs[0])[0] # batch size, tensor 369 | mf = K.cast(m, K.dtype(yolo_outputs[0])) 370 | 371 | for l in range(num_layers): 372 | object_mask = y_true[l][..., 4:5] 373 | true_class_probs = y_true[l][..., 5:] 374 | 375 | grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l], 376 | anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True) 377 | pred_box = K.concatenate([pred_xy, pred_wh]) 378 | 379 | # Darknet raw box to calculate loss. 380 | raw_true_xy = y_true[l][..., :2]*grid_shapes[l][::-1] - grid 381 | raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1]) 382 | raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf 383 | box_loss_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4] 384 | 385 | # Find ignore mask, iterate over each of batch. 386 | ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True) 387 | object_mask_bool = K.cast(object_mask, 'bool') 388 | def loop_body(b, ignore_mask): 389 | true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0]) 390 | iou = box_iou(pred_box[b], true_box) 391 | best_iou = K.max(iou, axis=-1) 392 | ignore_mask = ignore_mask.write(b, K.cast(best_iou