├── vod-converter
    ├── vod_converter
    │   ├── __init__.py
    │   ├── main.py
    │   ├── udacity.py
    │   ├── kitti_tracking.py
    │   ├── kitti.py
    │   ├── voc.py
    │   └── converter.py
    ├── .gitignore
    ├── tests
    │   ├── context.py
    │   └── test_converter.py
    ├── LICENSE
    └── README.md
├── model_data
    ├── tiny_yolo_anchors.txt
    ├── yolo_anchors.txt
    ├── coco_classes.txt
    └── yolov3.cfg
├── 9_CLASS_test_classes.txt
├── README.md
├── voc_to_YOLOv3.py
├── yolo3
    ├── utils.py
    └── model.py
├── oid_to_pascal_voc_xml.py
├── train.py
├── convert.py
└── train_bottleneck.py


/vod-converter/vod_converter/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vod-converter/.gitignore:
--------------------------------------------------------------------------------
1 | .idea*
2 | .cache*
3 | kitti-tracking*


--------------------------------------------------------------------------------
/model_data/tiny_yolo_anchors.txt:
--------------------------------------------------------------------------------
1 | 10,14,  23,27,  37,58,  81,82,  135,169,  344,319
2 | 


--------------------------------------------------------------------------------
/model_data/yolo_anchors.txt:
--------------------------------------------------------------------------------
1 | 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
2 | 


--------------------------------------------------------------------------------
/9_CLASS_test_classes.txt:
--------------------------------------------------------------------------------
 1 | car
 2 | Van
 3 | Truck
 4 | person
 5 | Person_sitting
 6 | Cyclist
 7 | Tram
 8 | Misc
 9 | DontCare
10 | 


--------------------------------------------------------------------------------
/vod-converter/tests/context.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../vod_converter')))
4 | 
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Object-distance-Estimation-and-Collision-warning
2 | A project which uses deep learning to detect and estimate the distance of the detected objects from the monocular camera. An alarm is raised the detected object is in a distance range considered as dangerous.
3 | 
4 | In this poject we used the TensorFlow implementation of Yolov3 architecture from [pythonlesson](https://github.com/pythonlessons/YOLOv3-object-detection-tutorial/tree/master/YOLOv3-custom-training). We trained our model on the [Kitti](http://www.cvlibs.net/datasets/kitti/) dataset for this model to be detect the road object. This dataset has 9 classes which are 'car', 'Van', 'Truck','person', 'Person_sitting', 'Cyclist', 'Tram', 'Misc' and 'DontCare'
5 | 


--------------------------------------------------------------------------------
/model_data/coco_classes.txt:
--------------------------------------------------------------------------------
 1 | person
 2 | bicycle
 3 | car
 4 | motorbike
 5 | aeroplane
 6 | bus
 7 | train
 8 | truck
 9 | boat
10 | traffic light
11 | fire hydrant
12 | stop sign
13 | parking meter
14 | bench
15 | bird
16 | cat
17 | dog
18 | horse
19 | sheep
20 | cow
21 | elephant
22 | bear
23 | zebra
24 | giraffe
25 | backpack
26 | umbrella
27 | handbag
28 | tie
29 | suitcase
30 | frisbee
31 | skis
32 | snowboard
33 | sports ball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife
45 | spoon
46 | bowl
47 | banana
48 | apple
49 | sandwich
50 | orange
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut
56 | cake
57 | chair
58 | sofa
59 | pottedplant
60 | bed
61 | diningtable
62 | toilet
63 | tvmonitor
64 | laptop
65 | mouse
66 | remote
67 | keyboard
68 | cell phone
69 | microwave
70 | oven
71 | toaster
72 | sink
73 | refrigerator
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear
79 | hair drier
80 | toothbrush
81 | 


--------------------------------------------------------------------------------
/vod-converter/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/voc_to_YOLOv3.py:
--------------------------------------------------------------------------------
 1 | import xml.etree.ElementTree as ET
 2 | from os import getcwd
 3 | import os
 4 | 
 5 | 
 6 | dataset_train = 'kitti_data/training/image_2'
 7 | dataset_file = '9_CLASS_test.txt'
 8 | # classes_file = dataset_file[:-4]+'_classes.txt'
 9 | 
10 | CLS = ["car", "Van", "Truck", "person", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"]
11 | # classes =[dataset_train+CLASS for CLASS in CLS]
12 | wd = getcwd()
13 | 
14 | 
15 | def test(fullname):
16 |     bb = ""
17 |     in_file = open(fullname)
18 |     tree=ET.parse(in_file)
19 |     root = tree.getroot()
20 |     for i, obj in enumerate(root.iter('object')):
21 |         difficult = obj.find('difficult').text
22 |         cls = obj.find('name').text
23 |         if cls not in CLS or int(difficult)==1:
24 |             continue
25 |         cls_id = CLS.index(cls)
26 |         xmlbox = obj.find('bndbox')
27 |         b = (int(float(xmlbox.find('xmin').text)), int(float(xmlbox.find('ymin').text)), int(float(xmlbox.find('xmax').text)), int(float(xmlbox.find('ymax').text)))
28 |         bb += (" " + ",".join([str(a) for a in b]) + ',' + str(cls_id))
29 | 
30 |         # we need this because I don't know overlapping or something like that
31 |         if cls == 'DontCare':
32 |             list_file = open(dataset_file, 'a')
33 |             file_string = str(fullname)[:-4]+'.png'+bb+'\n'
34 |             list_file.write(file_string)
35 |             list_file.close()
36 |             bb = ""
37 | 
38 |     if bb != "":
39 |         list_file = open(dataset_file, 'a')
40 |         file_string = str(fullname)[:-4]+'.png'+bb+'\n'
41 |         list_file.write(file_string)
42 |         list_file.close()
43 | 
44 | 
45 | 
46 | for filename in os.listdir(dataset_train):
47 |     if not filename.endswith('.xml'):
48 |         continue
49 |     fullname = os.getcwd()+'/'+dataset_train+'/'+filename
50 |     test(fullname)
51 | 
52 | 


--------------------------------------------------------------------------------
/vod-converter/README.md:
--------------------------------------------------------------------------------
 1 | # Visual Object Dataset converter
 2 | 
 3 | Converts between object dataset formats. Requires Python 3.6.
 4 | 
 5 | Example: convert from data in [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php) format to
 6 | [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html) format:
 7 | 
 8 | ```
 9 | $ python3.6 vod_converter/main.py --from kitti --from-path datasets/mydata-kitti --to voc --to-path datasets/mydata-voc
10 | ```
11 | 
12 | See `main.py` for documentation on how to easily plug in additional data formats; you can define a function
13 | that can read in your data into a common format, and it will be then ready to convert to any supported format.
14 | 
15 | Similarly, you can implement a single function that takes the common format and outputs to the filesystem in
16 | your format and you will be ready to convert from e.g VOC to yours.
17 | 
18 | Currently support conversion from:
19 | 
20 | - [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php)
21 | - [KITTI tracking](http://www.cvlibs.net/datasets/kitti/eval_tracking.php)
22 | - [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html)
23 | - [Udacity CrowdAI and AUTTI](https://github.com/udacity/self-driving-car/tree/master/annotations)
24 | 
25 | to:
26 | 
27 | - [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html)
28 | - [KITTI](http://www.cvlibs.net/datasets/kitti/eval_object.php)
29 | 
30 | ## That 'train.txt' file for KITTI
31 | 
32 | When reading in KITTI, the script expects a `train.txt` file that isn't part of the original dataset. This is simply a file with the name of each datapoint you wish to capture. [Here's an example with everything in the training set](https://github.com/umautobots/vod-converter/files/1139276/train.txt). You can also create it like so:
33 | 
34 | ```
35 | $ cd datasets/kitti && ls -1 training/image_2 | cut -d. -f1 > train.txt && cd -
36 | $ head datasets/kitti/train.txt
37 | 000000
38 | 000001
39 | 000002
40 | 000003
41 | 000004
42 | 000005
43 | 000006
44 | 000007
45 | 000008
46 | 000009
47 | ```
48 | 
49 | ## Python2 support
50 | 
51 | This project is written using features requiring Python3.6+, but there is [a fork](https://github.com/nghiattran/vod-converter) that has been updated to work in Python2 if you need it.
52 | 
53 | 


--------------------------------------------------------------------------------
/vod-converter/tests/test_converter.py:
--------------------------------------------------------------------------------
 1 | import context  # augment system path to make imports work
 2 | from vod_converter import converter
 3 | 
 4 | 
 5 | def test_convert_labels():
 6 |     assert [{'detections': [
 7 |         {'label': 'person'},
 8 |         {'label': 'person'},
 9 |         {'label': 'person'},
10 |         {'label': 'rhinoZaurus'}
11 |     ]}] == \
12 |            converter.convert_labels(
13 |                image_detections=[
14 |                    {'detections': [
15 |                        {'label': 'Pedestrian'},
16 |                        {'label': 'pedestrian'},
17 |                        {'label': 'Person'},
18 |                        {'label': 'rhinoZaurus'}
19 |                    ]}
20 |                ],
21 |                expected_labels={'person': ['Pedestrian']},
22 |                select_only_known_labels=False,
23 |                filter_images_without_labels=False
24 |            )
25 | 
26 | 
27 | def test_select_only_known_labels():
28 |     assert [{'detections': [
29 |         {'label': 'person'},
30 |         {'label': 'person'},
31 |         {'label': 'person'},
32 |     ]}] == \
33 |            converter.convert_labels(
34 |                image_detections=[
35 |                    {'detections': [
36 |                        {'label': 'Pedestrian'},
37 |                        {'label': 'pedestrian'},
38 |                        {'label': 'Person'},
39 |                        {'label': 'rhinoZaurus'}
40 |                    ]}
41 |                ],
42 |                expected_labels={'person': ['Pedestrian']},
43 |                select_only_known_labels=True,
44 |                filter_images_without_labels=False
45 |            )
46 | 
47 | 
48 | def test_filter_images_without_labels():
49 |     assert [{'detections': [
50 |         {'label': 'person'},
51 |         {'label': 'person'},
52 |         {'label': 'person'},
53 |     ]}] == \
54 |            converter.convert_labels(
55 |                image_detections=[
56 |                    {'detections': [
57 |                        {'label': 'Pedestrian'},
58 |                        {'label': 'pedestrian'},
59 |                        {'label': 'Person'},
60 |                        {'label': 'rhinoZaurus'}
61 |                    ],
62 |                    },
63 |                    {'detections': [
64 |                        {'label': 'rhinoZaurus'}
65 |                    ]
66 |                    }
67 |                ],
68 |                expected_labels={'person': ['Pedestrian']},
69 |                select_only_known_labels=True,
70 |                filter_images_without_labels=True
71 |            )
72 | 


--------------------------------------------------------------------------------
/vod-converter/vod_converter/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Converts between visual object detection dataset formats. See `converter.py` for more info.
 3 | 
 4 | To add support for additional data formats, define a module with an `converter.Ingestor` and/or
 5 | `converter.Egestor` implementation and add them to the `INGESTORS` and `EGESTORS` dicts below.
 6 | """
 7 | 
 8 | import argparse
 9 | import logging
10 | 
11 | import converter
12 | import kitti
13 | import kitti_tracking
14 | import udacity
15 | import voc
16 | 
17 | import sys
18 | 
19 | logger = logging.getLogger()
20 | logger.setLevel(logging.INFO)
21 | 
22 | INGESTORS = {
23 |     'kitti': kitti.KITTIIngestor(),
24 |     'kitti-tracking': kitti_tracking.KITTITrackingIngestor(),
25 |     'voc': voc.VOCIngestor(),
26 |     'udacity-crowdai': udacity.UdacityCrowdAIIngestor(),
27 |     'udacity-autti': udacity.UdacityAuttiIngestor()
28 | }
29 | 
30 | EGESTORS = {
31 |     'voc': voc.VOCEgestor(),
32 |     'kitti': kitti.KITTIEgestor()
33 | }
34 | 
35 | 
36 | def main(*, from_path, from_key, to_path, to_key, select_only_known_labels, filter_images_without_labels):
37 |     success, msg = converter.convert(from_path=from_path, ingestor=INGESTORS[from_key],
38 |                                      to_path=to_path, egestor=EGESTORS[to_key],
39 |                                      select_only_known_labels=select_only_known_labels,
40 |                                      filter_images_without_labels=filter_images_without_labels)
41 |     if success:
42 |         print(f"Successfully converted from {from_key} to {to_key}.")
43 |     else:
44 |         print(f"Failed to convert from {from_key} to {to_key}: {msg}")
45 |         return 1
46 | 
47 | 
48 | def parse_args():
49 |     parser = argparse.ArgumentParser(description='Convert visual object datasets.')
50 |     parser._action_groups.pop()
51 |     required = parser.add_argument_group('required arguments')
52 |     optional = parser.add_argument_group('optional arguments')
53 |     required.add_argument('--from',
54 |                           dest='from_key',
55 |                           required=True,
56 |                           help=f'Format to convert from: one of {", ".join(INGESTORS.keys())}', type=str)
57 |     required.add_argument('--from-path', dest='from_path',
58 |                           required=True,
59 |                           help=f'Path to dataset you wish to convert.', type=str)
60 |     required.add_argument('--to', dest='to_key', required=True,
61 |                           help=f'Format to convert to: one of {", ".join(EGESTORS.keys())}',
62 |                           type=str)
63 |     required.add_argument(
64 |         '--to-path',
65 |         dest='to_path', required=True,
66 |         help="Path to output directory for converted dataset.", type=str)
67 |     optional.add_argument(
68 |         '--select-only-known-labels',
69 |         help="only include labels known to the destination dataset (e.g skip 'trafficlight' if VOC doesn't know about it)",
70 |         required=False,
71 |         action='store_true',
72 |         default=False
73 |     )
74 |     optional.add_argument(
75 |         '--filter-images-without-labels',
76 |         help="skip images that don't have any (known) labels",
77 |         required=False,
78 |         action='store_true',
79 |         default=False
80 |     )
81 | 
82 |     args = parser.parse_args()
83 |     logging.info(args)
84 |     return args
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     args = parse_args()
89 |     sys.exit(main(from_path=args.from_path, from_key=args.from_key,
90 |                   to_path=args.to_path, to_key=args.to_key,
91 |                   select_only_known_labels=args.select_only_known_labels,
92 |                   filter_images_without_labels=args.filter_images_without_labels))
93 | 


--------------------------------------------------------------------------------
/yolo3/utils.py:
--------------------------------------------------------------------------------
  1 | """Miscellaneous utility functions."""
  2 | 
  3 | from functools import reduce
  4 | 
  5 | from PIL import Image
  6 | import numpy as np
  7 | from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
  8 | import cv2
  9 | 
 10 | def compose(*funcs):
 11 |     """Compose arbitrarily many functions, evaluated left to right.
 12 | 
 13 |     Reference: https://mathieularose.com/function-composition-in-python/
 14 |     """
 15 |     # return lambda x: reduce(lambda v, f: f(v), funcs, x)
 16 |     if funcs:
 17 |         return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs)
 18 |     else:
 19 |         raise ValueError('Composition of empty sequence not supported.')
 20 | 
 21 | def image_preporcess(image, target_size, gt_boxes=None):
 22 | 
 23 |     ih, iw    = target_size
 24 |     h,  w, _  = image.shape
 25 | 
 26 |     scale = min(iw/w, ih/h)
 27 |     nw = int(scale * w)
 28 |     nh = int(scale * h)
 29 |     
 30 |     image_resized = cv2.resize(image, (nw, nh), interpolation=cv2.INTER_CUBIC)
 31 | 
 32 |     image_paded = np.full(shape=[ih, iw, 3], fill_value=128.0)
 33 |     dw, dh = (iw - nw) // 2, (ih-nh) // 2
 34 |     image_paded[dh:nh+dh, dw:nw+dw, :] = image_resized
 35 | 
 36 |     image_paded = np.array(image_paded, dtype='float32')
 37 |     
 38 |     image_paded = image_paded / 255.
 39 |     
 40 |     image_paded = np.expand_dims(image_paded, axis=0)
 41 | 
 42 |     return image_paded
 43 | 
 44 | def rand(a=0, b=1):
 45 |     return np.random.rand()*(b-a) + a
 46 | 
 47 | def get_random_data(annotation_line, input_shape, random=True, max_boxes=20, jitter=.3, hue=.1, sat=1.5, val=1.5, proc_img=True):
 48 |     '''random preprocessing for real-time data augmentation'''
 49 |     line = annotation_line.split()
 50 |     image = Image.open(line[0])
 51 |     iw, ih = image.size
 52 |     h, w = input_shape
 53 |     box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
 54 | 
 55 |     if not random:
 56 |         # resize image
 57 |         scale = min(w/iw, h/ih)
 58 |         nw = int(iw*scale)
 59 |         nh = int(ih*scale)
 60 |         dx = (w-nw)//2
 61 |         dy = (h-nh)//2
 62 |         image_data=0
 63 |         if proc_img:
 64 |             image = image.resize((nw,nh), Image.BICUBIC)
 65 |             new_image = Image.new('RGB', (w,h), (128,128,128))
 66 |             new_image.paste(image, (dx, dy))
 67 |             image_data = np.array(new_image)/255.
 68 | 
 69 |         # correct boxes
 70 |         box_data = np.zeros((max_boxes,5))
 71 |         if len(box)>0:
 72 |             np.random.shuffle(box)
 73 |             if len(box)>max_boxes: box = box[:max_boxes]
 74 |             box[:, [0,2]] = box[:, [0,2]]*scale + dx
 75 |             box[:, [1,3]] = box[:, [1,3]]*scale + dy
 76 |             box_data[:len(box)] = box
 77 | 
 78 |         return image_data, box_data
 79 | 
 80 |     # resize image
 81 |     new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
 82 |     scale = rand(.25, 2)
 83 |     if new_ar < 1:
 84 |         nh = int(scale*h)
 85 |         nw = int(nh*new_ar)
 86 |     else:
 87 |         nw = int(scale*w)
 88 |         nh = int(nw/new_ar)
 89 |     image = image.resize((nw,nh), Image.BICUBIC)
 90 | 
 91 |     # place image
 92 |     dx = int(rand(0, w-nw))
 93 |     dy = int(rand(0, h-nh))
 94 |     new_image = Image.new('RGB', (w,h), (128,128,128))
 95 |     new_image.paste(image, (dx, dy))
 96 |     image = new_image
 97 | 
 98 |     # flip image or not
 99 |     flip = rand()<.5
100 |     if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
101 | 
102 |     # distort image
103 |     hue = rand(-hue, hue)
104 |     sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
105 |     val = rand(1, val) if rand()<.5 else 1/rand(1, val)
106 |     x = rgb_to_hsv(np.array(image)/255.)
107 |     x[..., 0] += hue
108 |     x[..., 0][x[..., 0]>1] -= 1
109 |     x[..., 0][x[..., 0]<0] += 1
110 |     x[..., 1] *= sat
111 |     x[..., 2] *= val
112 |     x[x>1] = 1
113 |     x[x<0] = 0
114 |     image_data = hsv_to_rgb(x) # numpy array, 0 to 1
115 | 
116 |     # correct boxes
117 |     box_data = np.zeros((max_boxes,5))
118 |     if len(box)>0:
119 |         np.random.shuffle(box)
120 |         box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
121 |         box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
122 |         if flip: box[:, [0,2]] = w - box[:, [2,0]]
123 |         box[:, 0:2][box[:, 0:2]<0] = 0
124 |         box[:, 2][box[:, 2]>w] = w
125 |         box[:, 3][box[:, 3]>h] = h
126 |         box_w = box[:, 2] - box[:, 0]
127 |         box_h = box[:, 3] - box[:, 1]
128 |         box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
129 |         if len(box)>max_boxes: box = box[:max_boxes]
130 |         box_data[:len(box)] = box
131 | 
132 |     return image_data, box_data
133 | 


--------------------------------------------------------------------------------
/vod-converter/vod_converter/udacity.py:
--------------------------------------------------------------------------------
  1 | """
  2 | https://github.com/udacity/self-driving-car/tree/master/annotations
  3 | 
  4 | 
  5 | """
  6 | 
  7 | import csv
  8 | import glob
  9 | import os
 10 | from PIL import Image
 11 | 
 12 | from collections import defaultdict
 13 | 
 14 | 
 15 | from converter import Ingestor
 16 | 
 17 | 
 18 | class UdacityCrowdAIIngestor(Ingestor):
 19 | 
 20 |     def validate(self, root):
 21 |         labels_path = f"{root}/labels.csv"
 22 |         if not os.path.isfile(labels_path):
 23 |             return False, f"Expected to find {labels_path}"
 24 |         return True, None
 25 | 
 26 |     def ingest(self, root):
 27 |         labels_path = f"{root}/labels.csv"
 28 |         image_labels = defaultdict(list)
 29 | 
 30 |         with open(labels_path) as labels_file:
 31 |             labels_csv = csv.reader(labels_file)
 32 |             next(labels_csv, None)  # skip header
 33 |             for idx, row in enumerate(labels_csv):
 34 |                 image_labels[row[4]].append(row)
 35 | 
 36 |         image_detections = []
 37 |         for idx, image_path in enumerate(glob.glob(f"{root}/*.jpg")):
 38 |             f_name = image_path.split("/")[-1]
 39 |             f_image_labels = image_labels[f_name]
 40 |             fname_id = f_name.split('.')[0]
 41 | 
 42 |             image_width, image_height = _image_dimensions(image_path)
 43 | 
 44 |             def clamp_bbox(det):
 45 |                 if det['right'] > image_width - 1:
 46 |                     det['right'] = image_width - 1
 47 |                 if det['bottom'] > image_height - 1:
 48 |                     det['bottom'] = image_height - 1
 49 |                 return det
 50 | 
 51 |             def valid_bbox(det):
 52 |                 return det['right'] > det['left'] and det['bottom'] > det['top']
 53 | 
 54 |             detections = []
 55 | 
 56 |             for image_label in f_image_labels:
 57 |                 x1, y1, x2, y2 = map(float, image_label[0:4])
 58 |                 label = image_label[5]
 59 |                 detections.append({
 60 |                     'label': label,
 61 |                     'left': x1,
 62 |                     'right': x2,
 63 |                     'top': y1,
 64 |                     'bottom': y2
 65 |                 })
 66 | 
 67 |             filtered_detections = [clamp_bbox(det) for det in detections if valid_bbox(det)]
 68 |             if filtered_detections:
 69 |                 image_detections.append({
 70 |                     'image': {
 71 |                         'id': fname_id,
 72 |                         'path': image_path,
 73 |                         'segmented_path': None,
 74 |                         'width': image_width,
 75 |                         'height': image_height
 76 |                     },
 77 |                     'detections': filtered_detections
 78 |                 })
 79 |         return image_detections
 80 | 
 81 | 
 82 | class UdacityAuttiIngestor(Ingestor):
 83 |     def validate(self, root):
 84 |         labels_path = f"{root}/labels.csv"
 85 |         if not os.path.isfile(labels_path):
 86 |             return False, f"Expected to find {labels_path}"
 87 |         return True, None
 88 | 
 89 |     def ingest(self, root):
 90 |         labels_path = f"{root}/labels.csv"
 91 |         image_labels = defaultdict(list)
 92 | 
 93 |         with open(labels_path) as labels_file:
 94 |             labels_csv = csv.reader(labels_file, delimiter=' ')
 95 |             next(labels_csv, None)  # skip header
 96 |             for idx, row in enumerate(labels_csv):
 97 |                 image_labels[row[0]].append(row)
 98 | 
 99 |         image_detections = []
100 |         for idx, image_path in enumerate(glob.glob(f"{root}/*.jpg")):
101 |             f_name = image_path.split("/")[-1]
102 |             f_image_labels = image_labels[f_name]
103 |             fname_id = f_name.split('.')[0]
104 | 
105 |             image_width, image_height = _image_dimensions(image_path)
106 | 
107 |             def clamp_bbox(det):
108 |                 if det['right'] > image_width - 1:
109 |                     det['right'] = image_width - 1
110 |                 if det['bottom'] > image_height - 1:
111 |                     det['bottom'] = image_height - 1
112 |                 return det
113 | 
114 |             def valid_bbox(det):
115 |                 return det['right'] > det['left'] and det['bottom'] > det['top']
116 | 
117 |             detections = []
118 | 
119 |             for image_label in f_image_labels:
120 |                 x1, y1, x2, y2, x, label = image_label[1:7]
121 |                 x1, y1, x2, y2 = map(float, (x1, y1, x2, y2))
122 |                 detections.append({
123 |                     'label': label,
124 |                     'left': x1,
125 |                     'right': x2,
126 |                     'top': y1,
127 |                     'bottom': y2
128 |                 })
129 | 
130 |             filtered_detections = [clamp_bbox(det) for det in detections if valid_bbox(det)]
131 |             if filtered_detections:
132 |                 image_detections.append({
133 |                     'image': {
134 |                         'id': fname_id,
135 |                         'path': image_path,
136 |                         'segmented_path': None,
137 |                         'width': image_width,
138 |                         'height': image_height
139 |                     },
140 |                     'detections': filtered_detections
141 |                 })
142 |         return image_detections
143 | 
144 | 
145 | def _image_dimensions(path):
146 |     with Image.open(path) as image:
147 |         return image.width, image.height


--------------------------------------------------------------------------------
/vod-converter/vod_converter/kitti_tracking.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ingestor for KITTI tracking formats.
  3 | 
  4 | http://www.cvlibs.net/datasets/kitti/eval_tracking.php
  5 | 
  6 | Note: even though this is for tracking instead of object detection, sometime it's helpful to convert
  7 | data from this for object detection training. This reads in the left color labels.
  8 | 
  9 | Per devkit docs:
 10 | 
 11 | The data for training and testing can be found in the corresponding folders.
 12 | The sub-folders are structured as follows:
 13 | 
 14 |   - image_02/%04d/ contains the left color camera sequence images (png)
 15 |   - image_03/%04d/ contains the right color camera sequence images  (png)
 16 |   - label_02/ contains the left color camera label files (plain text files)
 17 |   - calib/ contains the calibration for all four cameras (plain text files)
 18 | 
 19 | The label files contain the following information, which can be read and
 20 | written using the matlab tools (readLabels.m) provided within this devkit.
 21 | All values (numerical or strings) are separated via spaces, each row
 22 | corresponds to one object. The 17 columns represent:
 23 | 
 24 | #Values    Name      Description
 25 | ----------------------------------------------------------------------------
 26 |    1    frame        Frame within the sequence where the object appearers
 27 |    1    track id     Unique tracking id of this object within this sequence
 28 |    1    type         Describes the type of object: 'Car', 'Van', 'Truck',
 29 |                      'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
 30 |                      'Misc' or 'DontCare'
 31 |    1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
 32 |                      truncated refers to the object leaving image boundaries.
 33 | 		     Truncation 2 indicates an ignored object (in particular
 34 | 		     in the beginning or end of a track) introduced by manual
 35 | 		     labeling.
 36 |    1    occluded     Integer (0,1,2,3) indicating occlusion state:
 37 |                      0 = fully visible, 1 = partly occluded
 38 |                      2 = largely occluded, 3 = unknown
 39 |    1    alpha        Observation angle of object, ranging [-pi..pi]
 40 |    4    bbox         2D bounding box of object in the image (0-based index):
 41 |                      contains left, top, right, bottom pixel coordinates
 42 |    3    dimensions   3D object dimensions: height, width, length (in meters)
 43 |    3    location     3D object location x,y,z in camera coordinates (in meters)
 44 |    1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
 45 |    1    score        Only for results: Float, indicating confidence in
 46 |                      detection, needed for p/r curves, higher is better.
 47 | 
 48 | 
 49 | """
 50 | 
 51 | import csv
 52 | from collections import defaultdict
 53 | import os
 54 | import re
 55 | from PIL import Image
 56 | 
 57 | from converter import Ingestor
 58 | 
 59 | LABEL_F_PATTERN = re.compile('[0-9]+\.txt')
 60 | 
 61 | 
 62 | class KITTITrackingIngestor(Ingestor):
 63 |     def validate(self, path):
 64 |         expected_dirs = [
 65 |             'image_02',
 66 |             'label_02'
 67 |         ]
 68 |         for subdir in expected_dirs:
 69 |             if not os.path.isdir(f"{path}/{subdir}"):
 70 |                 return False, f"Expected subdirectory {subdir} within {path}"
 71 |         return True, None
 72 | 
 73 |     def ingest(self, path):
 74 |         fs = os.listdir(f"{path}/label_02")
 75 |         label_fnames = [f for f in fs if LABEL_F_PATTERN.match(f)]
 76 |         image_detections = []
 77 |         for label_fname in label_fnames:
 78 |             frame_name = label_fname.split(".")[0]
 79 |             labels_path = f"{path}/label_02/{label_fname}"
 80 |             images_dir = f"{path}/image_02/{frame_name}"
 81 |             image_detections.extend(
 82 |                 self._get_track_image_detections(frame_name=frame_name, labels_path=labels_path, images_dir=images_dir))
 83 |         return image_detections
 84 | 
 85 |     def _get_track_image_detections(self, *, frame_name, labels_path, images_dir):
 86 |         detections_by_frame = defaultdict(list)
 87 |         with open(labels_path) as f:
 88 |             f_csv = csv.reader(f, delimiter=' ')
 89 |             for row in f_csv:
 90 |                 frame_id = int(row[0])
 91 |                 x1, y1, x2, y2 = map(float, row[6:10])
 92 |                 label = row[2]
 93 |                 detections_by_frame[frame_id].append({
 94 |                     'label': label,
 95 |                     'left': x1,
 96 |                     'right': x2,
 97 |                     'top': y1,
 98 |                     'bottom': y2
 99 |                 })
100 | 
101 |         image_detections = []
102 |         for frame_id in sorted(detections_by_frame.keys()):
103 |             frame_dets = detections_by_frame[frame_id]
104 |             image_path = f"{images_dir}/{frame_id:06d}.png"
105 |             if not os.path.exists(image_path):
106 |                 image_path = f"{images_dir}/{frame_id:06d}.jpg"
107 |             with Image.open(image_path) as image:
108 |                 image_width = image.width
109 |                 image_height = image.height
110 | 
111 |                 def clamp_bbox(det):
112 |                     if det['right'] > image_width - 1:
113 |                         det['right'] = image_width - 1
114 |                     if det['bottom'] > image_height - 1:
115 |                         det['bottom'] = image_height - 1
116 |                     return det
117 | 
118 |                 image_detections.append({
119 |                     'image': {
120 |                         'id': f"{frame_name}-{frame_id:06d}",
121 |                         'path': image_path,
122 |                         'segmented_path': None,
123 |                         'width': image.width,
124 |                         'height': image.height
125 |                     },
126 |                     'detections': [clamp_bbox(det) for det in frame_dets]
127 |                 })
128 |         return image_detections
129 | 


--------------------------------------------------------------------------------
/vod-converter/vod_converter/kitti.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ingestor for KITTI formats.
  3 | 
  4 | http://www.cvlibs.net/datasets/kitti/eval_object.php
  5 | 
  6 | Per devkit docs:
  7 | 
  8 | All values (numerical or strings) are separated via spaces,
  9 | each row corresponds to one object. The 15 columns represent:
 10 | 
 11 | #Values    Name      Description
 12 | ----------------------------------------------------------------------------
 13 |    1    type         Describes the type of object: 'Car', 'Van', 'Truck',
 14 |                      'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
 15 |                      'Misc' or 'DontCare'
 16 |    1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
 17 |                      truncated refers to the object leaving image boundaries
 18 |    1    occluded     Integer (0,1,2,3) indicating occlusion state:
 19 |                      0 = fully visible, 1 = partly occluded
 20 |                      2 = largely occluded, 3 = unknown
 21 |    1    alpha        Observation angle of object, ranging [-pi..pi]
 22 |    4    bbox         2D bounding box of object in the image (0-based index):
 23 |                      contains left, top, right, bottom pixel coordinates
 24 |    3    dimensions   3D object dimensions: height, width, length (in meters)
 25 |    3    location     3D object location x,y,z in camera coordinates (in meters)
 26 |    1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
 27 |    1    score        Only for results: Float, indicating confidence in
 28 |                      detection, needed for p/r curves, higher is better.
 29 | 
 30 | 
 31 | """
 32 | 
 33 | import csv
 34 | import os
 35 | from PIL import Image
 36 | import shutil
 37 | 
 38 | from converter import Ingestor, Egestor
 39 | 
 40 | 
 41 | class KITTIIngestor(Ingestor):
 42 |     def validate(self, path):
 43 |         expected_dirs = [
 44 |             'training/image_2',
 45 |             'training/label_2'
 46 |         ]
 47 |         for subdir in expected_dirs:
 48 |             if not os.path.isdir(f"{path}/{subdir}"):
 49 |                 return False, f"Expected subdirectory {subdir} within {path}"
 50 |         if not os.path.isfile(f"{path}/train.txt"):
 51 |             return False, f"Expected train.txt file within {path}"
 52 |         return True, None
 53 | 
 54 |     def ingest(self, path):
 55 |         image_ids = self._get_image_ids(path)
 56 |         image_ext = 'png'
 57 |         if len(image_ids):
 58 |             first_image_id = image_ids[0]
 59 |             image_ext = self.find_image_ext(path, first_image_id)
 60 |         return [self._get_image_detection(path, image_name, image_ext=image_ext) for image_name in image_ids]
 61 | 
 62 |     def find_image_ext(self, root, image_id):
 63 |         for image_ext in ['png', 'jpg']:
 64 |             if os.path.exists(f"{root}/training/image_2/{image_id}.{image_ext}"):
 65 |                 return image_ext
 66 |         raise Exception(f"could not find jpg or png for {image_id} at {root}/training/image_2")
 67 | 
 68 |     def _get_image_ids(self, root):
 69 |         path = f"{root}/train.txt"
 70 |         with open(path) as f:
 71 |             return f.read().strip().split('\n')
 72 | 
 73 |     def _get_image_detection(self, root, image_id, *, image_ext='png'):
 74 |         detections_fpath = f"{root}/training/label_2/{image_id}.txt"
 75 |         detections = self._get_detections(detections_fpath)
 76 |         detections = [det for det in detections if det['left'] < det['right'] and det['top'] < det['bottom']]
 77 |         image_path = f"{root}/training/image_2/{image_id}.{image_ext}"
 78 |         image_width, image_height = _image_dimensions(image_path)
 79 |         return {
 80 |             'image': {
 81 |                 'id': image_id,
 82 |                 'path': image_path,
 83 |                 'segmented_path': None,
 84 |                 'width': image_width,
 85 |                 'height': image_height
 86 |             },
 87 |             'detections': detections
 88 |         }
 89 | 
 90 |     def _get_detections(self, detections_fpath):
 91 |         detections = []
 92 |         with open(detections_fpath) as f:
 93 |             f_csv = csv.reader(f, delimiter=' ')
 94 |             for row in f_csv:
 95 |                 x1, y1, x2, y2 = map(float, row[4:8])
 96 |                 label = row[0]
 97 |                 detections.append({
 98 |                     'label': label,
 99 |                     'left': x1,
100 |                     'right': x2,
101 |                     'top': y1,
102 |                     'bottom': y2
103 |                 })
104 |         return detections
105 | 
106 | 
107 | def _image_dimensions(path):
108 |     with Image.open(path) as image:
109 |         return image.width, image.height
110 | 
111 | DEFAULT_TRUNCATED = 0.0 # 0% truncated
112 | DEFAULT_OCCLUDED = 0    # fully visible
113 | 
114 | class KITTIEgestor(Egestor):
115 | 
116 |     def expected_labels(self):
117 |         return {
118 |             'Car': [],
119 |             'Cyclist': ['biker'],
120 |             'Misc': [],
121 |             'Pedestrian': ['person'],
122 |             'Person_sitting': [],
123 |             'Tram': [],
124 |             'Truck': [],
125 |             'Van': [],
126 |         }
127 | 
128 |     def egest(self, *, image_detections, root):
129 |         images_dir = f"{root}/training/image_2"
130 |         os.makedirs(images_dir, exist_ok=True)
131 |         labels_dir = f"{root}/training/label_2"
132 |         os.makedirs(labels_dir, exist_ok=True)
133 | 
134 |         id_file = f"{root}/train.txt"
135 | 
136 |         for image_detection in image_detections:
137 |             image = image_detection['image']
138 |             image_id = image['id']
139 |             src_extension = image['path'].split('.')[-1]
140 |             shutil.copyfile(image['path'], f"{images_dir}/{image_id}.{src_extension}")
141 | 
142 |             with open(id_file, 'a') as out_image_index_file:
143 |                 out_image_index_file.write(f'{image_id}\n')
144 | 
145 |             out_labels_path = f"{labels_dir}/{image_id}.txt"
146 |             with open(out_labels_path, 'w') as csvfile:
147 |                 csvwriter = csv.writer(csvfile, delimiter=' ', quoting=csv.QUOTE_MINIMAL)
148 | 
149 |                 for detection in image_detection['detections']:
150 |                     kitti_row = [-1] * 15
151 |                     kitti_row[0] = detection['label']
152 |                     kitti_row[1] = DEFAULT_TRUNCATED
153 |                     kitti_row[2] = DEFAULT_OCCLUDED
154 |                     x1 = detection['left']
155 |                     x2 = detection['right']
156 |                     y1 = detection['top']
157 |                     y2 = detection['bottom']
158 |                     kitti_row[4:8] = x1, y1, x2, y2
159 |                     csvwriter.writerow(kitti_row)
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/oid_to_pascal_voc_xml.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from tqdm import tqdm
  3 | from sys import exit
  4 | import argparse
  5 | import cv2
  6 | from textwrap import dedent
  7 | from lxml import etree
  8 | 
  9 | XML_DIR = ''
 10 | 
 11 | os.chdir('kitti_data')
 12 | #os.chdir(os.path.join("OID", "Dataset"))
 13 | DIRS = os.listdir(os.getcwd())
 14 | 
 15 | for DIR in DIRS:
 16 |     if os.path.isdir(DIR):
 17 |         os.chdir(DIR)
 18 | 
 19 |         print("Currently in Subdirectory:", DIR)
 20 |         CLASS_DIRS = os.listdir(os.getcwd())
 21 |         for CLASS_DIR in CLASS_DIRS:
 22 |             if " " in CLASS_DIR:
 23 |                 os.rename(CLASS_DIR, CLASS_DIR.replace(" ", "_"))
 24 | 
 25 |         CLASS_DIRS = os.listdir(os.getcwd())
 26 |         for CLASS_DIR in CLASS_DIRS:
 27 |             #if " " in CLASS_DIR:
 28 |             #    os.rename(CLASS_DIR, CLASS_DIR.replace(" ", "_"))
 29 |             if os.path.isdir(CLASS_DIR):
 30 |                 os.chdir(CLASS_DIR)
 31 | 
 32 |                 print("\n" + "Creating PASCAL VOC XML Files for Class:", CLASS_DIR)
 33 |                 # Create Directory for annotations if it does not exist yet
 34 |                 #if not os.path.exists(XML_DIR):
 35 |                 #    os.makedirs(XML_DIR)
 36 | 
 37 |                 #Read Labels from OIDv4 ToolKit
 38 |                 os.chdir("label")
 39 | 
 40 |                 #Create PASCAL XML
 41 |                 for filename in tqdm(os.listdir(os.getcwd())):
 42 |                     if filename.endswith(".txt"):
 43 |                         filename_str = str.split(filename, ".")[0]
 44 | 
 45 | 
 46 |                         annotation = etree.Element("annotation")
 47 |                         
 48 |                         os.chdir("..")
 49 |                         folder = etree.Element("folder")
 50 |                         folder.text = os.path.basename(os.getcwd())
 51 |                         annotation.append(folder)
 52 | 
 53 |                         filename_xml = etree.Element("filename")
 54 |                         filename_xml.text = filename_str + ".png"
 55 |                         annotation.append(filename_xml)
 56 | 
 57 |                         path = etree.Element("path")
 58 |                         path.text = os.path.join(os.path.dirname(os.path.abspath(filename)), filename_str + ".png")
 59 |                         annotation.append(path)
 60 | 
 61 |                         source = etree.Element("source")
 62 |                         annotation.append(source)
 63 | 
 64 |                         database = etree.Element("database")
 65 |                         database.text = "Unknown"
 66 |                         source.append(database)
 67 | 
 68 |                         size = etree.Element("size")
 69 |                         annotation.append(size)
 70 | 
 71 |                         width = etree.Element("width")
 72 |                         height = etree.Element("height")
 73 |                         depth = etree.Element("depth")
 74 |                         img = cv2.imread(filename_xml.text)
 75 | 
 76 |                         try:
 77 |                             width.text = str(img.shape[1])
 78 |                         except AttributeError:
 79 |                             #os.chdir("..")
 80 |                             os.chdir("label")
 81 |                             continue
 82 |                         height.text = str(img.shape[0])
 83 |                         depth.text = str(img.shape[2])
 84 | 
 85 |                         size.append(width)
 86 |                         size.append(height)
 87 |                         size.append(depth)
 88 | 
 89 |                         segmented = etree.Element("segmented")
 90 |                         segmented.text = "0"
 91 |                         annotation.append(segmented)
 92 | 
 93 |                         os.chdir("label")
 94 |                         label_original = open(filename, 'r')
 95 | 
 96 |                         # Labels from OIDv4 Toolkit: name_of_class X_min Y_min X_max Y_max
 97 |                         for line in label_original:
 98 |                             line = line.strip()
 99 |                             l = line.split(' ')
100 |                             class_name = l[0]
101 |                             try:
102 |                                 xmin_l = str(int(float(l[1])))
103 |                                 add1 = 0
104 |                             except ValueError:
105 |                                 class_name = l[0]+"_"+l[1]
106 |                                 add1 = 1
107 | 
108 |                             xmin_l = str(int(float(l[1+add1])))
109 |                             ymin_l = str(int(float(l[2+add1])))
110 |                             xmax_l = str(int(float(l[3+add1])))
111 |                             ymax_l = str(int(float(l[4+add1])))
112 |                             
113 |                             obj = etree.Element("object")
114 |                             annotation.append(obj)
115 | 
116 |                             name = etree.Element("name")
117 |                             name.text = class_name
118 |                             obj.append(name)
119 | 
120 |                             pose = etree.Element("pose")
121 |                             pose.text = "Unspecified"
122 |                             obj.append(pose)
123 | 
124 |                             truncated = etree.Element("truncated")
125 |                             truncated.text = "0"
126 |                             obj.append(truncated)
127 | 
128 |                             difficult = etree.Element("difficult")
129 |                             difficult.text = "0"
130 |                             obj.append(difficult)
131 | 
132 |                             bndbox = etree.Element("bndbox")
133 |                             obj.append(bndbox)
134 | 
135 |                             xmin = etree.Element("xmin")
136 |                             xmin.text = xmin_l
137 |                             bndbox.append(xmin)
138 | 
139 |                             ymin = etree.Element("ymin")
140 |                             ymin.text = ymin_l
141 |                             bndbox.append(ymin)
142 | 
143 |                             xmax = etree.Element("xmax")
144 |                             xmax.text = xmax_l
145 |                             bndbox.append(xmax)
146 | 
147 |                             ymax = etree.Element("ymax")
148 |                             ymax.text = ymax_l
149 |                             bndbox.append(ymax)
150 | 
151 |                         os.chdir("..")
152 | 
153 |                        #os.chdir(XML_DIR)
154 | 
155 |                         # write xml to file
156 |                         s = etree.tostring(annotation, pretty_print=True)
157 |                         with open(filename_str + ".xml", 'wb') as f:
158 |                             f.write(s)
159 |                             f.close()
160 | 
161 |                         #os.chdir("..")
162 |                         os.chdir("label")
163 | 
164 |                 os.chdir("..")
165 |                 os.chdir("..")   
166 |                    
167 |         os.chdir("..")
168 | 


--------------------------------------------------------------------------------
/vod-converter/vod_converter/voc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ingestor and egestor for VOC formats.
  3 | 
  4 | http://host.robots.ox.ac.uk/pascal/VOC/voc2012/htmldoc/index.html
  5 | """
  6 | 
  7 | import os
  8 | import shutil
  9 | 
 10 | from converter import Ingestor, Egestor
 11 | import xml.etree.ElementTree as ET
 12 | 
 13 | 
 14 | class VOCIngestor(Ingestor):
 15 |     def validate(self, root):
 16 |         path = f"{root}/VOC2012"
 17 |         for subdir in ["ImageSets", "JPEGImages", "Annotations"]:
 18 |             if not os.path.isdir(f"{path}/{subdir}"):
 19 |                 return False, f"Expected subdirectory {subdir} within {path}"
 20 |             if not os.path.isfile(f"{path}/ImageSets/Main/trainval.txt"):
 21 |                 return False, f"Expected main image set ImageSets/Main/trainval.txt to exist within {path}"
 22 |         return True, None
 23 | 
 24 |     def ingest(self, path):
 25 |         image_names = self._get_image_ids(path)
 26 |         return [self._get_image_detection(path, image_name) for image_name in image_names]
 27 | 
 28 |     def _get_image_ids(self, root):
 29 |         path = f"{root}/VOC2012"
 30 |         with open(f"{path}/ImageSets/Main/trainval.txt") as f:
 31 |             fnames = []
 32 |             for line in f.read().strip().split('\n'):
 33 |                 cols = line.split()
 34 |                 if len(cols) > 1:
 35 |                     score = cols[1]
 36 |                     if score != '1':
 37 |                         continue
 38 |                 fnames.append(cols[0])
 39 |             return fnames
 40 | 
 41 |     def _get_image_detection(self, root, image_id):
 42 |         path = f"{root}/VOC2012"
 43 |         image_path = f"{path}/JPEGImages/{image_id}.jpg"
 44 |         if not os.path.isfile(image_path):
 45 |             raise Exception(f"Expected {image_path} to exist.")
 46 |         annotation_path = f"{path}/Annotations/{image_id}.xml"
 47 |         if not os.path.isfile(annotation_path):
 48 |             raise Exception(f"Expected annotation file {annotation_path} to exist.")
 49 |         tree = ET.parse(annotation_path)
 50 |         xml_root = tree.getroot()
 51 |         size = xml_root.find('size')
 52 |         segmented = xml_root.find('segmented').text == '1'
 53 |         segmented_path = None
 54 |         if segmented:
 55 |             segmented_path = f"{path}/SegmentationObject/{image_id}.png"
 56 |             if not os.path.isfile(segmented_path):
 57 |                 raise Exception(f"Expected segmentation file {segmented_path} to exist.")
 58 |         image_width = int(size.find('width').text)
 59 |         image_height = int(size.find('height').text)
 60 |         return {
 61 |             'image': {
 62 |                 'id': image_id,
 63 |                 'path': image_path,
 64 |                 'segmented_path': segmented_path,
 65 |                 'width': image_width,
 66 |                 'height': image_height
 67 |             },
 68 |             'detections': [self._get_detection(node) for node in xml_root.findall('object')]
 69 |         }
 70 | 
 71 |     def _get_detection(self, node):
 72 |         bndbox = node.find('bndbox')
 73 |         return {
 74 |             'label': node.find('name').text,
 75 |             'top': float(bndbox.find('ymin').text) - 1,
 76 |             'left': float(bndbox.find('xmin').text) - 1,
 77 |             'right': float(bndbox.find('xmax').text) - 1,
 78 |             'bottom': float(bndbox.find('ymax').text) - 1,
 79 |         }
 80 | 
 81 | 
 82 | class VOCEgestor(Egestor):
 83 | 
 84 |     def expected_labels(self):
 85 |         return {
 86 |             'aeroplane': [],
 87 |             'bicycle': [],
 88 |             'bird': [],
 89 |             'boat': [],
 90 |             'bottle': [],
 91 |             'bus': [],
 92 |             'car': [],
 93 |             'cat': [],
 94 |             'chair': [],
 95 |             'cow': [],
 96 |             'diningtable': [],
 97 |             'dog': [],
 98 |             'horse': [],
 99 |             'motorbike': [],
100 |             'person': ['pedestrian'],
101 |             'pottedplant': [],
102 |             'sheep': [],
103 |             'sofa': [],
104 |             'train': [],
105 |             'tvmonitor': []
106 |         }
107 | 
108 |     def egest(self, *, image_detections, root):
109 |         image_sets_path = f"{root}/VOC2012/ImageSets/Main"
110 |         images_path = f"{root}/VOC2012/JPEGImages"
111 |         annotations_path = f"{root}/VOC2012/Annotations"
112 |         segmentations_path = f"{root}/VOC2012/SegmentationObject"
113 |         segmentations_dir_created = False
114 | 
115 |         for to_create in [image_sets_path, images_path, annotations_path]:
116 |             os.makedirs(to_create, exist_ok=True)
117 | 
118 |         for image_detection in image_detections:
119 |             image = image_detection['image']
120 |             image_id = image['id']
121 |             src_extension = image['path'].split('.')[-1]
122 |             shutil.copyfile(image['path'], f"{images_path}/{image_id}.{src_extension}")
123 | 
124 |             with open(f"{image_sets_path}/trainval.txt", 'a') as out_image_index_file:
125 |                 out_image_index_file.write(f'{image_id}\n')
126 | 
127 |             if image['segmented_path'] is not None:
128 |                 if not segmentations_dir_created:
129 |                     os.makedirs(segmentations_path)
130 |                     segmentations_dir_created = True
131 |                 shutil.copyfile(image['segmented_path'], f"{segmentations_path}/{image_id}.png")
132 | 
133 |             xml_root = ET.Element('annotation')
134 |             add_text_node(xml_root, 'filename', f"{image_id}.{src_extension}")
135 |             add_text_node(xml_root, 'folder', 'VOC2012')
136 |             add_text_node(xml_root, 'segmented', int(segmentations_dir_created))
137 | 
138 |             add_sub_node(xml_root, 'size', {
139 |                 'depth': 3,
140 |                 'width': image['width'],
141 |                 'height': image['height']
142 |             })
143 |             add_sub_node(xml_root, 'source', {
144 |                 'annotation': 'Dummy',
145 |                 'database': 'Dummy',
146 |                 'image': 'Dummy'
147 |             })
148 | 
149 |             for detection in image_detection['detections']:
150 |                 x_object = add_sub_node(xml_root, 'object', {
151 |                     'name': detection['label'],
152 |                     'difficult': 0,
153 |                     'occluded': 0,
154 |                     'truncated': 0,
155 |                     'pose': 'Unspecified'
156 |                 })
157 |                 add_sub_node(x_object, 'bndbox', {
158 |                     'xmin': detection['left'] + 1,
159 |                     'xmax': detection['right'] + 1,
160 |                     'ymin': detection['top'] + 1,
161 |                     'ymax': detection['bottom'] + 1
162 |                 })
163 | 
164 |             ET.ElementTree(xml_root).write(f"{annotations_path}/{image_id}.xml")
165 | 
166 | 
167 | def add_sub_node(node, name, kvs):
168 |     subnode = ET.SubElement(node, name)
169 |     for k, v in kvs.items():
170 |         add_text_node(subnode, k, v)
171 |     return subnode
172 | 
173 | 
174 | def add_text_node(node, name, text):
175 |     subnode = ET.SubElement(node, name)
176 |     subnode.text = f"{text}"
177 |     return subnode
178 | 
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 


--------------------------------------------------------------------------------
/vod-converter/vod_converter/converter.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Defines the protocol for converting too and from a common data format and executes
  3 | the conversion, validating proper conversion along the way.
  4 | 
  5 | For a given dataformat, e.g `voc.py`, if you wish to support reading in of your data format, define
  6 | an `Ingestor` that can read in data from a path and return an array of data conforming to `IMAGE_DETECTION_SCHEMA`.
  7 | 
  8 | If you wish to support data output, define an `Egestor` that, given an array of data of the same form,
  9 | can output the data to the filesystem.
 10 | 
 11 | See `main.py` for the supported types, and `voc.py` and `kitti.py` for reference.
 12 | """
 13 | from jsonschema import validate as raw_validate
 14 | from jsonschema.exceptions import ValidationError as SchemaError
 15 | 
 16 | 
 17 | def validate_schema(data, schema):
 18 |     """Wraps default implementation but accepting tuples as arrays too.
 19 | 
 20 |     https://github.com/Julian/jsonschema/issues/148
 21 |     """
 22 |     return raw_validate(data, schema, types={"array": (list, tuple)})
 23 | 
 24 | 
 25 | IMAGE_SCHEMA = {
 26 |     'type': 'object',
 27 |     'properties': {
 28 |         'id': {'type': 'string'},
 29 |         'path': {'type': 'string'},
 30 |         'segmented_path': {
 31 |             'anyOf': [
 32 |                 {'type': 'null'},
 33 |                 {'type': 'string'}
 34 |             ]
 35 |         },
 36 |         'width': {'type': 'integer', 'minimum': 10},
 37 |         'height': {'type': 'integer', 'minimum': 10},
 38 |     },
 39 |     'required': ['id', 'path', 'segmented_path', 'width', 'height']
 40 | }
 41 | 
 42 | 
 43 | DETECTION_SCHEMA = {
 44 |     'type': 'object',
 45 |     'properties': {
 46 |         'label': {'type': 'string'},
 47 |         'top': {'type': 'number', 'minimum': 0},
 48 |         'left': {'type': 'number', 'minimum': 0},
 49 |         'right': {'type': 'number', 'minimum': 0},
 50 |         'bottom': {'type': 'number', 'minimum': 0}
 51 |     },
 52 |     'required': ['top', 'left', 'right', 'bottom']
 53 | }
 54 | 
 55 | IMAGE_DETECTION_SCHEMA = {
 56 |     'type': 'object',
 57 |     'properties': {
 58 |         'image': IMAGE_SCHEMA,
 59 |         'detections': {
 60 |             'type': 'array',
 61 |             'items': DETECTION_SCHEMA
 62 |         }
 63 |     }
 64 | }
 65 | 
 66 | 
 67 | class Ingestor:
 68 |     def validate(self, path):
 69 |         """
 70 |         Validate that a path contains files / directories expected for a given data format.
 71 | 
 72 |         This is where you can provide feedback to the end user if they are attempting to convert from
 73 |         your format but have passed you path to a directory that is missing the expected files or directory
 74 |         structure.
 75 | 
 76 |         :param path: Where the data is stored
 77 |         :return: (sucess, error message), e.g (False, "error message") if anything is awry, (True, None) otherwise.
 78 |         """
 79 |         return True, None
 80 | 
 81 |     def ingest(self, path):
 82 |         """
 83 |         Read in data from the filesytem.
 84 |         :param path: '/path/to/data/'
 85 |         :return: an array of dicts conforming to `IMAGE_DETECTION_SCHEMA`
 86 |         """
 87 |         pass
 88 | 
 89 | 
 90 | class Egestor:
 91 | 
 92 |     def expected_labels(self):
 93 |         """
 94 |         Return a dict with a key for each label generally expected by this dataset format and
 95 |         any aliases that should be converted.
 96 | 
 97 |         In the example below the expected labels are 'car' and 'pedestrian' and, for example, both
 98 |         'Car' and 'auto' should be converted to 'car'.
 99 | 
100 |         :return: {'car': ['Car', 'auto'], 'pedestrian': ['Person']}
101 |         """
102 |         raise NotImplementedError()
103 | 
104 |     def egest(self, *, image_detections, root):
105 |         """
106 |         Output data to the filesystem.
107 | 
108 |         Note: image_detections will already have any conversions specified via `expected_labels` applied
109 |         by the time they are passed to this method.
110 | 
111 |         :param image_detections: an array of dicts conforming to `IMAGE_DETECTION_SCHEMA`
112 |         :param root: '/path/to/output/data/'
113 |         """
114 |         raise NotImplementedError()
115 | 
116 | 
117 | def convert(*, from_path, ingestor, to_path, egestor, select_only_known_labels, filter_images_without_labels):
118 |     """
119 |     Converts between data formats, validating that the converted data matches
120 |     `IMAGE_DETECTION_SCHEMA` along the way.
121 | 
122 |     :param from_path: '/path/to/read/from'
123 |     :param ingestor: `Ingestor` to read in data
124 |     :param to_path: '/path/to/write/to'
125 |     :param egestor: `Egestor` to write out data
126 |     :return: (success, message)
127 |     """
128 |     from_valid, from_msg = ingestor.validate(from_path)
129 | 
130 |     if not from_valid:
131 |         return from_valid, from_msg
132 | 
133 |     image_detections = ingestor.ingest(from_path)
134 |     validate_image_detections(image_detections)
135 |     image_detections = convert_labels(
136 |         image_detections=image_detections, expected_labels=egestor.expected_labels(),
137 |         select_only_known_labels=select_only_known_labels,
138 |         filter_images_without_labels=filter_images_without_labels)
139 | 
140 |     egestor.egest(image_detections=image_detections, root=to_path)
141 |     return True, ''
142 | 
143 | 
144 | def validate_image_detections(image_detections):
145 |     for i, image_detection in enumerate(image_detections):
146 |         try:
147 |             validate_schema(image_detection, IMAGE_DETECTION_SCHEMA)
148 |         except SchemaError as se:
149 |             raise Exception(f"at index {i}") from se
150 |         image = image_detection['image']
151 |         for detection in image_detection['detections']:
152 |             if detection['right'] >= image['width'] or detection['bottom'] >= image['height']:
153 |                 raise ValueError(f"Image {image} has out of bounds bounding box {detection}")
154 |             if detection['right'] <= detection['left'] or detection['bottom'] <= detection['top']:
155 |                 raise ValueError(f"Image {image} has zero dimension bbox {detection}")
156 | 
157 | 
158 | def convert_labels(*, image_detections, expected_labels,
159 |                    select_only_known_labels, filter_images_without_labels):
160 |     convert_dict = {}
161 |     for label, aliases in expected_labels.items():
162 |         convert_dict[label.lower()] = label
163 |         for alias in aliases:
164 |             convert_dict[alias.lower()] = label
165 | 
166 |     final_image_detections = []
167 |     for image_detection in image_detections:
168 |         detections = []
169 |         for detection in image_detection['detections']:
170 |             label = detection['label']
171 |             fallback_label = label if not select_only_known_labels else None
172 |             final_label = convert_dict.get(label.lower(), fallback_label)
173 |             if final_label:
174 |                 detection['label'] = final_label
175 |                 detections.append(detection)
176 |         image_detection['detections'] = detections
177 |         if detections:
178 |             final_image_detections.append(image_detection)
179 |         elif not filter_images_without_labels:
180 |             final_image_detections.append(image_detection)
181 | 
182 |     return final_image_detections
183 | 
184 | 
185 | 
186 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Retrain the YOLO model for your own dataset.
  3 | """
  4 | import os
  5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
  6 | 
  7 | import numpy as np
  8 | import keras.backend as K
  9 | from keras.layers import Input, Lambda
 10 | from keras.models import Model
 11 | from keras.optimizers import Adam
 12 | from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
 13 | 
 14 | from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
 15 | from yolo3.utils import get_random_data
 16 | 
 17 | 
 18 | def _main():
 19 |     annotation_path = '9_CLASS_test.txt'
 20 |     log_dir = 'logs/000/'
 21 |     classes_path = '9_CLASS_test_classes.txt'
 22 |     anchors_path = 'model_data/yolo_anchors.txt'
 23 |     class_names = get_classes(classes_path)
 24 |     num_classes = len(class_names)
 25 |     anchors = get_anchors(anchors_path)
 26 | 
 27 |     input_shape = (416,416) # multiple of 32, hw
 28 | 
 29 |     is_tiny_version = len(anchors)==6 # default setting
 30 |     if is_tiny_version:
 31 |         model = create_tiny_model(input_shape, anchors, num_classes,
 32 |             freeze_body=2, weights_path='model_data/yolo_weights.h5')
 33 |     else:
 34 |         model = create_model(input_shape, anchors, num_classes, freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze
 35 | 
 36 | 
 37 |     logging = TensorBoard(log_dir=log_dir)
 38 |     checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
 39 |         monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)
 40 |     reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
 41 |     early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
 42 | 
 43 |     val_split = 0.1
 44 |     with open(annotation_path) as f:
 45 |         lines = f.readlines()
 46 |     np.random.shuffle(lines)
 47 |     num_val = int(len(lines)*val_split)
 48 |     num_train = len(lines) - num_val
 49 | 
 50 |     # Train with frozen layers first, to get a stable loss.
 51 |     # Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
 52 |     if True:
 53 |         model.compile(optimizer=Adam(lr=1e-3), loss={
 54 |             # use custom yolo_loss Lambda layer.
 55 |             'yolo_loss': lambda y_true, y_pred: y_pred})
 56 | 
 57 |         batch_size = 32
 58 |         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
 59 |         model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
 60 |                 steps_per_epoch=max(1, num_train//batch_size),
 61 |                 validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
 62 |                 validation_steps=max(1, num_val//batch_size),
 63 |                 epochs=50,
 64 |                 initial_epoch=0,
 65 |                 callbacks=[logging, checkpoint])
 66 |         model.save_weights(log_dir + 'trained_weights_stage_1.h5')
 67 | 
 68 |     # Unfreeze and continue training, to fine-tune.
 69 |     # Train longer if the result is not good.
 70 |     if True:
 71 |         for i in range(len(model.layers)):
 72 |             model.layers[i].trainable = True
 73 |         model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
 74 |         print('Unfreeze all of the layers.')
 75 | 
 76 |         batch_size = 8 # note that more GPU memory is required after unfreezing the body
 77 |         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
 78 |         model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
 79 |             steps_per_epoch=max(1, num_train//batch_size),
 80 |             validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
 81 |             validation_steps=max(1, num_val//batch_size),
 82 |             epochs=100,
 83 |             initial_epoch=50,
 84 |             callbacks=[logging, checkpoint, reduce_lr, early_stopping])
 85 |         model.save_weights(log_dir + 'trained_weights_final.h5')
 86 | 
 87 |     # Further training if needed.
 88 | 
 89 | 
 90 | def get_classes(classes_path):
 91 |     '''loads the classes'''
 92 |     with open(classes_path) as f:
 93 |         class_names = f.readlines()
 94 |     class_names = [c.strip() for c in class_names]
 95 |     return class_names
 96 | 
 97 | def get_anchors(anchors_path):
 98 |     '''loads the anchors from a file'''
 99 |     with open(anchors_path) as f:
100 |         anchors = f.readline()
101 |     anchors = [float(x) for x in anchors.split(',')]
102 |     return np.array(anchors).reshape(-1, 2)
103 | 
104 | 
105 | def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
106 |             weights_path='model_data/yolo_weights.h5'):
107 |     '''create the training model'''
108 |     K.clear_session() # get a new session
109 |     image_input = Input(shape=(None, None, 3))
110 |     h, w = input_shape
111 |     num_anchors = len(anchors)
112 | 
113 |     y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
114 |         num_anchors//3, num_classes+5)) for l in range(3)]
115 | 
116 |     model_body = yolo_body(image_input, num_anchors//3, num_classes)
117 |     print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
118 | 
119 |     if load_pretrained:
120 |         model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
121 |         print('Load weights {}.'.format(weights_path))
122 |         if freeze_body in [1, 2]:
123 |             # Freeze darknet53 body or freeze all but 3 output layers.
124 |             num = (185, len(model_body.layers)-3)[freeze_body-1]
125 |             for i in range(num): model_body.layers[i].trainable = False
126 |             print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))
127 | 
128 |     model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
129 |         arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
130 |         [*model_body.output, *y_true])
131 |     model = Model([model_body.input, *y_true], model_loss)
132 | 
133 |     return model
134 | 
135 | def create_tiny_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
136 |             weights_path='model_data/tiny_yolo_weights.h5'):
137 |     '''create the training model, for Tiny YOLOv3'''
138 |     K.clear_session() # get a new session
139 |     image_input = Input(shape=(None, None, 3))
140 |     h, w = input_shape
141 |     num_anchors = len(anchors)
142 | 
143 |     y_true = [Input(shape=(h//{0:32, 1:16}[l], w//{0:32, 1:16}[l], \
144 |         num_anchors//2, num_classes+5)) for l in range(2)]
145 | 
146 |     model_body = tiny_yolo_body(image_input, num_anchors//2, num_classes)
147 |     print('Create Tiny YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
148 | 
149 |     if load_pretrained:
150 |         model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
151 |         print('Load weights {}.'.format(weights_path))
152 |         if freeze_body in [1, 2]:
153 |             # Freeze the darknet body or freeze all but 2 output layers.
154 |             num = (20, len(model_body.layers)-2)[freeze_body-1]
155 |             for i in range(num): model_body.layers[i].trainable = False
156 |             print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))
157 | 
158 |     model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
159 |         arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.7})(
160 |         [*model_body.output, *y_true])
161 |     model = Model([model_body.input, *y_true], model_loss)
162 | 
163 |     return model
164 | 
165 | def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes):
166 |     '''data generator for fit_generator'''
167 |     n = len(annotation_lines)
168 |     i = 0
169 |     while True:
170 |         image_data = []
171 |         box_data = []
172 |         for b in range(batch_size):
173 |             if i==0:
174 |                 np.random.shuffle(annotation_lines)
175 |             image, box = get_random_data(annotation_lines[i], input_shape, random=True)
176 |             image_data.append(image)
177 |             box_data.append(box)
178 |             i = (i+1) % n
179 |         image_data = np.array(image_data)
180 |         box_data = np.array(box_data)
181 |         y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
182 |         yield [image_data, *y_true], np.zeros(batch_size)
183 | 
184 | def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes):
185 |     n = len(annotation_lines)
186 |     if n==0 or batch_size<=0: return None
187 |     return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes)
188 | 
189 | if __name__ == '__main__':
190 |     _main()
191 | 


--------------------------------------------------------------------------------
/convert.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """
  3 | Reads Darknet config and weights and creates Keras model with TF backend.
  4 | 
  5 | """
  6 | 
  7 | import argparse
  8 | import configparser
  9 | import io
 10 | import os
 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 12 | from collections import defaultdict
 13 | 
 14 | import numpy as np
 15 | from keras import backend as K
 16 | from keras.layers import (Conv2D, Input, ZeroPadding2D, Add,
 17 |                           UpSampling2D, MaxPooling2D, Concatenate)
 18 | from keras.layers.advanced_activations import LeakyReLU
 19 | from keras.layers.normalization import BatchNormalization
 20 | from keras.models import Model
 21 | from keras.regularizers import l2
 22 | from keras.utils.vis_utils import plot_model as plot
 23 | 
 24 | 
 25 | parser = argparse.ArgumentParser(description='Darknet To Keras Converter.')
 26 | parser.add_argument('config_path', help='Path to Darknet cfg file.')
 27 | parser.add_argument('weights_path', help='Path to Darknet weights file.')
 28 | parser.add_argument('output_path', help='Path to output Keras model file.')
 29 | parser.add_argument(
 30 |     '-p',
 31 |     '--plot_model',
 32 |     help='Plot generated Keras model and save as image.',
 33 |     action='store_true')
 34 | parser.add_argument(
 35 |     '-w',
 36 |     '--weights_only',
 37 |     help='Save as Keras weights file instead of model file.',
 38 |     action='store_true')
 39 | 
 40 | def unique_config_sections(config_file):
 41 |     """Convert all config sections to have unique names.
 42 | 
 43 |     Adds unique suffixes to config sections for compability with configparser.
 44 |     """
 45 |     section_counters = defaultdict(int)
 46 |     output_stream = io.StringIO()
 47 |     with open(config_file) as fin:
 48 |         for line in fin:
 49 |             if line.startswith('['):
 50 |                 section = line.strip().strip('[]')
 51 |                 _section = section + '_' + str(section_counters[section])
 52 |                 section_counters[section] += 1
 53 |                 line = line.replace(section, _section)
 54 |             output_stream.write(line)
 55 |     output_stream.seek(0)
 56 |     return output_stream
 57 | 
 58 | # %%
 59 | def _main(args):
 60 |     config_path = os.path.expanduser(args.config_path)
 61 |     weights_path = os.path.expanduser(args.weights_path)
 62 |     assert config_path.endswith('.cfg'), '{} is not a .cfg file'.format(
 63 |         config_path)
 64 |     assert weights_path.endswith(
 65 |         '.weights'), '{} is not a .weights file'.format(weights_path)
 66 | 
 67 |     output_path = os.path.expanduser(args.output_path)
 68 |     assert output_path.endswith(
 69 |         '.h5'), 'output path {} is not a .h5 file'.format(output_path)
 70 |     output_root = os.path.splitext(output_path)[0]
 71 | 
 72 |     # Load weights and config.
 73 |     print('Loading weights.')
 74 |     weights_file = open(weights_path, 'rb')
 75 |     major, minor, revision = np.ndarray(
 76 |         shape=(3, ), dtype='int32', buffer=weights_file.read(12))
 77 |     if (major*10+minor)>=2 and major<1000 and minor<1000:
 78 |         seen = np.ndarray(shape=(1,), dtype='int64', buffer=weights_file.read(8))
 79 |     else:
 80 |         seen = np.ndarray(shape=(1,), dtype='int32', buffer=weights_file.read(4))
 81 |     print('Weights Header: ', major, minor, revision, seen)
 82 | 
 83 |     print('Parsing Darknet config.')
 84 |     unique_config_file = unique_config_sections(config_path)
 85 |     cfg_parser = configparser.ConfigParser()
 86 |     cfg_parser.read_file(unique_config_file)
 87 | 
 88 |     print('Creating Keras model.')
 89 |     input_layer = Input(shape=(None, None, 3))
 90 |     prev_layer = input_layer
 91 |     all_layers = []
 92 | 
 93 |     weight_decay = float(cfg_parser['net_0']['decay']
 94 |                          ) if 'net_0' in cfg_parser.sections() else 5e-4
 95 |     count = 0
 96 |     out_index = []
 97 |     for section in cfg_parser.sections():
 98 |         print('Parsing section {}'.format(section))
 99 |         if section.startswith('convolutional'):
100 |             filters = int(cfg_parser[section]['filters'])
101 |             size = int(cfg_parser[section]['size'])
102 |             stride = int(cfg_parser[section]['stride'])
103 |             pad = int(cfg_parser[section]['pad'])
104 |             activation = cfg_parser[section]['activation']
105 |             batch_normalize = 'batch_normalize' in cfg_parser[section]
106 | 
107 |             padding = 'same' if pad == 1 and stride == 1 else 'valid'
108 | 
109 |             # Setting weights.
110 |             # Darknet serializes convolutional weights as:
111 |             # [bias/beta, [gamma, mean, variance], conv_weights]
112 |             prev_layer_shape = K.int_shape(prev_layer)
113 | 
114 |             weights_shape = (size, size, prev_layer_shape[-1], filters)
115 |             darknet_w_shape = (filters, weights_shape[2], size, size)
116 |             weights_size = np.product(weights_shape)
117 | 
118 |             print('conv2d', 'bn'
119 |                   if batch_normalize else '  ', activation, weights_shape)
120 | 
121 |             conv_bias = np.ndarray(
122 |                 shape=(filters, ),
123 |                 dtype='float32',
124 |                 buffer=weights_file.read(filters * 4))
125 |             count += filters
126 | 
127 |             if batch_normalize:
128 |                 bn_weights = np.ndarray(
129 |                     shape=(3, filters),
130 |                     dtype='float32',
131 |                     buffer=weights_file.read(filters * 12))
132 |                 count += 3 * filters
133 | 
134 |                 bn_weight_list = [
135 |                     bn_weights[0],  # scale gamma
136 |                     conv_bias,  # shift beta
137 |                     bn_weights[1],  # running mean
138 |                     bn_weights[2]  # running var
139 |                 ]
140 | 
141 |             conv_weights = np.ndarray(
142 |                 shape=darknet_w_shape,
143 |                 dtype='float32',
144 |                 buffer=weights_file.read(weights_size * 4))
145 |             count += weights_size
146 | 
147 |             # DarkNet conv_weights are serialized Caffe-style:
148 |             # (out_dim, in_dim, height, width)
149 |             # We would like to set these to Tensorflow order:
150 |             # (height, width, in_dim, out_dim)
151 |             conv_weights = np.transpose(conv_weights, [2, 3, 1, 0])
152 |             conv_weights = [conv_weights] if batch_normalize else [
153 |                 conv_weights, conv_bias
154 |             ]
155 | 
156 |             # Handle activation.
157 |             act_fn = None
158 |             if activation == 'leaky':
159 |                 pass  # Add advanced activation later.
160 |             elif activation != 'linear':
161 |                 raise ValueError(
162 |                     'Unknown activation function `{}` in section {}'.format(
163 |                         activation, section))
164 | 
165 |             # Create Conv2D layer
166 |             if stride>1:
167 |                 # Darknet uses left and top padding instead of 'same' mode
168 |                 prev_layer = ZeroPadding2D(((1,0),(1,0)))(prev_layer)
169 |             conv_layer = (Conv2D(
170 |                 filters, (size, size),
171 |                 strides=(stride, stride),
172 |                 kernel_regularizer=l2(weight_decay),
173 |                 use_bias=not batch_normalize,
174 |                 weights=conv_weights,
175 |                 activation=act_fn,
176 |                 padding=padding))(prev_layer)
177 | 
178 |             if batch_normalize:
179 |                 conv_layer = (BatchNormalization(
180 |                     weights=bn_weight_list))(conv_layer)
181 |             prev_layer = conv_layer
182 | 
183 |             if activation == 'linear':
184 |                 all_layers.append(prev_layer)
185 |             elif activation == 'leaky':
186 |                 act_layer = LeakyReLU(alpha=0.1)(prev_layer)
187 |                 prev_layer = act_layer
188 |                 all_layers.append(act_layer)
189 | 
190 |         elif section.startswith('route'):
191 |             ids = [int(i) for i in cfg_parser[section]['layers'].split(',')]
192 |             layers = [all_layers[i] for i in ids]
193 |             if len(layers) > 1:
194 |                 print('Concatenating route layers:', layers)
195 |                 concatenate_layer = Concatenate()(layers)
196 |                 all_layers.append(concatenate_layer)
197 |                 prev_layer = concatenate_layer
198 |             else:
199 |                 skip_layer = layers[0]  # only one layer to route
200 |                 all_layers.append(skip_layer)
201 |                 prev_layer = skip_layer
202 | 
203 |         elif section.startswith('maxpool'):
204 |             size = int(cfg_parser[section]['size'])
205 |             stride = int(cfg_parser[section]['stride'])
206 |             all_layers.append(
207 |                 MaxPooling2D(
208 |                     pool_size=(size, size),
209 |                     strides=(stride, stride),
210 |                     padding='same')(prev_layer))
211 |             prev_layer = all_layers[-1]
212 | 
213 |         elif section.startswith('shortcut'):
214 |             index = int(cfg_parser[section]['from'])
215 |             activation = cfg_parser[section]['activation']
216 |             assert activation == 'linear', 'Only linear activation supported.'
217 |             all_layers.append(Add()([all_layers[index], prev_layer]))
218 |             prev_layer = all_layers[-1]
219 | 
220 |         elif section.startswith('upsample'):
221 |             stride = int(cfg_parser[section]['stride'])
222 |             assert stride == 2, 'Only stride=2 supported.'
223 |             all_layers.append(UpSampling2D(stride)(prev_layer))
224 |             prev_layer = all_layers[-1]
225 | 
226 |         elif section.startswith('yolo'):
227 |             out_index.append(len(all_layers)-1)
228 |             all_layers.append(None)
229 |             prev_layer = all_layers[-1]
230 | 
231 |         elif section.startswith('net'):
232 |             pass
233 | 
234 |         else:
235 |             raise ValueError(
236 |                 'Unsupported section header type: {}'.format(section))
237 | 
238 |     # Create and save model.
239 |     if len(out_index)==0: out_index.append(len(all_layers)-1)
240 |     model = Model(inputs=input_layer, outputs=[all_layers[i] for i in out_index])
241 |     print(model.summary())
242 |     if args.weights_only:
243 |         model.save_weights('{}'.format(output_path))
244 |         print('Saved Keras weights to {}'.format(output_path))
245 |     else:
246 |         model.save('{}'.format(output_path))
247 |         print('Saved Keras model to {}'.format(output_path))
248 | 
249 |     # Check to see if all weights have been read.
250 |     remaining_weights = len(weights_file.read()) / 4
251 |     weights_file.close()
252 |     print('Read {} of {} from Darknet weights.'.format(count, count +
253 |                                                        remaining_weights))
254 |     if remaining_weights > 0:
255 |         print('Warning: {} unused weights'.format(remaining_weights))
256 | 
257 |     if args.plot_model:
258 |         plot(model, to_file='{}.png'.format(output_root), show_shapes=True)
259 |         print('Saved model plot to {}.png'.format(output_root))
260 | 
261 | 
262 | if __name__ == '__main__':
263 |     _main(parser.parse_args())
264 | 


--------------------------------------------------------------------------------
/train_bottleneck.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Retrain the YOLO model for your own dataset.
  3 | """
  4 | import os
  5 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
  6 | 
  7 | import numpy as np
  8 | import keras.backend as K
  9 | from keras.layers import Input, Lambda
 10 | from keras.models import Model
 11 | from keras.optimizers import Adam
 12 | from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
 13 | 
 14 | from yolo3.model import preprocess_true_boxes, yolo_body, tiny_yolo_body, yolo_loss
 15 | from yolo3.utils import get_random_data
 16 | 
 17 | 
 18 | def _main():
 19 |     annotation_path = '4_CLASS_test.txt'
 20 |     log_dir = 'logs/'
 21 |     classes_path = '4_CLASS_test_classes.txt'
 22 |     anchors_path = 'model_data/yolo_anchors.txt'
 23 |     class_names = get_classes(classes_path)
 24 |     num_classes = len(class_names)
 25 |     anchors = get_anchors(anchors_path)
 26 | 
 27 |     input_shape = (416,416) # multiple of 32, hw
 28 | 
 29 |     model, bottleneck_model, last_layer_model = create_model(input_shape, anchors, num_classes,
 30 |             freeze_body=2, weights_path='model_data/yolo_weights.h5') # make sure you know what you freeze
 31 |     
 32 |     logging = TensorBoard(log_dir=log_dir)
 33 |     checkpoint = ModelCheckpoint(log_dir + 'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
 34 |         monitor='val_loss', save_weights_only=True, save_best_only=True, period=10)
 35 |     reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
 36 |     early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
 37 | 
 38 |     val_split = 0.1
 39 |     with open(annotation_path) as f:
 40 |         lines = f.readlines()
 41 |     #np.random.seed(10101)
 42 |     np.random.shuffle(lines)
 43 |     #np.random.seed(None)
 44 |     num_val = int(len(lines)*val_split)
 45 |     num_train = len(lines) - num_val
 46 | 
 47 |     # Train with frozen layers first, to get a stable loss.
 48 |     # Adjust num epochs to your dataset. This step is enough to obtain a not bad model.
 49 |     if True:
 50 |         # perform bottleneck training
 51 |         if not os.path.isfile("bottlenecks.npz"):
 52 |             print("calculating bottlenecks")
 53 |             batch_size=32
 54 |             bottlenecks=bottleneck_model.predict_generator(data_generator_wrapper(lines, batch_size, input_shape, anchors, num_classes, random=False, verbose=True),
 55 |              steps=(len(lines)//batch_size)+1, max_queue_size=1)
 56 |             np.savez("bottlenecks.npz", bot0=bottlenecks[0], bot1=bottlenecks[1], bot2=bottlenecks[2])
 57 |     
 58 |         # load bottleneck features from file
 59 |         dict_bot=np.load("bottlenecks.npz")
 60 |         bottlenecks_train=[dict_bot["bot0"][:num_train], dict_bot["bot1"][:num_train], dict_bot["bot2"][:num_train]]
 61 |         bottlenecks_val=[dict_bot["bot0"][num_train:], dict_bot["bot1"][num_train:], dict_bot["bot2"][num_train:]]
 62 | 
 63 |         # train last layers with fixed bottleneck features
 64 |         batch_size=32
 65 |         print("Training last layers with bottleneck features")
 66 |         print('with {} samples, val on {} samples and batch size {}.'.format(num_train, num_val, batch_size))
 67 |         last_layer_model.compile(optimizer='adam', loss={'yolo_loss': lambda y_true, y_pred: y_pred})
 68 |         last_layer_model.fit_generator(bottleneck_generator(lines[:num_train], batch_size, input_shape, anchors, num_classes, bottlenecks_train),
 69 |                 steps_per_epoch=max(1, num_train//batch_size),
 70 |                 validation_data=bottleneck_generator(lines[num_train:], batch_size, input_shape, anchors, num_classes, bottlenecks_val),
 71 |                 validation_steps=max(1, num_val//batch_size),
 72 |                 epochs=90,
 73 |                 initial_epoch=0, max_queue_size=1)
 74 |         model.save_weights(log_dir + 'trained_weights_stage_0.h5')
 75 |         
 76 |         # train last layers with random augmented data
 77 |         model.compile(optimizer=Adam(lr=1e-3), loss={
 78 |             # use custom yolo_loss Lambda layer.
 79 |             'yolo_loss': lambda y_true, y_pred: y_pred})
 80 |         batch_size = 32
 81 |         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
 82 |         model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
 83 |                 steps_per_epoch=max(1, num_train//batch_size),
 84 |                 validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
 85 |                 validation_steps=max(1, num_val//batch_size),
 86 |                 epochs=150,
 87 |                 initial_epoch=0,
 88 |                 callbacks=[logging, checkpoint])
 89 |         model.save_weights(log_dir + 'trained_weights_stage_1.h5')
 90 | 
 91 |     # Unfreeze and continue training, to fine-tune.
 92 |     # Train longer if the result is not good.
 93 |     if True:
 94 |         for i in range(len(model.layers)):
 95 |             model.layers[i].trainable = True
 96 |         model.compile(optimizer=Adam(lr=1e-4), loss={'yolo_loss': lambda y_true, y_pred: y_pred}) # recompile to apply the change
 97 |         print('Unfreeze all of the layers.')
 98 | 
 99 |         batch_size = 4 # note that more GPU memory is required after unfreezing the body
100 |         print('Train on {} samples, val on {} samples, with batch size {}.'.format(num_train, num_val, batch_size))
101 |         model.fit_generator(data_generator_wrapper(lines[:num_train], batch_size, input_shape, anchors, num_classes),
102 |             steps_per_epoch=max(1, num_train//batch_size),
103 |             validation_data=data_generator_wrapper(lines[num_train:], batch_size, input_shape, anchors, num_classes),
104 |             validation_steps=max(1, num_val//batch_size),
105 |             epochs=300,
106 |             initial_epoch=150,
107 |             callbacks=[logging, checkpoint, reduce_lr, early_stopping])
108 |         model.save_weights(log_dir + 'trained_weights_final.h5')
109 | 
110 |     # Further training if needed.
111 | 
112 | 
113 | def get_classes(classes_path):
114 |     '''loads the classes'''
115 |     with open(classes_path) as f:
116 |         class_names = f.readlines()
117 |     class_names = [c.strip() for c in class_names]
118 |     return class_names
119 | 
120 | def get_anchors(anchors_path):
121 |     '''loads the anchors from a file'''
122 |     with open(anchors_path) as f:
123 |         anchors = f.readline()
124 |     anchors = [float(x) for x in anchors.split(',')]
125 |     return np.array(anchors).reshape(-1, 2)
126 | 
127 | 
128 | def create_model(input_shape, anchors, num_classes, load_pretrained=True, freeze_body=2,
129 |             weights_path='model_data/yolo_weights.h5'):
130 |     '''create the training model'''
131 |     K.clear_session() # get a new session
132 |     image_input = Input(shape=(None, None, 3))
133 |     h, w = input_shape
134 |     num_anchors = len(anchors)
135 | 
136 |     y_true = [Input(shape=(h//{0:32, 1:16, 2:8}[l], w//{0:32, 1:16, 2:8}[l], \
137 |         num_anchors//3, num_classes+5)) for l in range(3)]
138 | 
139 |     model_body = yolo_body(image_input, num_anchors//3, num_classes)
140 |     print('Create YOLOv3 model with {} anchors and {} classes.'.format(num_anchors, num_classes))
141 | 
142 |     if load_pretrained:
143 |         model_body.load_weights(weights_path, by_name=True, skip_mismatch=True)
144 |         print('Load weights {}.'.format(weights_path))
145 |         if freeze_body in [1, 2]:
146 |             # Freeze darknet53 body or freeze all but 3 output layers.
147 |             num = (185, len(model_body.layers)-3)[freeze_body-1]
148 |             for i in range(num): model_body.layers[i].trainable = False
149 |             print('Freeze the first {} layers of total {} layers.'.format(num, len(model_body.layers)))
150 | 
151 |     # get output of second last layers and create bottleneck model of it
152 |     out1=model_body.layers[246].output
153 |     out2=model_body.layers[247].output
154 |     out3=model_body.layers[248].output
155 |     bottleneck_model = Model([model_body.input, *y_true], [out1, out2, out3])
156 | 
157 |     # create last layer model of last layers from yolo model
158 |     in0 = Input(shape=bottleneck_model.output[0].shape[1:].as_list()) 
159 |     in1 = Input(shape=bottleneck_model.output[1].shape[1:].as_list())
160 |     in2 = Input(shape=bottleneck_model.output[2].shape[1:].as_list())
161 |     last_out0=model_body.layers[249](in0)
162 |     last_out1=model_body.layers[250](in1)
163 |     last_out2=model_body.layers[251](in2)
164 |     model_last=Model(inputs=[in0, in1, in2], outputs=[last_out0, last_out1, last_out2])
165 |     model_loss_last =Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
166 |         arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
167 |         [*model_last.output, *y_true])
168 |     last_layer_model = Model([in0,in1,in2, *y_true], model_loss_last)
169 | 
170 |     
171 |     model_loss = Lambda(yolo_loss, output_shape=(1,), name='yolo_loss',
172 |         arguments={'anchors': anchors, 'num_classes': num_classes, 'ignore_thresh': 0.5})(
173 |         [*model_body.output, *y_true])
174 |     model = Model([model_body.input, *y_true], model_loss)
175 | 
176 |     return model, bottleneck_model, last_layer_model
177 | 
178 | def data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, random=True, verbose=False):
179 |     '''data generator for fit_generator'''
180 |     n = len(annotation_lines)
181 |     i = 0
182 |     while True:
183 |         image_data = []
184 |         box_data = []
185 |         for b in range(batch_size):
186 |             if i==0 and random:
187 |                 np.random.shuffle(annotation_lines)
188 |             image, box = get_random_data(annotation_lines[i], input_shape, random=random)
189 |             image_data.append(image)
190 |             box_data.append(box)
191 |             i = (i+1) % n
192 |         image_data = np.array(image_data)
193 |         if verbose:
194 |             print("Progress: ",i,"/",n)
195 |         box_data = np.array(box_data)
196 |         y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
197 |         yield [image_data, *y_true], np.zeros(batch_size)
198 | 
199 | def data_generator_wrapper(annotation_lines, batch_size, input_shape, anchors, num_classes, random=True, verbose=False):
200 |     n = len(annotation_lines)
201 |     if n==0 or batch_size<=0: return None
202 |     return data_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, random, verbose)
203 | 
204 | def bottleneck_generator(annotation_lines, batch_size, input_shape, anchors, num_classes, bottlenecks):
205 |     n = len(annotation_lines)
206 |     i = 0
207 |     while True:
208 |         box_data = []
209 |         b0=np.zeros((batch_size,bottlenecks[0].shape[1],bottlenecks[0].shape[2],bottlenecks[0].shape[3]))
210 |         b1=np.zeros((batch_size,bottlenecks[1].shape[1],bottlenecks[1].shape[2],bottlenecks[1].shape[3]))
211 |         b2=np.zeros((batch_size,bottlenecks[2].shape[1],bottlenecks[2].shape[2],bottlenecks[2].shape[3]))
212 |         for b in range(batch_size):
213 |             _, box = get_random_data(annotation_lines[i], input_shape, random=False, proc_img=False)
214 |             box_data.append(box)
215 |             b0[b]=bottlenecks[0][i]
216 |             b1[b]=bottlenecks[1][i]
217 |             b2[b]=bottlenecks[2][i]
218 |             i = (i+1) % n
219 |         box_data = np.array(box_data)
220 |         y_true = preprocess_true_boxes(box_data, input_shape, anchors, num_classes)
221 |         yield [b0, b1, b2, *y_true], np.zeros(batch_size)
222 | 
223 | if __name__ == '__main__':
224 |     _main()
225 | 


--------------------------------------------------------------------------------
/model_data/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | batch=1
  4 | subdivisions=1
  5 | # Training
  6 | # batch=64
  7 | # subdivisions=16
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [convolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .5
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [convolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .5
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [convolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .5
787 | truth_thresh = 1
788 | random=1
789 | 
790 | 


--------------------------------------------------------------------------------
/yolo3/model.py:
--------------------------------------------------------------------------------
  1 | """YOLO_v3 Model Defined in Keras."""
  2 | 
  3 | from functools import wraps
  4 | 
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from keras import backend as K
  8 | from keras.layers import Conv2D, Add, ZeroPadding2D, UpSampling2D, Concatenate, MaxPooling2D
  9 | from keras.layers.advanced_activations import LeakyReLU
 10 | from keras.layers.normalization import BatchNormalization
 11 | from keras.models import Model
 12 | from keras.regularizers import l2
 13 | 
 14 | from yolo3.utils import compose
 15 | 
 16 | 
 17 | @wraps(Conv2D)
 18 | def DarknetConv2D(*args, **kwargs):
 19 |     """Wrapper to set Darknet parameters for Convolution2D."""
 20 |     darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)}
 21 |     darknet_conv_kwargs['padding'] = 'valid' if kwargs.get('strides')==(2,2) else 'same'
 22 |     darknet_conv_kwargs.update(kwargs)
 23 |     return Conv2D(*args, **darknet_conv_kwargs)
 24 | 
 25 | def DarknetConv2D_BN_Leaky(*args, **kwargs):
 26 |     """Darknet Convolution2D followed by BatchNormalization and LeakyReLU."""
 27 |     no_bias_kwargs = {'use_bias': False}
 28 |     no_bias_kwargs.update(kwargs)
 29 |     return compose(
 30 |         DarknetConv2D(*args, **no_bias_kwargs),
 31 |         BatchNormalization(),
 32 |         LeakyReLU(alpha=0.1))
 33 | 
 34 | def resblock_body(x, num_filters, num_blocks):
 35 |     '''A series of resblocks starting with a downsampling Convolution2D'''
 36 |     # Darknet uses left and top padding instead of 'same' mode
 37 |     x = ZeroPadding2D(((1,0),(1,0)))(x)
 38 |     x = DarknetConv2D_BN_Leaky(num_filters, (3,3), strides=(2,2))(x)
 39 |     for i in range(num_blocks):
 40 |         y = compose(
 41 |                 DarknetConv2D_BN_Leaky(num_filters//2, (1,1)),
 42 |                 DarknetConv2D_BN_Leaky(num_filters, (3,3)))(x)
 43 |         x = Add()([x,y])
 44 |     return x
 45 | 
 46 | def darknet_body(x):
 47 |     '''Darknent body having 52 Convolution2D layers'''
 48 |     x = DarknetConv2D_BN_Leaky(32, (3,3))(x)
 49 |     x = resblock_body(x, 64, 1)
 50 |     x = resblock_body(x, 128, 2)
 51 |     x = resblock_body(x, 256, 8)
 52 |     x = resblock_body(x, 512, 8)
 53 |     x = resblock_body(x, 1024, 4)
 54 |     return x
 55 | 
 56 | def make_last_layers(x, num_filters, out_filters):
 57 |     '''6 Conv2D_BN_Leaky layers followed by a Conv2D_linear layer'''
 58 |     x = compose(
 59 |             DarknetConv2D_BN_Leaky(num_filters, (1,1)),
 60 |             DarknetConv2D_BN_Leaky(num_filters*2, (3,3)),
 61 |             DarknetConv2D_BN_Leaky(num_filters, (1,1)),
 62 |             DarknetConv2D_BN_Leaky(num_filters*2, (3,3)),
 63 |             DarknetConv2D_BN_Leaky(num_filters, (1,1)))(x)
 64 |     y = compose(
 65 |             DarknetConv2D_BN_Leaky(num_filters*2, (3,3)),
 66 |             DarknetConv2D(out_filters, (1,1)))(x)
 67 |     return x, y
 68 | 
 69 | 
 70 | def yolo_body(inputs, num_anchors, num_classes):
 71 |     """Create YOLO_V3 model CNN body in Keras."""
 72 |     darknet = Model(inputs, darknet_body(inputs))
 73 |     x, y1 = make_last_layers(darknet.output, 512, num_anchors*(num_classes+5))
 74 | 
 75 |     x = compose(
 76 |             DarknetConv2D_BN_Leaky(256, (1,1)),
 77 |             UpSampling2D(2))(x)
 78 |     x = Concatenate()([x,darknet.layers[152].output])
 79 |     x, y2 = make_last_layers(x, 256, num_anchors*(num_classes+5))
 80 | 
 81 |     x = compose(
 82 |             DarknetConv2D_BN_Leaky(128, (1,1)),
 83 |             UpSampling2D(2))(x)
 84 |     x = Concatenate()([x,darknet.layers[92].output])
 85 |     x, y3 = make_last_layers(x, 128, num_anchors*(num_classes+5))
 86 | 
 87 |     return Model(inputs, [y1,y2,y3])
 88 | 
 89 | def tiny_yolo_body(inputs, num_anchors, num_classes):
 90 |     '''Create Tiny YOLO_v3 model CNN body in keras.'''
 91 |     x1 = compose(
 92 |             DarknetConv2D_BN_Leaky(16, (3,3)),
 93 |             MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'),
 94 |             DarknetConv2D_BN_Leaky(32, (3,3)),
 95 |             MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'),
 96 |             DarknetConv2D_BN_Leaky(64, (3,3)),
 97 |             MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'),
 98 |             DarknetConv2D_BN_Leaky(128, (3,3)),
 99 |             MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'),
100 |             DarknetConv2D_BN_Leaky(256, (3,3)))(inputs)
101 |     x2 = compose(
102 |             MaxPooling2D(pool_size=(2,2), strides=(2,2), padding='same'),
103 |             DarknetConv2D_BN_Leaky(512, (3,3)),
104 |             MaxPooling2D(pool_size=(2,2), strides=(1,1), padding='same'),
105 |             DarknetConv2D_BN_Leaky(1024, (3,3)),
106 |             DarknetConv2D_BN_Leaky(256, (1,1)))(x1)
107 |     y1 = compose(
108 |             DarknetConv2D_BN_Leaky(512, (3,3)),
109 |             DarknetConv2D(num_anchors*(num_classes+5), (1,1)))(x2)
110 | 
111 |     x2 = compose(
112 |             DarknetConv2D_BN_Leaky(128, (1,1)),
113 |             UpSampling2D(2))(x2)
114 |     y2 = compose(
115 |             Concatenate(),
116 |             DarknetConv2D_BN_Leaky(256, (3,3)),
117 |             DarknetConv2D(num_anchors*(num_classes+5), (1,1)))([x2,x1])
118 | 
119 |     return Model(inputs, [y1,y2])
120 | 
121 | 
122 | def yolo_head(feats, anchors, num_classes, input_shape, calc_loss=False):
123 |     """Convert final layer features to bounding box parameters."""
124 |     num_anchors = len(anchors)
125 |     # Reshape to batch, height, width, num_anchors, box_params.
126 |     anchors_tensor = K.reshape(K.constant(anchors), [1, 1, 1, num_anchors, 2])
127 | 
128 |     grid_shape = K.shape(feats)[1:3] # height, width
129 |     grid_y = K.tile(K.reshape(K.arange(0, stop=grid_shape[0]), [-1, 1, 1, 1]),
130 |         [1, grid_shape[1], 1, 1])
131 |     grid_x = K.tile(K.reshape(K.arange(0, stop=grid_shape[1]), [1, -1, 1, 1]),
132 |         [grid_shape[0], 1, 1, 1])
133 |     grid = K.concatenate([grid_x, grid_y])
134 |     grid = K.cast(grid, K.dtype(feats))
135 | 
136 |     feats = K.reshape(
137 |         feats, [-1, grid_shape[0], grid_shape[1], num_anchors, num_classes + 5])
138 | 
139 |     # Adjust preditions to each spatial grid point and anchor size.
140 |     box_xy = (K.sigmoid(feats[..., :2]) + grid) / K.cast(grid_shape[::-1], K.dtype(feats))
141 |     box_wh = K.exp(feats[..., 2:4]) * anchors_tensor / K.cast(input_shape[::-1], K.dtype(feats))
142 |     box_confidence = K.sigmoid(feats[..., 4:5])
143 |     box_class_probs = K.sigmoid(feats[..., 5:])
144 | 
145 |     if calc_loss == True:
146 |         return grid, feats, box_xy, box_wh
147 |     return box_xy, box_wh, box_confidence, box_class_probs
148 | 
149 | 
150 | def yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape):
151 |     '''Get corrected boxes'''
152 |     box_yx = box_xy[..., ::-1]
153 |     box_hw = box_wh[..., ::-1]
154 |     input_shape = K.cast(input_shape, K.dtype(box_yx))
155 |     image_shape = K.cast(image_shape, K.dtype(box_yx))
156 |     new_shape = K.round(image_shape * K.min(input_shape/image_shape))
157 |     offset = (input_shape-new_shape)/2./input_shape
158 |     scale = input_shape/new_shape
159 |     box_yx = (box_yx - offset) * scale
160 |     box_hw *= scale
161 | 
162 |     box_mins = box_yx - (box_hw / 2.)
163 |     box_maxes = box_yx + (box_hw / 2.)
164 |     boxes =  K.concatenate([
165 |         box_mins[..., 0:1],  # y_min
166 |         box_mins[..., 1:2],  # x_min
167 |         box_maxes[..., 0:1],  # y_max
168 |         box_maxes[..., 1:2]  # x_max
169 |     ])
170 | 
171 |     # Scale boxes back to original image shape.
172 |     boxes *= K.concatenate([image_shape, image_shape])
173 |     return boxes
174 | 
175 | 
176 | def yolo_boxes_and_scores(feats, anchors, num_classes, input_shape, image_shape):
177 |     '''Process Conv layer output'''
178 |     box_xy, box_wh, box_confidence, box_class_probs = yolo_head(feats,
179 |         anchors, num_classes, input_shape)
180 |     boxes = yolo_correct_boxes(box_xy, box_wh, input_shape, image_shape)
181 |     boxes = K.reshape(boxes, [-1, 4])
182 |     box_scores = box_confidence * box_class_probs
183 |     box_scores = K.reshape(box_scores, [-1, num_classes])
184 |     return boxes, box_scores
185 | 
186 | 
187 | def yolo_eval(yolo_outputs,
188 |               anchors,
189 |               num_classes,
190 |               image_shape,
191 |               max_boxes=20,
192 |               score_threshold=.6,
193 |               iou_threshold=.5):
194 |     """Evaluate YOLO model on given input and return filtered boxes."""
195 |     num_layers = len(yolo_outputs)
196 |     anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] # default setting
197 |     input_shape = K.shape(yolo_outputs[0])[1:3] * 32
198 |     boxes = []
199 |     box_scores = []
200 |     for l in range(num_layers):
201 |         _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l],
202 |             anchors[anchor_mask[l]], num_classes, input_shape, image_shape)
203 |         boxes.append(_boxes)
204 |         box_scores.append(_box_scores)
205 |     boxes = K.concatenate(boxes, axis=0)
206 |     box_scores = K.concatenate(box_scores, axis=0)
207 | 
208 |     mask = box_scores >= score_threshold
209 |     max_boxes_tensor = K.constant(max_boxes, dtype='int32')
210 |     boxes_ = []
211 |     scores_ = []
212 |     classes_ = []
213 |     for c in range(num_classes):
214 |         # TODO: use keras backend instead of tf.
215 |         class_boxes = tf.boolean_mask(boxes, mask[:, c])
216 |         class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c])
217 |         nms_index = tf.image.non_max_suppression(
218 |             class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold)
219 |         class_boxes = K.gather(class_boxes, nms_index)
220 |         class_box_scores = K.gather(class_box_scores, nms_index)
221 |         classes = K.ones_like(class_box_scores, 'int32') * c
222 |         boxes_.append(class_boxes)
223 |         scores_.append(class_box_scores)
224 |         classes_.append(classes)
225 |     boxes_ = K.concatenate(boxes_, axis=0)
226 |     scores_ = K.concatenate(scores_, axis=0)
227 |     classes_ = K.concatenate(classes_, axis=0)
228 | 
229 |     return boxes_, scores_, classes_
230 | 
231 | 
232 | def preprocess_true_boxes(true_boxes, input_shape, anchors, num_classes):
233 |     '''Preprocess true boxes to training input format
234 | 
235 |     Parameters
236 |     ----------
237 |     true_boxes: array, shape=(m, T, 5)
238 |         Absolute x_min, y_min, x_max, y_max, class_id relative to input_shape.
239 |     input_shape: array-like, hw, multiples of 32
240 |     anchors: array, shape=(N, 2), wh
241 |     num_classes: integer
242 | 
243 |     Returns
244 |     -------
245 |     y_true: list of array, shape like yolo_outputs, xywh are reletive value
246 | 
247 |     '''
248 |     assert (true_boxes[..., 4]<num_classes).all(), 'class id must be less than num_classes'
249 |     num_layers = len(anchors)//3 # default setting
250 |     anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
251 | 
252 |     true_boxes = np.array(true_boxes, dtype='float32')
253 |     input_shape = np.array(input_shape, dtype='int32')
254 |     boxes_xy = (true_boxes[..., 0:2] + true_boxes[..., 2:4]) // 2
255 |     boxes_wh = true_boxes[..., 2:4] - true_boxes[..., 0:2]
256 |     true_boxes[..., 0:2] = boxes_xy/input_shape[::-1]
257 |     true_boxes[..., 2:4] = boxes_wh/input_shape[::-1]
258 | 
259 |     m = true_boxes.shape[0]
260 |     grid_shapes = [input_shape//{0:32, 1:16, 2:8}[l] for l in range(num_layers)]
261 |     y_true = [np.zeros((m,grid_shapes[l][0],grid_shapes[l][1],len(anchor_mask[l]),5+num_classes),
262 |         dtype='float32') for l in range(num_layers)]
263 | 
264 |     # Expand dim to apply broadcasting.
265 |     anchors = np.expand_dims(anchors, 0)
266 |     anchor_maxes = anchors / 2.
267 |     anchor_mins = -anchor_maxes
268 |     valid_mask = boxes_wh[..., 0]>0
269 | 
270 |     for b in range(m):
271 |         # Discard zero rows.
272 |         wh = boxes_wh[b, valid_mask[b]]
273 |         if len(wh)==0: continue
274 |         # Expand dim to apply broadcasting.
275 |         wh = np.expand_dims(wh, -2)
276 |         box_maxes = wh / 2.
277 |         box_mins = -box_maxes
278 | 
279 |         intersect_mins = np.maximum(box_mins, anchor_mins)
280 |         intersect_maxes = np.minimum(box_maxes, anchor_maxes)
281 |         intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.)
282 |         intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
283 |         box_area = wh[..., 0] * wh[..., 1]
284 |         anchor_area = anchors[..., 0] * anchors[..., 1]
285 |         iou = intersect_area / (box_area + anchor_area - intersect_area)
286 | 
287 |         # Find best anchor for each true box
288 |         best_anchor = np.argmax(iou, axis=-1)
289 | 
290 |         for t, n in enumerate(best_anchor):
291 |             for l in range(num_layers):
292 |                 if n in anchor_mask[l]:
293 |                     i = np.floor(true_boxes[b,t,0]*grid_shapes[l][1]).astype('int32')
294 |                     j = np.floor(true_boxes[b,t,1]*grid_shapes[l][0]).astype('int32')
295 |                     k = anchor_mask[l].index(n)
296 |                     c = true_boxes[b,t, 4].astype('int32')
297 |                     y_true[l][b, j, i, k, 0:4] = true_boxes[b,t, 0:4]
298 |                     y_true[l][b, j, i, k, 4] = 1
299 |                     y_true[l][b, j, i, k, 5+c] = 1
300 | 
301 |     return y_true
302 | 
303 | 
304 | def box_iou(b1, b2):
305 |     '''Return iou tensor
306 | 
307 |     Parameters
308 |     ----------
309 |     b1: tensor, shape=(i1,...,iN, 4), xywh
310 |     b2: tensor, shape=(j, 4), xywh
311 | 
312 |     Returns
313 |     -------
314 |     iou: tensor, shape=(i1,...,iN, j)
315 | 
316 |     '''
317 | 
318 |     # Expand dim to apply broadcasting.
319 |     b1 = K.expand_dims(b1, -2)
320 |     b1_xy = b1[..., :2]
321 |     b1_wh = b1[..., 2:4]
322 |     b1_wh_half = b1_wh/2.
323 |     b1_mins = b1_xy - b1_wh_half
324 |     b1_maxes = b1_xy + b1_wh_half
325 | 
326 |     # Expand dim to apply broadcasting.
327 |     b2 = K.expand_dims(b2, 0)
328 |     b2_xy = b2[..., :2]
329 |     b2_wh = b2[..., 2:4]
330 |     b2_wh_half = b2_wh/2.
331 |     b2_mins = b2_xy - b2_wh_half
332 |     b2_maxes = b2_xy + b2_wh_half
333 | 
334 |     intersect_mins = K.maximum(b1_mins, b2_mins)
335 |     intersect_maxes = K.minimum(b1_maxes, b2_maxes)
336 |     intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.)
337 |     intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1]
338 |     b1_area = b1_wh[..., 0] * b1_wh[..., 1]
339 |     b2_area = b2_wh[..., 0] * b2_wh[..., 1]
340 |     iou = intersect_area / (b1_area + b2_area - intersect_area)
341 | 
342 |     return iou
343 | 
344 | 
345 | def yolo_loss(args, anchors, num_classes, ignore_thresh=.5, print_loss=False):
346 |     '''Return yolo_loss tensor
347 | 
348 |     Parameters
349 |     ----------
350 |     yolo_outputs: list of tensor, the output of yolo_body or tiny_yolo_body
351 |     y_true: list of array, the output of preprocess_true_boxes
352 |     anchors: array, shape=(N, 2), wh
353 |     num_classes: integer
354 |     ignore_thresh: float, the iou threshold whether to ignore object confidence loss
355 | 
356 |     Returns
357 |     -------
358 |     loss: tensor, shape=(1,)
359 | 
360 |     '''
361 |     num_layers = len(anchors)//3 # default setting
362 |     yolo_outputs = args[:num_layers]
363 |     y_true = args[num_layers:]
364 |     anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]]
365 |     input_shape = K.cast(K.shape(yolo_outputs[0])[1:3] * 32, K.dtype(y_true[0]))
366 |     grid_shapes = [K.cast(K.shape(yolo_outputs[l])[1:3], K.dtype(y_true[0])) for l in range(num_layers)]
367 |     loss = 0
368 |     m = K.shape(yolo_outputs[0])[0] # batch size, tensor
369 |     mf = K.cast(m, K.dtype(yolo_outputs[0]))
370 | 
371 |     for l in range(num_layers):
372 |         object_mask = y_true[l][..., 4:5]
373 |         true_class_probs = y_true[l][..., 5:]
374 | 
375 |         grid, raw_pred, pred_xy, pred_wh = yolo_head(yolo_outputs[l],
376 |              anchors[anchor_mask[l]], num_classes, input_shape, calc_loss=True)
377 |         pred_box = K.concatenate([pred_xy, pred_wh])
378 | 
379 |         # Darknet raw box to calculate loss.
380 |         raw_true_xy = y_true[l][..., :2]*grid_shapes[l][::-1] - grid
381 |         raw_true_wh = K.log(y_true[l][..., 2:4] / anchors[anchor_mask[l]] * input_shape[::-1])
382 |         raw_true_wh = K.switch(object_mask, raw_true_wh, K.zeros_like(raw_true_wh)) # avoid log(0)=-inf
383 |         box_loss_scale = 2 - y_true[l][...,2:3]*y_true[l][...,3:4]
384 | 
385 |         # Find ignore mask, iterate over each of batch.
386 |         ignore_mask = tf.TensorArray(K.dtype(y_true[0]), size=1, dynamic_size=True)
387 |         object_mask_bool = K.cast(object_mask, 'bool')
388 |         def loop_body(b, ignore_mask):
389 |             true_box = tf.boolean_mask(y_true[l][b,...,0:4], object_mask_bool[b,...,0])
390 |             iou = box_iou(pred_box[b], true_box)
391 |             best_iou = K.max(iou, axis=-1)
392 |             ignore_mask = ignore_mask.write(b, K.cast(best_iou<ignore_thresh, K.dtype(true_box)))
393 |             return b+1, ignore_mask
394 |         _, ignore_mask = tf.while_loop(lambda b,*args: b<m, loop_body, [0, ignore_mask])
395 |         ignore_mask = ignore_mask.stack()
396 |         ignore_mask = K.expand_dims(ignore_mask, -1)
397 | 
398 |         # K.binary_crossentropy is helpful to avoid exp overflow.
399 |         xy_loss = object_mask * box_loss_scale * K.binary_crossentropy(raw_true_xy, raw_pred[...,0:2], from_logits=True)
400 |         wh_loss = object_mask * box_loss_scale * 0.5 * K.square(raw_true_wh-raw_pred[...,2:4])
401 |         confidence_loss = object_mask * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True)+ \
402 |             (1-object_mask) * K.binary_crossentropy(object_mask, raw_pred[...,4:5], from_logits=True) * ignore_mask
403 |         class_loss = object_mask * K.binary_crossentropy(true_class_probs, raw_pred[...,5:], from_logits=True)
404 | 
405 |         xy_loss = K.sum(xy_loss) / mf
406 |         wh_loss = K.sum(wh_loss) / mf
407 |         confidence_loss = K.sum(confidence_loss) / mf
408 |         class_loss = K.sum(class_loss) / mf
409 |         loss += xy_loss + wh_loss + confidence_loss + class_loss
410 |         if print_loss:
411 |             loss = tf.Print(loss, [loss, xy_loss, wh_loss, confidence_loss, class_loss, K.sum(ignore_mask)], message='loss: ')
412 |     return loss
413 | 


--------------------------------------------------------------------------------