├── .gitignore ├── LICENSE ├── README.md ├── environment.yml ├── etc ├── dog_small.jpg └── yolo.png ├── font ├── FiraMono-Medium.otf └── SIL Open Font License.txt ├── images ├── dog.jpg ├── eagle.jpg ├── giraffe.jpg ├── horses.jpg ├── person.jpg └── scream.jpg ├── model_data ├── coco_classes.txt └── pascal_classes.txt ├── retrain_yolo.py ├── test_yolo.py ├── train_overfit.py ├── voc_conversion_scripts ├── voc_to_hdf5.py └── voc_to_tfrecords.py ├── yad2k.py └── yad2k ├── __init__.py ├── models ├── __init__.py ├── keras_darknet19.py └── keras_yolo.py └── utils ├── __init__.py ├── draw_boxes.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore generated models, test images, and Darknet files. 2 | *.h5 3 | *.weights 4 | *.png 5 | *.jpg 6 | *.cfg 7 | images/* 8 | images/out/* 9 | model_data/*_anchors.txt 10 | 11 | # Tensorboard training logs 12 | logs/ 13 | 14 | # Byte-compiled / optimized / DLL files 15 | __pycache__/ 16 | *.py[cod] 17 | *$py.class 18 | 19 | # C extensions 20 | *.so 21 | 22 | # Distribution / packaging 23 | .Python 24 | env/ 25 | build/ 26 | develop-eggs/ 27 | dist/ 28 | downloads/ 29 | eggs/ 30 | .eggs/ 31 | lib/ 32 | lib64/ 33 | parts/ 34 | sdist/ 35 | var/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *,cover 59 | .hypothesis/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # IPython Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # dotenv 92 | .env 93 | 94 | # virtualenv 95 | venv/ 96 | ENV/ 97 | 98 | # Spyder project settings 99 | .spyderproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | All contributions by Allan Zelener: 4 | Copyright (c) 2017, Allan Zelener. 5 | All rights reserved. 6 | 7 | All other contributions: 8 | Copyright (c) 2017, the respective contributors. 9 | All rights reserved. 10 | 11 | Each contributor holds copyright over their respective contributions. 12 | The project versioning (Git) records all such contribution source information. 13 | 14 | LICENSE 15 | 16 | The MIT License (MIT) 17 | 18 | Permission is hereby granted, free of charge, to any person obtaining a copy 19 | of this software and associated documentation files (the "Software"), to deal 20 | in the Software without restriction, including without limitation the rights 21 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 22 | copies of the Software, and to permit persons to whom the Software is 23 | furnished to do so, subject to the following conditions: 24 | 25 | The above copyright notice and this permission notice shall be included in all 26 | copies or substantial portions of the Software. 27 | 28 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 33 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 34 | SOFTWARE. 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # YAD2K: Yet Another Darknet 2 Keras 2 | 3 | [![license](https://img.shields.io/github/license/mashape/apistatus.svg)](LICENSE) 4 | 5 | ## Welcome to YAD2K 6 | 7 | You only look once, but you reimplement neural nets over and over again. 8 | 9 | YAD2K is a 90% Keras/10% Tensorflow implementation of YOLO_v2. 10 | 11 | Original paper: [YOLO9000: Better, Faster, Stronger](https://arxiv.org/abs/1612.08242) by Joseph Redmond and Ali Farhadi. 12 | 13 | ![YOLO_v2 COCO model with test_yolo defaults](etc/dog_small.jpg) 14 | 15 | -------------------------------------------------------------------------------- 16 | 17 | ## Requirements 18 | 19 | - [Keras](https://github.com/fchollet/keras) 20 | - [Tensorflow](https://www.tensorflow.org/) 21 | - [Numpy](http://www.numpy.org/) 22 | - [h5py](http://www.h5py.org/) (For Keras model serialization.) 23 | - [Pillow](https://pillow.readthedocs.io/) (For rendering test results.) 24 | - [Python 3](https://www.python.org/) 25 | - [pydot-ng](https://github.com/pydot/pydot-ng) (Optional for plotting model.) 26 | 27 | ### Installation 28 | ```bash 29 | git clone https://github.com/allanzelener/yad2k.git 30 | cd yad2k 31 | 32 | # [Option 1] To replicate the conda environment: 33 | conda env create -f environment.yml 34 | source activate yad2k 35 | # [Option 2] Install everything globaly. 36 | pip install numpy h5py pillow 37 | pip install tensorflow-gpu # CPU-only: conda install -c conda-forge tensorflow 38 | pip install keras # Possibly older release: conda install keras 39 | ``` 40 | 41 | ## Quick Start 42 | 43 | - Download Darknet model cfg and weights from the [official YOLO website](http://pjreddie.com/darknet/yolo/). 44 | - Convert the Darknet YOLO_v2 model to a Keras model. 45 | - Test the converted model on the small test set in `images/`. 46 | 47 | ```bash 48 | wget http://pjreddie.com/media/files/yolo.weights 49 | wget https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolo.cfg 50 | ./yad2k.py yolo.cfg yolo.weights model_data/yolo.h5 51 | ./test_yolo.py model_data/yolo.h5 # output in images/out/ 52 | ``` 53 | 54 | See `./yad2k.py --help` and `./test_yolo.py --help` for more options. 55 | 56 | -------------------------------------------------------------------------------- 57 | 58 | ## More Details 59 | 60 | The YAD2K converter currently only supports YOLO_v2 style models, this include the following configurations: `darknet19_448`, `tiny-yolo-voc`, `yolo-voc`, and `yolo`. 61 | 62 | `yad2k.py -p` will produce a plot of the generated Keras model. For example see [yolo.png](etc/yolo.png). 63 | 64 | YAD2K assumes the Keras backend is Tensorflow. In particular for YOLO_v2 models with a passthrough layer, YAD2K uses `tf.space_to_depth` to implement the passthrough layer. The evaluation script also directly uses Tensorflow tensors and uses `tf.non_max_suppression` for the final output. 65 | 66 | `voc_conversion_scripts` contains two scripts for converting the Pascal VOC image dataset with XML annotations to either HDF5 or TFRecords format for easier training with Keras or Tensorflow. 67 | 68 | `yad2k/models` contains reference implementations of Darknet-19 and YOLO_v2. 69 | 70 | `train_overfit` is a sample training script that overfits a YOLO_v2 model to a single image from the Pascal VOC dataset. 71 | 72 | ## Known Issues and TODOs 73 | 74 | - Expand sample training script to train YOLO_v2 reference model on full dataset. 75 | - Support for additional Darknet layer types. 76 | - Tuck away the Tensorflow dependencies with Keras wrappers where possible. 77 | - YOLO_v2 model does not support fully convolutional mode. Current implementation assumes 1:1 aspect ratio images. 78 | 79 | ## Darknets of Yore 80 | 81 | YAD2K stands on the shoulders of giants. 82 | 83 | - :fire: [Darknet](https://github.com/pjreddie/darknet) :fire: 84 | - [Darknet.Keras](https://github.com/sunshineatnoon/Darknet.keras) - The original D2K for YOLO_v1. 85 | - [Darkflow](https://github.com/thtrieu/darkflow) - Darknet directly to Tensorflow. 86 | - [caffe-yolo](https://github.com/xingwangsfu/caffe-yolo) - YOLO_v1 to Caffe. 87 | - [yolo2-pytorch](https://github.com/longcw/yolo2-pytorch) - YOLO_v2 in PyTorch. 88 | 89 | -------------------------------------------------------------------------------- 90 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: yad2k 2 | channels: 3 | - defaults 4 | dependencies: 5 | - cycler=0.10.0=py36_0 6 | - dbus=1.10.10=0 7 | - expat=2.1.0=0 8 | - fontconfig=2.12.1=3 9 | - freetype=2.5.5=2 10 | - glib=2.50.2=1 11 | - gst-plugins-base=1.8.0=0 12 | - gstreamer=1.8.0=0 13 | - h5py=2.7.0=np112py36_0 14 | - hdf5=1.8.17=1 15 | - icu=54.1=0 16 | - jbig=2.1=0 17 | - jpeg=9b=0 18 | - libffi=3.2.1=1 19 | - libgcc=5.2.0=0 20 | - libgfortran=3.0.0=1 21 | - libiconv=1.14=0 22 | - libpng=1.6.27=0 23 | - libtiff=4.0.6=3 24 | - libxcb=1.12=1 25 | - libxml2=2.9.4=0 26 | - matplotlib=2.0.0=np112py36_0 27 | - mkl=2017.0.1=0 28 | - numpy=1.12.1=py36_0 29 | - olefile=0.44=py36_0 30 | - openssl=1.0.2k=1 31 | - pcre=8.39=1 32 | - pillow=4.1.0=py36_0 33 | - pip=9.0.1=py36_1 34 | - pyparsing=2.1.4=py36_0 35 | - pyqt=5.6.0=py36_2 36 | - python=3.6.1=0 37 | - python-dateutil=2.6.0=py36_0 38 | - pytz=2017.2=py36_0 39 | - pyyaml=3.12=py36_0 40 | - qt=5.6.2=3 41 | - readline=6.2=2 42 | - scipy=0.19.0=np112py36_0 43 | - setuptools=27.2.0=py36_0 44 | - sip=4.18=py36_0 45 | - six=1.10.0=py36_0 46 | - sqlite=3.13.0=0 47 | - tk=8.5.18=0 48 | - wheel=0.29.0=py36_0 49 | - xz=5.2.2=1 50 | - yaml=0.1.6=0 51 | - zlib=1.2.8=3 52 | - pip: 53 | - keras==2.0.3 54 | - protobuf==3.2.0 55 | - pydot-ng==1.0.0 56 | - tensorflow-gpu==1.0.1 57 | - theano==0.9.0 58 | prefix: /home/allan/anaconda3/envs/yad2k 59 | -------------------------------------------------------------------------------- /etc/dog_small.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/etc/dog_small.jpg -------------------------------------------------------------------------------- /etc/yolo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/etc/yolo.png -------------------------------------------------------------------------------- /font/FiraMono-Medium.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/font/FiraMono-Medium.otf -------------------------------------------------------------------------------- /font/SIL Open Font License.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Mozilla Foundation https://mozilla.org/ with Reserved Font Name Fira Mono. 2 | 3 | Copyright (c) 2014, Telefonica S.A. 4 | 5 | This Font Software is licensed under the SIL Open Font License, Version 1.1. 6 | This license is copied below, and is also available with a FAQ at: http://scripts.sil.org/OFL 7 | 8 | ----------------------------------------------------------- 9 | SIL OPEN FONT LICENSE Version 1.1 - 26 February 2007 10 | ----------------------------------------------------------- 11 | 12 | PREAMBLE 13 | The goals of the Open Font License (OFL) are to stimulate worldwide development of collaborative font projects, to support the font creation efforts of academic and linguistic communities, and to provide a free and open framework in which fonts may be shared and improved in partnership with others. 14 | 15 | The OFL allows the licensed fonts to be used, studied, modified and redistributed freely as long as they are not sold by themselves. The fonts, including any derivative works, can be bundled, embedded, redistributed and/or sold with any software provided that any reserved names are not used by derivative works. The fonts and derivatives, however, cannot be released under any other type of license. The requirement for fonts to remain under this license does not apply to any document created using the fonts or their derivatives. 16 | 17 | DEFINITIONS 18 | "Font Software" refers to the set of files released by the Copyright Holder(s) under this license and clearly marked as such. This may include source files, build scripts and documentation. 19 | 20 | "Reserved Font Name" refers to any names specified as such after the copyright statement(s). 21 | 22 | "Original Version" refers to the collection of Font Software components as distributed by the Copyright Holder(s). 23 | 24 | "Modified Version" refers to any derivative made by adding to, deleting, or substituting -- in part or in whole -- any of the components of the Original Version, by changing formats or by porting the Font Software to a new environment. 25 | 26 | "Author" refers to any designer, engineer, programmer, technical writer or other person who contributed to the Font Software. 27 | 28 | PERMISSION & CONDITIONS 29 | Permission is hereby granted, free of charge, to any person obtaining a copy of the Font Software, to use, study, copy, merge, embed, modify, redistribute, and sell modified and unmodified copies of the Font Software, subject to the following conditions: 30 | 31 | 1) Neither the Font Software nor any of its individual components, in Original or Modified Versions, may be sold by itself. 32 | 33 | 2) Original or Modified Versions of the Font Software may be bundled, redistributed and/or sold with any software, provided that each copy contains the above copyright notice and this license. These can be included either as stand-alone text files, human-readable headers or in the appropriate machine-readable metadata fields within text or binary files as long as those fields can be easily viewed by the user. 34 | 35 | 3) No Modified Version of the Font Software may use the Reserved Font Name(s) unless explicit written permission is granted by the corresponding Copyright Holder. This restriction only applies to the primary font name as presented to the users. 36 | 37 | 4) The name(s) of the Copyright Holder(s) or the Author(s) of the Font Software shall not be used to promote, endorse or advertise any Modified Version, except to acknowledge the contribution(s) of the Copyright Holder(s) and the Author(s) or with their explicit written permission. 38 | 39 | 5) The Font Software, modified or unmodified, in part or in whole, must be distributed entirely under this license, and must not be distributed under any other license. The requirement for fonts to remain under this license does not apply to any document created using the Font Software. 40 | 41 | TERMINATION 42 | This license becomes null and void if any of the above conditions are not met. 43 | 44 | DISCLAIMER 45 | THE FONT SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO ANY WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF COPYRIGHT, PATENT, TRADEMARK, OR OTHER RIGHT. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, INCLUDING ANY GENERAL, SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF THE USE OR INABILITY TO USE THE FONT SOFTWARE OR FROM OTHER DEALINGS IN THE FONT SOFTWARE. -------------------------------------------------------------------------------- /images/dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/dog.jpg -------------------------------------------------------------------------------- /images/eagle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/eagle.jpg -------------------------------------------------------------------------------- /images/giraffe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/giraffe.jpg -------------------------------------------------------------------------------- /images/horses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/horses.jpg -------------------------------------------------------------------------------- /images/person.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/person.jpg -------------------------------------------------------------------------------- /images/scream.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/images/scream.jpg -------------------------------------------------------------------------------- /model_data/coco_classes.txt: -------------------------------------------------------------------------------- 1 | person 2 | bicycle 3 | car 4 | motorbike 5 | aeroplane 6 | bus 7 | train 8 | truck 9 | boat 10 | traffic light 11 | fire hydrant 12 | stop sign 13 | parking meter 14 | bench 15 | bird 16 | cat 17 | dog 18 | horse 19 | sheep 20 | cow 21 | elephant 22 | bear 23 | zebra 24 | giraffe 25 | backpack 26 | umbrella 27 | handbag 28 | tie 29 | suitcase 30 | frisbee 31 | skis 32 | snowboard 33 | sports ball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife 45 | spoon 46 | bowl 47 | banana 48 | apple 49 | sandwich 50 | orange 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut 56 | cake 57 | chair 58 | sofa 59 | pottedplant 60 | bed 61 | diningtable 62 | toilet 63 | tvmonitor 64 | laptop 65 | mouse 66 | remote 67 | keyboard 68 | cell phone 69 | microwave 70 | oven 71 | toaster 72 | sink 73 | refrigerator 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear 79 | hair drier 80 | toothbrush 81 | -------------------------------------------------------------------------------- /model_data/pascal_classes.txt: -------------------------------------------------------------------------------- 1 | aeroplane 2 | bicycle 3 | bird 4 | boat 5 | bottle 6 | bus 7 | car 8 | cat 9 | chair 10 | cow 11 | diningtable 12 | dog 13 | horse 14 | motorbike 15 | person 16 | pottedplant 17 | sheep 18 | sofa 19 | train 20 | tvmonitor 21 | -------------------------------------------------------------------------------- /retrain_yolo.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a script that can be used to retrain the YOLOv2 model for your own dataset. 3 | """ 4 | import argparse 5 | 6 | import os 7 | 8 | import matplotlib.pyplot as plt 9 | import numpy as np 10 | import PIL 11 | import tensorflow as tf 12 | from keras import backend as K 13 | from keras.layers import Input, Lambda, Conv2D 14 | from keras.models import load_model, Model 15 | from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping 16 | 17 | from yad2k.models.keras_yolo import (preprocess_true_boxes, yolo_body, 18 | yolo_eval, yolo_head, yolo_loss) 19 | from yad2k.utils.draw_boxes import draw_boxes 20 | 21 | # Args 22 | argparser = argparse.ArgumentParser( 23 | description="Retrain or 'fine-tune' a pretrained YOLOv2 model for your own data.") 24 | 25 | argparser.add_argument( 26 | '-d', 27 | '--data_path', 28 | help="path to numpy data file (.npz) containing np.object array 'boxes' and np.uint8 array 'images'", 29 | default=os.path.join('..', 'DATA', 'underwater_data.npz')) 30 | 31 | argparser.add_argument( 32 | '-a', 33 | '--anchors_path', 34 | help='path to anchors file, defaults to yolo_anchors.txt', 35 | default=os.path.join('model_data', 'yolo_anchors.txt')) 36 | 37 | argparser.add_argument( 38 | '-c', 39 | '--classes_path', 40 | help='path to classes file, defaults to pascal_classes.txt', 41 | default=os.path.join('..', 'DATA', 'underwater_classes.txt')) 42 | 43 | # Default anchor boxes 44 | YOLO_ANCHORS = np.array( 45 | ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434), 46 | (7.88282, 3.52778), (9.77052, 9.16828))) 47 | 48 | def _main(args): 49 | data_path = os.path.expanduser(args.data_path) 50 | classes_path = os.path.expanduser(args.classes_path) 51 | anchors_path = os.path.expanduser(args.anchors_path) 52 | 53 | class_names = get_classes(classes_path) 54 | anchors = get_anchors(anchors_path) 55 | 56 | data = np.load(data_path) # custom data saved as a numpy file. 57 | # has 2 arrays: an object array 'boxes' (variable length of boxes in each image) 58 | # and an array of images 'images' 59 | 60 | image_data, boxes = process_data(data['images'], data['boxes']) 61 | 62 | anchors = YOLO_ANCHORS 63 | 64 | detectors_mask, matching_true_boxes = get_detector_mask(boxes, anchors) 65 | 66 | model_body, model = create_model(anchors, class_names) 67 | 68 | train( 69 | model, 70 | class_names, 71 | anchors, 72 | image_data, 73 | boxes, 74 | detectors_mask, 75 | matching_true_boxes 76 | ) 77 | 78 | draw(model_body, 79 | class_names, 80 | anchors, 81 | image_data, 82 | image_set='val', # assumes training/validation split is 0.9 83 | weights_name='trained_stage_3_best.h5', 84 | save_all=False) 85 | 86 | 87 | def get_classes(classes_path): 88 | '''loads the classes''' 89 | with open(classes_path) as f: 90 | class_names = f.readlines() 91 | class_names = [c.strip() for c in class_names] 92 | return class_names 93 | 94 | def get_anchors(anchors_path): 95 | '''loads the anchors from a file''' 96 | if os.path.isfile(anchors_path): 97 | with open(anchors_path) as f: 98 | anchors = f.readline() 99 | anchors = [float(x) for x in anchors.split(',')] 100 | return np.array(anchors).reshape(-1, 2) 101 | else: 102 | Warning("Could not open anchors file, using default.") 103 | return YOLO_ANCHORS 104 | 105 | def process_data(images, boxes=None): 106 | '''processes the data''' 107 | images = [PIL.Image.fromarray(i) for i in images] 108 | orig_size = np.array([images[0].width, images[0].height]) 109 | orig_size = np.expand_dims(orig_size, axis=0) 110 | 111 | # Image preprocessing. 112 | processed_images = [i.resize((416, 416), PIL.Image.BICUBIC) for i in images] 113 | processed_images = [np.array(image, dtype=np.float) for image in processed_images] 114 | processed_images = [image/255. for image in processed_images] 115 | 116 | if boxes is not None: 117 | # Box preprocessing. 118 | # Original boxes stored as 1D list of class, x_min, y_min, x_max, y_max. 119 | boxes = [box.reshape((-1, 5)) for box in boxes] 120 | # Get extents as y_min, x_min, y_max, x_max, class for comparision with 121 | # model output. 122 | boxes_extents = [box[:, [2, 1, 4, 3, 0]] for box in boxes] 123 | 124 | # Get box parameters as x_center, y_center, box_width, box_height, class. 125 | boxes_xy = [0.5 * (box[:, 3:5] + box[:, 1:3]) for box in boxes] 126 | boxes_wh = [box[:, 3:5] - box[:, 1:3] for box in boxes] 127 | boxes_xy = [boxxy / orig_size for boxxy in boxes_xy] 128 | boxes_wh = [boxwh / orig_size for boxwh in boxes_wh] 129 | boxes = [np.concatenate((boxes_xy[i], boxes_wh[i], box[:, 0:1]), axis=1) for i, box in enumerate(boxes)] 130 | 131 | # find the max number of boxes 132 | max_boxes = 0 133 | for boxz in boxes: 134 | if boxz.shape[0] > max_boxes: 135 | max_boxes = boxz.shape[0] 136 | 137 | # add zero pad for training 138 | for i, boxz in enumerate(boxes): 139 | if boxz.shape[0] < max_boxes: 140 | zero_padding = np.zeros( (max_boxes-boxz.shape[0], 5), dtype=np.float32) 141 | boxes[i] = np.vstack((boxz, zero_padding)) 142 | 143 | return np.array(processed_images), np.array(boxes) 144 | else: 145 | return np.array(processed_images) 146 | 147 | def get_detector_mask(boxes, anchors): 148 | ''' 149 | Precompute detectors_mask and matching_true_boxes for training. 150 | Detectors mask is 1 for each spatial position in the final conv layer and 151 | anchor that should be active for the given boxes and 0 otherwise. 152 | Matching true boxes gives the regression targets for the ground truth box 153 | that caused a detector to be active or 0 otherwise. 154 | ''' 155 | detectors_mask = [0 for i in range(len(boxes))] 156 | matching_true_boxes = [0 for i in range(len(boxes))] 157 | for i, box in enumerate(boxes): 158 | detectors_mask[i], matching_true_boxes[i] = preprocess_true_boxes(box, anchors, [416, 416]) 159 | 160 | return np.array(detectors_mask), np.array(matching_true_boxes) 161 | 162 | def create_model(anchors, class_names, load_pretrained=True, freeze_body=True): 163 | ''' 164 | returns the body of the model and the model 165 | 166 | # Params: 167 | 168 | load_pretrained: whether or not to load the pretrained model or initialize all weights 169 | 170 | freeze_body: whether or not to freeze all weights except for the last layer's 171 | 172 | # Returns: 173 | 174 | model_body: YOLOv2 with new output layer 175 | 176 | model: YOLOv2 with custom loss Lambda layer 177 | 178 | ''' 179 | 180 | detectors_mask_shape = (13, 13, 5, 1) 181 | matching_boxes_shape = (13, 13, 5, 5) 182 | 183 | # Create model input layers. 184 | image_input = Input(shape=(416, 416, 3)) 185 | boxes_input = Input(shape=(None, 5)) 186 | detectors_mask_input = Input(shape=detectors_mask_shape) 187 | matching_boxes_input = Input(shape=matching_boxes_shape) 188 | 189 | # Create model body. 190 | yolo_model = yolo_body(image_input, len(anchors), len(class_names)) 191 | topless_yolo = Model(yolo_model.input, yolo_model.layers[-2].output) 192 | 193 | if load_pretrained: 194 | # Save topless yolo: 195 | topless_yolo_path = os.path.join('model_data', 'yolo_topless.h5') 196 | if not os.path.exists(topless_yolo_path): 197 | print("CREATING TOPLESS WEIGHTS FILE") 198 | yolo_path = os.path.join('model_data', 'yolo.h5') 199 | model_body = load_model(yolo_path) 200 | model_body = Model(model_body.inputs, model_body.layers[-2].output) 201 | model_body.save_weights(topless_yolo_path) 202 | topless_yolo.load_weights(topless_yolo_path) 203 | 204 | if freeze_body: 205 | for layer in topless_yolo.layers: 206 | layer.trainable = False 207 | final_layer = Conv2D(len(anchors)*(5+len(class_names)), (1, 1), activation='linear')(topless_yolo.output) 208 | 209 | model_body = Model(image_input, final_layer) 210 | 211 | # Place model loss on CPU to reduce GPU memory usage. 212 | with tf.device('/cpu:0'): 213 | # TODO: Replace Lambda with custom Keras layer for loss. 214 | model_loss = Lambda( 215 | yolo_loss, 216 | output_shape=(1, ), 217 | name='yolo_loss', 218 | arguments={'anchors': anchors, 219 | 'num_classes': len(class_names)})([ 220 | model_body.output, boxes_input, 221 | detectors_mask_input, matching_boxes_input 222 | ]) 223 | 224 | model = Model( 225 | [model_body.input, boxes_input, detectors_mask_input, 226 | matching_boxes_input], model_loss) 227 | 228 | return model_body, model 229 | 230 | def train(model, class_names, anchors, image_data, boxes, detectors_mask, matching_true_boxes, validation_split=0.1): 231 | ''' 232 | retrain/fine-tune the model 233 | 234 | logs training with tensorboard 235 | 236 | saves training weights in current directory 237 | 238 | best weights according to val_loss is saved as trained_stage_3_best.h5 239 | ''' 240 | model.compile( 241 | optimizer='adam', loss={ 242 | 'yolo_loss': lambda y_true, y_pred: y_pred 243 | }) # This is a hack to use the custom loss function in the last layer. 244 | 245 | 246 | logging = TensorBoard() 247 | checkpoint = ModelCheckpoint("trained_stage_3_best.h5", monitor='val_loss', 248 | save_weights_only=True, save_best_only=True) 249 | early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=15, verbose=1, mode='auto') 250 | 251 | model.fit([image_data, boxes, detectors_mask, matching_true_boxes], 252 | np.zeros(len(image_data)), 253 | validation_split=validation_split, 254 | batch_size=32, 255 | epochs=5, 256 | callbacks=[logging]) 257 | model.save_weights('trained_stage_1.h5') 258 | 259 | model_body, model = create_model(anchors, class_names, load_pretrained=False, freeze_body=False) 260 | 261 | model.load_weights('trained_stage_1.h5') 262 | 263 | model.compile( 264 | optimizer='adam', loss={ 265 | 'yolo_loss': lambda y_true, y_pred: y_pred 266 | }) # This is a hack to use the custom loss function in the last layer. 267 | 268 | 269 | model.fit([image_data, boxes, detectors_mask, matching_true_boxes], 270 | np.zeros(len(image_data)), 271 | validation_split=0.1, 272 | batch_size=8, 273 | epochs=30, 274 | callbacks=[logging]) 275 | 276 | model.save_weights('trained_stage_2.h5') 277 | 278 | model.fit([image_data, boxes, detectors_mask, matching_true_boxes], 279 | np.zeros(len(image_data)), 280 | validation_split=0.1, 281 | batch_size=8, 282 | epochs=30, 283 | callbacks=[logging, checkpoint, early_stopping]) 284 | 285 | model.save_weights('trained_stage_3.h5') 286 | 287 | def draw(model_body, class_names, anchors, image_data, image_set='val', 288 | weights_name='trained_stage_3_best.h5', out_path="output_images", save_all=True): 289 | ''' 290 | Draw bounding boxes on image data 291 | ''' 292 | if image_set == 'train': 293 | image_data = np.array([np.expand_dims(image, axis=0) 294 | for image in image_data[:int(len(image_data)*.9)]]) 295 | elif image_set == 'val': 296 | image_data = np.array([np.expand_dims(image, axis=0) 297 | for image in image_data[int(len(image_data)*.9):]]) 298 | elif image_set == 'all': 299 | image_data = np.array([np.expand_dims(image, axis=0) 300 | for image in image_data]) 301 | else: 302 | ValueError("draw argument image_set must be 'train', 'val', or 'all'") 303 | # model.load_weights(weights_name) 304 | print(image_data.shape) 305 | model_body.load_weights(weights_name) 306 | 307 | # Create output variables for prediction. 308 | yolo_outputs = yolo_head(model_body.output, anchors, len(class_names)) 309 | input_image_shape = K.placeholder(shape=(2, )) 310 | boxes, scores, classes = yolo_eval( 311 | yolo_outputs, input_image_shape, score_threshold=0.07, iou_threshold=0) 312 | 313 | # Run prediction on overfit image. 314 | sess = K.get_session() # TODO: Remove dependence on Tensorflow session. 315 | 316 | if not os.path.exists(out_path): 317 | os.makedirs(out_path) 318 | for i in range(len(image_data)): 319 | out_boxes, out_scores, out_classes = sess.run( 320 | [boxes, scores, classes], 321 | feed_dict={ 322 | model_body.input: image_data[i], 323 | input_image_shape: [image_data.shape[2], image_data.shape[3]], 324 | K.learning_phase(): 0 325 | }) 326 | print('Found {} boxes for image.'.format(len(out_boxes))) 327 | print(out_boxes) 328 | 329 | # Plot image with predicted boxes. 330 | image_with_boxes = draw_boxes(image_data[i][0], out_boxes, out_classes, 331 | class_names, out_scores) 332 | # Save the image: 333 | if save_all or (len(out_boxes) > 0): 334 | image = PIL.Image.fromarray(image_with_boxes) 335 | image.save(os.path.join(out_path,str(i)+'.png')) 336 | 337 | # To display (pauses the program): 338 | # plt.imshow(image_with_boxes, interpolation='nearest') 339 | # plt.show() 340 | 341 | 342 | 343 | if __name__ == '__main__': 344 | args = argparser.parse_args() 345 | _main(args) 346 | -------------------------------------------------------------------------------- /test_yolo.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """Run a YOLO_v2 style detection model on test images.""" 3 | import argparse 4 | import colorsys 5 | import imghdr 6 | import os 7 | import random 8 | 9 | import numpy as np 10 | from keras import backend as K 11 | from keras.models import load_model 12 | from PIL import Image, ImageDraw, ImageFont 13 | 14 | from yad2k.models.keras_yolo import yolo_eval, yolo_head 15 | 16 | parser = argparse.ArgumentParser( 17 | description='Run a YOLO_v2 style detection model on test images..') 18 | parser.add_argument( 19 | 'model_path', 20 | help='path to h5 model file containing body' 21 | 'of a YOLO_v2 model') 22 | parser.add_argument( 23 | '-a', 24 | '--anchors_path', 25 | help='path to anchors file, defaults to yolo_anchors.txt', 26 | default='model_data/yolo_anchors.txt') 27 | parser.add_argument( 28 | '-c', 29 | '--classes_path', 30 | help='path to classes file, defaults to coco_classes.txt', 31 | default='model_data/coco_classes.txt') 32 | parser.add_argument( 33 | '-t', 34 | '--test_path', 35 | help='path to directory of test images, defaults to images/', 36 | default='images') 37 | parser.add_argument( 38 | '-o', 39 | '--output_path', 40 | help='path to output test images, defaults to images/out', 41 | default='images/out') 42 | parser.add_argument( 43 | '-s', 44 | '--score_threshold', 45 | type=float, 46 | help='threshold for bounding box scores, default .3', 47 | default=.3) 48 | parser.add_argument( 49 | '-iou', 50 | '--iou_threshold', 51 | type=float, 52 | help='threshold for non max suppression IOU, default .5', 53 | default=.5) 54 | 55 | 56 | def _main(args): 57 | model_path = os.path.expanduser(args.model_path) 58 | assert model_path.endswith('.h5'), 'Keras model must be a .h5 file.' 59 | anchors_path = os.path.expanduser(args.anchors_path) 60 | classes_path = os.path.expanduser(args.classes_path) 61 | test_path = os.path.expanduser(args.test_path) 62 | output_path = os.path.expanduser(args.output_path) 63 | 64 | if not os.path.exists(output_path): 65 | print('Creating output path {}'.format(output_path)) 66 | os.mkdir(output_path) 67 | 68 | sess = K.get_session() # TODO: Remove dependence on Tensorflow session. 69 | 70 | with open(classes_path) as f: 71 | class_names = f.readlines() 72 | class_names = [c.strip() for c in class_names] 73 | 74 | with open(anchors_path) as f: 75 | anchors = f.readline() 76 | anchors = [float(x) for x in anchors.split(',')] 77 | anchors = np.array(anchors).reshape(-1, 2) 78 | 79 | yolo_model = load_model(model_path) 80 | 81 | # Verify model, anchors, and classes are compatible 82 | num_classes = len(class_names) 83 | num_anchors = len(anchors) 84 | # TODO: Assumes dim ordering is channel last 85 | model_output_channels = yolo_model.layers[-1].output_shape[-1] 86 | assert model_output_channels == num_anchors * (num_classes + 5), \ 87 | 'Mismatch between model and given anchor and class sizes. ' \ 88 | 'Specify matching anchors and classes with --anchors_path and ' \ 89 | '--classes_path flags.' 90 | print('{} model, anchors, and classes loaded.'.format(model_path)) 91 | 92 | # Check if model is fully convolutional, assuming channel last order. 93 | model_image_size = yolo_model.layers[0].input_shape[1:3] 94 | is_fixed_size = model_image_size != (None, None) 95 | 96 | # Generate colors for drawing bounding boxes. 97 | hsv_tuples = [(x / len(class_names), 1., 1.) 98 | for x in range(len(class_names))] 99 | colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) 100 | colors = list( 101 | map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), 102 | colors)) 103 | random.seed(10101) # Fixed seed for consistent colors across runs. 104 | random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes. 105 | random.seed(None) # Reset seed to default. 106 | 107 | # Generate output tensor targets for filtered bounding boxes. 108 | # TODO: Wrap these backend operations with Keras layers. 109 | yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names)) 110 | input_image_shape = K.placeholder(shape=(2, )) 111 | boxes, scores, classes = yolo_eval( 112 | yolo_outputs, 113 | input_image_shape, 114 | score_threshold=args.score_threshold, 115 | iou_threshold=args.iou_threshold) 116 | 117 | for image_file in os.listdir(test_path): 118 | try: 119 | image_type = imghdr.what(os.path.join(test_path, image_file)) 120 | if not image_type: 121 | continue 122 | except IsADirectoryError: 123 | continue 124 | 125 | image = Image.open(os.path.join(test_path, image_file)) 126 | if is_fixed_size: # TODO: When resizing we can use minibatch input. 127 | resized_image = image.resize( 128 | tuple(reversed(model_image_size)), Image.BICUBIC) 129 | image_data = np.array(resized_image, dtype='float32') 130 | else: 131 | # Due to skip connection + max pooling in YOLO_v2, inputs must have 132 | # width and height as multiples of 32. 133 | new_image_size = (image.width - (image.width % 32), 134 | image.height - (image.height % 32)) 135 | resized_image = image.resize(new_image_size, Image.BICUBIC) 136 | image_data = np.array(resized_image, dtype='float32') 137 | print(image_data.shape) 138 | 139 | image_data /= 255. 140 | image_data = np.expand_dims(image_data, 0) # Add batch dimension. 141 | 142 | out_boxes, out_scores, out_classes = sess.run( 143 | [boxes, scores, classes], 144 | feed_dict={ 145 | yolo_model.input: image_data, 146 | input_image_shape: [image.size[1], image.size[0]], 147 | K.learning_phase(): 0 148 | }) 149 | print('Found {} boxes for {}'.format(len(out_boxes), image_file)) 150 | 151 | font = ImageFont.truetype( 152 | font='font/FiraMono-Medium.otf', 153 | size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) 154 | thickness = (image.size[0] + image.size[1]) // 300 155 | 156 | for i, c in reversed(list(enumerate(out_classes))): 157 | predicted_class = class_names[c] 158 | box = out_boxes[i] 159 | score = out_scores[i] 160 | 161 | label = '{} {:.2f}'.format(predicted_class, score) 162 | 163 | draw = ImageDraw.Draw(image) 164 | label_size = draw.textsize(label, font) 165 | 166 | top, left, bottom, right = box 167 | top = max(0, np.floor(top + 0.5).astype('int32')) 168 | left = max(0, np.floor(left + 0.5).astype('int32')) 169 | bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) 170 | right = min(image.size[0], np.floor(right + 0.5).astype('int32')) 171 | print(label, (left, top), (right, bottom)) 172 | 173 | if top - label_size[1] >= 0: 174 | text_origin = np.array([left, top - label_size[1]]) 175 | else: 176 | text_origin = np.array([left, top + 1]) 177 | 178 | # My kingdom for a good redistributable image drawing library. 179 | for i in range(thickness): 180 | draw.rectangle( 181 | [left + i, top + i, right - i, bottom - i], 182 | outline=colors[c]) 183 | draw.rectangle( 184 | [tuple(text_origin), tuple(text_origin + label_size)], 185 | fill=colors[c]) 186 | draw.text(text_origin, label, fill=(0, 0, 0), font=font) 187 | del draw 188 | 189 | image.save(os.path.join(output_path, image_file), quality=90) 190 | sess.close() 191 | 192 | 193 | if __name__ == '__main__': 194 | _main(parser.parse_args()) 195 | -------------------------------------------------------------------------------- /train_overfit.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """Overfit a YOLO_v2 model to a single image from the Pascal VOC dataset. 3 | 4 | This is a sample training script used to test the implementation of the 5 | YOLO localization loss function. 6 | """ 7 | import argparse 8 | import io 9 | import os 10 | 11 | import h5py 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | import PIL 15 | import tensorflow as tf 16 | from keras import backend as K 17 | from keras.layers import Input, Lambda 18 | from keras.models import Model 19 | 20 | from yad2k.models.keras_yolo import (preprocess_true_boxes, yolo_body, 21 | yolo_eval, yolo_head, yolo_loss) 22 | from yad2k.utils.draw_boxes import draw_boxes 23 | 24 | YOLO_ANCHORS = np.array( 25 | ((0.57273, 0.677385), (1.87446, 2.06253), (3.33843, 5.47434), 26 | (7.88282, 3.52778), (9.77052, 9.16828))) 27 | 28 | argparser = argparse.ArgumentParser( 29 | description='Train YOLO_v2 model to overfit on a single image.') 30 | 31 | argparser.add_argument( 32 | '-d', 33 | '--data_path', 34 | help='path to HDF5 file containing pascal voc dataset', 35 | default='~/datasets/VOCdevkit/pascal_voc_07_12.hdf5') 36 | 37 | argparser.add_argument( 38 | '-a', 39 | '--anchors_path', 40 | help='path to anchors file, defaults to yolo_anchors.txt', 41 | default='model_data/yolo_anchors.txt') 42 | 43 | argparser.add_argument( 44 | '-c', 45 | '--classes_path', 46 | help='path to classes file, defaults to pascal_classes.txt', 47 | default='model_data/pascal_classes.txt') 48 | 49 | 50 | def _main(args): 51 | voc_path = os.path.expanduser(args.data_path) 52 | classes_path = os.path.expanduser(args.classes_path) 53 | anchors_path = os.path.expanduser(args.anchors_path) 54 | 55 | with open(classes_path) as f: 56 | class_names = f.readlines() 57 | class_names = [c.strip() for c in class_names] 58 | 59 | if os.path.isfile(anchors_path): 60 | with open(anchors_path) as f: 61 | anchors = f.readline() 62 | anchors = [float(x) for x in anchors.split(',')] 63 | anchors = np.array(anchors).reshape(-1, 2) 64 | else: 65 | anchors = YOLO_ANCHORS 66 | 67 | voc = h5py.File(voc_path, 'r') 68 | image = PIL.Image.open(io.BytesIO(voc['train/images'][28])) 69 | orig_size = np.array([image.width, image.height]) 70 | orig_size = np.expand_dims(orig_size, axis=0) 71 | 72 | # Image preprocessing. 73 | image = image.resize((416, 416), PIL.Image.BICUBIC) 74 | image_data = np.array(image, dtype=np.float) 75 | image_data /= 255. 76 | 77 | # Box preprocessing. 78 | # Original boxes stored as 1D list of class, x_min, y_min, x_max, y_max. 79 | boxes = voc['train/boxes'][28] 80 | boxes = boxes.reshape((-1, 5)) 81 | # Get extents as y_min, x_min, y_max, x_max, class for comparision with 82 | # model output. 83 | boxes_extents = boxes[:, [2, 1, 4, 3, 0]] 84 | 85 | # Get box parameters as x_center, y_center, box_width, box_height, class. 86 | boxes_xy = 0.5 * (boxes[:, 3:5] + boxes[:, 1:3]) 87 | boxes_wh = boxes[:, 3:5] - boxes[:, 1:3] 88 | boxes_xy = boxes_xy / orig_size 89 | boxes_wh = boxes_wh / orig_size 90 | boxes = np.concatenate((boxes_xy, boxes_wh, boxes[:, 0:1]), axis=1) 91 | 92 | # Precompute detectors_mask and matching_true_boxes for training. 93 | # Detectors mask is 1 for each spatial position in the final conv layer and 94 | # anchor that should be active for the given boxes and 0 otherwise. 95 | # Matching true boxes gives the regression targets for the ground truth box 96 | # that caused a detector to be active or 0 otherwise. 97 | detectors_mask_shape = (13, 13, 5, 1) 98 | matching_boxes_shape = (13, 13, 5, 5) 99 | detectors_mask, matching_true_boxes = preprocess_true_boxes(boxes, anchors, 100 | [416, 416]) 101 | 102 | # Create model input layers. 103 | image_input = Input(shape=(416, 416, 3)) 104 | boxes_input = Input(shape=(None, 5)) 105 | detectors_mask_input = Input(shape=detectors_mask_shape) 106 | matching_boxes_input = Input(shape=matching_boxes_shape) 107 | 108 | print('Boxes:') 109 | print(boxes) 110 | print('Box corners:') 111 | print(boxes_extents) 112 | print('Active detectors:') 113 | print(np.where(detectors_mask == 1)[:-1]) 114 | print('Matching boxes for active detectors:') 115 | print(matching_true_boxes[np.where(detectors_mask == 1)[:-1]]) 116 | 117 | # Create model body. 118 | model_body = yolo_body(image_input, len(anchors), len(class_names)) 119 | model_body = Model(image_input, model_body.output) 120 | # Place model loss on CPU to reduce GPU memory usage. 121 | with tf.device('/cpu:0'): 122 | # TODO: Replace Lambda with custom Keras layer for loss. 123 | model_loss = Lambda( 124 | yolo_loss, 125 | output_shape=(1, ), 126 | name='yolo_loss', 127 | arguments={'anchors': anchors, 128 | 'num_classes': len(class_names)})([ 129 | model_body.output, boxes_input, 130 | detectors_mask_input, matching_boxes_input 131 | ]) 132 | model = Model( 133 | [image_input, boxes_input, detectors_mask_input, 134 | matching_boxes_input], model_loss) 135 | model.compile( 136 | optimizer='adam', loss={ 137 | 'yolo_loss': lambda y_true, y_pred: y_pred 138 | }) # This is a hack to use the custom loss function in the last layer. 139 | 140 | # Add batch dimension for training. 141 | image_data = np.expand_dims(image_data, axis=0) 142 | boxes = np.expand_dims(boxes, axis=0) 143 | detectors_mask = np.expand_dims(detectors_mask, axis=0) 144 | matching_true_boxes = np.expand_dims(matching_true_boxes, axis=0) 145 | 146 | num_steps = 1000 147 | # TODO: For full training, put preprocessing inside training loop. 148 | # for i in range(num_steps): 149 | # loss = model.train_on_batch( 150 | # [image_data, boxes, detectors_mask, matching_true_boxes], 151 | # np.zeros(len(image_data))) 152 | model.fit([image_data, boxes, detectors_mask, matching_true_boxes], 153 | np.zeros(len(image_data)), 154 | batch_size=1, 155 | epochs=num_steps) 156 | model.save_weights('overfit_weights.h5') 157 | 158 | # Create output variables for prediction. 159 | yolo_outputs = yolo_head(model_body.output, anchors, len(class_names)) 160 | input_image_shape = K.placeholder(shape=(2, )) 161 | boxes, scores, classes = yolo_eval( 162 | yolo_outputs, input_image_shape, score_threshold=.3, iou_threshold=.9) 163 | 164 | # Run prediction on overfit image. 165 | sess = K.get_session() # TODO: Remove dependence on Tensorflow session. 166 | out_boxes, out_scores, out_classes = sess.run( 167 | [boxes, scores, classes], 168 | feed_dict={ 169 | model_body.input: image_data, 170 | input_image_shape: [image.size[1], image.size[0]], 171 | K.learning_phase(): 0 172 | }) 173 | print('Found {} boxes for image.'.format(len(out_boxes))) 174 | print(out_boxes) 175 | 176 | # Plot image with predicted boxes. 177 | image_with_boxes = draw_boxes(image_data[0], out_boxes, out_classes, 178 | class_names, out_scores) 179 | plt.imshow(image_with_boxes, interpolation='nearest') 180 | plt.show() 181 | 182 | 183 | if __name__ == '__main__': 184 | args = argparser.parse_args() 185 | _main(args) 186 | -------------------------------------------------------------------------------- /voc_conversion_scripts/voc_to_hdf5.py: -------------------------------------------------------------------------------- 1 | """ 2 | Convert Pascal VOC 2007+2012 detection dataset to HDF5. 3 | 4 | Does not preserve full XML annotations. 5 | Combines all VOC subsets (train, val test) with VOC2012 train for full 6 | training set as done in Faster R-CNN paper. 7 | 8 | Code based on: 9 | https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py 10 | """ 11 | 12 | import argparse 13 | import os 14 | import xml.etree.ElementTree as ElementTree 15 | 16 | import h5py 17 | import numpy as np 18 | 19 | sets_from_2007 = [('2007', 'train'), ('2007', 'val')] 20 | train_set = [('2012', 'train')] 21 | val_set = [('2012', 'val')] 22 | test_set = [('2007', 'test')] 23 | 24 | classes = [ 25 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 26 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 27 | "pottedplant", "sheep", "sofa", "train", "tvmonitor" 28 | ] 29 | 30 | parser = argparse.ArgumentParser( 31 | description='Convert Pascal VOC 2007+2012 detection dataset to HDF5.') 32 | parser.add_argument( 33 | '-p', 34 | '--path_to_voc', 35 | help='path to VOCdevkit directory', 36 | default='~/data/PascalVOC/VOCdevkit') 37 | 38 | 39 | def get_boxes_for_id(voc_path, year, image_id): 40 | """Get object bounding boxes annotations for given image. 41 | 42 | Parameters 43 | ---------- 44 | voc_path : str 45 | Path to VOCdevkit directory. 46 | year : str 47 | Year of dataset containing image. Either '2007' or '2012'. 48 | image_id : str 49 | Pascal VOC identifier for given image. 50 | 51 | Returns 52 | ------- 53 | boxes : array of int 54 | bounding box annotations of class label, xmin, ymin, xmax, ymax as a 55 | 5xN array. 56 | """ 57 | fname = os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year, 58 | image_id)) 59 | with open(fname) as in_file: 60 | xml_tree = ElementTree.parse(in_file) 61 | root = xml_tree.getroot() 62 | boxes = [] 63 | for obj in root.iter('object'): 64 | difficult = obj.find('difficult').text 65 | label = obj.find('name').text 66 | if label not in classes or int( 67 | difficult) == 1: # exclude difficult or unlisted classes 68 | continue 69 | xml_box = obj.find('bndbox') 70 | bbox = (classes.index(label), int(xml_box.find('xmin').text), 71 | int(xml_box.find('ymin').text), int(xml_box.find('xmax').text), 72 | int(xml_box.find('ymax').text)) 73 | boxes.extend(bbox) 74 | return np.array( 75 | boxes) # .T # return transpose so last dimension is variable length 76 | 77 | 78 | def get_image_for_id(voc_path, year, image_id): 79 | """Get image data as uint8 array for given image. 80 | 81 | Parameters 82 | ---------- 83 | voc_path : str 84 | Path to VOCdevkit directory. 85 | year : str 86 | Year of dataset containing image. Either '2007' or '2012'. 87 | image_id : str 88 | Pascal VOC identifier for given image. 89 | 90 | Returns 91 | ------- 92 | image_data : array of uint8 93 | Compressed JPEG byte string represented as array of uint8. 94 | """ 95 | fname = os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year, 96 | image_id)) 97 | with open(fname, 'rb') as in_file: 98 | data = in_file.read() 99 | # Use of encoding based on: https://github.com/h5py/h5py/issues/745 100 | return np.fromstring(data, dtype='uint8') 101 | 102 | 103 | def get_ids(voc_path, datasets): 104 | """Get image identifiers for corresponding list of dataset identifies. 105 | 106 | Parameters 107 | ---------- 108 | voc_path : str 109 | Path to VOCdevkit directory. 110 | datasets : list of str tuples 111 | List of dataset identifiers in the form of (year, dataset) pairs. 112 | 113 | Returns 114 | ------- 115 | ids : list of str 116 | List of all image identifiers for given datasets. 117 | """ 118 | ids = [] 119 | for year, image_set in datasets: 120 | id_file = os.path.join(voc_path, 'VOC{}/ImageSets/Main/{}.txt'.format( 121 | year, image_set)) 122 | with open(id_file, 'r') as image_ids: 123 | ids.extend(map(str.strip, image_ids.readlines())) 124 | return ids 125 | 126 | 127 | def add_to_dataset(voc_path, year, ids, images, boxes, start=0): 128 | """Process all given ids and adds them to given datasets.""" 129 | for i, voc_id in enumerate(ids): 130 | image_data = get_image_for_id(voc_path, year, voc_id) 131 | image_boxes = get_boxes_for_id(voc_path, year, voc_id) 132 | images[start + i] = image_data 133 | boxes[start + i] = image_boxes 134 | return i 135 | 136 | 137 | def _main(args): 138 | voc_path = os.path.expanduser(args.path_to_voc) 139 | train_ids = get_ids(voc_path, train_set) 140 | val_ids = get_ids(voc_path, val_set) 141 | test_ids = get_ids(voc_path, test_set) 142 | train_ids_2007 = get_ids(voc_path, sets_from_2007) 143 | total_train_ids = len(train_ids) + len(train_ids_2007) 144 | 145 | # Create HDF5 dataset structure 146 | print('Creating HDF5 dataset structure.') 147 | fname = os.path.join(voc_path, 'pascal_voc_07_12.hdf5') 148 | voc_h5file = h5py.File(fname, 'w') 149 | uint8_dt = h5py.special_dtype( 150 | vlen=np.dtype('uint8')) # variable length uint8 151 | vlen_int_dt = h5py.special_dtype( 152 | vlen=np.dtype(int)) # variable length default int 153 | train_group = voc_h5file.create_group('train') 154 | val_group = voc_h5file.create_group('val') 155 | test_group = voc_h5file.create_group('test') 156 | 157 | # store class list for reference class ids as csv fixed-length numpy string 158 | voc_h5file.attrs['classes'] = np.string_(str.join(',', classes)) 159 | 160 | # store images as variable length uint8 arrays 161 | train_images = train_group.create_dataset( 162 | 'images', shape=(total_train_ids, ), dtype=uint8_dt) 163 | val_images = val_group.create_dataset( 164 | 'images', shape=(len(val_ids), ), dtype=uint8_dt) 165 | test_images = test_group.create_dataset( 166 | 'images', shape=(len(test_ids), ), dtype=uint8_dt) 167 | 168 | # store boxes as class_id, xmin, ymin, xmax, ymax 169 | train_boxes = train_group.create_dataset( 170 | 'boxes', shape=(total_train_ids, ), dtype=vlen_int_dt) 171 | val_boxes = val_group.create_dataset( 172 | 'boxes', shape=(len(val_ids), ), dtype=vlen_int_dt) 173 | test_boxes = test_group.create_dataset( 174 | 'boxes', shape=(len(test_ids), ), dtype=vlen_int_dt) 175 | 176 | # process all ids and add to datasets 177 | print('Processing Pascal VOC 2007 datasets for training set.') 178 | last_2007 = add_to_dataset(voc_path, '2007', train_ids_2007, train_images, 179 | train_boxes) 180 | print('Processing Pascal VOC 2012 training set.') 181 | add_to_dataset( 182 | voc_path, 183 | '2012', 184 | train_ids, 185 | train_images, 186 | train_boxes, 187 | start=last_2007 + 1) 188 | print('Processing Pascal VOC 2012 val set.') 189 | add_to_dataset(voc_path, '2012', val_ids, val_images, val_boxes) 190 | print('Processing Pascal VOC 2007 test set.') 191 | add_to_dataset(voc_path, '2007', test_ids, test_images, test_boxes) 192 | 193 | print('Closing HDF5 file.') 194 | voc_h5file.close() 195 | print('Done.') 196 | 197 | 198 | if __name__ == '__main__': 199 | _main(parser.parse_args()) 200 | -------------------------------------------------------------------------------- /voc_conversion_scripts/voc_to_tfrecords.py: -------------------------------------------------------------------------------- 1 | """Convert Pascal VOC 2007+2012 detection dataset to TFRecords. 2 | Does not preserve full XML annotations. 3 | Combines all VOC 2007 subsets (train, val) with VOC2012 for training. 4 | Uses VOC2012 val for val and VOC2007 test for test. 5 | 6 | Code based on: 7 | https://github.com/pjreddie/darknet/blob/master/scripts/voc_label.py 8 | https://github.com/tensorflow/models/blob/master/inception/inception/data/build_image_data.py 9 | """ 10 | 11 | import argparse 12 | import os 13 | import xml.etree.ElementTree as ElementTree 14 | from datetime import datetime 15 | 16 | import numpy as np 17 | import tensorflow as tf 18 | 19 | from voc_to_hdf5 import get_ids 20 | 21 | sets_from_2007 = [('2007', 'train'), ('2007', 'val')] 22 | train_set = [('2012', 'train'), ('2012', 'val')] 23 | test_set = [('2007', 'test')] 24 | 25 | classes = [ 26 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 27 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 28 | "pottedplant", "sheep", "sofa", "train", "tvmonitor" 29 | ] 30 | 31 | parser = argparse.ArgumentParser( 32 | description='Convert Pascal VOC 2007+2012 detection dataset to TFRecords.') 33 | parser.add_argument( 34 | '-p', 35 | '--path_to_voc', 36 | help='path to Pascal VOC dataset', 37 | default='~/data/PascalVOC/VOCdevkit') 38 | 39 | # Small graph for image decoding 40 | decoder_sess = tf.Session() 41 | image_placeholder = tf.placeholder(dtype=tf.string) 42 | decoded_jpeg = tf.image.decode_jpeg(image_placeholder, channels=3) 43 | 44 | 45 | def process_image(image_path): 46 | """Decode image at given path.""" 47 | with open(image_path, 'rb') as f: 48 | image_data = f.read() 49 | image = decoder_sess.run(decoded_jpeg, 50 | feed_dict={image_placeholder: image_data}) 51 | assert len(image.shape) == 3 52 | height = image.shape[0] 53 | width = image.shape[2] 54 | assert image.shape[2] == 3 55 | return image_data, height, width 56 | 57 | 58 | def process_anno(anno_path): 59 | """Process Pascal VOC annotations.""" 60 | with open(anno_path) as f: 61 | xml_tree = ElementTree.parse(f) 62 | root = xml_tree.getroot() 63 | size = root.find('size') 64 | height = float(size.find('height').text) 65 | width = float(size.find('width').text) 66 | boxes = [] 67 | for obj in root.iter('object'): 68 | difficult = obj.find('difficult').text 69 | label = obj.find('name').text 70 | if label not in classes or int( 71 | difficult) == 1: # exclude difficult or unlisted classes 72 | continue 73 | xml_box = obj.find('bndbox') 74 | bbox = { 75 | 'class': classes.index(label), 76 | 'y_min': float(xml_box.find('ymin').text) / height, 77 | 'x_min': float(xml_box.find('xmin').text) / width, 78 | 'y_max': float(xml_box.find('ymax').text) / height, 79 | 'x_max': float(xml_box.find('xmax').text) / width 80 | } 81 | boxes.append(bbox) 82 | return boxes 83 | 84 | 85 | def convert_to_example(image_data, boxes, filename, height, width): 86 | """Convert Pascal VOC ground truth to TFExample protobuf. 87 | 88 | Parameters 89 | ---------- 90 | image_data : bytes 91 | Encoded image bytes. 92 | boxes : dict 93 | Bounding box corners and class labels 94 | filename : string 95 | Path to image file. 96 | height : int 97 | Image height. 98 | width : int 99 | Image width. 100 | 101 | Returns 102 | ------- 103 | example : protobuf 104 | Tensorflow Example protobuf containing image and bounding boxes. 105 | """ 106 | box_classes = [b['class'] for b in boxes] 107 | box_ymin = [b['y_min'] for b in boxes] 108 | box_xmin = [b['x_min'] for b in boxes] 109 | box_ymax = [b['y_max'] for b in boxes] 110 | box_xmax = [b['x_max'] for b in boxes] 111 | encoded_image = [tf.compat.as_bytes(image_data)] 112 | base_name = [tf.compat.as_bytes(os.path.basename(filename))] 113 | 114 | example = tf.train.Example(features=tf.train.Features(feature={ 115 | 'filename': 116 | tf.train.Feature(bytes_list=tf.train.BytesList(value=base_name)), 117 | 'height': 118 | tf.train.Feature(int64_list=tf.train.Int64List(value=[height])), 119 | 'width': 120 | tf.train.Feature(int64_list=tf.train.Int64List(value=[width])), 121 | 'classes': 122 | tf.train.Feature(int64_list=tf.train.Int64List(value=box_classes)), 123 | 'y_mins': 124 | tf.train.Feature(float_list=tf.train.FloatList(value=box_ymin)), 125 | 'x_mins': 126 | tf.train.Feature(float_list=tf.train.FloatList(value=box_xmin)), 127 | 'y_maxes': 128 | tf.train.Feature(float_list=tf.train.FloatList(value=box_ymax)), 129 | 'x_maxes': 130 | tf.train.Feature(float_list=tf.train.FloatList(value=box_xmax)), 131 | 'encoded': 132 | tf.train.Feature(bytes_list=tf.train.BytesList(value=encoded_image)) 133 | })) 134 | return example 135 | 136 | 137 | def get_image_path(voc_path, year, image_id): 138 | """Get path to image for given year and image id.""" 139 | return os.path.join(voc_path, 'VOC{}/JPEGImages/{}.jpg'.format(year, 140 | image_id)) 141 | 142 | 143 | def get_anno_path(voc_path, year, image_id): 144 | """Get path to image annotation for given year and image id.""" 145 | return os.path.join(voc_path, 'VOC{}/Annotations/{}.xml'.format(year, 146 | image_id)) 147 | 148 | 149 | def process_dataset(name, image_paths, anno_paths, result_path, num_shards): 150 | """Process selected Pascal VOC dataset to generate TFRecords files. 151 | 152 | Parameters 153 | ---------- 154 | name : string 155 | Name of resulting dataset 'train' or 'test'. 156 | image_paths : list 157 | List of paths to images to include in dataset. 158 | anno_paths : list 159 | List of paths to corresponding image annotations. 160 | result_path : string 161 | Path to put resulting TFRecord files. 162 | num_shards : int 163 | Number of shards to split TFRecord files into. 164 | """ 165 | shard_ranges = np.linspace(0, len(image_paths), num_shards + 1).astype(int) 166 | counter = 0 167 | for shard in range(num_shards): 168 | # Generate shard file name 169 | output_filename = '{}-{:05d}-of-{:05d}'.format(name, shard, num_shards) 170 | output_file = os.path.join(result_path, output_filename) 171 | writer = tf.python_io.TFRecordWriter(output_file) 172 | 173 | shard_counter = 0 174 | files_in_shard = range(shard_ranges[shard], shard_ranges[shard + 1]) 175 | for i in files_in_shard: 176 | image_file = image_paths[i] 177 | anno_file = anno_paths[i] 178 | 179 | # processes image + anno 180 | image_data, height, width = process_image(image_file) 181 | boxes = process_anno(anno_file) 182 | 183 | # convert to example 184 | example = convert_to_example(image_data, boxes, image_file, height, 185 | width) 186 | 187 | # write to writer 188 | writer.write(example.SerializeToString()) 189 | 190 | shard_counter += 1 191 | counter += 1 192 | 193 | if not counter % 1000: 194 | print('{} : Processed {:d} of {:d} images.'.format( 195 | datetime.now(), counter, len(image_paths))) 196 | writer.close() 197 | print('{} : Wrote {} images to {}'.format( 198 | datetime.now(), shard_counter, output_filename)) 199 | 200 | print('{} : Wrote {} images to {} shards'.format(datetime.now(), counter, 201 | num_shards)) 202 | 203 | 204 | def _main(args): 205 | """Locate files for train and test sets and then generate TFRecords.""" 206 | voc_path = args.path_to_voc 207 | voc_path = os.path.expanduser(voc_path) 208 | result_path = os.path.join(voc_path, 'TFRecords') 209 | print('Saving results to {}'.format(result_path)) 210 | 211 | train_path = os.path.join(result_path, 'train') 212 | test_path = os.path.join(result_path, 'test') 213 | 214 | train_ids = get_ids(voc_path, train_set) # 2012 trainval 215 | test_ids = get_ids(voc_path, test_set) # 2007 test 216 | train_ids_2007 = get_ids(voc_path, sets_from_2007) # 2007 trainval 217 | total_train_ids = len(train_ids) + len(train_ids_2007) 218 | print('{} train examples and {} test examples'.format(total_train_ids, 219 | len(test_ids))) 220 | 221 | train_image_paths = [ 222 | get_image_path(voc_path, '2012', i) for i in train_ids 223 | ] 224 | train_image_paths.extend( 225 | [get_image_path(voc_path, '2007', i) for i in train_ids_2007]) 226 | test_image_paths = [get_image_path(voc_path, '2007', i) for i in test_ids] 227 | 228 | train_anno_paths = [get_anno_path(voc_path, '2012', i) for i in train_ids] 229 | train_anno_paths.extend( 230 | [get_anno_path(voc_path, '2007', i) for i in train_ids_2007]) 231 | test_anno_paths = [get_anno_path(voc_path, '2007', i) for i in test_ids] 232 | 233 | process_dataset( 234 | 'train', 235 | train_image_paths, 236 | train_anno_paths, 237 | train_path, 238 | num_shards=60) 239 | process_dataset( 240 | 'test', test_image_paths, test_anno_paths, test_path, num_shards=20) 241 | 242 | 243 | if __name__ == '__main__': 244 | _main(parser.parse_args(args)) 245 | -------------------------------------------------------------------------------- /yad2k.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """ 3 | Reads Darknet19 config and weights and creates Keras model with TF backend. 4 | 5 | Currently only supports layers in Darknet19 config. 6 | """ 7 | 8 | import argparse 9 | import configparser 10 | import io 11 | import os 12 | from collections import defaultdict 13 | 14 | import numpy as np 15 | from keras import backend as K 16 | from keras.layers import (Conv2D, GlobalAveragePooling2D, Input, Lambda, 17 | MaxPooling2D) 18 | from keras.layers.advanced_activations import LeakyReLU 19 | from keras.layers.merge import concatenate 20 | from keras.layers.normalization import BatchNormalization 21 | from keras.models import Model 22 | from keras.regularizers import l2 23 | from keras.utils.vis_utils import plot_model as plot 24 | 25 | from yad2k.models.keras_yolo import (space_to_depth_x2, 26 | space_to_depth_x2_output_shape) 27 | 28 | parser = argparse.ArgumentParser( 29 | description='Yet Another Darknet To Keras Converter.') 30 | parser.add_argument('config_path', help='Path to Darknet cfg file.') 31 | parser.add_argument('weights_path', help='Path to Darknet weights file.') 32 | parser.add_argument('output_path', help='Path to output Keras model file.') 33 | parser.add_argument( 34 | '-p', 35 | '--plot_model', 36 | help='Plot generated Keras model and save as image.', 37 | action='store_true') 38 | parser.add_argument( 39 | '-flcl', 40 | '--fully_convolutional', 41 | help='Model is fully convolutional so set input shape to (None, None, 3). ' 42 | 'WARNING: This experimental option does not work properly for YOLO_v2.', 43 | action='store_true') 44 | 45 | 46 | def unique_config_sections(config_file): 47 | """Convert all config sections to have unique names. 48 | 49 | Adds unique suffixes to config sections for compability with configparser. 50 | """ 51 | section_counters = defaultdict(int) 52 | output_stream = io.StringIO() 53 | with open(config_file) as fin: 54 | for line in fin: 55 | if line.startswith('['): 56 | section = line.strip().strip('[]') 57 | _section = section + '_' + str(section_counters[section]) 58 | section_counters[section] += 1 59 | line = line.replace(section, _section) 60 | output_stream.write(line) 61 | output_stream.seek(0) 62 | return output_stream 63 | 64 | 65 | # %% 66 | def _main(args): 67 | config_path = os.path.expanduser(args.config_path) 68 | weights_path = os.path.expanduser(args.weights_path) 69 | assert config_path.endswith('.cfg'), '{} is not a .cfg file'.format( 70 | config_path) 71 | assert weights_path.endswith( 72 | '.weights'), '{} is not a .weights file'.format(weights_path) 73 | 74 | output_path = os.path.expanduser(args.output_path) 75 | assert output_path.endswith( 76 | '.h5'), 'output path {} is not a .h5 file'.format(output_path) 77 | output_root = os.path.splitext(output_path)[0] 78 | 79 | # Load weights and config. 80 | print('Loading weights.') 81 | weights_file = open(weights_path, 'rb') 82 | weights_header = np.ndarray( 83 | shape=(4, ), dtype='int32', buffer=weights_file.read(16)) 84 | print('Weights Header: ', weights_header) 85 | # TODO: Check transpose flag when implementing fully connected layers. 86 | # transpose = (weight_header[0] > 1000) or (weight_header[1] > 1000) 87 | 88 | print('Parsing Darknet config.') 89 | unique_config_file = unique_config_sections(config_path) 90 | cfg_parser = configparser.ConfigParser() 91 | cfg_parser.read_file(unique_config_file) 92 | 93 | print('Creating Keras model.') 94 | if args.fully_convolutional: 95 | image_height, image_width = None, None 96 | else: 97 | image_height = int(cfg_parser['net_0']['height']) 98 | image_width = int(cfg_parser['net_0']['width']) 99 | prev_layer = Input(shape=(image_height, image_width, 3)) 100 | all_layers = [prev_layer] 101 | 102 | weight_decay = float(cfg_parser['net_0']['decay'] 103 | ) if 'net_0' in cfg_parser.sections() else 5e-4 104 | count = 0 105 | for section in cfg_parser.sections(): 106 | print('Parsing section {}'.format(section)) 107 | if section.startswith('convolutional'): 108 | filters = int(cfg_parser[section]['filters']) 109 | size = int(cfg_parser[section]['size']) 110 | stride = int(cfg_parser[section]['stride']) 111 | pad = int(cfg_parser[section]['pad']) 112 | activation = cfg_parser[section]['activation'] 113 | batch_normalize = 'batch_normalize' in cfg_parser[section] 114 | 115 | # padding='same' is equivalent to Darknet pad=1 116 | padding = 'same' if pad == 1 else 'valid' 117 | 118 | # Setting weights. 119 | # Darknet serializes convolutional weights as: 120 | # [bias/beta, [gamma, mean, variance], conv_weights] 121 | prev_layer_shape = K.int_shape(prev_layer) 122 | 123 | # TODO: This assumes channel last dim_ordering. 124 | weights_shape = (size, size, prev_layer_shape[-1], filters) 125 | darknet_w_shape = (filters, weights_shape[2], size, size) 126 | weights_size = np.product(weights_shape) 127 | 128 | print('conv2d', 'bn' 129 | if batch_normalize else ' ', activation, weights_shape) 130 | 131 | conv_bias = np.ndarray( 132 | shape=(filters, ), 133 | dtype='float32', 134 | buffer=weights_file.read(filters * 4)) 135 | count += filters 136 | 137 | if batch_normalize: 138 | bn_weights = np.ndarray( 139 | shape=(3, filters), 140 | dtype='float32', 141 | buffer=weights_file.read(filters * 12)) 142 | count += 3 * filters 143 | 144 | # TODO: Keras BatchNormalization mistakenly refers to var 145 | # as std. 146 | bn_weight_list = [ 147 | bn_weights[0], # scale gamma 148 | conv_bias, # shift beta 149 | bn_weights[1], # running mean 150 | bn_weights[2] # running var 151 | ] 152 | 153 | conv_weights = np.ndarray( 154 | shape=darknet_w_shape, 155 | dtype='float32', 156 | buffer=weights_file.read(weights_size * 4)) 157 | count += weights_size 158 | 159 | # DarkNet conv_weights are serialized Caffe-style: 160 | # (out_dim, in_dim, height, width) 161 | # We would like to set these to Tensorflow order: 162 | # (height, width, in_dim, out_dim) 163 | # TODO: Add check for Theano dim ordering. 164 | conv_weights = np.transpose(conv_weights, [2, 3, 1, 0]) 165 | conv_weights = [conv_weights] if batch_normalize else [ 166 | conv_weights, conv_bias 167 | ] 168 | 169 | # Handle activation. 170 | act_fn = None 171 | if activation == 'leaky': 172 | pass # Add advanced activation later. 173 | elif activation != 'linear': 174 | raise ValueError( 175 | 'Unknown activation function `{}` in section {}'.format( 176 | activation, section)) 177 | 178 | # Create Conv2D layer 179 | conv_layer = (Conv2D( 180 | filters, (size, size), 181 | strides=(stride, stride), 182 | kernel_regularizer=l2(weight_decay), 183 | use_bias=not batch_normalize, 184 | weights=conv_weights, 185 | activation=act_fn, 186 | padding=padding))(prev_layer) 187 | 188 | if batch_normalize: 189 | conv_layer = (BatchNormalization( 190 | weights=bn_weight_list))(conv_layer) 191 | prev_layer = conv_layer 192 | 193 | if activation == 'linear': 194 | all_layers.append(prev_layer) 195 | elif activation == 'leaky': 196 | act_layer = LeakyReLU(alpha=0.1)(prev_layer) 197 | prev_layer = act_layer 198 | all_layers.append(act_layer) 199 | 200 | elif section.startswith('maxpool'): 201 | size = int(cfg_parser[section]['size']) 202 | stride = int(cfg_parser[section]['stride']) 203 | all_layers.append( 204 | MaxPooling2D( 205 | padding='same', 206 | pool_size=(size, size), 207 | strides=(stride, stride))(prev_layer)) 208 | prev_layer = all_layers[-1] 209 | 210 | elif section.startswith('avgpool'): 211 | if cfg_parser.items(section) != []: 212 | raise ValueError('{} with params unsupported.'.format(section)) 213 | all_layers.append(GlobalAveragePooling2D()(prev_layer)) 214 | prev_layer = all_layers[-1] 215 | 216 | elif section.startswith('route'): 217 | ids = [int(i) for i in cfg_parser[section]['layers'].split(',')] 218 | layers = [all_layers[i] for i in ids] 219 | if len(layers) > 1: 220 | print('Concatenating route layers:', layers) 221 | concatenate_layer = concatenate(layers) 222 | all_layers.append(concatenate_layer) 223 | prev_layer = concatenate_layer 224 | else: 225 | skip_layer = layers[0] # only one layer to route 226 | all_layers.append(skip_layer) 227 | prev_layer = skip_layer 228 | 229 | elif section.startswith('reorg'): 230 | block_size = int(cfg_parser[section]['stride']) 231 | assert block_size == 2, 'Only reorg with stride 2 supported.' 232 | all_layers.append( 233 | Lambda( 234 | space_to_depth_x2, 235 | output_shape=space_to_depth_x2_output_shape, 236 | name='space_to_depth_x2')(prev_layer)) 237 | prev_layer = all_layers[-1] 238 | 239 | elif section.startswith('region'): 240 | with open('{}_anchors.txt'.format(output_root), 'w') as f: 241 | print(cfg_parser[section]['anchors'], file=f) 242 | 243 | elif (section.startswith('net') or section.startswith('cost') or 244 | section.startswith('softmax')): 245 | pass # Configs not currently handled during model definition. 246 | 247 | else: 248 | raise ValueError( 249 | 'Unsupported section header type: {}'.format(section)) 250 | 251 | # Create and save model. 252 | model = Model(inputs=all_layers[0], outputs=all_layers[-1]) 253 | print(model.summary()) 254 | model.save('{}'.format(output_path)) 255 | print('Saved Keras model to {}'.format(output_path)) 256 | # Check to see if all weights have been read. 257 | remaining_weights = len(weights_file.read()) / 4 258 | weights_file.close() 259 | print('Read {} of {} from Darknet weights.'.format(count, count + 260 | remaining_weights)) 261 | if remaining_weights > 0: 262 | print('Warning: {} unused weights'.format(remaining_weights)) 263 | 264 | if args.plot_model: 265 | plot(model, to_file='{}.png'.format(output_root), show_shapes=True) 266 | print('Saved model plot to {}.png'.format(output_root)) 267 | 268 | 269 | if __name__ == '__main__': 270 | _main(parser.parse_args()) 271 | -------------------------------------------------------------------------------- /yad2k/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/yad2k/__init__.py -------------------------------------------------------------------------------- /yad2k/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/allanzelener/YAD2K/a42c760ef868bc115e596b56863dc25624d2e756/yad2k/models/__init__.py -------------------------------------------------------------------------------- /yad2k/models/keras_darknet19.py: -------------------------------------------------------------------------------- 1 | """Darknet19 Model Defined in Keras.""" 2 | import functools 3 | from functools import partial 4 | 5 | from keras.layers import Conv2D, MaxPooling2D 6 | from keras.layers.advanced_activations import LeakyReLU 7 | from keras.layers.normalization import BatchNormalization 8 | from keras.models import Model 9 | from keras.regularizers import l2 10 | 11 | from ..utils import compose 12 | 13 | # Partial wrapper for Convolution2D with static default argument. 14 | _DarknetConv2D = partial(Conv2D, padding='same') 15 | 16 | 17 | @functools.wraps(Conv2D) 18 | def DarknetConv2D(*args, **kwargs): 19 | """Wrapper to set Darknet weight regularizer for Convolution2D.""" 20 | darknet_conv_kwargs = {'kernel_regularizer': l2(5e-4)} 21 | darknet_conv_kwargs.update(kwargs) 22 | return _DarknetConv2D(*args, **darknet_conv_kwargs) 23 | 24 | 25 | def DarknetConv2D_BN_Leaky(*args, **kwargs): 26 | """Darknet Convolution2D followed by BatchNormalization and LeakyReLU.""" 27 | no_bias_kwargs = {'use_bias': False} 28 | no_bias_kwargs.update(kwargs) 29 | return compose( 30 | DarknetConv2D(*args, **no_bias_kwargs), 31 | BatchNormalization(), 32 | LeakyReLU(alpha=0.1)) 33 | 34 | 35 | def bottleneck_block(outer_filters, bottleneck_filters): 36 | """Bottleneck block of 3x3, 1x1, 3x3 convolutions.""" 37 | return compose( 38 | DarknetConv2D_BN_Leaky(outer_filters, (3, 3)), 39 | DarknetConv2D_BN_Leaky(bottleneck_filters, (1, 1)), 40 | DarknetConv2D_BN_Leaky(outer_filters, (3, 3))) 41 | 42 | 43 | def bottleneck_x2_block(outer_filters, bottleneck_filters): 44 | """Bottleneck block of 3x3, 1x1, 3x3, 1x1, 3x3 convolutions.""" 45 | return compose( 46 | bottleneck_block(outer_filters, bottleneck_filters), 47 | DarknetConv2D_BN_Leaky(bottleneck_filters, (1, 1)), 48 | DarknetConv2D_BN_Leaky(outer_filters, (3, 3))) 49 | 50 | 51 | def darknet_body(): 52 | """Generate first 18 conv layers of Darknet-19.""" 53 | return compose( 54 | DarknetConv2D_BN_Leaky(32, (3, 3)), 55 | MaxPooling2D(), 56 | DarknetConv2D_BN_Leaky(64, (3, 3)), 57 | MaxPooling2D(), 58 | bottleneck_block(128, 64), 59 | MaxPooling2D(), 60 | bottleneck_block(256, 128), 61 | MaxPooling2D(), 62 | bottleneck_x2_block(512, 256), 63 | MaxPooling2D(), 64 | bottleneck_x2_block(1024, 512)) 65 | 66 | 67 | def darknet19(inputs): 68 | """Generate Darknet-19 model for Imagenet classification.""" 69 | body = darknet_body()(inputs) 70 | logits = DarknetConv2D(1000, (1, 1), activation='softmax')(body) 71 | return Model(inputs, logits) 72 | -------------------------------------------------------------------------------- /yad2k/models/keras_yolo.py: -------------------------------------------------------------------------------- 1 | """YOLO_v2 Model Defined in Keras.""" 2 | import sys 3 | 4 | import numpy as np 5 | import tensorflow as tf 6 | from keras import backend as K 7 | from keras.layers import Lambda 8 | from keras.layers.merge import concatenate 9 | from keras.models import Model 10 | 11 | from ..utils import compose 12 | from .keras_darknet19 import (DarknetConv2D, DarknetConv2D_BN_Leaky, 13 | darknet_body) 14 | 15 | sys.path.append('..') 16 | 17 | voc_anchors = np.array( 18 | [[1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52]]) 19 | 20 | voc_classes = [ 21 | "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", 22 | "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", 23 | "pottedplant", "sheep", "sofa", "train", "tvmonitor" 24 | ] 25 | 26 | 27 | def space_to_depth_x2(x): 28 | """Thin wrapper for Tensorflow space_to_depth with block_size=2.""" 29 | # Import currently required to make Lambda work. 30 | # See: https://github.com/fchollet/keras/issues/5088#issuecomment-273851273 31 | import tensorflow as tf 32 | return tf.space_to_depth(x, block_size=2) 33 | 34 | 35 | def space_to_depth_x2_output_shape(input_shape): 36 | """Determine space_to_depth output shape for block_size=2. 37 | 38 | Note: For Lambda with TensorFlow backend, output shape may not be needed. 39 | """ 40 | return (input_shape[0], input_shape[1] // 2, input_shape[2] // 2, 4 * 41 | input_shape[3]) if input_shape[1] else (input_shape[0], None, None, 42 | 4 * input_shape[3]) 43 | 44 | 45 | def yolo_body(inputs, num_anchors, num_classes): 46 | """Create YOLO_V2 model CNN body in Keras.""" 47 | darknet = Model(inputs, darknet_body()(inputs)) 48 | conv20 = compose( 49 | DarknetConv2D_BN_Leaky(1024, (3, 3)), 50 | DarknetConv2D_BN_Leaky(1024, (3, 3)))(darknet.output) 51 | 52 | conv13 = darknet.layers[43].output 53 | conv21 = DarknetConv2D_BN_Leaky(64, (1, 1))(conv13) 54 | # TODO: Allow Keras Lambda to use func arguments for output_shape? 55 | conv21_reshaped = Lambda( 56 | space_to_depth_x2, 57 | output_shape=space_to_depth_x2_output_shape, 58 | name='space_to_depth')(conv21) 59 | 60 | x = concatenate([conv21_reshaped, conv20]) 61 | x = DarknetConv2D_BN_Leaky(1024, (3, 3))(x) 62 | x = DarknetConv2D(num_anchors * (num_classes + 5), (1, 1))(x) 63 | return Model(inputs, x) 64 | 65 | 66 | def yolo_head(feats, anchors, num_classes): 67 | """Convert final layer features to bounding box parameters. 68 | 69 | Parameters 70 | ---------- 71 | feats : tensor 72 | Final convolutional layer features. 73 | anchors : array-like 74 | Anchor box widths and heights. 75 | num_classes : int 76 | Number of target classes. 77 | 78 | Returns 79 | ------- 80 | box_xy : tensor 81 | x, y box predictions adjusted by spatial location in conv layer. 82 | box_wh : tensor 83 | w, h box predictions adjusted by anchors and conv spatial resolution. 84 | box_conf : tensor 85 | Probability estimate for whether each box contains any object. 86 | box_class_pred : tensor 87 | Probability distribution estimate for each box over class labels. 88 | """ 89 | num_anchors = len(anchors) 90 | # Reshape to batch, height, width, num_anchors, box_params. 91 | anchors_tensor = K.reshape(K.variable(anchors), [1, 1, 1, num_anchors, 2]) 92 | 93 | # Static implementation for fixed models. 94 | # TODO: Remove or add option for static implementation. 95 | # _, conv_height, conv_width, _ = K.int_shape(feats) 96 | # conv_dims = K.variable([conv_width, conv_height]) 97 | 98 | # Dynamic implementation of conv dims for fully convolutional model. 99 | conv_dims = K.shape(feats)[1:3] # assuming channels last 100 | # In YOLO the height index is the inner most iteration. 101 | conv_height_index = K.arange(0, stop=conv_dims[0]) 102 | conv_width_index = K.arange(0, stop=conv_dims[1]) 103 | conv_height_index = K.tile(conv_height_index, [conv_dims[1]]) 104 | 105 | # TODO: Repeat_elements and tf.split doesn't support dynamic splits. 106 | # conv_width_index = K.repeat_elements(conv_width_index, conv_dims[1], axis=0) 107 | conv_width_index = K.tile( 108 | K.expand_dims(conv_width_index, 0), [conv_dims[0], 1]) 109 | conv_width_index = K.flatten(K.transpose(conv_width_index)) 110 | conv_index = K.transpose(K.stack([conv_height_index, conv_width_index])) 111 | conv_index = K.reshape(conv_index, [1, conv_dims[0], conv_dims[1], 1, 2]) 112 | conv_index = K.cast(conv_index, K.dtype(feats)) 113 | 114 | feats = K.reshape( 115 | feats, [-1, conv_dims[0], conv_dims[1], num_anchors, num_classes + 5]) 116 | conv_dims = K.cast(K.reshape(conv_dims, [1, 1, 1, 1, 2]), K.dtype(feats)) 117 | 118 | # Static generation of conv_index: 119 | # conv_index = np.array([_ for _ in np.ndindex(conv_width, conv_height)]) 120 | # conv_index = conv_index[:, [1, 0]] # swap columns for YOLO ordering. 121 | # conv_index = K.variable( 122 | # conv_index.reshape(1, conv_height, conv_width, 1, 2)) 123 | # feats = Reshape( 124 | # (conv_dims[0], conv_dims[1], num_anchors, num_classes + 5))(feats) 125 | 126 | box_xy = K.sigmoid(feats[..., :2]) 127 | box_wh = K.exp(feats[..., 2:4]) 128 | box_confidence = K.sigmoid(feats[..., 4:5]) 129 | box_class_probs = K.softmax(feats[..., 5:]) 130 | 131 | # Adjust preditions to each spatial grid point and anchor size. 132 | # Note: YOLO iterates over height index before width index. 133 | box_xy = (box_xy + conv_index) / conv_dims 134 | box_wh = box_wh * anchors_tensor / conv_dims 135 | 136 | return box_xy, box_wh, box_confidence, box_class_probs 137 | 138 | 139 | def yolo_boxes_to_corners(box_xy, box_wh): 140 | """Convert YOLO box predictions to bounding box corners.""" 141 | box_mins = box_xy - (box_wh / 2.) 142 | box_maxes = box_xy + (box_wh / 2.) 143 | 144 | return K.concatenate([ 145 | box_mins[..., 1:2], # y_min 146 | box_mins[..., 0:1], # x_min 147 | box_maxes[..., 1:2], # y_max 148 | box_maxes[..., 0:1] # x_max 149 | ]) 150 | 151 | 152 | def yolo_loss(args, 153 | anchors, 154 | num_classes, 155 | rescore_confidence=False, 156 | print_loss=False): 157 | """YOLO localization loss function. 158 | 159 | Parameters 160 | ---------- 161 | yolo_output : tensor 162 | Final convolutional layer features. 163 | 164 | true_boxes : tensor 165 | Ground truth boxes tensor with shape [batch, num_true_boxes, 5] 166 | containing box x_center, y_center, width, height, and class. 167 | 168 | detectors_mask : array 169 | 0/1 mask for detector positions where there is a matching ground truth. 170 | 171 | matching_true_boxes : array 172 | Corresponding ground truth boxes for positive detector positions. 173 | Already adjusted for conv height and width. 174 | 175 | anchors : tensor 176 | Anchor boxes for model. 177 | 178 | num_classes : int 179 | Number of object classes. 180 | 181 | rescore_confidence : bool, default=False 182 | If true then set confidence target to IOU of best predicted box with 183 | the closest matching ground truth box. 184 | 185 | print_loss : bool, default=False 186 | If True then use a tf.Print() to print the loss components. 187 | 188 | Returns 189 | ------- 190 | mean_loss : float 191 | mean localization loss across minibatch 192 | """ 193 | (yolo_output, true_boxes, detectors_mask, matching_true_boxes) = args 194 | num_anchors = len(anchors) 195 | object_scale = 5 196 | no_object_scale = 1 197 | class_scale = 1 198 | coordinates_scale = 1 199 | pred_xy, pred_wh, pred_confidence, pred_class_prob = yolo_head( 200 | yolo_output, anchors, num_classes) 201 | 202 | # Unadjusted box predictions for loss. 203 | # TODO: Remove extra computation shared with yolo_head. 204 | yolo_output_shape = K.shape(yolo_output) 205 | feats = K.reshape(yolo_output, [ 206 | -1, yolo_output_shape[1], yolo_output_shape[2], num_anchors, 207 | num_classes + 5 208 | ]) 209 | pred_boxes = K.concatenate( 210 | (K.sigmoid(feats[..., 0:2]), feats[..., 2:4]), axis=-1) 211 | 212 | # TODO: Adjust predictions by image width/height for non-square images? 213 | # IOUs may be off due to different aspect ratio. 214 | 215 | # Expand pred x,y,w,h to allow comparison with ground truth. 216 | # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params 217 | pred_xy = K.expand_dims(pred_xy, 4) 218 | pred_wh = K.expand_dims(pred_wh, 4) 219 | 220 | pred_wh_half = pred_wh / 2. 221 | pred_mins = pred_xy - pred_wh_half 222 | pred_maxes = pred_xy + pred_wh_half 223 | 224 | true_boxes_shape = K.shape(true_boxes) 225 | 226 | # batch, conv_height, conv_width, num_anchors, num_true_boxes, box_params 227 | true_boxes = K.reshape(true_boxes, [ 228 | true_boxes_shape[0], 1, 1, 1, true_boxes_shape[1], true_boxes_shape[2] 229 | ]) 230 | true_xy = true_boxes[..., 0:2] 231 | true_wh = true_boxes[..., 2:4] 232 | 233 | # Find IOU of each predicted box with each ground truth box. 234 | true_wh_half = true_wh / 2. 235 | true_mins = true_xy - true_wh_half 236 | true_maxes = true_xy + true_wh_half 237 | 238 | intersect_mins = K.maximum(pred_mins, true_mins) 239 | intersect_maxes = K.minimum(pred_maxes, true_maxes) 240 | intersect_wh = K.maximum(intersect_maxes - intersect_mins, 0.) 241 | intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1] 242 | 243 | pred_areas = pred_wh[..., 0] * pred_wh[..., 1] 244 | true_areas = true_wh[..., 0] * true_wh[..., 1] 245 | 246 | union_areas = pred_areas + true_areas - intersect_areas 247 | iou_scores = intersect_areas / union_areas 248 | 249 | # Best IOUs for each location. 250 | best_ious = K.max(iou_scores, axis=4) # Best IOU scores. 251 | best_ious = K.expand_dims(best_ious) 252 | 253 | # A detector has found an object if IOU > thresh for some true box. 254 | object_detections = K.cast(best_ious > 0.6, K.dtype(best_ious)) 255 | 256 | # TODO: Darknet region training includes extra coordinate loss for early 257 | # training steps to encourage predictions to match anchor priors. 258 | 259 | # Determine confidence weights from object and no_object weights. 260 | # NOTE: YOLO does not use binary cross-entropy here. 261 | no_object_weights = (no_object_scale * (1 - object_detections) * 262 | (1 - detectors_mask)) 263 | no_objects_loss = no_object_weights * K.square(-pred_confidence) 264 | 265 | if rescore_confidence: 266 | objects_loss = (object_scale * detectors_mask * 267 | K.square(best_ious - pred_confidence)) 268 | else: 269 | objects_loss = (object_scale * detectors_mask * 270 | K.square(1 - pred_confidence)) 271 | confidence_loss = objects_loss + no_objects_loss 272 | 273 | # Classification loss for matching detections. 274 | # NOTE: YOLO does not use categorical cross-entropy loss here. 275 | matching_classes = K.cast(matching_true_boxes[..., 4], 'int32') 276 | matching_classes = K.one_hot(matching_classes, num_classes) 277 | classification_loss = (class_scale * detectors_mask * 278 | K.square(matching_classes - pred_class_prob)) 279 | 280 | # Coordinate loss for matching detection boxes. 281 | matching_boxes = matching_true_boxes[..., 0:4] 282 | coordinates_loss = (coordinates_scale * detectors_mask * 283 | K.square(matching_boxes - pred_boxes)) 284 | 285 | confidence_loss_sum = K.sum(confidence_loss) 286 | classification_loss_sum = K.sum(classification_loss) 287 | coordinates_loss_sum = K.sum(coordinates_loss) 288 | total_loss = 0.5 * ( 289 | confidence_loss_sum + classification_loss_sum + coordinates_loss_sum) 290 | if print_loss: 291 | total_loss = tf.Print( 292 | total_loss, [ 293 | total_loss, confidence_loss_sum, classification_loss_sum, 294 | coordinates_loss_sum 295 | ], 296 | message='yolo_loss, conf_loss, class_loss, box_coord_loss:') 297 | 298 | return total_loss 299 | 300 | 301 | def yolo(inputs, anchors, num_classes): 302 | """Generate a complete YOLO_v2 localization model.""" 303 | num_anchors = len(anchors) 304 | body = yolo_body(inputs, num_anchors, num_classes) 305 | outputs = yolo_head(body.output, anchors, num_classes) 306 | return outputs 307 | 308 | 309 | def yolo_filter_boxes(boxes, box_confidence, box_class_probs, threshold=.6): 310 | """Filter YOLO boxes based on object and class confidence.""" 311 | box_scores = box_confidence * box_class_probs 312 | box_classes = K.argmax(box_scores, axis=-1) 313 | box_class_scores = K.max(box_scores, axis=-1) 314 | prediction_mask = box_class_scores >= threshold 315 | 316 | # TODO: Expose tf.boolean_mask to Keras backend? 317 | boxes = tf.boolean_mask(boxes, prediction_mask) 318 | scores = tf.boolean_mask(box_class_scores, prediction_mask) 319 | classes = tf.boolean_mask(box_classes, prediction_mask) 320 | return boxes, scores, classes 321 | 322 | 323 | def yolo_eval(yolo_outputs, 324 | image_shape, 325 | max_boxes=10, 326 | score_threshold=.6, 327 | iou_threshold=.5): 328 | """Evaluate YOLO model on given input batch and return filtered boxes.""" 329 | box_xy, box_wh, box_confidence, box_class_probs = yolo_outputs 330 | boxes = yolo_boxes_to_corners(box_xy, box_wh) 331 | boxes, scores, classes = yolo_filter_boxes( 332 | boxes, box_confidence, box_class_probs, threshold=score_threshold) 333 | 334 | # Scale boxes back to original image shape. 335 | height = image_shape[0] 336 | width = image_shape[1] 337 | image_dims = K.stack([height, width, height, width]) 338 | image_dims = K.reshape(image_dims, [1, 4]) 339 | boxes = boxes * image_dims 340 | 341 | # TODO: Something must be done about this ugly hack! 342 | max_boxes_tensor = K.variable(max_boxes, dtype='int32') 343 | K.get_session().run(tf.variables_initializer([max_boxes_tensor])) 344 | nms_index = tf.image.non_max_suppression( 345 | boxes, scores, max_boxes_tensor, iou_threshold=iou_threshold) 346 | boxes = K.gather(boxes, nms_index) 347 | scores = K.gather(scores, nms_index) 348 | classes = K.gather(classes, nms_index) 349 | return boxes, scores, classes 350 | 351 | 352 | def preprocess_true_boxes(true_boxes, anchors, image_size): 353 | """Find detector in YOLO where ground truth box should appear. 354 | 355 | Parameters 356 | ---------- 357 | true_boxes : array 358 | List of ground truth boxes in form of relative x, y, w, h, class. 359 | Relative coordinates are in the range [0, 1] indicating a percentage 360 | of the original image dimensions. 361 | anchors : array 362 | List of anchors in form of w, h. 363 | Anchors are assumed to be in the range [0, conv_size] where conv_size 364 | is the spatial dimension of the final convolutional features. 365 | image_size : array-like 366 | List of image dimensions in form of h, w in pixels. 367 | 368 | Returns 369 | ------- 370 | detectors_mask : array 371 | 0/1 mask for detectors in [conv_height, conv_width, num_anchors, 1] 372 | that should be compared with a matching ground truth box. 373 | matching_true_boxes: array 374 | Same shape as detectors_mask with the corresponding ground truth box 375 | adjusted for comparison with predicted parameters at training time. 376 | """ 377 | height, width = image_size 378 | num_anchors = len(anchors) 379 | # Downsampling factor of 5x 2-stride max_pools == 32. 380 | # TODO: Remove hardcoding of downscaling calculations. 381 | assert height % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.' 382 | assert width % 32 == 0, 'Image sizes in YOLO_v2 must be multiples of 32.' 383 | conv_height = height // 32 384 | conv_width = width // 32 385 | num_box_params = true_boxes.shape[1] 386 | detectors_mask = np.zeros( 387 | (conv_height, conv_width, num_anchors, 1), dtype=np.float32) 388 | matching_true_boxes = np.zeros( 389 | (conv_height, conv_width, num_anchors, num_box_params), 390 | dtype=np.float32) 391 | 392 | for box in true_boxes: 393 | # scale box to convolutional feature spatial dimensions 394 | box_class = box[4:5] 395 | box = box[0:4] * np.array( 396 | [conv_width, conv_height, conv_width, conv_height]) 397 | i = np.floor(box[1]).astype('int') 398 | j = np.floor(box[0]).astype('int') 399 | best_iou = 0 400 | best_anchor = 0 401 | for k, anchor in enumerate(anchors): 402 | # Find IOU between box shifted to origin and anchor box. 403 | box_maxes = box[2:4] / 2. 404 | box_mins = -box_maxes 405 | anchor_maxes = (anchor / 2.) 406 | anchor_mins = -anchor_maxes 407 | 408 | intersect_mins = np.maximum(box_mins, anchor_mins) 409 | intersect_maxes = np.minimum(box_maxes, anchor_maxes) 410 | intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.) 411 | intersect_area = intersect_wh[0] * intersect_wh[1] 412 | box_area = box[2] * box[3] 413 | anchor_area = anchor[0] * anchor[1] 414 | iou = intersect_area / (box_area + anchor_area - intersect_area) 415 | if iou > best_iou: 416 | best_iou = iou 417 | best_anchor = k 418 | 419 | if best_iou > 0: 420 | detectors_mask[i, j, best_anchor] = 1 421 | adjusted_box = np.array( 422 | [ 423 | box[0] - j, box[1] - i, 424 | np.log(box[2] / anchors[best_anchor][0]), 425 | np.log(box[3] / anchors[best_anchor][1]), box_class 426 | ], 427 | dtype=np.float32) 428 | matching_true_boxes[i, j, best_anchor] = adjusted_box 429 | return detectors_mask, matching_true_boxes 430 | -------------------------------------------------------------------------------- /yad2k/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | -------------------------------------------------------------------------------- /yad2k/utils/draw_boxes.py: -------------------------------------------------------------------------------- 1 | """Draw predicted or ground truth boxes on input image.""" 2 | 3 | import colorsys 4 | import random 5 | 6 | import numpy as np 7 | from PIL import Image, ImageDraw, ImageFont 8 | 9 | 10 | def get_colors_for_classes(num_classes): 11 | """Return list of random colors for number of classes given.""" 12 | # Use previously generated colors if num_classes is the same. 13 | if (hasattr(get_colors_for_classes, "colors") and 14 | len(get_colors_for_classes.colors) == num_classes): 15 | return get_colors_for_classes.colors 16 | 17 | hsv_tuples = [(x / num_classes, 1., 1.) for x in range(num_classes)] 18 | colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) 19 | colors = list( 20 | map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), 21 | colors)) 22 | random.seed(10101) # Fixed seed for consistent colors across runs. 23 | random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes. 24 | random.seed(None) # Reset seed to default. 25 | get_colors_for_classes.colors = colors # Save colors for future calls. 26 | return colors 27 | 28 | 29 | def draw_boxes(image, boxes, box_classes, class_names, scores=None): 30 | """Draw bounding boxes on image. 31 | 32 | Draw bounding boxes with class name and optional box score on image. 33 | 34 | Args: 35 | image: An `array` of shape (width, height, 3) with values in [0, 1]. 36 | boxes: An `array` of shape (num_boxes, 4) containing box corners as 37 | (y_min, x_min, y_max, x_max). 38 | box_classes: A `list` of indicies into `class_names`. 39 | class_names: A `list` of `string` class names. 40 | `scores`: A `list` of scores for each box. 41 | 42 | Returns: 43 | A copy of `image` modified with given bounding boxes. 44 | """ 45 | image = Image.fromarray(np.floor(image * 255 + 0.5).astype('uint8')) 46 | 47 | font = ImageFont.truetype( 48 | font='font/FiraMono-Medium.otf', 49 | size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) 50 | thickness = (image.size[0] + image.size[1]) // 300 51 | 52 | colors = get_colors_for_classes(len(class_names)) 53 | 54 | for i, c in list(enumerate(box_classes)): 55 | box_class = class_names[c] 56 | box = boxes[i] 57 | if isinstance(scores, np.ndarray): 58 | score = scores[i] 59 | label = '{} {:.2f}'.format(box_class, score) 60 | else: 61 | label = '{}'.format(box_class) 62 | 63 | draw = ImageDraw.Draw(image) 64 | label_size = draw.textsize(label, font) 65 | 66 | top, left, bottom, right = box 67 | top = max(0, np.floor(top + 0.5).astype('int32')) 68 | left = max(0, np.floor(left + 0.5).astype('int32')) 69 | bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) 70 | right = min(image.size[0], np.floor(right + 0.5).astype('int32')) 71 | print(label, (left, top), (right, bottom)) 72 | 73 | if top - label_size[1] >= 0: 74 | text_origin = np.array([left, top - label_size[1]]) 75 | else: 76 | text_origin = np.array([left, top + 1]) 77 | 78 | # My kingdom for a good redistributable image drawing library. 79 | for i in range(thickness): 80 | draw.rectangle( 81 | [left + i, top + i, right - i, bottom - i], outline=colors[c]) 82 | draw.rectangle( 83 | [tuple(text_origin), tuple(text_origin + label_size)], 84 | fill=colors[c]) 85 | draw.text(text_origin, label, fill=(0, 0, 0), font=font) 86 | del draw 87 | 88 | return np.array(image) 89 | -------------------------------------------------------------------------------- /yad2k/utils/utils.py: -------------------------------------------------------------------------------- 1 | """Miscellaneous utility functions.""" 2 | 3 | from functools import reduce 4 | 5 | 6 | def compose(*funcs): 7 | """Compose arbitrarily many functions, evaluated left to right. 8 | 9 | Reference: https://mathieularose.com/function-composition-in-python/ 10 | """ 11 | # return lambda x: reduce(lambda v, f: f(v), funcs, x) 12 | if funcs: 13 | return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs) 14 | else: 15 | raise ValueError('Composition of empty sequence not supported.') 16 | --------------------------------------------------------------------------------