├── .gitignore ├── LICENSE ├── README.md ├── __init__.py ├── barebone-yolo.ipynb ├── coco2pascal.py ├── images ├── custom-loss.png ├── custom-loss2.png └── model.png ├── model.png ├── preprocessing.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Anderson Banihirwe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Keras YOLO Series 2 | Keras implementation of YOLO (You Only Look Once) : Unified, Real-Time Object Detection 3 | 4 | This is a [Keras](https://keras.io/) 5 | implementation of YOLO, and YOLOv2. 6 | This project is mainly based on [darkflow](https://github.com/thtrieu/darkflow) 7 | and [darknet](https://github.com/pjreddie/darknet). 8 | 9 | For details about YOLO and YOLOv2 please refer to their [project page](https://pjreddie.com/darknet/yolo/) 10 | and the [paper](https://arxiv.org/abs/1612.08242): 11 | YOLO9000: Better, Faster, Stronger by Joseph Redmon and Ali Farhadi. 12 | 13 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/__init__.py -------------------------------------------------------------------------------- /barebone-yolo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "toc": "true" 7 | }, 8 | "source": [ 9 | " # Table of Contents\n", 10 | "
" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# YOLO" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Import packages" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 35, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "from keras import models\n", 34 | "from keras import layers\n", 35 | "from keras import callbacks\n", 36 | "from keras import optimizers\n", 37 | "from keras.utils.vis_utils import plot_model\n", 38 | "import keras.backend as K\n", 39 | "import tensorflow as tf\n", 40 | "%matplotlib inline\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import matplotlib\n", 43 | "matplotlib.style.use('seaborn')\n", 44 | "import numpy as np\n", 45 | "import os\n", 46 | "import cv2\n", 47 | "import imgaug as ia\n", 48 | "from imgaug import augmenters as iaa\n", 49 | "from preprocessing import parse_annotation, BatchGenerator\n", 50 | "from utils import WeightReader, decode_netout, draw_boxes" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## Define and initialize global variables" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "metadata": { 64 | "collapsed": true 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']\n", 69 | "\n", 70 | "IMAGE_H, IMAGE_W = 416, 416\n", 71 | "GRID_H, GRID_W = 13 , 13\n", 72 | "BOX = 5\n", 73 | "CLASS = len(LABELS)\n", 74 | "CLASS_WEIGHTS = np.ones(CLASS, dtype='float32')\n", 75 | "OBJ_THRESHOLD = 0.3#0.5\n", 76 | "NMS_THRESHOLD = 0.3#0.45\n", 77 | "ANCHORS = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]\n", 78 | "\n", 79 | "NO_OBJECT_SCALE = 1.0\n", 80 | "OBJECT_SCALE = 5.0\n", 81 | "COORD_SCALE = 1.0\n", 82 | "CLASS_SCALE = 1.0\n", 83 | "\n", 84 | "BATCH_SIZE = 16\n", 85 | "WARM_UP_BATCHES = 0\n", 86 | "TRUE_BOX_BUFFER = 50\n", 87 | "\n", 88 | "\n", 89 | "ALPHA = 0.1" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "pre_trained_weights='weights/yolo.weights'\n", 101 | "train_image_folder = '/home/abanihi/Documents/deep-data/coco/images/train2014/'\n", 102 | "train_annot_folder = '/home/abanihi/Documents/deep-data/coco/train2014ann/'\n", 103 | "val_image_folder = '/home/abanihi/Documents/deep-data/coco/images/val2014/'\n", 104 | "val_annot_folder = '/home/abanihi/Documents/deep-data/coco/val2014ann/'" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Construct the Network" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n", 123 | "def space_to_depth_x2(x):\n", 124 | " return tf.space_to_depth(x, block_size=2)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 5, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "input_image = layers.Input(shape=(IMAGE_H, IMAGE_W, 3))\n", 136 | "true_boxes = layers.Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 6, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "def yolo():\n", 148 | " \n", 149 | " \n", 150 | " # Layer 1\n", 151 | " x = layers.Conv2D(32, (3, 3), strides=(1, 1), \n", 152 | " padding='same', name='conv_1', use_bias=False)(input_image)\n", 153 | " x = layers.BatchNormalization(name='norm_1')(x)\n", 154 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 155 | " x = layers.MaxPool2D(pool_size=(2,2))(x)\n", 156 | " \n", 157 | " # Layer 2\n", 158 | " x = layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same', name='conv_2', use_bias=False)(x)\n", 159 | " x = layers.BatchNormalization(name='norm_2')(x)\n", 160 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 161 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n", 162 | " \n", 163 | " \n", 164 | " # Layer 3\n", 165 | " x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_3', use_bias=False)(x)\n", 166 | " x = layers.BatchNormalization(name='norm_3')(x)\n", 167 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 168 | " \n", 169 | " # Layer 4 \n", 170 | " x = layers.Conv2D(64, (1, 1), strides=(1, 1), padding='same', name='conv_4', use_bias=False)(x)\n", 171 | " x = layers.BatchNormalization(name='norm_4')(x)\n", 172 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 173 | " \n", 174 | " # Layer 5\n", 175 | " x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_5', use_bias=False)(x)\n", 176 | " x = layers.BatchNormalization(name='norm_5')(x)\n", 177 | " x= layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 178 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n", 179 | " \n", 180 | " # Layer 6\n", 181 | " x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_6', use_bias=False)(x)\n", 182 | " x = layers.BatchNormalization(name='norm_6')(x)\n", 183 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 184 | " \n", 185 | " \n", 186 | " # Layer 7\n", 187 | " x = layers.Conv2D(128, (1, 1), strides=(1, 1), padding='same', name='conv_7', use_bias=False)(x)\n", 188 | " x= layers.BatchNormalization(name='norm_7')(x)\n", 189 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 190 | " \n", 191 | " # Layer 8\n", 192 | " x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_8', use_bias=False)(x)\n", 193 | " x = layers.BatchNormalization(name='norm_8')(x)\n", 194 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 195 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n", 196 | " \n", 197 | " # Layer 9\n", 198 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_9', use_bias=False)(x)\n", 199 | " x = layers.BatchNormalization(name='norm_9')(x)\n", 200 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 201 | " \n", 202 | " # Layer 10\n", 203 | " x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_10', use_bias=False)(x)\n", 204 | " x = layers.BatchNormalization(name='norm_10')(x)\n", 205 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 206 | " \n", 207 | " # Layer 11\n", 208 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_11', use_bias=False)(x)\n", 209 | " x = layers.BatchNormalization(name='norm_11')(x)\n", 210 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 211 | " \n", 212 | " \n", 213 | " # Layer 12\n", 214 | " x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_12', use_bias=False)(x)\n", 215 | " x = layers.BatchNormalization(name='norm_12')(x)\n", 216 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 217 | " \n", 218 | " # Layer 13\n", 219 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_13', use_bias=False)(x)\n", 220 | " x = layers.BatchNormalization(name='norm_13')(x)\n", 221 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 222 | " \n", 223 | " \n", 224 | " skip_connection = x\n", 225 | " \n", 226 | " x = layers.MaxPool2D(pool_size=(2, 2))(x)\n", 227 | " \n", 228 | " # Layer 14\n", 229 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_14', use_bias=False)(x)\n", 230 | " x = layers.BatchNormalization(name='norm_14')(x)\n", 231 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 232 | " \n", 233 | " # Layer 15\n", 234 | " x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_15', use_bias=False)(x)\n", 235 | " x = layers.BatchNormalization(name='norm_15')(x)\n", 236 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 237 | " \n", 238 | " # Layer 16\n", 239 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_16', use_bias=False)(x)\n", 240 | " x = layers.BatchNormalization(name='norm_16')(x)\n", 241 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 242 | " \n", 243 | " # Layer 17\n", 244 | " x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_17', use_bias=False)(x)\n", 245 | " x = layers.BatchNormalization(name='norm_17')(x)\n", 246 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 247 | " \n", 248 | " # Layer 18\n", 249 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_18', use_bias=False)(x)\n", 250 | " x = layers.BatchNormalization(name='norm_18')(x)\n", 251 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 252 | " \n", 253 | " # Layer 19\n", 254 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_19', use_bias=False)(x)\n", 255 | " x = layers.BatchNormalization(name='norm_19')(x)\n", 256 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 257 | " \n", 258 | " # Layer 20\n", 259 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_20', use_bias=False)(x)\n", 260 | " x = layers.BatchNormalization(name='norm_20')(x)\n", 261 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 262 | " \n", 263 | " \n", 264 | " # Layer 21\n", 265 | " skip_connection = layers.Conv2D(64, (1, 1), strides=(1, 1), \n", 266 | " padding='same', name='conv_21', use_bias=False)(skip_connection)\n", 267 | " skip_connection = layers.BatchNormalization(name='norm_21')(skip_connection)\n", 268 | " skip_connection = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(skip_connection)\n", 269 | " skip_connection = layers.Lambda(space_to_depth_x2)(skip_connection)\n", 270 | " \n", 271 | " x = layers.concatenate([skip_connection, x])\n", 272 | " \n", 273 | " # Layer 22\n", 274 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_22',\n", 275 | " use_bias=False)(x)\n", 276 | " x = layers.BatchNormalization(name='norm_22')(x)\n", 277 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n", 278 | " \n", 279 | " # Layer 23\n", 280 | " x = layers.Conv2D((4 + 1 + CLASS) * 5, (1,1), strides=(1,1), padding='same', name='conv_23')(x)\n", 281 | " output = layers.Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)\n", 282 | " \n", 283 | " # small hack to allow true_boxes to be registered when Keras build the model \n", 284 | " # for more information: https://github.com/fchollet/keras/issues/2790\n", 285 | " output = layers.Lambda(lambda args: args[0])([output, true_boxes])\n", 286 | " \n", 287 | " model = models.Model([input_image, true_boxes], output)\n", 288 | " \n", 289 | " \n", 290 | " return model\n" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 7, 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "name": "stdout", 300 | "output_type": "stream", 301 | "text": [ 302 | "____________________________________________________________________________________________________\n", 303 | "Layer (type) Output Shape Param # Connected to \n", 304 | "====================================================================================================\n", 305 | "input_1 (InputLayer) (None, 416, 416, 3) 0 \n", 306 | "____________________________________________________________________________________________________\n", 307 | "conv_1 (Conv2D) (None, 416, 416, 32) 864 input_1[0][0] \n", 308 | "____________________________________________________________________________________________________\n", 309 | "norm_1 (BatchNormalization) (None, 416, 416, 32) 128 conv_1[0][0] \n", 310 | "____________________________________________________________________________________________________\n", 311 | "leaky_re_lu_1 (LeakyReLU) (None, 416, 416, 32) 0 norm_1[0][0] \n", 312 | "____________________________________________________________________________________________________\n", 313 | "max_pooling2d_1 (MaxPooling2D) (None, 208, 208, 32) 0 leaky_re_lu_1[0][0] \n", 314 | "____________________________________________________________________________________________________\n", 315 | "conv_2 (Conv2D) (None, 208, 208, 64) 18432 max_pooling2d_1[0][0] \n", 316 | "____________________________________________________________________________________________________\n", 317 | "norm_2 (BatchNormalization) (None, 208, 208, 64) 256 conv_2[0][0] \n", 318 | "____________________________________________________________________________________________________\n", 319 | "leaky_re_lu_2 (LeakyReLU) (None, 208, 208, 64) 0 norm_2[0][0] \n", 320 | "____________________________________________________________________________________________________\n", 321 | "max_pooling2d_2 (MaxPooling2D) (None, 104, 104, 64) 0 leaky_re_lu_2[0][0] \n", 322 | "____________________________________________________________________________________________________\n", 323 | "conv_3 (Conv2D) (None, 104, 104, 128) 73728 max_pooling2d_2[0][0] \n", 324 | "____________________________________________________________________________________________________\n", 325 | "norm_3 (BatchNormalization) (None, 104, 104, 128) 512 conv_3[0][0] \n", 326 | "____________________________________________________________________________________________________\n", 327 | "leaky_re_lu_3 (LeakyReLU) (None, 104, 104, 128) 0 norm_3[0][0] \n", 328 | "____________________________________________________________________________________________________\n", 329 | "conv_4 (Conv2D) (None, 104, 104, 64) 8192 leaky_re_lu_3[0][0] \n", 330 | "____________________________________________________________________________________________________\n", 331 | "norm_4 (BatchNormalization) (None, 104, 104, 64) 256 conv_4[0][0] \n", 332 | "____________________________________________________________________________________________________\n", 333 | "leaky_re_lu_4 (LeakyReLU) (None, 104, 104, 64) 0 norm_4[0][0] \n", 334 | "____________________________________________________________________________________________________\n", 335 | "conv_5 (Conv2D) (None, 104, 104, 128) 73728 leaky_re_lu_4[0][0] \n", 336 | "____________________________________________________________________________________________________\n", 337 | "norm_5 (BatchNormalization) (None, 104, 104, 128) 512 conv_5[0][0] \n", 338 | "____________________________________________________________________________________________________\n", 339 | "leaky_re_lu_5 (LeakyReLU) (None, 104, 104, 128) 0 norm_5[0][0] \n", 340 | "____________________________________________________________________________________________________\n", 341 | "max_pooling2d_3 (MaxPooling2D) (None, 52, 52, 128) 0 leaky_re_lu_5[0][0] \n", 342 | "____________________________________________________________________________________________________\n", 343 | "conv_6 (Conv2D) (None, 52, 52, 256) 294912 max_pooling2d_3[0][0] \n", 344 | "____________________________________________________________________________________________________\n", 345 | "norm_6 (BatchNormalization) (None, 52, 52, 256) 1024 conv_6[0][0] \n", 346 | "____________________________________________________________________________________________________\n", 347 | "leaky_re_lu_6 (LeakyReLU) (None, 52, 52, 256) 0 norm_6[0][0] \n", 348 | "____________________________________________________________________________________________________\n", 349 | "conv_7 (Conv2D) (None, 52, 52, 128) 32768 leaky_re_lu_6[0][0] \n", 350 | "____________________________________________________________________________________________________\n", 351 | "norm_7 (BatchNormalization) (None, 52, 52, 128) 512 conv_7[0][0] \n", 352 | "____________________________________________________________________________________________________\n", 353 | "leaky_re_lu_7 (LeakyReLU) (None, 52, 52, 128) 0 norm_7[0][0] \n", 354 | "____________________________________________________________________________________________________\n", 355 | "conv_8 (Conv2D) (None, 52, 52, 256) 294912 leaky_re_lu_7[0][0] \n", 356 | "____________________________________________________________________________________________________\n", 357 | "norm_8 (BatchNormalization) (None, 52, 52, 256) 1024 conv_8[0][0] \n", 358 | "____________________________________________________________________________________________________\n", 359 | "leaky_re_lu_8 (LeakyReLU) (None, 52, 52, 256) 0 norm_8[0][0] \n", 360 | "____________________________________________________________________________________________________\n", 361 | "max_pooling2d_4 (MaxPooling2D) (None, 26, 26, 256) 0 leaky_re_lu_8[0][0] \n", 362 | "____________________________________________________________________________________________________\n", 363 | "conv_9 (Conv2D) (None, 26, 26, 512) 1179648 max_pooling2d_4[0][0] \n", 364 | "____________________________________________________________________________________________________\n", 365 | "norm_9 (BatchNormalization) (None, 26, 26, 512) 2048 conv_9[0][0] \n", 366 | "____________________________________________________________________________________________________\n", 367 | "leaky_re_lu_9 (LeakyReLU) (None, 26, 26, 512) 0 norm_9[0][0] \n", 368 | "____________________________________________________________________________________________________\n", 369 | "conv_10 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_9[0][0] \n", 370 | "____________________________________________________________________________________________________\n", 371 | "norm_10 (BatchNormalization) (None, 26, 26, 256) 1024 conv_10[0][0] \n", 372 | "____________________________________________________________________________________________________\n", 373 | "leaky_re_lu_10 (LeakyReLU) (None, 26, 26, 256) 0 norm_10[0][0] \n", 374 | "____________________________________________________________________________________________________\n", 375 | "conv_11 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_10[0][0] \n", 376 | "____________________________________________________________________________________________________\n", 377 | "norm_11 (BatchNormalization) (None, 26, 26, 512) 2048 conv_11[0][0] \n", 378 | "____________________________________________________________________________________________________\n", 379 | "leaky_re_lu_11 (LeakyReLU) (None, 26, 26, 512) 0 norm_11[0][0] \n", 380 | "____________________________________________________________________________________________________\n", 381 | "conv_12 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_11[0][0] \n", 382 | "____________________________________________________________________________________________________\n", 383 | "norm_12 (BatchNormalization) (None, 26, 26, 256) 1024 conv_12[0][0] \n", 384 | "____________________________________________________________________________________________________\n", 385 | "leaky_re_lu_12 (LeakyReLU) (None, 26, 26, 256) 0 norm_12[0][0] \n", 386 | "____________________________________________________________________________________________________\n", 387 | "conv_13 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_12[0][0] \n", 388 | "____________________________________________________________________________________________________\n", 389 | "norm_13 (BatchNormalization) (None, 26, 26, 512) 2048 conv_13[0][0] \n", 390 | "____________________________________________________________________________________________________\n", 391 | "leaky_re_lu_13 (LeakyReLU) (None, 26, 26, 512) 0 norm_13[0][0] \n", 392 | "____________________________________________________________________________________________________\n", 393 | "max_pooling2d_5 (MaxPooling2D) (None, 13, 13, 512) 0 leaky_re_lu_13[0][0] \n", 394 | "____________________________________________________________________________________________________\n", 395 | "conv_14 (Conv2D) (None, 13, 13, 1024) 4718592 max_pooling2d_5[0][0] \n", 396 | "____________________________________________________________________________________________________\n", 397 | "norm_14 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_14[0][0] \n", 398 | "____________________________________________________________________________________________________\n", 399 | "leaky_re_lu_14 (LeakyReLU) (None, 13, 13, 1024) 0 norm_14[0][0] \n", 400 | "____________________________________________________________________________________________________\n", 401 | "conv_15 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_14[0][0] \n", 402 | "____________________________________________________________________________________________________\n", 403 | "norm_15 (BatchNormalization) (None, 13, 13, 512) 2048 conv_15[0][0] \n", 404 | "____________________________________________________________________________________________________\n", 405 | "leaky_re_lu_15 (LeakyReLU) (None, 13, 13, 512) 0 norm_15[0][0] \n", 406 | "____________________________________________________________________________________________________\n", 407 | "conv_16 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_15[0][0] \n", 408 | "____________________________________________________________________________________________________\n", 409 | "norm_16 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_16[0][0] \n", 410 | "____________________________________________________________________________________________________\n", 411 | "leaky_re_lu_16 (LeakyReLU) (None, 13, 13, 1024) 0 norm_16[0][0] \n", 412 | "____________________________________________________________________________________________________\n", 413 | "conv_17 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_16[0][0] \n", 414 | "____________________________________________________________________________________________________\n", 415 | "norm_17 (BatchNormalization) (None, 13, 13, 512) 2048 conv_17[0][0] \n", 416 | "____________________________________________________________________________________________________\n", 417 | "leaky_re_lu_17 (LeakyReLU) (None, 13, 13, 512) 0 norm_17[0][0] \n", 418 | "____________________________________________________________________________________________________\n", 419 | "conv_18 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_17[0][0] \n", 420 | "____________________________________________________________________________________________________\n", 421 | "norm_18 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_18[0][0] \n", 422 | "____________________________________________________________________________________________________\n", 423 | "leaky_re_lu_18 (LeakyReLU) (None, 13, 13, 1024) 0 norm_18[0][0] \n", 424 | "____________________________________________________________________________________________________\n", 425 | "conv_19 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_18[0][0] \n", 426 | "____________________________________________________________________________________________________\n", 427 | "norm_19 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_19[0][0] \n", 428 | "____________________________________________________________________________________________________\n", 429 | "conv_21 (Conv2D) (None, 26, 26, 64) 32768 leaky_re_lu_13[0][0] \n", 430 | "____________________________________________________________________________________________________\n", 431 | "leaky_re_lu_19 (LeakyReLU) (None, 13, 13, 1024) 0 norm_19[0][0] \n", 432 | "____________________________________________________________________________________________________\n", 433 | "norm_21 (BatchNormalization) (None, 26, 26, 64) 256 conv_21[0][0] \n", 434 | "____________________________________________________________________________________________________\n", 435 | "conv_20 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_19[0][0] \n", 436 | "____________________________________________________________________________________________________\n", 437 | "leaky_re_lu_21 (LeakyReLU) (None, 26, 26, 64) 0 norm_21[0][0] \n", 438 | "____________________________________________________________________________________________________\n", 439 | "norm_20 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_20[0][0] \n", 440 | "____________________________________________________________________________________________________\n", 441 | "lambda_1 (Lambda) (None, 13, 13, 256) 0 leaky_re_lu_21[0][0] \n", 442 | "____________________________________________________________________________________________________\n", 443 | "leaky_re_lu_20 (LeakyReLU) (None, 13, 13, 1024) 0 norm_20[0][0] \n", 444 | "____________________________________________________________________________________________________\n", 445 | "concatenate_1 (Concatenate) (None, 13, 13, 1280) 0 lambda_1[0][0] \n", 446 | " leaky_re_lu_20[0][0] \n", 447 | "____________________________________________________________________________________________________\n", 448 | "conv_22 (Conv2D) (None, 13, 13, 1024) 11796480 concatenate_1[0][0] \n", 449 | "____________________________________________________________________________________________________\n", 450 | "norm_22 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_22[0][0] \n", 451 | "____________________________________________________________________________________________________\n", 452 | "leaky_re_lu_22 (LeakyReLU) (None, 13, 13, 1024) 0 norm_22[0][0] \n", 453 | "____________________________________________________________________________________________________\n", 454 | "conv_23 (Conv2D) (None, 13, 13, 425) 435625 leaky_re_lu_22[0][0] \n", 455 | "____________________________________________________________________________________________________\n", 456 | "reshape_1 (Reshape) (None, 13, 13, 5, 85) 0 conv_23[0][0] \n", 457 | "____________________________________________________________________________________________________\n", 458 | "input_2 (InputLayer) (None, 1, 1, 1, 50, 4 0 \n", 459 | "____________________________________________________________________________________________________\n", 460 | "lambda_2 (Lambda) (None, 13, 13, 5, 85) 0 reshape_1[0][0] \n", 461 | " input_2[0][0] \n", 462 | "====================================================================================================\n", 463 | "Total params: 50,983,561\n", 464 | "Trainable params: 50,962,889\n", 465 | "Non-trainable params: 20,672\n", 466 | "____________________________________________________________________________________________________\n" 467 | ] 468 | } 469 | ], 470 | "source": [ 471 | "model = yolo()\n", 472 | "model.summary()" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 8, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "plot_model(model, to_file='model.png')" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "Total params: 50,983,561\n", 491 | "Trainable params: 50,962,889\n", 492 | "Non-trainable params: 20,672" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | "## Load Pretrained weights\n", 500 | "\n", 501 | "Load the weights originally provided by YOLO" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 9, 507 | "metadata": { 508 | "collapsed": true 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "weight_reader = WeightReader(pre_trained_weights)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 10, 518 | "metadata": { 519 | "collapsed": true 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "weight_reader.reset()\n", 524 | "nb_conv = 23" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 11, 530 | "metadata": { 531 | "collapsed": true 532 | }, 533 | "outputs": [], 534 | "source": [ 535 | "for i in range(1, nb_conv+1):\n", 536 | " conv_layer = model.get_layer('conv_' + str(i))\n", 537 | " \n", 538 | " if i < nb_conv:\n", 539 | " norm_layer = model.get_layer('norm_' + str(i))\n", 540 | " \n", 541 | " size = np.prod(norm_layer.get_weights()[0].shape)\n", 542 | " \n", 543 | " beta = weight_reader.read_bytes(size)\n", 544 | " gamma = weight_reader.read_bytes(size)\n", 545 | " mean = weight_reader.read_bytes(size)\n", 546 | " var = weight_reader.read_bytes(size)\n", 547 | " \n", 548 | " weights = norm_layer.set_weights([gamma, beta, mean, var])\n", 549 | " \n", 550 | " if len(conv_layer.get_weights()) > 1:\n", 551 | " bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))\n", 552 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n", 553 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n", 554 | " kernel = kernel.transpose([2,3,1,0])\n", 555 | " conv_layer.set_weights([kernel, bias])\n", 556 | " \n", 557 | " else:\n", 558 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n", 559 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n", 560 | " kernel = kernel.transpose([2,3,1,0])\n", 561 | " conv_layer.set_weights([kernel])" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": { 567 | "collapsed": true 568 | }, 569 | "source": [ 570 | "## Randomize weights of the last layer" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 12, 576 | "metadata": { 577 | "collapsed": true 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "# Get last convolutional layer\n", 582 | "layer = model.layers[-4] \n", 583 | "weights = layer.get_weights()\n", 584 | "\n", 585 | "new_kernel = np.random.normal(size=weights[0].shape) / (GRID_H*GRID_W)\n", 586 | "new_bias = np.random.normal(size=weights[1].shape) / (GRID_H*GRID_W)\n", 587 | "\n", 588 | "layer.set_weights([new_kernel, new_bias])" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": {}, 594 | "source": [ 595 | "## Training" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "### Loss Function\n", 603 | "\n", 604 | "![](images/custom-loss.png)\n", 605 | "\n", 606 | "![](images/custom-loss2.png)\n" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 54, 612 | "metadata": { 613 | "collapsed": true 614 | }, 615 | "outputs": [], 616 | "source": [ 617 | "\n", 618 | "\n", 619 | "def custom_loss(y_true, y_pred):\n", 620 | " mask_shape = tf.shape(y_true)[:4]\n", 621 | " \n", 622 | " cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))\n", 623 | " cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n", 624 | "\n", 625 | " cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])\n", 626 | " \n", 627 | " coord_mask = tf.zeros(mask_shape)\n", 628 | " conf_mask = tf.zeros(mask_shape)\n", 629 | " class_mask = tf.zeros(mask_shape)\n", 630 | " \n", 631 | " seen = tf.Variable(0.)\n", 632 | " \n", 633 | " total_AP = tf.Variable(0.)\n", 634 | " \n", 635 | " \"\"\"\n", 636 | " Adjust prediction\n", 637 | " \"\"\"\n", 638 | " ### adjust x and y \n", 639 | " pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n", 640 | " \n", 641 | " ### adjust w and h\n", 642 | " pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])\n", 643 | " \n", 644 | " ### adjust confidence\n", 645 | " pred_box_conf = tf.sigmoid(y_pred[..., 4])\n", 646 | " \n", 647 | " ### adjust class probabilities\n", 648 | " pred_box_class = y_pred[..., 5:]\n", 649 | " \n", 650 | " \"\"\"\n", 651 | " Adjust ground truth\n", 652 | " \"\"\"\n", 653 | " ### adjust x and y\n", 654 | " true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n", 655 | " \n", 656 | " ### adjust w and h\n", 657 | " true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n", 658 | " \n", 659 | " ### adjust confidence\n", 660 | " true_wh_half = true_box_wh / 2.\n", 661 | " true_mins = true_box_xy - true_wh_half\n", 662 | " true_maxes = true_box_xy + true_wh_half\n", 663 | " \n", 664 | " pred_wh_half = pred_box_wh / 2.\n", 665 | " pred_mins = pred_box_xy - pred_wh_half\n", 666 | " pred_maxes = pred_box_xy + pred_wh_half \n", 667 | " \n", 668 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n", 669 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", 670 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", 671 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", 672 | " \n", 673 | " true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n", 674 | " pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n", 675 | "\n", 676 | " union_areas = pred_areas + true_areas - intersect_areas\n", 677 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n", 678 | " \n", 679 | " true_box_conf = iou_scores * y_true[..., 4]\n", 680 | " \n", 681 | " ### adjust class probabilities\n", 682 | " true_box_class = tf.to_int32(y_true[..., 5])\n", 683 | " \n", 684 | " \"\"\"\n", 685 | " Determine the masks\n", 686 | " \"\"\"\n", 687 | " ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n", 688 | " coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE\n", 689 | " \n", 690 | " ### confidence mask: penelize predictors + penalize boxes with low IOU\n", 691 | " # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n", 692 | " true_xy = true_boxes[..., 0:2]\n", 693 | " true_wh = true_boxes[..., 2:4]\n", 694 | " \n", 695 | " true_wh_half = true_wh / 2.\n", 696 | " true_mins = true_xy - true_wh_half\n", 697 | " true_maxes = true_xy + true_wh_half\n", 698 | " \n", 699 | " pred_xy = tf.expand_dims(pred_box_xy, 4)\n", 700 | " pred_wh = tf.expand_dims(pred_box_wh, 4)\n", 701 | " \n", 702 | " pred_wh_half = pred_wh / 2.\n", 703 | " pred_mins = pred_xy - pred_wh_half\n", 704 | " pred_maxes = pred_xy + pred_wh_half \n", 705 | " \n", 706 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n", 707 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n", 708 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n", 709 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n", 710 | " \n", 711 | " true_areas = true_wh[..., 0] * true_wh[..., 1]\n", 712 | " pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n", 713 | "\n", 714 | " union_areas = pred_areas + true_areas - intersect_areas\n", 715 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n", 716 | "\n", 717 | " best_ious = tf.reduce_max(iou_scores, axis=4)\n", 718 | " conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE\n", 719 | " \n", 720 | " # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n", 721 | " conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE\n", 722 | " \n", 723 | " ### class mask: simply the position of the ground truth boxes (the predictors)\n", 724 | " class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE \n", 725 | " \n", 726 | " \"\"\"\n", 727 | " Warm-up training\n", 728 | " \"\"\"\n", 729 | " no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)\n", 730 | " seen = tf.assign_add(seen, 1.)\n", 731 | " \n", 732 | " true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), \n", 733 | " lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n", 734 | " true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask, \n", 735 | " tf.ones_like(coord_mask)],\n", 736 | " lambda: [true_box_xy, \n", 737 | " true_box_wh,\n", 738 | " coord_mask])\n", 739 | " \n", 740 | " \"\"\"\n", 741 | " Finalize the loss\n", 742 | " \"\"\"\n", 743 | " nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n", 744 | " nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0))\n", 745 | " nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n", 746 | " \n", 747 | " loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", 748 | " loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n", 749 | " loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2.\n", 750 | " loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n", 751 | " loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n", 752 | " \n", 753 | " loss = loss_xy + loss_wh + loss_conf + loss_class\n", 754 | " \n", 755 | " nb_true_box = tf.reduce_sum(y_true[..., 4])\n", 756 | " nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > OBJ_THRESHOLD))\n", 757 | " \n", 758 | " total_AP = tf.assign_add(total_AP, nb_pred_box/nb_true_box) \n", 759 | " \n", 760 | " loss = tf.Print(loss, [loss_xy, loss_wh, loss_conf, loss_class, loss, total_AP/seen], message='DEBUG', summarize=1000)\n", 761 | " \n", 762 | " return loss\n", 763 | "\n" 764 | ] 765 | }, 766 | { 767 | "cell_type": "markdown", 768 | "metadata": {}, 769 | "source": [ 770 | "### Parse the annotations to construct train generator and validation generator" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 14, 776 | "metadata": { 777 | "collapsed": true 778 | }, 779 | "outputs": [], 780 | "source": [ 781 | "generator_config = {\n", 782 | " 'IMAGE_H' : IMAGE_H, \n", 783 | " 'IMAGE_W' : IMAGE_W,\n", 784 | " 'GRID_H' : GRID_H, \n", 785 | " 'GRID_W' : GRID_W,\n", 786 | " 'BOX' : BOX,\n", 787 | " 'LABELS' : LABELS,\n", 788 | " 'CLASS' : len(LABELS),\n", 789 | " 'ANCHORS' : ANCHORS,\n", 790 | " 'BATCH_SIZE' : BATCH_SIZE,\n", 791 | " 'TRUE_BOX_BUFFER' : 50,\n", 792 | "}\n", 793 | "\n" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 16, 799 | "metadata": {}, 800 | "outputs": [ 801 | { 802 | "name": "stdout", 803 | "output_type": "stream", 804 | "text": [ 805 | "CPU times: user 26.6 s, sys: 5.42 s, total: 32 s\n", 806 | "Wall time: 11min 35s\n" 807 | ] 808 | } 809 | ], 810 | "source": [ 811 | "%%time\n", 812 | "train_imgs, seen_train_labels = parse_annotation(train_annot_folder, train_image_folder, labels=LABELS)" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 39, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [ 823 | "import os\n", 824 | "import cv2\n", 825 | "import copy\n", 826 | "import numpy as np\n", 827 | "import imgaug as ia\n", 828 | "from imgaug import augmenters as iaa\n", 829 | "import xml.etree.ElementTree as ET\n", 830 | "from utils import BoundBox, normalize, bbox_iou" 831 | ] 832 | }, 833 | { 834 | "cell_type": "code", 835 | "execution_count": 43, 836 | "metadata": { 837 | "collapsed": true 838 | }, 839 | "outputs": [], 840 | "source": [ 841 | "class BatchGenerator:\n", 842 | " def __init__(self, images, \n", 843 | " config, \n", 844 | " shuffle=True, \n", 845 | " jitter=True, \n", 846 | " norm=True):\n", 847 | "\n", 848 | " self.images = images\n", 849 | " self.config = config\n", 850 | "\n", 851 | " self.shuffle = shuffle\n", 852 | " self.jitter = jitter\n", 853 | " self.norm = norm\n", 854 | " \n", 855 | "\n", 856 | " self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))]\n", 857 | "\n", 858 | " ### augmentors by https://github.com/aleju/imgaug\n", 859 | " sometimes = lambda aug: iaa.Sometimes(0.5, aug)\n", 860 | "\n", 861 | " # Define our sequence of augmentation steps that will be applied to every image\n", 862 | " # All augmenters with per_channel=0.5 will sample one value _per image_\n", 863 | " # in 50% of all cases. In all other cases they will sample new values\n", 864 | " # _per channel_.\n", 865 | " self.aug_pipe = iaa.Sequential(\n", 866 | " [\n", 867 | " # apply the following augmenters to most images\n", 868 | " #iaa.Fliplr(0.5), # horizontally flip 50% of all images\n", 869 | " #iaa.Flipud(0.2), # vertically flip 20% of all images\n", 870 | " #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width\n", 871 | " sometimes(iaa.Affine(\n", 872 | " #scale={\"x\": (0.8, 1.2), \"y\": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis\n", 873 | " #translate_percent={\"x\": (-0.2, 0.2), \"y\": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)\n", 874 | " #rotate=(-5, 5), # rotate by -45 to +45 degrees\n", 875 | " #shear=(-5, 5), # shear by -16 to +16 degrees\n", 876 | " #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)\n", 877 | " #cval=(0, 255), # if mode is constant, use a cval between 0 and 255\n", 878 | " #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)\n", 879 | " )),\n", 880 | " # execute 0 to 5 of the following (less important) augmenters per image\n", 881 | " # don't execute all of them, as that would often be way too strong\n", 882 | " iaa.SomeOf((0, 5),\n", 883 | " [\n", 884 | " #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation\n", 885 | " iaa.OneOf([\n", 886 | " iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0\n", 887 | " iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7\n", 888 | " iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7\n", 889 | " ]),\n", 890 | " iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images\n", 891 | " #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images\n", 892 | " # search either for all edges or for directed edges\n", 893 | " #sometimes(iaa.OneOf([\n", 894 | " # iaa.EdgeDetect(alpha=(0, 0.7)),\n", 895 | " # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),\n", 896 | " #])),\n", 897 | " iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images\n", 898 | " iaa.OneOf([\n", 899 | " iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels\n", 900 | " #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),\n", 901 | " ]),\n", 902 | " #iaa.Invert(0.05, per_channel=True), # invert color channels\n", 903 | " iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)\n", 904 | " iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)\n", 905 | " iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast\n", 906 | " #iaa.Grayscale(alpha=(0.0, 1.0)),\n", 907 | " #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)\n", 908 | " #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around\n", 909 | " ],\n", 910 | " random_order=True\n", 911 | " )\n", 912 | " ],\n", 913 | " random_order=True\n", 914 | " )\n", 915 | "\n", 916 | " if shuffle: np.random.shuffle(self.images)\n", 917 | "\n", 918 | " def get_generator(self):\n", 919 | " num_img = len(self.images)\n", 920 | " \n", 921 | " total_count = 0\n", 922 | " batch_count = 0\n", 923 | " \n", 924 | " x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images\n", 925 | " b_batch = np.zeros((self.config['BATCH_SIZE'], 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes\n", 926 | " y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1)) # desired network output\n", 927 | " \n", 928 | " while True:\n", 929 | " if total_count < num_img:\n", 930 | " train_instance = self.images[total_count]\n", 931 | "\n", 932 | " # augment input image and fix object's position and size\n", 933 | " img, all_objs = self.aug_image(train_instance, jitter=self.jitter)\n", 934 | " \n", 935 | " # construct output from object's x, y, w, h\n", 936 | " true_box_index = 0\n", 937 | " \n", 938 | " for obj in all_objs:\n", 939 | " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:\n", 940 | " center_x = .5*(obj['xmin'] + obj['xmax'])\n", 941 | " center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])\n", 942 | " center_y = .5*(obj['ymin'] + obj['ymax'])\n", 943 | " center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])\n", 944 | "\n", 945 | " grid_x = int(np.floor(center_x))\n", 946 | " grid_y = int(np.floor(center_y))\n", 947 | "\n", 948 | " if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:\n", 949 | " obj_indx = self.config['LABELS'].index(obj['name'])\n", 950 | " \n", 951 | " center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n", 952 | " center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n", 953 | " \n", 954 | " box = [center_x, center_y, center_w, center_h]\n", 955 | "\n", 956 | " # find the anchor that best predicts this box\n", 957 | " best_anchor = -1\n", 958 | " max_iou = -1\n", 959 | " \n", 960 | " shifted_box = BoundBox(0, \n", 961 | " 0, \n", 962 | " center_w, \n", 963 | " center_h)\n", 964 | " \n", 965 | " for i in range(len(self.anchors)):\n", 966 | " anchor = self.anchors[i]\n", 967 | " iou = bbox_iou(shifted_box, anchor)\n", 968 | " \n", 969 | " if max_iou < iou:\n", 970 | " best_anchor = i\n", 971 | " max_iou = iou\n", 972 | " \n", 973 | " # assign ground truth x, y, w, h, confidence and class probs to y_batch\n", 974 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4] = box\n", 975 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 4 ] = 1.\n", 976 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 5 ] = obj_indx\n", 977 | " \n", 978 | " # assign the true box to b_batch\n", 979 | " b_batch[batch_count, 0, 0, 0, true_box_index] = box\n", 980 | " \n", 981 | " true_box_index += 1\n", 982 | " true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']\n", 983 | " \n", 984 | " # assign input image to x_batch\n", 985 | " if self.norm: \n", 986 | " x_batch[batch_count] = normalize(img)\n", 987 | " else:\n", 988 | " # plot image and bounding boxes for sanity check\n", 989 | " for obj in all_objs:\n", 990 | " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:\n", 991 | " cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)\n", 992 | " cv2.putText(img[:,:,::-1], obj['name'], \n", 993 | " (obj['xmin']+2, obj['ymin']+12), \n", 994 | " 0, 1.2e-3 * img.shape[0], \n", 995 | " (0,255,0), 2)\n", 996 | " \n", 997 | " x_batch[batch_count] = img\n", 998 | "\n", 999 | " # increase instance counter in current batch\n", 1000 | " batch_count += 1 \n", 1001 | " \n", 1002 | " total_count += 1\n", 1003 | " if total_count >= num_img:\n", 1004 | " total_count = 0\n", 1005 | " if self.shuffle: np.random.shuffle(self.images) \n", 1006 | "\n", 1007 | " if batch_count >= self.config['BATCH_SIZE']:\n", 1008 | " yield [x_batch, b_batch], y_batch\n", 1009 | " \n", 1010 | " x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))\n", 1011 | " y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 5+self.config['CLASS'])) \n", 1012 | " \n", 1013 | " batch_count = 0\n", 1014 | "\n", 1015 | " def aug_image(self, train_instance, jitter):\n", 1016 | " image_name = train_instance['filename']\n", 1017 | " image = cv2.imread(image_name)\n", 1018 | " h, w, c = image.shape\n", 1019 | " \n", 1020 | " all_objs = copy.deepcopy(train_instance['object'])\n", 1021 | "\n", 1022 | " if jitter:\n", 1023 | " ### scale the image\n", 1024 | " scale = np.random.uniform() / 10. + 1.\n", 1025 | " image = cv2.resize(image, (0,0), fx = scale, fy = scale)\n", 1026 | "\n", 1027 | " ### translate the image\n", 1028 | " max_offx = (scale-1.) * w\n", 1029 | " max_offy = (scale-1.) * h\n", 1030 | " offx = int(np.random.uniform() * max_offx)\n", 1031 | " offy = int(np.random.uniform() * max_offy)\n", 1032 | " \n", 1033 | " image = image[offy : (offy + h), offx : (offx + w)]\n", 1034 | "\n", 1035 | " ### flip the image\n", 1036 | " flip = np.random.binomial(1, .5)\n", 1037 | " if flip > 0.5: image = cv2.flip(image, 1)\n", 1038 | " \n", 1039 | " image = self.aug_pipe.augment_image(image) \n", 1040 | " \n", 1041 | " # resize the image to standard size\n", 1042 | " image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))\n", 1043 | " image = image[:,:,::-1]\n", 1044 | "\n", 1045 | " # fix object's position and size\n", 1046 | " for obj in all_objs:\n", 1047 | " for attr in ['xmin', 'xmax']:\n", 1048 | " if jitter: obj[attr] = int(obj[attr] * scale - offx)\n", 1049 | " \n", 1050 | " obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)\n", 1051 | " obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)\n", 1052 | " \n", 1053 | " for attr in ['ymin', 'ymax']:\n", 1054 | " if jitter: obj[attr] = int(obj[attr] * scale - offy)\n", 1055 | " \n", 1056 | " obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)\n", 1057 | " obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)\n", 1058 | "\n", 1059 | " if jitter and flip > 0.5:\n", 1060 | " xmin = obj['xmin']\n", 1061 | " obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']\n", 1062 | " obj['xmax'] = self.config['IMAGE_W'] - xmin\n", 1063 | " \n", 1064 | " return image, all_objs\n", 1065 | "\n", 1066 | " def get_dateset_size(self):\n", 1067 | " return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": 44, 1073 | "metadata": {}, 1074 | "outputs": [ 1075 | { 1076 | "name": "stdout", 1077 | "output_type": "stream", 1078 | "text": [ 1079 | "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", 1080 | "Wall time: 11.5 ms\n" 1081 | ] 1082 | } 1083 | ], 1084 | "source": [ 1085 | "%%time\n", 1086 | "train_batch = BatchGenerator(train_imgs, generator_config)" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": 22, 1092 | "metadata": {}, 1093 | "outputs": [ 1094 | { 1095 | "name": "stdout", 1096 | "output_type": "stream", 1097 | "text": [ 1098 | "CPU times: user 12.4 s, sys: 2.45 s, total: 14.8 s\n", 1099 | "Wall time: 4min 50s\n" 1100 | ] 1101 | } 1102 | ], 1103 | "source": [ 1104 | "%%time\n", 1105 | "val_imgs, seen_val_labels = parse_annotation(val_annot_folder, val_image_folder, labels=LABELS)" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": 45, 1111 | "metadata": {}, 1112 | "outputs": [ 1113 | { 1114 | "name": "stdout", 1115 | "output_type": "stream", 1116 | "text": [ 1117 | "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n", 1118 | "Wall time: 5.87 ms\n" 1119 | ] 1120 | } 1121 | ], 1122 | "source": [ 1123 | "%%time\n", 1124 | "valid_batch = BatchGenerator(val_imgs, generator_config, jitter=False)" 1125 | ] 1126 | }, 1127 | { 1128 | "cell_type": "markdown", 1129 | "metadata": {}, 1130 | "source": [ 1131 | "## Setup a few callbacks and start the training" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": 46, 1137 | "metadata": { 1138 | "collapsed": true 1139 | }, 1140 | "outputs": [], 1141 | "source": [ 1142 | "early_stop = callbacks.EarlyStopping(monitor='val_loss', \n", 1143 | " min_delta=0.001, \n", 1144 | " patience=3, \n", 1145 | " mode='min', \n", 1146 | " verbose=1)\n", 1147 | "\n", 1148 | "checkpoint = callbacks.ModelCheckpoint('weights_coco.h5', \n", 1149 | " monitor='val_loss', \n", 1150 | " verbose=1, \n", 1151 | " save_best_only=True, \n", 1152 | " mode='min', \n", 1153 | " period=1)" 1154 | ] 1155 | }, 1156 | { 1157 | "cell_type": "code", 1158 | "execution_count": 47, 1159 | "metadata": {}, 1160 | "outputs": [ 1161 | { 1162 | "ename": "OSError", 1163 | "evalue": "Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)", 1164 | "output_type": "error", 1165 | "traceback": [ 1166 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 1167 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)", 1168 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'weights_coco.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 1169 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py\u001b[0m in \u001b[0;36mload_weights\u001b[0;34m(self, filepath, by_name)\u001b[0m\n\u001b[1;32m 2564\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mh5py\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2565\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'`load_weights` requires h5py.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2566\u001b[0;31m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2567\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'layer_names'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'model_weights'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2568\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'model_weights'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1170 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, **kwds)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 269\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muserblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mswmr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mswmr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1171 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 1172 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 1173 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n", 1174 | "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n", 1175 | "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)" 1176 | ] 1177 | } 1178 | ], 1179 | "source": [ 1180 | "model.load_weights('weights_coco.h5')" 1181 | ] 1182 | }, 1183 | { 1184 | "cell_type": "code", 1185 | "execution_count": null, 1186 | "metadata": {}, 1187 | "outputs": [ 1188 | { 1189 | "name": "stdout", 1190 | "output_type": "stream", 1191 | "text": [ 1192 | "Epoch 1/100\n", 1193 | " 91/5120 [..............................] - ETA: 296822s - loss: 4.8495" 1194 | ] 1195 | } 1196 | ], 1197 | "source": [ 1198 | "tb_counter = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'coco_' in log]) + 1\n", 1199 | "tensorboard = callbacks.TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'coco_' + '_' + str(tb_counter), \n", 1200 | " histogram_freq=0, \n", 1201 | " write_graph=True, \n", 1202 | " write_images=False)\n", 1203 | "\n", 1204 | "optimizer = optimizers.Adam(lr=0.5e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n", 1205 | "#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)\n", 1206 | "#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)\n", 1207 | "\n", 1208 | "model.compile(loss=custom_loss, optimizer=optimizer)\n", 1209 | "\n", 1210 | "model.fit_generator(generator = train_batch.get_generator(), \n", 1211 | " steps_per_epoch = train_batch.get_dateset_size(), \n", 1212 | " epochs = 100, \n", 1213 | " verbose = 1,\n", 1214 | " validation_data = valid_batch.get_generator(),\n", 1215 | " validation_steps = valid_batch.get_dateset_size(),\n", 1216 | " callbacks = [early_stop, checkpoint, tensorboard], \n", 1217 | " max_queue_size = 3)" 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "code", 1222 | "execution_count": null, 1223 | "metadata": { 1224 | "collapsed": true 1225 | }, 1226 | "outputs": [], 1227 | "source": [ 1228 | "%load_ext version_information\n", 1229 | "%version_information keras" 1230 | ] 1231 | }, 1232 | { 1233 | "cell_type": "code", 1234 | "execution_count": null, 1235 | "metadata": { 1236 | "collapsed": true 1237 | }, 1238 | "outputs": [], 1239 | "source": [] 1240 | } 1241 | ], 1242 | "metadata": { 1243 | "kernelspec": { 1244 | "display_name": "Python 3", 1245 | "language": "python", 1246 | "name": "python3" 1247 | }, 1248 | "language_info": { 1249 | "codemirror_mode": { 1250 | "name": "ipython", 1251 | "version": 3 1252 | }, 1253 | "file_extension": ".py", 1254 | "mimetype": "text/x-python", 1255 | "name": "python", 1256 | "nbconvert_exporter": "python", 1257 | "pygments_lexer": "ipython3", 1258 | "version": "3.6.2" 1259 | }, 1260 | "toc": { 1261 | "nav_menu": {}, 1262 | "number_sections": true, 1263 | "sideBar": true, 1264 | "skip_h1_title": false, 1265 | "toc_cell": true, 1266 | "toc_position": {}, 1267 | "toc_section_display": "block", 1268 | "toc_window_display": false 1269 | } 1270 | }, 1271 | "nbformat": 4, 1272 | "nbformat_minor": 2 1273 | } 1274 | -------------------------------------------------------------------------------- /coco2pascal.py: -------------------------------------------------------------------------------- 1 | import baker 2 | import json 3 | from path import Path as path 4 | from cytoolz import merge, join, groupby 5 | from cytoolz.compatibility import iteritems 6 | from cytoolz.curried import update_in 7 | from itertools import starmap 8 | from collections import deque 9 | from lxml import etree, objectify 10 | from scipy.io import savemat 11 | from scipy.ndimage import imread 12 | 13 | 14 | def keyjoin(leftkey, leftseq, rightkey, rightseq): 15 | return starmap(merge, join(leftkey, leftseq, rightkey, rightseq)) 16 | 17 | 18 | def root(folder, filename, width, height): 19 | E = objectify.ElementMaker(annotate=False) 20 | return E.annotation( 21 | E.folder(folder), 22 | E.filename(filename), 23 | E.source( 24 | E.database('MS COCO 2014'), 25 | E.annotation('MS COCO 2014'), 26 | E.image('Flickr'), 27 | ), 28 | E.size( 29 | E.width(width), 30 | E.height(height), 31 | E.depth(3), 32 | ), 33 | E.segmented(0) 34 | ) 35 | 36 | 37 | def instance_to_xml(anno): 38 | E = objectify.ElementMaker(annotate=False) 39 | xmin, ymin, width, height = anno['bbox'] 40 | return E.object( 41 | E.name(anno['category_id']), 42 | E.bndbox( 43 | E.xmin(xmin), 44 | E.ymin(ymin), 45 | E.xmax(xmin+width), 46 | E.ymax(ymin+height), 47 | ), 48 | ) 49 | 50 | 51 | @baker.command 52 | def write_categories(coco_annotation, dst): 53 | content = json.loads(path(coco_annotation).expand().text()) 54 | categories = tuple( d['name'] for d in content['categories']) 55 | savemat(path(dst).expand(), {'categories': categories}) 56 | 57 | 58 | def get_instances(coco_annotation): 59 | coco_annotation = path(coco_annotation).expand() 60 | content = json.loads(coco_annotation.text()) 61 | categories = {d['id']: d['name'] for d in content['categories']} 62 | return categories, tuple(keyjoin('id', content['images'], 'image_id', content['annotations'])) 63 | 64 | def rename(name, year=2014): 65 | out_name = path(name).stripext() 66 | # out_name = out_name.split('_')[-1] 67 | # out_name = '{}_{}'.format(year, out_name) 68 | return out_name 69 | 70 | 71 | @baker.command 72 | def create_imageset(annotations, dst): 73 | annotations = path(annotations).expand() 74 | dst = path(dst).expand() 75 | val_txt = dst / 'val.txt' 76 | train_txt = dst / 'train.txt' 77 | 78 | for val in annotations.listdir('*val*'): 79 | val_txt.write_text('{}\n'.format(val.basename().stripext()), append=True) 80 | 81 | for train in annotations.listdir('*train*'): 82 | train_txt.write_text('{}\n'.format(train.basename().stripext()), append=True) 83 | 84 | @baker.command 85 | def create_annotations(dbpath, subset, dst): 86 | annotations_path = path(dbpath).expand() / 'annotations/instances_{}2014.json'.format(subset) 87 | images_path = path(dbpath).expand() / 'images/{}2014'.format(subset) 88 | categories , instances= get_instances(annotations_path) 89 | dst = path(dst).expand() 90 | 91 | for i, instance in enumerate(instances): 92 | instances[i]['category_id'] = categories[instance['category_id']] 93 | 94 | for name, group in iteritems(groupby('file_name', instances)): 95 | img = imread(images_path / name) 96 | if img.ndim == 3: 97 | out_name = rename(name) 98 | annotation = root('VOC2014', '{}.jpg'.format(out_name), 99 | group[0]['height'], group[0]['width']) 100 | for instance in group: 101 | annotation.append(instance_to_xml(instance)) 102 | etree.ElementTree(annotation).write(dst / '{}.xml'.format(out_name)) 103 | print(out_name) 104 | else: 105 | print (instance['file_name']) 106 | 107 | 108 | 109 | 110 | 111 | if __name__ == '__main__': 112 | baker.run() 113 | -------------------------------------------------------------------------------- /images/custom-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss.png -------------------------------------------------------------------------------- /images/custom-loss2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss2.png -------------------------------------------------------------------------------- /images/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/model.png -------------------------------------------------------------------------------- /model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/model.png -------------------------------------------------------------------------------- /preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import copy 4 | import numpy as np 5 | import imgaug as ia 6 | from imgaug import augmenters as iaa 7 | import xml.etree.ElementTree as ET 8 | from utils import BoundBox, normalize, bbox_iou 9 | 10 | def parse_annotation(ann_dir, img_dir, labels=[]): 11 | all_imgs = [] 12 | seen_labels = set() 13 | 14 | for ann in sorted(os.listdir(ann_dir)): 15 | img = {'object':[]} 16 | 17 | tree = ET.parse(ann_dir + ann) 18 | 19 | for elem in tree.iter(): 20 | if 'filename' in elem.tag: 21 | all_imgs += [img] 22 | img['filename'] = img_dir + elem.text 23 | if 'width' in elem.tag: 24 | img['width'] = int(elem.text) 25 | if 'height' in elem.tag: 26 | img['height'] = int(elem.text) 27 | if 'object' in elem.tag or 'part' in elem.tag: 28 | obj = {} 29 | 30 | for attr in list(elem): 31 | if 'name' in attr.tag: 32 | obj['name'] = attr.text 33 | seen_labels.add(obj['name']) 34 | 35 | if len(labels) > 0 and obj['name'] not in labels: 36 | break 37 | else: 38 | img['object'] += [obj] 39 | 40 | if 'bndbox' in attr.tag: 41 | for dim in list(attr): 42 | if 'xmin' in dim.tag: 43 | obj['xmin'] = int(round(float(dim.text))) 44 | if 'ymin' in dim.tag: 45 | obj['ymin'] = int(round(float(dim.text))) 46 | if 'xmax' in dim.tag: 47 | obj['xmax'] = int(round(float(dim.text))) 48 | if 'ymax' in dim.tag: 49 | obj['ymax'] = int(round(float(dim.text))) 50 | 51 | return all_imgs, seen_labels 52 | 53 | class BatchGenerator: 54 | def __init__(self, images, 55 | config, 56 | shuffle=True, 57 | jitter=True, 58 | norm=True): 59 | 60 | self.images = images 61 | self.config = config 62 | 63 | self.shuffle = shuffle 64 | self.jitter = jitter 65 | self.norm = norm 66 | 67 | self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))] 68 | 69 | ### augmentors by https://github.com/aleju/imgaug 70 | sometimes = lambda aug: iaa.Sometimes(0.5, aug) 71 | 72 | # Define our sequence of augmentation steps that will be applied to every image 73 | # All augmenters with per_channel=0.5 will sample one value _per image_ 74 | # in 50% of all cases. In all other cases they will sample new values 75 | # _per channel_. 76 | self.aug_pipe = iaa.Sequential( 77 | [ 78 | # apply the following augmenters to most images 79 | #iaa.Fliplr(0.5), # horizontally flip 50% of all images 80 | #iaa.Flipud(0.2), # vertically flip 20% of all images 81 | #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width 82 | sometimes(iaa.Affine( 83 | #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis 84 | #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis) 85 | #rotate=(-5, 5), # rotate by -45 to +45 degrees 86 | #shear=(-5, 5), # shear by -16 to +16 degrees 87 | #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast) 88 | #cval=(0, 255), # if mode is constant, use a cval between 0 and 255 89 | #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples) 90 | )), 91 | # execute 0 to 5 of the following (less important) augmenters per image 92 | # don't execute all of them, as that would often be way too strong 93 | iaa.SomeOf((0, 5), 94 | [ 95 | #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation 96 | iaa.OneOf([ 97 | iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0 98 | iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7 99 | iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7 100 | ]), 101 | iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images 102 | #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images 103 | # search either for all edges or for directed edges 104 | #sometimes(iaa.OneOf([ 105 | # iaa.EdgeDetect(alpha=(0, 0.7)), 106 | # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)), 107 | #])), 108 | iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images 109 | iaa.OneOf([ 110 | iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels 111 | #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2), 112 | ]), 113 | #iaa.Invert(0.05, per_channel=True), # invert color channels 114 | iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value) 115 | iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value) 116 | iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast 117 | #iaa.Grayscale(alpha=(0.0, 1.0)), 118 | #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths) 119 | #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around 120 | ], 121 | random_order=True 122 | ) 123 | ], 124 | random_order=True 125 | ) 126 | 127 | if shuffle: np.random.shuffle(self.images) 128 | 129 | def get_generator(self): 130 | num_img = len(self.images) 131 | 132 | total_count = 0 133 | batch_count = 0 134 | 135 | x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images 136 | b_batch = np.zeros((self.config['BATCH_SIZE'], 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes 137 | y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1)) # desired network output 138 | 139 | while True: 140 | if total_count < num_img: 141 | train_instance = self.images[total_count] 142 | 143 | # augment input image and fix object's position and size 144 | img, all_objs = self.aug_image(train_instance, jitter=self.jitter) 145 | 146 | # construct output from object's x, y, w, h 147 | true_box_index = 0 148 | 149 | for obj in all_objs: 150 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']: 151 | center_x = .5*(obj['xmin'] + obj['xmax']) 152 | center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W']) 153 | center_y = .5*(obj['ymin'] + obj['ymax']) 154 | center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H']) 155 | 156 | grid_x = int(np.floor(center_x)) 157 | grid_y = int(np.floor(center_y)) 158 | 159 | if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']: 160 | obj_indx = self.config['LABELS'].index(obj['name']) 161 | 162 | center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell 163 | center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell 164 | 165 | box = [center_x, center_y, center_w, center_h] 166 | 167 | # find the anchor that best predicts this box 168 | best_anchor = -1 169 | max_iou = -1 170 | 171 | shifted_box = BoundBox(0, 172 | 0, 173 | center_w, 174 | center_h) 175 | 176 | for i in range(len(self.anchors)): 177 | anchor = self.anchors[i] 178 | iou = bbox_iou(shifted_box, anchor) 179 | 180 | if max_iou < iou: 181 | best_anchor = i 182 | max_iou = iou 183 | 184 | # assign ground truth x, y, w, h, confidence and class probs to y_batch 185 | y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4] = box 186 | y_batch[batch_count, grid_y, grid_x, best_anchor, 4 ] = 1. 187 | y_batch[batch_count, grid_y, grid_x, best_anchor, 5 ] = obj_indx 188 | 189 | # assign the true box to b_batch 190 | b_batch[batch_count, 0, 0, 0, true_box_index] = box 191 | 192 | true_box_index += 1 193 | true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER'] 194 | 195 | # assign input image to x_batch 196 | if self.norm: 197 | x_batch[batch_count] = normalize(img) 198 | else: 199 | # plot image and bounding boxes for sanity check 200 | for obj in all_objs: 201 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']: 202 | cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3) 203 | cv2.putText(img[:,:,::-1], obj['name'], 204 | (obj['xmin']+2, obj['ymin']+12), 205 | 0, 1.2e-3 * img.shape[0], 206 | (0,255,0), 2) 207 | 208 | x_batch[batch_count] = img 209 | 210 | # increase instance counter in current batch 211 | batch_count += 1 212 | 213 | total_count += 1 214 | if total_count >= num_img: 215 | total_count = 0 216 | if self.shuffle: np.random.shuffle(self.images) 217 | 218 | if batch_count >= self.config['BATCH_SIZE']: 219 | yield [x_batch, b_batch], y_batch 220 | 221 | x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) 222 | y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 5+self.config['CLASS'])) 223 | 224 | batch_count = 0 225 | 226 | def aug_image(self, train_instance, jitter): 227 | image_name = train_instance['filename'] 228 | image = cv2.imread(image_name) 229 | h, w, c = image.shape 230 | 231 | all_objs = copy.deepcopy(train_instance['object']) 232 | 233 | if jitter: 234 | ### scale the image 235 | scale = np.random.uniform() / 10. + 1. 236 | image = cv2.resize(image, (0,0), fx = scale, fy = scale) 237 | 238 | ### translate the image 239 | max_offx = (scale-1.) * w 240 | max_offy = (scale-1.) * h 241 | offx = int(np.random.uniform() * max_offx) 242 | offy = int(np.random.uniform() * max_offy) 243 | 244 | image = image[offy : (offy + h), offx : (offx + w)] 245 | 246 | ### flip the image 247 | flip = np.random.binomial(1, .5) 248 | if flip > 0.5: image = cv2.flip(image, 1) 249 | 250 | image = self.aug_pipe.augment_image(image) 251 | 252 | # resize the image to standard size 253 | image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W'])) 254 | image = image[:,:,::-1] 255 | 256 | # fix object's position and size 257 | for obj in all_objs: 258 | for attr in ['xmin', 'xmax']: 259 | if jitter: obj[attr] = int(obj[attr] * scale - offx) 260 | 261 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w) 262 | obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0) 263 | 264 | for attr in ['ymin', 'ymax']: 265 | if jitter: obj[attr] = int(obj[attr] * scale - offy) 266 | 267 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h) 268 | obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0) 269 | 270 | if jitter and flip > 0.5: 271 | xmin = obj['xmin'] 272 | obj['xmin'] = self.config['IMAGE_W'] - obj['xmax'] 273 | obj['xmax'] = self.config['IMAGE_W'] - xmin 274 | 275 | return image, all_objs 276 | 277 | def get_dateset_size(self): 278 | return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE'])) -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import xml.etree.ElementTree as ET 4 | import tensorflow as tf 5 | import copy 6 | import cv2 7 | 8 | class BoundBox: 9 | def __init__(self, x, y, w, h, c = None, classes = None): 10 | self.x = x 11 | self.y = y 12 | self.w = w 13 | self.h = h 14 | 15 | self.c = c 16 | self.classes = classes 17 | 18 | self.label = -1 19 | self.score = -1 20 | 21 | def get_label(self): 22 | if self.label == -1: 23 | self.label = np.argmax(self.classes) 24 | 25 | return self.label 26 | 27 | def get_score(self): 28 | if self.score == -1: 29 | self.score = self.classes[self.get_label()] 30 | 31 | return self.score 32 | 33 | class WeightReader: 34 | def __init__(self, weight_file): 35 | self.offset = 4 36 | self.all_weights = np.fromfile(weight_file, dtype='float32') 37 | 38 | def read_bytes(self, size): 39 | self.offset = self.offset + size 40 | return self.all_weights[self.offset-size:self.offset] 41 | 42 | def reset(self): 43 | self.offset = 4 44 | 45 | def normalize(image): 46 | image = image / 255. 47 | 48 | return image 49 | 50 | def bbox_iou(box1, box2): 51 | x1_min = box1.x - box1.w/2 52 | x1_max = box1.x + box1.w/2 53 | y1_min = box1.y - box1.h/2 54 | y1_max = box1.y + box1.h/2 55 | 56 | x2_min = box2.x - box2.w/2 57 | x2_max = box2.x + box2.w/2 58 | y2_min = box2.y - box2.h/2 59 | y2_max = box2.y + box2.h/2 60 | 61 | intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max]) 62 | intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max]) 63 | 64 | intersect = intersect_w * intersect_h 65 | 66 | union = box1.w * box1.h + box2.w * box2.h - intersect 67 | 68 | return float(intersect) / union 69 | 70 | def interval_overlap(interval_a, interval_b): 71 | x1, x2 = interval_a 72 | x3, x4 = interval_b 73 | 74 | if x3 < x1: 75 | if x4 < x1: 76 | return 0 77 | else: 78 | return min(x2,x4) - x1 79 | else: 80 | if x2 < x3: 81 | return 0 82 | else: 83 | return min(x2,x4) - x3 84 | 85 | def draw_boxes(image, boxes, labels): 86 | 87 | for box in boxes: 88 | xmin = int((box.x - box.w/2) * image.shape[1]) 89 | xmax = int((box.x + box.w/2) * image.shape[1]) 90 | ymin = int((box.y - box.h/2) * image.shape[0]) 91 | ymax = int((box.y + box.h/2) * image.shape[0]) 92 | 93 | cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3) 94 | cv2.putText(image, 95 | labels[box.get_label()] + ' ' + str(box.get_score()), 96 | (xmin, ymin - 13), 97 | cv2.FONT_HERSHEY_SIMPLEX, 98 | 1e-3 * image.shape[0], 99 | (0,255,0), 2) 100 | 101 | return image 102 | 103 | def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class): 104 | grid_h, grid_w, nb_box = netout.shape[:3] 105 | 106 | boxes = [] 107 | 108 | # decode the output by the network 109 | netout[..., 4] = sigmoid(netout[..., 4]) 110 | netout[..., 5:] = netout[..., 4][..., np.newaxis] * softmax(netout[..., 5:]) 111 | netout[..., 5:] *= netout[..., 5:] > obj_threshold 112 | 113 | for row in range(grid_h): 114 | for col in range(grid_w): 115 | for b in range(nb_box): 116 | # from 4th element onwards are confidence and class classes 117 | classes = netout[row,col,b,5:] 118 | 119 | if np.sum(classes) > 0: 120 | # first 4 elements are x, y, w, and h 121 | x, y, w, h = netout[row,col,b,:4] 122 | 123 | x = (col + sigmoid(x)) / grid_w # center position, unit: image width 124 | y = (row + sigmoid(y)) / grid_h # center position, unit: image height 125 | w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width 126 | h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height 127 | confidence = netout[row,col,b,4] 128 | 129 | box = BoundBox(x, y, w, h, confidence, classes) 130 | 131 | boxes.append(box) 132 | 133 | # suppress non-maximal boxes 134 | for c in range(nb_class): 135 | sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes]))) 136 | 137 | for i in xrange(len(sorted_indices)): 138 | index_i = sorted_indices[i] 139 | 140 | if boxes[index_i].classes[c] == 0: 141 | continue 142 | else: 143 | for j in xrange(i+1, len(sorted_indices)): 144 | index_j = sorted_indices[j] 145 | 146 | if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold: 147 | boxes[index_j].classes[c] = 0 148 | 149 | # remove the boxes which are less likely than a obj_threshold 150 | boxes = [box for box in boxes if box.get_score() > obj_threshold] 151 | 152 | return boxes 153 | 154 | def sigmoid(x): 155 | return 1. / (1. + np.exp(-x)) 156 | 157 | def softmax(x, axis=-1, t=-100.): 158 | x = x - np.max(x) 159 | 160 | if np.min(x) < t: 161 | x = x/np.min(x)*t 162 | 163 | e_x = np.exp(x) 164 | 165 | return e_x / e_x.sum(axis, keepdims=True) --------------------------------------------------------------------------------