├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── barebone-yolo.ipynb
├── coco2pascal.py
├── images
    ├── custom-loss.png
    ├── custom-loss2.png
    └── model.png
├── model.png
├── preprocessing.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Anderson Banihirwe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Keras YOLO Series
 2 | Keras implementation of YOLO (You Only Look Once) : Unified, Real-Time Object Detection
 3 | 
 4 | This is a [Keras](https://keras.io/)
 5 | implementation of YOLO, and YOLOv2.
 6 | This project is mainly based on [darkflow](https://github.com/thtrieu/darkflow)
 7 | and [darknet](https://github.com/pjreddie/darknet).
 8 | 
 9 | For details about YOLO and YOLOv2 please refer to their [project page](https://pjreddie.com/darknet/yolo/) 
10 | and the [paper](https://arxiv.org/abs/1612.08242):
11 | YOLO9000: Better, Faster, Stronger by Joseph Redmon and Ali Farhadi.
12 | 
13 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/__init__.py


--------------------------------------------------------------------------------
/barebone-yolo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "toc": "true"
   7 |    },
   8 |    "source": [
   9 |     " # Table of Contents\n",
  10 |     "<div class=\"toc\" style=\"margin-top: 1em;\"><ul class=\"toc-item\" id=\"toc-level0\"><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#YOLO\" data-toc-modified-id=\"YOLO-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>YOLO</a></span><ul class=\"toc-item\"><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Import-packages\" data-toc-modified-id=\"Import-packages-1.1\"><span class=\"toc-item-num\">1.1&nbsp;&nbsp;</span>Import packages</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Define-and-initialize-global-variables\" data-toc-modified-id=\"Define-and-initialize-global-variables-1.2\"><span class=\"toc-item-num\">1.2&nbsp;&nbsp;</span>Define and initialize global variables</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Construct-the-Network\" data-toc-modified-id=\"Construct-the-Network-1.3\"><span class=\"toc-item-num\">1.3&nbsp;&nbsp;</span>Construct the Network</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Load-Pretrained-weights\" data-toc-modified-id=\"Load-Pretrained-weights-1.4\"><span class=\"toc-item-num\">1.4&nbsp;&nbsp;</span>Load Pretrained weights</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Randomize-weights-of-the-last-layer\" data-toc-modified-id=\"Randomize-weights-of-the-last-layer-1.5\"><span class=\"toc-item-num\">1.5&nbsp;&nbsp;</span>Randomize weights of the last layer</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Training\" data-toc-modified-id=\"Training-1.6\"><span class=\"toc-item-num\">1.6&nbsp;&nbsp;</span>Training</a></span><ul class=\"toc-item\"><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Loss-Function\" data-toc-modified-id=\"Loss-Function-1.6.1\"><span class=\"toc-item-num\">1.6.1&nbsp;&nbsp;</span>Loss Function</a></span></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Parse-the-annotations-to-construct-train-generator-and-validation-generator\" data-toc-modified-id=\"Parse-the-annotations-to-construct-train-generator-and-validation-generator-1.6.2\"><span class=\"toc-item-num\">1.6.2&nbsp;&nbsp;</span>Parse the annotations to construct train generator and validation generator</a></span></li></ul></li><li><span><a href=\"http://localhost:8888/notebooks/barebone-yolo.ipynb#Setup-a-few-callbacks-and-start-the-training\" data-toc-modified-id=\"Setup-a-few-callbacks-and-start-the-training-1.7\"><span class=\"toc-item-num\">1.7&nbsp;&nbsp;</span>Setup a few callbacks and start the training</a></span></li></ul></li></ul></div>"
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "markdown",
  15 |    "metadata": {},
  16 |    "source": [
  17 |     "# YOLO"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "markdown",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "## Import packages"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 35,
  30 |    "metadata": {},
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "from keras import models\n",
  34 |     "from keras import layers\n",
  35 |     "from keras import callbacks\n",
  36 |     "from keras import optimizers\n",
  37 |     "from keras.utils.vis_utils import plot_model\n",
  38 |     "import keras.backend as K\n",
  39 |     "import tensorflow as tf\n",
  40 |     "%matplotlib inline\n",
  41 |     "import matplotlib.pyplot as plt\n",
  42 |     "import matplotlib\n",
  43 |     "matplotlib.style.use('seaborn')\n",
  44 |     "import numpy as np\n",
  45 |     "import os\n",
  46 |     "import cv2\n",
  47 |     "import imgaug as ia\n",
  48 |     "from imgaug import augmenters as iaa\n",
  49 |     "from preprocessing import parse_annotation, BatchGenerator\n",
  50 |     "from utils import WeightReader, decode_netout, draw_boxes"
  51 |    ]
  52 |   },
  53 |   {
  54 |    "cell_type": "markdown",
  55 |    "metadata": {},
  56 |    "source": [
  57 |     "## Define and initialize global variables"
  58 |    ]
  59 |   },
  60 |   {
  61 |    "cell_type": "code",
  62 |    "execution_count": 2,
  63 |    "metadata": {
  64 |     "collapsed": true
  65 |    },
  66 |    "outputs": [],
  67 |    "source": [
  68 |     "LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']\n",
  69 |     "\n",
  70 |     "IMAGE_H, IMAGE_W = 416, 416\n",
  71 |     "GRID_H,  GRID_W  = 13 , 13\n",
  72 |     "BOX              = 5\n",
  73 |     "CLASS            = len(LABELS)\n",
  74 |     "CLASS_WEIGHTS    = np.ones(CLASS, dtype='float32')\n",
  75 |     "OBJ_THRESHOLD    = 0.3#0.5\n",
  76 |     "NMS_THRESHOLD    = 0.3#0.45\n",
  77 |     "ANCHORS          = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]\n",
  78 |     "\n",
  79 |     "NO_OBJECT_SCALE  = 1.0\n",
  80 |     "OBJECT_SCALE     = 5.0\n",
  81 |     "COORD_SCALE      = 1.0\n",
  82 |     "CLASS_SCALE      = 1.0\n",
  83 |     "\n",
  84 |     "BATCH_SIZE       = 16\n",
  85 |     "WARM_UP_BATCHES  = 0\n",
  86 |     "TRUE_BOX_BUFFER  = 50\n",
  87 |     "\n",
  88 |     "\n",
  89 |     "ALPHA = 0.1"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "code",
  94 |    "execution_count": 3,
  95 |    "metadata": {
  96 |     "collapsed": true
  97 |    },
  98 |    "outputs": [],
  99 |    "source": [
 100 |     "pre_trained_weights='weights/yolo.weights'\n",
 101 |     "train_image_folder = '/home/abanihi/Documents/deep-data/coco/images/train2014/'\n",
 102 |     "train_annot_folder = '/home/abanihi/Documents/deep-data/coco/train2014ann/'\n",
 103 |     "val_image_folder = '/home/abanihi/Documents/deep-data/coco/images/val2014/'\n",
 104 |     "val_annot_folder = '/home/abanihi/Documents/deep-data/coco/val2014ann/'"
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "markdown",
 109 |    "metadata": {},
 110 |    "source": [
 111 |     "## Construct the Network"
 112 |    ]
 113 |   },
 114 |   {
 115 |    "cell_type": "code",
 116 |    "execution_count": 4,
 117 |    "metadata": {
 118 |     "collapsed": true
 119 |    },
 120 |    "outputs": [],
 121 |    "source": [
 122 |     "# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n",
 123 |     "def space_to_depth_x2(x):\n",
 124 |     "    return tf.space_to_depth(x, block_size=2)"
 125 |    ]
 126 |   },
 127 |   {
 128 |    "cell_type": "code",
 129 |    "execution_count": 5,
 130 |    "metadata": {
 131 |     "collapsed": true
 132 |    },
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "input_image = layers.Input(shape=(IMAGE_H, IMAGE_W, 3))\n",
 136 |     "true_boxes  = layers.Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))"
 137 |    ]
 138 |   },
 139 |   {
 140 |    "cell_type": "code",
 141 |    "execution_count": 6,
 142 |    "metadata": {
 143 |     "collapsed": true
 144 |    },
 145 |    "outputs": [],
 146 |    "source": [
 147 |     "def yolo():\n",
 148 |     "    \n",
 149 |     " \n",
 150 |     "    # Layer 1\n",
 151 |     "    x = layers.Conv2D(32, (3, 3), strides=(1, 1), \n",
 152 |     "                        padding='same', name='conv_1', use_bias=False)(input_image)\n",
 153 |     "    x = layers.BatchNormalization(name='norm_1')(x)\n",
 154 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 155 |     "    x = layers.MaxPool2D(pool_size=(2,2))(x)\n",
 156 |     "    \n",
 157 |     "    # Layer 2\n",
 158 |     "    x = layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same', name='conv_2', use_bias=False)(x)\n",
 159 |     "    x = layers.BatchNormalization(name='norm_2')(x)\n",
 160 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 161 |     "    x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
 162 |     "    \n",
 163 |     "    \n",
 164 |     "    # Layer 3\n",
 165 |     "    x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_3', use_bias=False)(x)\n",
 166 |     "    x = layers.BatchNormalization(name='norm_3')(x)\n",
 167 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 168 |     "    \n",
 169 |     "    # Layer 4 \n",
 170 |     "    x = layers.Conv2D(64, (1, 1), strides=(1, 1), padding='same', name='conv_4', use_bias=False)(x)\n",
 171 |     "    x = layers.BatchNormalization(name='norm_4')(x)\n",
 172 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 173 |     "    \n",
 174 |     "    # Layer 5\n",
 175 |     "    x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_5', use_bias=False)(x)\n",
 176 |     "    x = layers.BatchNormalization(name='norm_5')(x)\n",
 177 |     "    x= layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 178 |     "    x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
 179 |     "    \n",
 180 |     "    # Layer 6\n",
 181 |     "    x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_6', use_bias=False)(x)\n",
 182 |     "    x = layers.BatchNormalization(name='norm_6')(x)\n",
 183 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 184 |     "    \n",
 185 |     "    \n",
 186 |     "    # Layer 7\n",
 187 |     "    x = layers.Conv2D(128, (1, 1), strides=(1, 1), padding='same', name='conv_7', use_bias=False)(x)\n",
 188 |     "    x= layers.BatchNormalization(name='norm_7')(x)\n",
 189 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 190 |     "    \n",
 191 |     "    # Layer 8\n",
 192 |     "    x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_8', use_bias=False)(x)\n",
 193 |     "    x = layers.BatchNormalization(name='norm_8')(x)\n",
 194 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 195 |     "    x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
 196 |     "    \n",
 197 |     "    # Layer 9\n",
 198 |     "    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_9', use_bias=False)(x)\n",
 199 |     "    x = layers.BatchNormalization(name='norm_9')(x)\n",
 200 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 201 |     "    \n",
 202 |     "    # Layer 10\n",
 203 |     "    x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_10', use_bias=False)(x)\n",
 204 |     "    x = layers.BatchNormalization(name='norm_10')(x)\n",
 205 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 206 |     "    \n",
 207 |     "    # Layer 11\n",
 208 |     "    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_11', use_bias=False)(x)\n",
 209 |     "    x = layers.BatchNormalization(name='norm_11')(x)\n",
 210 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 211 |     "    \n",
 212 |     "    \n",
 213 |     "    # Layer 12\n",
 214 |     "    x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_12', use_bias=False)(x)\n",
 215 |     "    x = layers.BatchNormalization(name='norm_12')(x)\n",
 216 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 217 |     "    \n",
 218 |     "    # Layer 13\n",
 219 |     "    x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_13', use_bias=False)(x)\n",
 220 |     "    x = layers.BatchNormalization(name='norm_13')(x)\n",
 221 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 222 |     "    \n",
 223 |     "    \n",
 224 |     "    skip_connection = x\n",
 225 |     "    \n",
 226 |     "    x = layers.MaxPool2D(pool_size=(2, 2))(x)\n",
 227 |     "    \n",
 228 |     "    # Layer 14\n",
 229 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_14', use_bias=False)(x)\n",
 230 |     "    x = layers.BatchNormalization(name='norm_14')(x)\n",
 231 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 232 |     "    \n",
 233 |     "    # Layer 15\n",
 234 |     "    x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_15', use_bias=False)(x)\n",
 235 |     "    x = layers.BatchNormalization(name='norm_15')(x)\n",
 236 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 237 |     "    \n",
 238 |     "    # Layer 16\n",
 239 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_16', use_bias=False)(x)\n",
 240 |     "    x = layers.BatchNormalization(name='norm_16')(x)\n",
 241 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 242 |     "    \n",
 243 |     "    # Layer 17\n",
 244 |     "    x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_17', use_bias=False)(x)\n",
 245 |     "    x = layers.BatchNormalization(name='norm_17')(x)\n",
 246 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 247 |     "    \n",
 248 |     "    # Layer 18\n",
 249 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_18', use_bias=False)(x)\n",
 250 |     "    x = layers.BatchNormalization(name='norm_18')(x)\n",
 251 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 252 |     "    \n",
 253 |     "    # Layer 19\n",
 254 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_19', use_bias=False)(x)\n",
 255 |     "    x = layers.BatchNormalization(name='norm_19')(x)\n",
 256 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 257 |     "    \n",
 258 |     "    # Layer 20\n",
 259 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_20', use_bias=False)(x)\n",
 260 |     "    x = layers.BatchNormalization(name='norm_20')(x)\n",
 261 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 262 |     "    \n",
 263 |     "    \n",
 264 |     "    # Layer 21\n",
 265 |     "    skip_connection = layers.Conv2D(64, (1, 1), strides=(1, 1), \n",
 266 |     "                                padding='same', name='conv_21', use_bias=False)(skip_connection)\n",
 267 |     "    skip_connection = layers.BatchNormalization(name='norm_21')(skip_connection)\n",
 268 |     "    skip_connection = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(skip_connection)\n",
 269 |     "    skip_connection = layers.Lambda(space_to_depth_x2)(skip_connection)\n",
 270 |     "    \n",
 271 |     "    x = layers.concatenate([skip_connection, x])\n",
 272 |     "    \n",
 273 |     "    # Layer 22\n",
 274 |     "    x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_22',\n",
 275 |     "                     use_bias=False)(x)\n",
 276 |     "    x = layers.BatchNormalization(name='norm_22')(x)\n",
 277 |     "    x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
 278 |     "    \n",
 279 |     "    # Layer 23\n",
 280 |     "    x = layers.Conv2D((4 + 1 + CLASS) * 5, (1,1), strides=(1,1), padding='same', name='conv_23')(x)\n",
 281 |     "    output = layers.Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)\n",
 282 |     "    \n",
 283 |     "    # small hack to allow true_boxes to be registered when Keras build the model \n",
 284 |     "    # for more information: https://github.com/fchollet/keras/issues/2790\n",
 285 |     "    output = layers.Lambda(lambda args: args[0])([output, true_boxes])\n",
 286 |     "    \n",
 287 |     "    model = models.Model([input_image, true_boxes], output)\n",
 288 |     "                                    \n",
 289 |     "    \n",
 290 |     "    return model\n"
 291 |    ]
 292 |   },
 293 |   {
 294 |    "cell_type": "code",
 295 |    "execution_count": 7,
 296 |    "metadata": {},
 297 |    "outputs": [
 298 |     {
 299 |      "name": "stdout",
 300 |      "output_type": "stream",
 301 |      "text": [
 302 |       "____________________________________________________________________________________________________\n",
 303 |       "Layer (type)                     Output Shape          Param #     Connected to                     \n",
 304 |       "====================================================================================================\n",
 305 |       "input_1 (InputLayer)             (None, 416, 416, 3)   0                                            \n",
 306 |       "____________________________________________________________________________________________________\n",
 307 |       "conv_1 (Conv2D)                  (None, 416, 416, 32)  864         input_1[0][0]                    \n",
 308 |       "____________________________________________________________________________________________________\n",
 309 |       "norm_1 (BatchNormalization)      (None, 416, 416, 32)  128         conv_1[0][0]                     \n",
 310 |       "____________________________________________________________________________________________________\n",
 311 |       "leaky_re_lu_1 (LeakyReLU)        (None, 416, 416, 32)  0           norm_1[0][0]                     \n",
 312 |       "____________________________________________________________________________________________________\n",
 313 |       "max_pooling2d_1 (MaxPooling2D)   (None, 208, 208, 32)  0           leaky_re_lu_1[0][0]              \n",
 314 |       "____________________________________________________________________________________________________\n",
 315 |       "conv_2 (Conv2D)                  (None, 208, 208, 64)  18432       max_pooling2d_1[0][0]            \n",
 316 |       "____________________________________________________________________________________________________\n",
 317 |       "norm_2 (BatchNormalization)      (None, 208, 208, 64)  256         conv_2[0][0]                     \n",
 318 |       "____________________________________________________________________________________________________\n",
 319 |       "leaky_re_lu_2 (LeakyReLU)        (None, 208, 208, 64)  0           norm_2[0][0]                     \n",
 320 |       "____________________________________________________________________________________________________\n",
 321 |       "max_pooling2d_2 (MaxPooling2D)   (None, 104, 104, 64)  0           leaky_re_lu_2[0][0]              \n",
 322 |       "____________________________________________________________________________________________________\n",
 323 |       "conv_3 (Conv2D)                  (None, 104, 104, 128) 73728       max_pooling2d_2[0][0]            \n",
 324 |       "____________________________________________________________________________________________________\n",
 325 |       "norm_3 (BatchNormalization)      (None, 104, 104, 128) 512         conv_3[0][0]                     \n",
 326 |       "____________________________________________________________________________________________________\n",
 327 |       "leaky_re_lu_3 (LeakyReLU)        (None, 104, 104, 128) 0           norm_3[0][0]                     \n",
 328 |       "____________________________________________________________________________________________________\n",
 329 |       "conv_4 (Conv2D)                  (None, 104, 104, 64)  8192        leaky_re_lu_3[0][0]              \n",
 330 |       "____________________________________________________________________________________________________\n",
 331 |       "norm_4 (BatchNormalization)      (None, 104, 104, 64)  256         conv_4[0][0]                     \n",
 332 |       "____________________________________________________________________________________________________\n",
 333 |       "leaky_re_lu_4 (LeakyReLU)        (None, 104, 104, 64)  0           norm_4[0][0]                     \n",
 334 |       "____________________________________________________________________________________________________\n",
 335 |       "conv_5 (Conv2D)                  (None, 104, 104, 128) 73728       leaky_re_lu_4[0][0]              \n",
 336 |       "____________________________________________________________________________________________________\n",
 337 |       "norm_5 (BatchNormalization)      (None, 104, 104, 128) 512         conv_5[0][0]                     \n",
 338 |       "____________________________________________________________________________________________________\n",
 339 |       "leaky_re_lu_5 (LeakyReLU)        (None, 104, 104, 128) 0           norm_5[0][0]                     \n",
 340 |       "____________________________________________________________________________________________________\n",
 341 |       "max_pooling2d_3 (MaxPooling2D)   (None, 52, 52, 128)   0           leaky_re_lu_5[0][0]              \n",
 342 |       "____________________________________________________________________________________________________\n",
 343 |       "conv_6 (Conv2D)                  (None, 52, 52, 256)   294912      max_pooling2d_3[0][0]            \n",
 344 |       "____________________________________________________________________________________________________\n",
 345 |       "norm_6 (BatchNormalization)      (None, 52, 52, 256)   1024        conv_6[0][0]                     \n",
 346 |       "____________________________________________________________________________________________________\n",
 347 |       "leaky_re_lu_6 (LeakyReLU)        (None, 52, 52, 256)   0           norm_6[0][0]                     \n",
 348 |       "____________________________________________________________________________________________________\n",
 349 |       "conv_7 (Conv2D)                  (None, 52, 52, 128)   32768       leaky_re_lu_6[0][0]              \n",
 350 |       "____________________________________________________________________________________________________\n",
 351 |       "norm_7 (BatchNormalization)      (None, 52, 52, 128)   512         conv_7[0][0]                     \n",
 352 |       "____________________________________________________________________________________________________\n",
 353 |       "leaky_re_lu_7 (LeakyReLU)        (None, 52, 52, 128)   0           norm_7[0][0]                     \n",
 354 |       "____________________________________________________________________________________________________\n",
 355 |       "conv_8 (Conv2D)                  (None, 52, 52, 256)   294912      leaky_re_lu_7[0][0]              \n",
 356 |       "____________________________________________________________________________________________________\n",
 357 |       "norm_8 (BatchNormalization)      (None, 52, 52, 256)   1024        conv_8[0][0]                     \n",
 358 |       "____________________________________________________________________________________________________\n",
 359 |       "leaky_re_lu_8 (LeakyReLU)        (None, 52, 52, 256)   0           norm_8[0][0]                     \n",
 360 |       "____________________________________________________________________________________________________\n",
 361 |       "max_pooling2d_4 (MaxPooling2D)   (None, 26, 26, 256)   0           leaky_re_lu_8[0][0]              \n",
 362 |       "____________________________________________________________________________________________________\n",
 363 |       "conv_9 (Conv2D)                  (None, 26, 26, 512)   1179648     max_pooling2d_4[0][0]            \n",
 364 |       "____________________________________________________________________________________________________\n",
 365 |       "norm_9 (BatchNormalization)      (None, 26, 26, 512)   2048        conv_9[0][0]                     \n",
 366 |       "____________________________________________________________________________________________________\n",
 367 |       "leaky_re_lu_9 (LeakyReLU)        (None, 26, 26, 512)   0           norm_9[0][0]                     \n",
 368 |       "____________________________________________________________________________________________________\n",
 369 |       "conv_10 (Conv2D)                 (None, 26, 26, 256)   131072      leaky_re_lu_9[0][0]              \n",
 370 |       "____________________________________________________________________________________________________\n",
 371 |       "norm_10 (BatchNormalization)     (None, 26, 26, 256)   1024        conv_10[0][0]                    \n",
 372 |       "____________________________________________________________________________________________________\n",
 373 |       "leaky_re_lu_10 (LeakyReLU)       (None, 26, 26, 256)   0           norm_10[0][0]                    \n",
 374 |       "____________________________________________________________________________________________________\n",
 375 |       "conv_11 (Conv2D)                 (None, 26, 26, 512)   1179648     leaky_re_lu_10[0][0]             \n",
 376 |       "____________________________________________________________________________________________________\n",
 377 |       "norm_11 (BatchNormalization)     (None, 26, 26, 512)   2048        conv_11[0][0]                    \n",
 378 |       "____________________________________________________________________________________________________\n",
 379 |       "leaky_re_lu_11 (LeakyReLU)       (None, 26, 26, 512)   0           norm_11[0][0]                    \n",
 380 |       "____________________________________________________________________________________________________\n",
 381 |       "conv_12 (Conv2D)                 (None, 26, 26, 256)   131072      leaky_re_lu_11[0][0]             \n",
 382 |       "____________________________________________________________________________________________________\n",
 383 |       "norm_12 (BatchNormalization)     (None, 26, 26, 256)   1024        conv_12[0][0]                    \n",
 384 |       "____________________________________________________________________________________________________\n",
 385 |       "leaky_re_lu_12 (LeakyReLU)       (None, 26, 26, 256)   0           norm_12[0][0]                    \n",
 386 |       "____________________________________________________________________________________________________\n",
 387 |       "conv_13 (Conv2D)                 (None, 26, 26, 512)   1179648     leaky_re_lu_12[0][0]             \n",
 388 |       "____________________________________________________________________________________________________\n",
 389 |       "norm_13 (BatchNormalization)     (None, 26, 26, 512)   2048        conv_13[0][0]                    \n",
 390 |       "____________________________________________________________________________________________________\n",
 391 |       "leaky_re_lu_13 (LeakyReLU)       (None, 26, 26, 512)   0           norm_13[0][0]                    \n",
 392 |       "____________________________________________________________________________________________________\n",
 393 |       "max_pooling2d_5 (MaxPooling2D)   (None, 13, 13, 512)   0           leaky_re_lu_13[0][0]             \n",
 394 |       "____________________________________________________________________________________________________\n",
 395 |       "conv_14 (Conv2D)                 (None, 13, 13, 1024)  4718592     max_pooling2d_5[0][0]            \n",
 396 |       "____________________________________________________________________________________________________\n",
 397 |       "norm_14 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_14[0][0]                    \n",
 398 |       "____________________________________________________________________________________________________\n",
 399 |       "leaky_re_lu_14 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_14[0][0]                    \n",
 400 |       "____________________________________________________________________________________________________\n",
 401 |       "conv_15 (Conv2D)                 (None, 13, 13, 512)   524288      leaky_re_lu_14[0][0]             \n",
 402 |       "____________________________________________________________________________________________________\n",
 403 |       "norm_15 (BatchNormalization)     (None, 13, 13, 512)   2048        conv_15[0][0]                    \n",
 404 |       "____________________________________________________________________________________________________\n",
 405 |       "leaky_re_lu_15 (LeakyReLU)       (None, 13, 13, 512)   0           norm_15[0][0]                    \n",
 406 |       "____________________________________________________________________________________________________\n",
 407 |       "conv_16 (Conv2D)                 (None, 13, 13, 1024)  4718592     leaky_re_lu_15[0][0]             \n",
 408 |       "____________________________________________________________________________________________________\n",
 409 |       "norm_16 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_16[0][0]                    \n",
 410 |       "____________________________________________________________________________________________________\n",
 411 |       "leaky_re_lu_16 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_16[0][0]                    \n",
 412 |       "____________________________________________________________________________________________________\n",
 413 |       "conv_17 (Conv2D)                 (None, 13, 13, 512)   524288      leaky_re_lu_16[0][0]             \n",
 414 |       "____________________________________________________________________________________________________\n",
 415 |       "norm_17 (BatchNormalization)     (None, 13, 13, 512)   2048        conv_17[0][0]                    \n",
 416 |       "____________________________________________________________________________________________________\n",
 417 |       "leaky_re_lu_17 (LeakyReLU)       (None, 13, 13, 512)   0           norm_17[0][0]                    \n",
 418 |       "____________________________________________________________________________________________________\n",
 419 |       "conv_18 (Conv2D)                 (None, 13, 13, 1024)  4718592     leaky_re_lu_17[0][0]             \n",
 420 |       "____________________________________________________________________________________________________\n",
 421 |       "norm_18 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_18[0][0]                    \n",
 422 |       "____________________________________________________________________________________________________\n",
 423 |       "leaky_re_lu_18 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_18[0][0]                    \n",
 424 |       "____________________________________________________________________________________________________\n",
 425 |       "conv_19 (Conv2D)                 (None, 13, 13, 1024)  9437184     leaky_re_lu_18[0][0]             \n",
 426 |       "____________________________________________________________________________________________________\n",
 427 |       "norm_19 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_19[0][0]                    \n",
 428 |       "____________________________________________________________________________________________________\n",
 429 |       "conv_21 (Conv2D)                 (None, 26, 26, 64)    32768       leaky_re_lu_13[0][0]             \n",
 430 |       "____________________________________________________________________________________________________\n",
 431 |       "leaky_re_lu_19 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_19[0][0]                    \n",
 432 |       "____________________________________________________________________________________________________\n",
 433 |       "norm_21 (BatchNormalization)     (None, 26, 26, 64)    256         conv_21[0][0]                    \n",
 434 |       "____________________________________________________________________________________________________\n",
 435 |       "conv_20 (Conv2D)                 (None, 13, 13, 1024)  9437184     leaky_re_lu_19[0][0]             \n",
 436 |       "____________________________________________________________________________________________________\n",
 437 |       "leaky_re_lu_21 (LeakyReLU)       (None, 26, 26, 64)    0           norm_21[0][0]                    \n",
 438 |       "____________________________________________________________________________________________________\n",
 439 |       "norm_20 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_20[0][0]                    \n",
 440 |       "____________________________________________________________________________________________________\n",
 441 |       "lambda_1 (Lambda)                (None, 13, 13, 256)   0           leaky_re_lu_21[0][0]             \n",
 442 |       "____________________________________________________________________________________________________\n",
 443 |       "leaky_re_lu_20 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_20[0][0]                    \n",
 444 |       "____________________________________________________________________________________________________\n",
 445 |       "concatenate_1 (Concatenate)      (None, 13, 13, 1280)  0           lambda_1[0][0]                   \n",
 446 |       "                                                                   leaky_re_lu_20[0][0]             \n",
 447 |       "____________________________________________________________________________________________________\n",
 448 |       "conv_22 (Conv2D)                 (None, 13, 13, 1024)  11796480    concatenate_1[0][0]              \n",
 449 |       "____________________________________________________________________________________________________\n",
 450 |       "norm_22 (BatchNormalization)     (None, 13, 13, 1024)  4096        conv_22[0][0]                    \n",
 451 |       "____________________________________________________________________________________________________\n",
 452 |       "leaky_re_lu_22 (LeakyReLU)       (None, 13, 13, 1024)  0           norm_22[0][0]                    \n",
 453 |       "____________________________________________________________________________________________________\n",
 454 |       "conv_23 (Conv2D)                 (None, 13, 13, 425)   435625      leaky_re_lu_22[0][0]             \n",
 455 |       "____________________________________________________________________________________________________\n",
 456 |       "reshape_1 (Reshape)              (None, 13, 13, 5, 85) 0           conv_23[0][0]                    \n",
 457 |       "____________________________________________________________________________________________________\n",
 458 |       "input_2 (InputLayer)             (None, 1, 1, 1, 50, 4 0                                            \n",
 459 |       "____________________________________________________________________________________________________\n",
 460 |       "lambda_2 (Lambda)                (None, 13, 13, 5, 85) 0           reshape_1[0][0]                  \n",
 461 |       "                                                                   input_2[0][0]                    \n",
 462 |       "====================================================================================================\n",
 463 |       "Total params: 50,983,561\n",
 464 |       "Trainable params: 50,962,889\n",
 465 |       "Non-trainable params: 20,672\n",
 466 |       "____________________________________________________________________________________________________\n"
 467 |      ]
 468 |     }
 469 |    ],
 470 |    "source": [
 471 |     "model = yolo()\n",
 472 |     "model.summary()"
 473 |    ]
 474 |   },
 475 |   {
 476 |    "cell_type": "code",
 477 |    "execution_count": 8,
 478 |    "metadata": {
 479 |     "collapsed": true
 480 |    },
 481 |    "outputs": [],
 482 |    "source": [
 483 |     "plot_model(model, to_file='model.png')"
 484 |    ]
 485 |   },
 486 |   {
 487 |    "cell_type": "markdown",
 488 |    "metadata": {},
 489 |    "source": [
 490 |     "Total params: 50,983,561\n",
 491 |     "Trainable params: 50,962,889\n",
 492 |     "Non-trainable params: 20,672"
 493 |    ]
 494 |   },
 495 |   {
 496 |    "cell_type": "markdown",
 497 |    "metadata": {},
 498 |    "source": [
 499 |     "## Load Pretrained weights\n",
 500 |     "\n",
 501 |     "Load the weights originally provided by YOLO"
 502 |    ]
 503 |   },
 504 |   {
 505 |    "cell_type": "code",
 506 |    "execution_count": 9,
 507 |    "metadata": {
 508 |     "collapsed": true
 509 |    },
 510 |    "outputs": [],
 511 |    "source": [
 512 |     "weight_reader = WeightReader(pre_trained_weights)"
 513 |    ]
 514 |   },
 515 |   {
 516 |    "cell_type": "code",
 517 |    "execution_count": 10,
 518 |    "metadata": {
 519 |     "collapsed": true
 520 |    },
 521 |    "outputs": [],
 522 |    "source": [
 523 |     "weight_reader.reset()\n",
 524 |     "nb_conv = 23"
 525 |    ]
 526 |   },
 527 |   {
 528 |    "cell_type": "code",
 529 |    "execution_count": 11,
 530 |    "metadata": {
 531 |     "collapsed": true
 532 |    },
 533 |    "outputs": [],
 534 |    "source": [
 535 |     "for i in range(1, nb_conv+1):\n",
 536 |     "    conv_layer = model.get_layer('conv_' + str(i))\n",
 537 |     "    \n",
 538 |     "    if i < nb_conv:\n",
 539 |     "        norm_layer = model.get_layer('norm_' + str(i))\n",
 540 |     "        \n",
 541 |     "        size = np.prod(norm_layer.get_weights()[0].shape)\n",
 542 |     "        \n",
 543 |     "        beta = weight_reader.read_bytes(size)\n",
 544 |     "        gamma = weight_reader.read_bytes(size)\n",
 545 |     "        mean = weight_reader.read_bytes(size)\n",
 546 |     "        var = weight_reader.read_bytes(size)\n",
 547 |     "        \n",
 548 |     "        weights = norm_layer.set_weights([gamma, beta, mean, var])\n",
 549 |     "        \n",
 550 |     "    if len(conv_layer.get_weights()) > 1:\n",
 551 |     "        bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))\n",
 552 |     "        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
 553 |     "        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
 554 |     "        kernel = kernel.transpose([2,3,1,0])\n",
 555 |     "        conv_layer.set_weights([kernel, bias])\n",
 556 |     "        \n",
 557 |     "    else:\n",
 558 |     "        kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
 559 |     "        kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
 560 |     "        kernel = kernel.transpose([2,3,1,0])\n",
 561 |     "        conv_layer.set_weights([kernel])"
 562 |    ]
 563 |   },
 564 |   {
 565 |    "cell_type": "markdown",
 566 |    "metadata": {
 567 |     "collapsed": true
 568 |    },
 569 |    "source": [
 570 |     "## Randomize weights of the last layer"
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "code",
 575 |    "execution_count": 12,
 576 |    "metadata": {
 577 |     "collapsed": true
 578 |    },
 579 |    "outputs": [],
 580 |    "source": [
 581 |     "# Get last convolutional layer\n",
 582 |     "layer = model.layers[-4] \n",
 583 |     "weights = layer.get_weights()\n",
 584 |     "\n",
 585 |     "new_kernel = np.random.normal(size=weights[0].shape) / (GRID_H*GRID_W)\n",
 586 |     "new_bias   = np.random.normal(size=weights[1].shape) / (GRID_H*GRID_W)\n",
 587 |     "\n",
 588 |     "layer.set_weights([new_kernel, new_bias])"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "metadata": {},
 594 |    "source": [
 595 |     "## Training"
 596 |    ]
 597 |   },
 598 |   {
 599 |    "cell_type": "markdown",
 600 |    "metadata": {},
 601 |    "source": [
 602 |     "### Loss Function\n",
 603 |     "\n",
 604 |     "![](images/custom-loss.png)\n",
 605 |     "\n",
 606 |     "![](images/custom-loss2.png)\n"
 607 |    ]
 608 |   },
 609 |   {
 610 |    "cell_type": "code",
 611 |    "execution_count": 54,
 612 |    "metadata": {
 613 |     "collapsed": true
 614 |    },
 615 |    "outputs": [],
 616 |    "source": [
 617 |     "\n",
 618 |     "\n",
 619 |     "def custom_loss(y_true, y_pred):\n",
 620 |     "    mask_shape = tf.shape(y_true)[:4]\n",
 621 |     "    \n",
 622 |     "    cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))\n",
 623 |     "    cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n",
 624 |     "\n",
 625 |     "    cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])\n",
 626 |     "    \n",
 627 |     "    coord_mask = tf.zeros(mask_shape)\n",
 628 |     "    conf_mask  = tf.zeros(mask_shape)\n",
 629 |     "    class_mask = tf.zeros(mask_shape)\n",
 630 |     "    \n",
 631 |     "    seen = tf.Variable(0.)\n",
 632 |     "    \n",
 633 |     "    total_AP = tf.Variable(0.)\n",
 634 |     "    \n",
 635 |     "    \"\"\"\n",
 636 |     "    Adjust prediction\n",
 637 |     "    \"\"\"\n",
 638 |     "    ### adjust x and y      \n",
 639 |     "    pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n",
 640 |     "    \n",
 641 |     "    ### adjust w and h\n",
 642 |     "    pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])\n",
 643 |     "    \n",
 644 |     "    ### adjust confidence\n",
 645 |     "    pred_box_conf = tf.sigmoid(y_pred[..., 4])\n",
 646 |     "    \n",
 647 |     "    ### adjust class probabilities\n",
 648 |     "    pred_box_class = y_pred[..., 5:]\n",
 649 |     "    \n",
 650 |     "    \"\"\"\n",
 651 |     "    Adjust ground truth\n",
 652 |     "    \"\"\"\n",
 653 |     "    ### adjust x and y\n",
 654 |     "    true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n",
 655 |     "    \n",
 656 |     "    ### adjust w and h\n",
 657 |     "    true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n",
 658 |     "    \n",
 659 |     "    ### adjust confidence\n",
 660 |     "    true_wh_half = true_box_wh / 2.\n",
 661 |     "    true_mins    = true_box_xy - true_wh_half\n",
 662 |     "    true_maxes   = true_box_xy + true_wh_half\n",
 663 |     "    \n",
 664 |     "    pred_wh_half = pred_box_wh / 2.\n",
 665 |     "    pred_mins    = pred_box_xy - pred_wh_half\n",
 666 |     "    pred_maxes   = pred_box_xy + pred_wh_half       \n",
 667 |     "    \n",
 668 |     "    intersect_mins  = tf.maximum(pred_mins,  true_mins)\n",
 669 |     "    intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
 670 |     "    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
 671 |     "    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
 672 |     "    \n",
 673 |     "    true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n",
 674 |     "    pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n",
 675 |     "\n",
 676 |     "    union_areas = pred_areas + true_areas - intersect_areas\n",
 677 |     "    iou_scores  = tf.truediv(intersect_areas, union_areas)\n",
 678 |     "    \n",
 679 |     "    true_box_conf = iou_scores * y_true[..., 4]\n",
 680 |     "    \n",
 681 |     "    ### adjust class probabilities\n",
 682 |     "    true_box_class = tf.to_int32(y_true[..., 5])\n",
 683 |     "    \n",
 684 |     "    \"\"\"\n",
 685 |     "    Determine the masks\n",
 686 |     "    \"\"\"\n",
 687 |     "    ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n",
 688 |     "    coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE\n",
 689 |     "    \n",
 690 |     "    ### confidence mask: penelize predictors + penalize boxes with low IOU\n",
 691 |     "    # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n",
 692 |     "    true_xy = true_boxes[..., 0:2]\n",
 693 |     "    true_wh = true_boxes[..., 2:4]\n",
 694 |     "    \n",
 695 |     "    true_wh_half = true_wh / 2.\n",
 696 |     "    true_mins    = true_xy - true_wh_half\n",
 697 |     "    true_maxes   = true_xy + true_wh_half\n",
 698 |     "    \n",
 699 |     "    pred_xy = tf.expand_dims(pred_box_xy, 4)\n",
 700 |     "    pred_wh = tf.expand_dims(pred_box_wh, 4)\n",
 701 |     "    \n",
 702 |     "    pred_wh_half = pred_wh / 2.\n",
 703 |     "    pred_mins    = pred_xy - pred_wh_half\n",
 704 |     "    pred_maxes   = pred_xy + pred_wh_half    \n",
 705 |     "    \n",
 706 |     "    intersect_mins  = tf.maximum(pred_mins,  true_mins)\n",
 707 |     "    intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
 708 |     "    intersect_wh    = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
 709 |     "    intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
 710 |     "    \n",
 711 |     "    true_areas = true_wh[..., 0] * true_wh[..., 1]\n",
 712 |     "    pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n",
 713 |     "\n",
 714 |     "    union_areas = pred_areas + true_areas - intersect_areas\n",
 715 |     "    iou_scores  = tf.truediv(intersect_areas, union_areas)\n",
 716 |     "\n",
 717 |     "    best_ious = tf.reduce_max(iou_scores, axis=4)\n",
 718 |     "    conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE\n",
 719 |     "    \n",
 720 |     "    # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n",
 721 |     "    conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE\n",
 722 |     "    \n",
 723 |     "    ### class mask: simply the position of the ground truth boxes (the predictors)\n",
 724 |     "    class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE       \n",
 725 |     "    \n",
 726 |     "    \"\"\"\n",
 727 |     "    Warm-up training\n",
 728 |     "    \"\"\"\n",
 729 |     "    no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)\n",
 730 |     "    seen = tf.assign_add(seen, 1.)\n",
 731 |     "    \n",
 732 |     "    true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), \n",
 733 |     "                          lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n",
 734 |     "                                   true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask, \n",
 735 |     "                                   tf.ones_like(coord_mask)],\n",
 736 |     "                          lambda: [true_box_xy, \n",
 737 |     "                                   true_box_wh,\n",
 738 |     "                                   coord_mask])\n",
 739 |     "    \n",
 740 |     "    \"\"\"\n",
 741 |     "    Finalize the loss\n",
 742 |     "    \"\"\"\n",
 743 |     "    nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n",
 744 |     "    nb_conf_box  = tf.reduce_sum(tf.to_float(conf_mask  > 0.0))\n",
 745 |     "    nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n",
 746 |     "    \n",
 747 |     "    loss_xy    = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy)     * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
 748 |     "    loss_wh    = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh)     * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
 749 |     "    loss_conf  = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask)  / (nb_conf_box  + 1e-6) / 2.\n",
 750 |     "    loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n",
 751 |     "    loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n",
 752 |     "    \n",
 753 |     "    loss = loss_xy + loss_wh + loss_conf + loss_class\n",
 754 |     "    \n",
 755 |     "    nb_true_box = tf.reduce_sum(y_true[..., 4])\n",
 756 |     "    nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > OBJ_THRESHOLD))\n",
 757 |     "    \n",
 758 |     "    total_AP = tf.assign_add(total_AP, nb_pred_box/nb_true_box) \n",
 759 |     "    \n",
 760 |     "    loss = tf.Print(loss, [loss_xy, loss_wh, loss_conf, loss_class, loss, total_AP/seen], message='DEBUG', summarize=1000)\n",
 761 |     "    \n",
 762 |     "    return loss\n",
 763 |     "\n"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "metadata": {},
 769 |    "source": [
 770 |     "### Parse the annotations to construct train generator and validation generator"
 771 |    ]
 772 |   },
 773 |   {
 774 |    "cell_type": "code",
 775 |    "execution_count": 14,
 776 |    "metadata": {
 777 |     "collapsed": true
 778 |    },
 779 |    "outputs": [],
 780 |    "source": [
 781 |     "generator_config = {\n",
 782 |     "    'IMAGE_H'         : IMAGE_H, \n",
 783 |     "    'IMAGE_W'         : IMAGE_W,\n",
 784 |     "    'GRID_H'          : GRID_H,  \n",
 785 |     "    'GRID_W'          : GRID_W,\n",
 786 |     "    'BOX'             : BOX,\n",
 787 |     "    'LABELS'          : LABELS,\n",
 788 |     "    'CLASS'           : len(LABELS),\n",
 789 |     "    'ANCHORS'         : ANCHORS,\n",
 790 |     "    'BATCH_SIZE'      : BATCH_SIZE,\n",
 791 |     "    'TRUE_BOX_BUFFER' : 50,\n",
 792 |     "}\n",
 793 |     "\n"
 794 |    ]
 795 |   },
 796 |   {
 797 |    "cell_type": "code",
 798 |    "execution_count": 16,
 799 |    "metadata": {},
 800 |    "outputs": [
 801 |     {
 802 |      "name": "stdout",
 803 |      "output_type": "stream",
 804 |      "text": [
 805 |       "CPU times: user 26.6 s, sys: 5.42 s, total: 32 s\n",
 806 |       "Wall time: 11min 35s\n"
 807 |      ]
 808 |     }
 809 |    ],
 810 |    "source": [
 811 |     "%%time\n",
 812 |     "train_imgs, seen_train_labels = parse_annotation(train_annot_folder, train_image_folder, labels=LABELS)"
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "code",
 817 |    "execution_count": 39,
 818 |    "metadata": {
 819 |     "collapsed": true
 820 |    },
 821 |    "outputs": [],
 822 |    "source": [
 823 |     "import os\n",
 824 |     "import cv2\n",
 825 |     "import copy\n",
 826 |     "import numpy as np\n",
 827 |     "import imgaug as ia\n",
 828 |     "from imgaug import augmenters as iaa\n",
 829 |     "import xml.etree.ElementTree as ET\n",
 830 |     "from utils import BoundBox, normalize, bbox_iou"
 831 |    ]
 832 |   },
 833 |   {
 834 |    "cell_type": "code",
 835 |    "execution_count": 43,
 836 |    "metadata": {
 837 |     "collapsed": true
 838 |    },
 839 |    "outputs": [],
 840 |    "source": [
 841 |     "class BatchGenerator:\n",
 842 |     "    def __init__(self, images, \n",
 843 |     "                       config, \n",
 844 |     "                       shuffle=True, \n",
 845 |     "                       jitter=True, \n",
 846 |     "                       norm=True):\n",
 847 |     "\n",
 848 |     "        self.images = images\n",
 849 |     "        self.config = config\n",
 850 |     "\n",
 851 |     "        self.shuffle = shuffle\n",
 852 |     "        self.jitter  = jitter\n",
 853 |     "        self.norm    = norm\n",
 854 |     "        \n",
 855 |     "\n",
 856 |     "        self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))]\n",
 857 |     "\n",
 858 |     "        ### augmentors by https://github.com/aleju/imgaug\n",
 859 |     "        sometimes = lambda aug: iaa.Sometimes(0.5, aug)\n",
 860 |     "\n",
 861 |     "        # Define our sequence of augmentation steps that will be applied to every image\n",
 862 |     "        # All augmenters with per_channel=0.5 will sample one value _per image_\n",
 863 |     "        # in 50% of all cases. In all other cases they will sample new values\n",
 864 |     "        # _per channel_.\n",
 865 |     "        self.aug_pipe = iaa.Sequential(\n",
 866 |     "            [\n",
 867 |     "                # apply the following augmenters to most images\n",
 868 |     "                #iaa.Fliplr(0.5), # horizontally flip 50% of all images\n",
 869 |     "                #iaa.Flipud(0.2), # vertically flip 20% of all images\n",
 870 |     "                #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width\n",
 871 |     "                sometimes(iaa.Affine(\n",
 872 |     "                    #scale={\"x\": (0.8, 1.2), \"y\": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis\n",
 873 |     "                    #translate_percent={\"x\": (-0.2, 0.2), \"y\": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)\n",
 874 |     "                    #rotate=(-5, 5), # rotate by -45 to +45 degrees\n",
 875 |     "                    #shear=(-5, 5), # shear by -16 to +16 degrees\n",
 876 |     "                    #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)\n",
 877 |     "                    #cval=(0, 255), # if mode is constant, use a cval between 0 and 255\n",
 878 |     "                    #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)\n",
 879 |     "                )),\n",
 880 |     "                # execute 0 to 5 of the following (less important) augmenters per image\n",
 881 |     "                # don't execute all of them, as that would often be way too strong\n",
 882 |     "                iaa.SomeOf((0, 5),\n",
 883 |     "                    [\n",
 884 |     "                        #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation\n",
 885 |     "                        iaa.OneOf([\n",
 886 |     "                            iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0\n",
 887 |     "                            iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7\n",
 888 |     "                            iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7\n",
 889 |     "                        ]),\n",
 890 |     "                        iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images\n",
 891 |     "                        #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images\n",
 892 |     "                        # search either for all edges or for directed edges\n",
 893 |     "                        #sometimes(iaa.OneOf([\n",
 894 |     "                        #    iaa.EdgeDetect(alpha=(0, 0.7)),\n",
 895 |     "                        #    iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),\n",
 896 |     "                        #])),\n",
 897 |     "                        iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images\n",
 898 |     "                        iaa.OneOf([\n",
 899 |     "                            iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels\n",
 900 |     "                            #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),\n",
 901 |     "                        ]),\n",
 902 |     "                        #iaa.Invert(0.05, per_channel=True), # invert color channels\n",
 903 |     "                        iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)\n",
 904 |     "                        iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)\n",
 905 |     "                        iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast\n",
 906 |     "                        #iaa.Grayscale(alpha=(0.0, 1.0)),\n",
 907 |     "                        #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)\n",
 908 |     "                        #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around\n",
 909 |     "                    ],\n",
 910 |     "                    random_order=True\n",
 911 |     "                )\n",
 912 |     "            ],\n",
 913 |     "            random_order=True\n",
 914 |     "        )\n",
 915 |     "\n",
 916 |     "        if shuffle: np.random.shuffle(self.images)\n",
 917 |     "\n",
 918 |     "    def get_generator(self):\n",
 919 |     "        num_img = len(self.images)\n",
 920 |     "        \n",
 921 |     "        total_count = 0\n",
 922 |     "        batch_count = 0\n",
 923 |     "        \n",
 924 |     "        x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))                         # input images\n",
 925 |     "        b_batch = np.zeros((self.config['BATCH_SIZE'], 1     , 1     , 1    ,  self.config['TRUE_BOX_BUFFER'], 4))   # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes\n",
 926 |     "        y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'],  self.config['GRID_W'], self.config['BOX'], 4+1+1))                # desired network output\n",
 927 |     "        \n",
 928 |     "        while True:\n",
 929 |     "            if total_count < num_img:\n",
 930 |     "                train_instance = self.images[total_count]\n",
 931 |     "\n",
 932 |     "                # augment input image and fix object's position and size\n",
 933 |     "                img, all_objs = self.aug_image(train_instance, jitter=self.jitter)\n",
 934 |     "                \n",
 935 |     "                # construct output from object's x, y, w, h\n",
 936 |     "                true_box_index = 0\n",
 937 |     "                \n",
 938 |     "                for obj in all_objs:\n",
 939 |     "                    if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:\n",
 940 |     "                        center_x = .5*(obj['xmin'] + obj['xmax'])\n",
 941 |     "                        center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])\n",
 942 |     "                        center_y = .5*(obj['ymin'] + obj['ymax'])\n",
 943 |     "                        center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])\n",
 944 |     "\n",
 945 |     "                        grid_x = int(np.floor(center_x))\n",
 946 |     "                        grid_y = int(np.floor(center_y))\n",
 947 |     "\n",
 948 |     "                        if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:\n",
 949 |     "                            obj_indx  = self.config['LABELS'].index(obj['name'])\n",
 950 |     "                            \n",
 951 |     "                            center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n",
 952 |     "                            center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n",
 953 |     "                            \n",
 954 |     "                            box = [center_x, center_y, center_w, center_h]\n",
 955 |     "\n",
 956 |     "                            # find the anchor that best predicts this box\n",
 957 |     "                            best_anchor = -1\n",
 958 |     "                            max_iou     = -1\n",
 959 |     "                            \n",
 960 |     "                            shifted_box = BoundBox(0, \n",
 961 |     "                                                   0, \n",
 962 |     "                                                   center_w, \n",
 963 |     "                                                   center_h)\n",
 964 |     "                            \n",
 965 |     "                            for i in range(len(self.anchors)):\n",
 966 |     "                                anchor = self.anchors[i]\n",
 967 |     "                                iou    = bbox_iou(shifted_box, anchor)\n",
 968 |     "                                \n",
 969 |     "                                if max_iou < iou:\n",
 970 |     "                                    best_anchor = i\n",
 971 |     "                                    max_iou     = iou\n",
 972 |     "                                    \n",
 973 |     "                            # assign ground truth x, y, w, h, confidence and class probs to y_batch\n",
 974 |     "                            y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4]        = box\n",
 975 |     "                            y_batch[batch_count, grid_y, grid_x, best_anchor, 4  ]        = 1.\n",
 976 |     "                            y_batch[batch_count, grid_y, grid_x, best_anchor, 5  ]        = obj_indx\n",
 977 |     "                            \n",
 978 |     "                            # assign the true box to b_batch\n",
 979 |     "                            b_batch[batch_count, 0, 0, 0, true_box_index] = box\n",
 980 |     "                            \n",
 981 |     "                            true_box_index += 1\n",
 982 |     "                            true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']\n",
 983 |     "                                \n",
 984 |     "                # assign input image to x_batch\n",
 985 |     "                if self.norm: \n",
 986 |     "                    x_batch[batch_count] = normalize(img)\n",
 987 |     "                else:\n",
 988 |     "                    # plot image and bounding boxes for sanity check\n",
 989 |     "                    for obj in all_objs:\n",
 990 |     "                        if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:\n",
 991 |     "                            cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)\n",
 992 |     "                            cv2.putText(img[:,:,::-1], obj['name'], \n",
 993 |     "                                        (obj['xmin']+2, obj['ymin']+12), \n",
 994 |     "                                        0, 1.2e-3 * img.shape[0], \n",
 995 |     "                                        (0,255,0), 2)\n",
 996 |     "                            \n",
 997 |     "                    x_batch[batch_count] = img\n",
 998 |     "\n",
 999 |     "                # increase instance counter in current batch\n",
1000 |     "                batch_count += 1  \n",
1001 |     "                    \n",
1002 |     "            total_count += 1\n",
1003 |     "            if total_count >= num_img:\n",
1004 |     "                total_count = 0\n",
1005 |     "                if self.shuffle: np.random.shuffle(self.images)                    \n",
1006 |     "\n",
1007 |     "            if batch_count >= self.config['BATCH_SIZE']:\n",
1008 |     "                yield [x_batch, b_batch], y_batch\n",
1009 |     "                \n",
1010 |     "                x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))\n",
1011 |     "                y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'],  self.config['GRID_W'],  self.config['BOX'], 5+self.config['CLASS']))       \n",
1012 |     "                \n",
1013 |     "                batch_count = 0\n",
1014 |     "\n",
1015 |     "    def aug_image(self, train_instance, jitter):\n",
1016 |     "        image_name = train_instance['filename']\n",
1017 |     "        image = cv2.imread(image_name)\n",
1018 |     "        h, w, c = image.shape\n",
1019 |     "        \n",
1020 |     "        all_objs = copy.deepcopy(train_instance['object'])\n",
1021 |     "\n",
1022 |     "        if jitter:\n",
1023 |     "            ### scale the image\n",
1024 |     "            scale = np.random.uniform() / 10. + 1.\n",
1025 |     "            image = cv2.resize(image, (0,0), fx = scale, fy = scale)\n",
1026 |     "\n",
1027 |     "            ### translate the image\n",
1028 |     "            max_offx = (scale-1.) * w\n",
1029 |     "            max_offy = (scale-1.) * h\n",
1030 |     "            offx = int(np.random.uniform() * max_offx)\n",
1031 |     "            offy = int(np.random.uniform() * max_offy)\n",
1032 |     "            \n",
1033 |     "            image = image[offy : (offy + h), offx : (offx + w)]\n",
1034 |     "\n",
1035 |     "            ### flip the image\n",
1036 |     "            flip = np.random.binomial(1, .5)\n",
1037 |     "            if flip > 0.5: image = cv2.flip(image, 1)\n",
1038 |     "                \n",
1039 |     "            image = self.aug_pipe.augment_image(image)            \n",
1040 |     "            \n",
1041 |     "        # resize the image to standard size\n",
1042 |     "        image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))\n",
1043 |     "        image = image[:,:,::-1]\n",
1044 |     "\n",
1045 |     "        # fix object's position and size\n",
1046 |     "        for obj in all_objs:\n",
1047 |     "            for attr in ['xmin', 'xmax']:\n",
1048 |     "                if jitter: obj[attr] = int(obj[attr] * scale - offx)\n",
1049 |     "                    \n",
1050 |     "                obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)\n",
1051 |     "                obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)\n",
1052 |     "                \n",
1053 |     "            for attr in ['ymin', 'ymax']:\n",
1054 |     "                if jitter: obj[attr] = int(obj[attr] * scale - offy)\n",
1055 |     "                    \n",
1056 |     "                obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)\n",
1057 |     "                obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)\n",
1058 |     "\n",
1059 |     "            if jitter and flip > 0.5:\n",
1060 |     "                xmin = obj['xmin']\n",
1061 |     "                obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']\n",
1062 |     "                obj['xmax'] = self.config['IMAGE_W'] - xmin\n",
1063 |     "                \n",
1064 |     "        return image, all_objs\n",
1065 |     "\n",
1066 |     "    def get_dateset_size(self):\n",
1067 |     "        return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "code",
1072 |    "execution_count": 44,
1073 |    "metadata": {},
1074 |    "outputs": [
1075 |     {
1076 |      "name": "stdout",
1077 |      "output_type": "stream",
1078 |      "text": [
1079 |       "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
1080 |       "Wall time: 11.5 ms\n"
1081 |      ]
1082 |     }
1083 |    ],
1084 |    "source": [
1085 |     "%%time\n",
1086 |     "train_batch = BatchGenerator(train_imgs, generator_config)"
1087 |    ]
1088 |   },
1089 |   {
1090 |    "cell_type": "code",
1091 |    "execution_count": 22,
1092 |    "metadata": {},
1093 |    "outputs": [
1094 |     {
1095 |      "name": "stdout",
1096 |      "output_type": "stream",
1097 |      "text": [
1098 |       "CPU times: user 12.4 s, sys: 2.45 s, total: 14.8 s\n",
1099 |       "Wall time: 4min 50s\n"
1100 |      ]
1101 |     }
1102 |    ],
1103 |    "source": [
1104 |     "%%time\n",
1105 |     "val_imgs, seen_val_labels = parse_annotation(val_annot_folder, val_image_folder, labels=LABELS)"
1106 |    ]
1107 |   },
1108 |   {
1109 |    "cell_type": "code",
1110 |    "execution_count": 45,
1111 |    "metadata": {},
1112 |    "outputs": [
1113 |     {
1114 |      "name": "stdout",
1115 |      "output_type": "stream",
1116 |      "text": [
1117 |       "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
1118 |       "Wall time: 5.87 ms\n"
1119 |      ]
1120 |     }
1121 |    ],
1122 |    "source": [
1123 |     "%%time\n",
1124 |     "valid_batch = BatchGenerator(val_imgs, generator_config, jitter=False)"
1125 |    ]
1126 |   },
1127 |   {
1128 |    "cell_type": "markdown",
1129 |    "metadata": {},
1130 |    "source": [
1131 |     "## Setup a few callbacks and start the training"
1132 |    ]
1133 |   },
1134 |   {
1135 |    "cell_type": "code",
1136 |    "execution_count": 46,
1137 |    "metadata": {
1138 |     "collapsed": true
1139 |    },
1140 |    "outputs": [],
1141 |    "source": [
1142 |     "early_stop = callbacks.EarlyStopping(monitor='val_loss', \n",
1143 |     "                           min_delta=0.001, \n",
1144 |     "                           patience=3, \n",
1145 |     "                           mode='min', \n",
1146 |     "                           verbose=1)\n",
1147 |     "\n",
1148 |     "checkpoint = callbacks.ModelCheckpoint('weights_coco.h5', \n",
1149 |     "                             monitor='val_loss', \n",
1150 |     "                             verbose=1, \n",
1151 |     "                             save_best_only=True, \n",
1152 |     "                             mode='min', \n",
1153 |     "                             period=1)"
1154 |    ]
1155 |   },
1156 |   {
1157 |    "cell_type": "code",
1158 |    "execution_count": 47,
1159 |    "metadata": {},
1160 |    "outputs": [
1161 |     {
1162 |      "ename": "OSError",
1163 |      "evalue": "Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)",
1164 |      "output_type": "error",
1165 |      "traceback": [
1166 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1167 |       "\u001b[0;31mOSError\u001b[0m                                   Traceback (most recent call last)",
1168 |       "\u001b[0;32m<ipython-input-47-5aa1dc8f3e88>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'weights_coco.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1169 |       "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py\u001b[0m in \u001b[0;36mload_weights\u001b[0;34m(self, filepath, by_name)\u001b[0m\n\u001b[1;32m   2564\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mh5py\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2565\u001b[0m             \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'`load_weights` requires h5py.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2566\u001b[0;31m         \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2567\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;34m'layer_names'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'model_weights'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2568\u001b[0m             \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'model_weights'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1170 |       "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, **kwds)\u001b[0m\n\u001b[1;32m    267\u001b[0m             \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    268\u001b[0m                 \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 269\u001b[0;31m                 \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muserblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mswmr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mswmr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    271\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1171 |       "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m     97\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m             \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m         \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    100\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    101\u001b[0m         \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1172 |       "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
1173 |       "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
1174 |       "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n",
1175 |       "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)"
1176 |      ]
1177 |     }
1178 |    ],
1179 |    "source": [
1180 |     "model.load_weights('weights_coco.h5')"
1181 |    ]
1182 |   },
1183 |   {
1184 |    "cell_type": "code",
1185 |    "execution_count": null,
1186 |    "metadata": {},
1187 |    "outputs": [
1188 |     {
1189 |      "name": "stdout",
1190 |      "output_type": "stream",
1191 |      "text": [
1192 |       "Epoch 1/100\n",
1193 |       "  91/5120 [..............................] - ETA: 296822s - loss: 4.8495"
1194 |      ]
1195 |     }
1196 |    ],
1197 |    "source": [
1198 |     "tb_counter  = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'coco_' in log]) + 1\n",
1199 |     "tensorboard = callbacks.TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'coco_' + '_' + str(tb_counter), \n",
1200 |     "                          histogram_freq=0, \n",
1201 |     "                          write_graph=True, \n",
1202 |     "                          write_images=False)\n",
1203 |     "\n",
1204 |     "optimizer = optimizers.Adam(lr=0.5e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n",
1205 |     "#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)\n",
1206 |     "#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)\n",
1207 |     "\n",
1208 |     "model.compile(loss=custom_loss, optimizer=optimizer)\n",
1209 |     "\n",
1210 |     "model.fit_generator(generator        = train_batch.get_generator(), \n",
1211 |     "                    steps_per_epoch  = train_batch.get_dateset_size(), \n",
1212 |     "                    epochs           = 100, \n",
1213 |     "                    verbose          = 1,\n",
1214 |     "                    validation_data  = valid_batch.get_generator(),\n",
1215 |     "                    validation_steps = valid_batch.get_dateset_size(),\n",
1216 |     "                    callbacks        = [early_stop, checkpoint, tensorboard], \n",
1217 |     "                    max_queue_size   = 3)"
1218 |    ]
1219 |   },
1220 |   {
1221 |    "cell_type": "code",
1222 |    "execution_count": null,
1223 |    "metadata": {
1224 |     "collapsed": true
1225 |    },
1226 |    "outputs": [],
1227 |    "source": [
1228 |     "%load_ext version_information\n",
1229 |     "%version_information keras"
1230 |    ]
1231 |   },
1232 |   {
1233 |    "cell_type": "code",
1234 |    "execution_count": null,
1235 |    "metadata": {
1236 |     "collapsed": true
1237 |    },
1238 |    "outputs": [],
1239 |    "source": []
1240 |   }
1241 |  ],
1242 |  "metadata": {
1243 |   "kernelspec": {
1244 |    "display_name": "Python 3",
1245 |    "language": "python",
1246 |    "name": "python3"
1247 |   },
1248 |   "language_info": {
1249 |    "codemirror_mode": {
1250 |     "name": "ipython",
1251 |     "version": 3
1252 |    },
1253 |    "file_extension": ".py",
1254 |    "mimetype": "text/x-python",
1255 |    "name": "python",
1256 |    "nbconvert_exporter": "python",
1257 |    "pygments_lexer": "ipython3",
1258 |    "version": "3.6.2"
1259 |   },
1260 |   "toc": {
1261 |    "nav_menu": {},
1262 |    "number_sections": true,
1263 |    "sideBar": true,
1264 |    "skip_h1_title": false,
1265 |    "toc_cell": true,
1266 |    "toc_position": {},
1267 |    "toc_section_display": "block",
1268 |    "toc_window_display": false
1269 |   }
1270 |  },
1271 |  "nbformat": 4,
1272 |  "nbformat_minor": 2
1273 | }
1274 | 


--------------------------------------------------------------------------------
/coco2pascal.py:
--------------------------------------------------------------------------------
  1 | import baker
  2 | import json
  3 | from path import Path as path
  4 | from cytoolz import merge, join, groupby
  5 | from cytoolz.compatibility import iteritems
  6 | from cytoolz.curried import update_in
  7 | from itertools import starmap
  8 | from collections import deque
  9 | from lxml import etree, objectify
 10 | from scipy.io import savemat
 11 | from scipy.ndimage import imread
 12 | 
 13 | 
 14 | def keyjoin(leftkey, leftseq, rightkey, rightseq):
 15 |     return starmap(merge, join(leftkey, leftseq, rightkey, rightseq))
 16 | 
 17 | 
 18 | def root(folder, filename, width, height):
 19 |     E = objectify.ElementMaker(annotate=False)
 20 |     return E.annotation(
 21 |             E.folder(folder),
 22 |             E.filename(filename),
 23 |             E.source(
 24 |                 E.database('MS COCO 2014'),
 25 |                 E.annotation('MS COCO 2014'),
 26 |                 E.image('Flickr'),
 27 |                 ),
 28 |             E.size(
 29 |                 E.width(width),
 30 |                 E.height(height),
 31 |                 E.depth(3),
 32 |                 ),
 33 |             E.segmented(0)
 34 |             )
 35 | 
 36 | 
 37 | def instance_to_xml(anno):
 38 |     E = objectify.ElementMaker(annotate=False)
 39 |     xmin, ymin, width, height = anno['bbox']
 40 |     return E.object(
 41 |             E.name(anno['category_id']),
 42 |             E.bndbox(
 43 |                 E.xmin(xmin),
 44 |                 E.ymin(ymin),
 45 |                 E.xmax(xmin+width),
 46 |                 E.ymax(ymin+height),
 47 |                 ),
 48 |             )
 49 | 
 50 | 
 51 | @baker.command
 52 | def write_categories(coco_annotation, dst):
 53 |     content = json.loads(path(coco_annotation).expand().text())
 54 |     categories = tuple( d['name'] for d in content['categories'])
 55 |     savemat(path(dst).expand(), {'categories': categories})
 56 | 
 57 | 
 58 | def get_instances(coco_annotation):
 59 |     coco_annotation = path(coco_annotation).expand()
 60 |     content = json.loads(coco_annotation.text())
 61 |     categories = {d['id']: d['name'] for d in content['categories']}
 62 |     return categories, tuple(keyjoin('id', content['images'], 'image_id', content['annotations']))
 63 | 
 64 | def rename(name, year=2014):
 65 |         out_name = path(name).stripext()
 66 |         # out_name = out_name.split('_')[-1]
 67 |         # out_name = '{}_{}'.format(year, out_name)
 68 |         return out_name
 69 | 
 70 | 
 71 | @baker.command
 72 | def create_imageset(annotations, dst):
 73 |     annotations = path(annotations).expand()
 74 |     dst = path(dst).expand()
 75 |     val_txt = dst / 'val.txt'
 76 |     train_txt = dst / 'train.txt'
 77 | 
 78 |     for val in annotations.listdir('*val*'):
 79 |         val_txt.write_text('{}\n'.format(val.basename().stripext()), append=True)
 80 | 
 81 |     for train in annotations.listdir('*train*'):
 82 |         train_txt.write_text('{}\n'.format(train.basename().stripext()), append=True)
 83 | 
 84 | @baker.command
 85 | def create_annotations(dbpath, subset, dst):
 86 |     annotations_path = path(dbpath).expand() / 'annotations/instances_{}2014.json'.format(subset)
 87 |     images_path = path(dbpath).expand() / 'images/{}2014'.format(subset)
 88 |     categories , instances= get_instances(annotations_path)
 89 |     dst = path(dst).expand()
 90 | 
 91 |     for i, instance in enumerate(instances):
 92 |         instances[i]['category_id'] = categories[instance['category_id']]
 93 | 
 94 |     for name, group in iteritems(groupby('file_name', instances)):
 95 |         img = imread(images_path / name)
 96 |         if img.ndim == 3:
 97 |             out_name = rename(name)
 98 |             annotation = root('VOC2014', '{}.jpg'.format(out_name),
 99 |                               group[0]['height'], group[0]['width'])
100 |             for instance in group:
101 |                 annotation.append(instance_to_xml(instance))
102 |             etree.ElementTree(annotation).write(dst / '{}.xml'.format(out_name))
103 |             print(out_name)
104 |         else:
105 |             print (instance['file_name'])
106 | 
107 | 
108 | 
109 | 
110 | 
111 | if __name__ == '__main__':
112 | 	baker.run()
113 | 


--------------------------------------------------------------------------------
/images/custom-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss.png


--------------------------------------------------------------------------------
/images/custom-loss2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss2.png


--------------------------------------------------------------------------------
/images/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/model.png


--------------------------------------------------------------------------------
/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/model.png


--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import copy
  4 | import numpy as np
  5 | import imgaug as ia
  6 | from imgaug import augmenters as iaa
  7 | import xml.etree.ElementTree as ET
  8 | from utils import BoundBox, normalize, bbox_iou
  9 | 
 10 | def parse_annotation(ann_dir, img_dir, labels=[]):
 11 |     all_imgs = []
 12 |     seen_labels = set()
 13 |     
 14 |     for ann in sorted(os.listdir(ann_dir)):
 15 |         img = {'object':[]}
 16 |         
 17 |         tree = ET.parse(ann_dir + ann)
 18 |         
 19 |         for elem in tree.iter():
 20 |             if 'filename' in elem.tag:
 21 |                 all_imgs += [img]
 22 |                 img['filename'] = img_dir + elem.text
 23 |             if 'width' in elem.tag:
 24 |                 img['width'] = int(elem.text)
 25 |             if 'height' in elem.tag:
 26 |                 img['height'] = int(elem.text)
 27 |             if 'object' in elem.tag or 'part' in elem.tag:
 28 |                 obj = {}
 29 |                 
 30 |                 for attr in list(elem):
 31 |                     if 'name' in attr.tag:
 32 |                         obj['name'] = attr.text
 33 |                         seen_labels.add(obj['name'])
 34 |                         
 35 |                         if len(labels) > 0 and obj['name'] not in labels:
 36 |                             break
 37 |                         else:
 38 |                             img['object'] += [obj]
 39 |                             
 40 |                     if 'bndbox' in attr.tag:
 41 |                         for dim in list(attr):
 42 |                             if 'xmin' in dim.tag:
 43 |                                 obj['xmin'] = int(round(float(dim.text)))
 44 |                             if 'ymin' in dim.tag:
 45 |                                 obj['ymin'] = int(round(float(dim.text)))
 46 |                             if 'xmax' in dim.tag:
 47 |                                 obj['xmax'] = int(round(float(dim.text)))
 48 |                             if 'ymax' in dim.tag:
 49 |                                 obj['ymax'] = int(round(float(dim.text)))
 50 |                         
 51 |     return all_imgs, seen_labels
 52 | 
 53 | class BatchGenerator:
 54 |     def __init__(self, images, 
 55 |                        config, 
 56 |                        shuffle=True, 
 57 |                        jitter=True, 
 58 |                        norm=True):
 59 | 
 60 |         self.images = images
 61 |         self.config = config
 62 | 
 63 |         self.shuffle = shuffle
 64 |         self.jitter  = jitter
 65 |         self.norm    = norm
 66 | 
 67 |         self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))]
 68 | 
 69 |         ### augmentors by https://github.com/aleju/imgaug
 70 |         sometimes = lambda aug: iaa.Sometimes(0.5, aug)
 71 | 
 72 |         # Define our sequence of augmentation steps that will be applied to every image
 73 |         # All augmenters with per_channel=0.5 will sample one value _per image_
 74 |         # in 50% of all cases. In all other cases they will sample new values
 75 |         # _per channel_.
 76 |         self.aug_pipe = iaa.Sequential(
 77 |             [
 78 |                 # apply the following augmenters to most images
 79 |                 #iaa.Fliplr(0.5), # horizontally flip 50% of all images
 80 |                 #iaa.Flipud(0.2), # vertically flip 20% of all images
 81 |                 #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width
 82 |                 sometimes(iaa.Affine(
 83 |                     #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
 84 |                     #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
 85 |                     #rotate=(-5, 5), # rotate by -45 to +45 degrees
 86 |                     #shear=(-5, 5), # shear by -16 to +16 degrees
 87 |                     #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
 88 |                     #cval=(0, 255), # if mode is constant, use a cval between 0 and 255
 89 |                     #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
 90 |                 )),
 91 |                 # execute 0 to 5 of the following (less important) augmenters per image
 92 |                 # don't execute all of them, as that would often be way too strong
 93 |                 iaa.SomeOf((0, 5),
 94 |                     [
 95 |                         #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
 96 |                         iaa.OneOf([
 97 |                             iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0
 98 |                             iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7
 99 |                             iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7
100 |                         ]),
101 |                         iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images
102 |                         #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
103 |                         # search either for all edges or for directed edges
104 |                         #sometimes(iaa.OneOf([
105 |                         #    iaa.EdgeDetect(alpha=(0, 0.7)),
106 |                         #    iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),
107 |                         #])),
108 |                         iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images
109 |                         iaa.OneOf([
110 |                             iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels
111 |                             #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),
112 |                         ]),
113 |                         #iaa.Invert(0.05, per_channel=True), # invert color channels
114 |                         iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
115 |                         iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)
116 |                         iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast
117 |                         #iaa.Grayscale(alpha=(0.0, 1.0)),
118 |                         #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
119 |                         #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around
120 |                     ],
121 |                     random_order=True
122 |                 )
123 |             ],
124 |             random_order=True
125 |         )
126 | 
127 |         if shuffle: np.random.shuffle(self.images)
128 | 
129 |     def get_generator(self):
130 |         num_img = len(self.images)
131 |         
132 |         total_count = 0
133 |         batch_count = 0
134 |         
135 |         x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))                         # input images
136 |         b_batch = np.zeros((self.config['BATCH_SIZE'], 1     , 1     , 1    ,  self.config['TRUE_BOX_BUFFER'], 4))   # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes
137 |         y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'],  self.config['GRID_W'], self.config['BOX'], 4+1+1))                # desired network output
138 |         
139 |         while True:
140 |             if total_count < num_img:
141 |                 train_instance = self.images[total_count]
142 | 
143 |                 # augment input image and fix object's position and size
144 |                 img, all_objs = self.aug_image(train_instance, jitter=self.jitter)
145 |                 
146 |                 # construct output from object's x, y, w, h
147 |                 true_box_index = 0
148 |                 
149 |                 for obj in all_objs:
150 |                     if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:
151 |                         center_x = .5*(obj['xmin'] + obj['xmax'])
152 |                         center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])
153 |                         center_y = .5*(obj['ymin'] + obj['ymax'])
154 |                         center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])
155 | 
156 |                         grid_x = int(np.floor(center_x))
157 |                         grid_y = int(np.floor(center_y))
158 | 
159 |                         if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:
160 |                             obj_indx  = self.config['LABELS'].index(obj['name'])
161 |                             
162 |                             center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
163 |                             center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
164 |                             
165 |                             box = [center_x, center_y, center_w, center_h]
166 | 
167 |                             # find the anchor that best predicts this box
168 |                             best_anchor = -1
169 |                             max_iou     = -1
170 |                             
171 |                             shifted_box = BoundBox(0, 
172 |                                                    0, 
173 |                                                    center_w, 
174 |                                                    center_h)
175 |                             
176 |                             for i in range(len(self.anchors)):
177 |                                 anchor = self.anchors[i]
178 |                                 iou    = bbox_iou(shifted_box, anchor)
179 |                                 
180 |                                 if max_iou < iou:
181 |                                     best_anchor = i
182 |                                     max_iou     = iou
183 |                                     
184 |                             # assign ground truth x, y, w, h, confidence and class probs to y_batch
185 |                             y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4]        = box
186 |                             y_batch[batch_count, grid_y, grid_x, best_anchor, 4  ]        = 1.
187 |                             y_batch[batch_count, grid_y, grid_x, best_anchor, 5  ]        = obj_indx
188 |                             
189 |                             # assign the true box to b_batch
190 |                             b_batch[batch_count, 0, 0, 0, true_box_index] = box
191 |                             
192 |                             true_box_index += 1
193 |                             true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']
194 |                                 
195 |                 # assign input image to x_batch
196 |                 if self.norm: 
197 |                     x_batch[batch_count] = normalize(img)
198 |                 else:
199 |                     # plot image and bounding boxes for sanity check
200 |                     for obj in all_objs:
201 |                         if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:
202 |                             cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)
203 |                             cv2.putText(img[:,:,::-1], obj['name'], 
204 |                                         (obj['xmin']+2, obj['ymin']+12), 
205 |                                         0, 1.2e-3 * img.shape[0], 
206 |                                         (0,255,0), 2)
207 |                             
208 |                     x_batch[batch_count] = img
209 | 
210 |                 # increase instance counter in current batch
211 |                 batch_count += 1  
212 |                     
213 |             total_count += 1
214 |             if total_count >= num_img:
215 |                 total_count = 0
216 |                 if self.shuffle: np.random.shuffle(self.images)                    
217 | 
218 |             if batch_count >= self.config['BATCH_SIZE']:
219 |                 yield [x_batch, b_batch], y_batch
220 |                 
221 |                 x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))
222 |                 y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'],  self.config['GRID_W'],  self.config['BOX'], 5+self.config['CLASS']))       
223 |                 
224 |                 batch_count = 0
225 | 
226 |     def aug_image(self, train_instance, jitter):
227 |         image_name = train_instance['filename']
228 |         image = cv2.imread(image_name)
229 |         h, w, c = image.shape
230 |         
231 |         all_objs = copy.deepcopy(train_instance['object'])
232 | 
233 |         if jitter:
234 |             ### scale the image
235 |             scale = np.random.uniform() / 10. + 1.
236 |             image = cv2.resize(image, (0,0), fx = scale, fy = scale)
237 | 
238 |             ### translate the image
239 |             max_offx = (scale-1.) * w
240 |             max_offy = (scale-1.) * h
241 |             offx = int(np.random.uniform() * max_offx)
242 |             offy = int(np.random.uniform() * max_offy)
243 |             
244 |             image = image[offy : (offy + h), offx : (offx + w)]
245 | 
246 |             ### flip the image
247 |             flip = np.random.binomial(1, .5)
248 |             if flip > 0.5: image = cv2.flip(image, 1)
249 |                 
250 |             image = self.aug_pipe.augment_image(image)            
251 |             
252 |         # resize the image to standard size
253 |         image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))
254 |         image = image[:,:,::-1]
255 | 
256 |         # fix object's position and size
257 |         for obj in all_objs:
258 |             for attr in ['xmin', 'xmax']:
259 |                 if jitter: obj[attr] = int(obj[attr] * scale - offx)
260 |                     
261 |                 obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)
262 |                 obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)
263 |                 
264 |             for attr in ['ymin', 'ymax']:
265 |                 if jitter: obj[attr] = int(obj[attr] * scale - offy)
266 |                     
267 |                 obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)
268 |                 obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)
269 | 
270 |             if jitter and flip > 0.5:
271 |                 xmin = obj['xmin']
272 |                 obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']
273 |                 obj['xmax'] = self.config['IMAGE_W'] - xmin
274 |                 
275 |         return image, all_objs
276 | 
277 |     def get_dateset_size(self):
278 |         return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import xml.etree.ElementTree as ET
  4 | import tensorflow as tf
  5 | import copy
  6 | import cv2
  7 | 
  8 | class BoundBox:
  9 |     def __init__(self, x, y, w, h, c = None, classes = None):
 10 |         self.x     = x
 11 |         self.y     = y
 12 |         self.w     = w
 13 |         self.h     = h
 14 |         
 15 |         self.c     = c
 16 |         self.classes = classes
 17 | 
 18 |         self.label = -1
 19 |         self.score = -1
 20 | 
 21 |     def get_label(self):
 22 |         if self.label == -1:
 23 |             self.label = np.argmax(self.classes)
 24 |         
 25 |         return self.label
 26 |     
 27 |     def get_score(self):
 28 |         if self.score == -1:
 29 |             self.score = self.classes[self.get_label()]
 30 |             
 31 |         return self.score
 32 | 
 33 | class WeightReader:
 34 |     def __init__(self, weight_file):
 35 |         self.offset = 4
 36 |         self.all_weights = np.fromfile(weight_file, dtype='float32')
 37 |         
 38 |     def read_bytes(self, size):
 39 |         self.offset = self.offset + size
 40 |         return self.all_weights[self.offset-size:self.offset]
 41 |     
 42 |     def reset(self):
 43 |         self.offset = 4
 44 | 
 45 | def normalize(image):
 46 |     image = image / 255.
 47 |     
 48 |     return image
 49 | 
 50 | def bbox_iou(box1, box2):
 51 |     x1_min  = box1.x - box1.w/2
 52 |     x1_max  = box1.x + box1.w/2
 53 |     y1_min  = box1.y - box1.h/2
 54 |     y1_max  = box1.y + box1.h/2
 55 |     
 56 |     x2_min  = box2.x - box2.w/2
 57 |     x2_max  = box2.x + box2.w/2
 58 |     y2_min  = box2.y - box2.h/2
 59 |     y2_max  = box2.y + box2.h/2
 60 |     
 61 |     intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max])
 62 |     intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max])
 63 |     
 64 |     intersect = intersect_w * intersect_h
 65 |     
 66 |     union = box1.w * box1.h + box2.w * box2.h - intersect
 67 |     
 68 |     return float(intersect) / union
 69 |     
 70 | def interval_overlap(interval_a, interval_b):
 71 |     x1, x2 = interval_a
 72 |     x3, x4 = interval_b
 73 | 
 74 |     if x3 < x1:
 75 |         if x4 < x1:
 76 |             return 0
 77 |         else:
 78 |             return min(x2,x4) - x1
 79 |     else:
 80 |         if x2 < x3:
 81 |             return 0
 82 |         else:
 83 |             return min(x2,x4) - x3  
 84 | 
 85 | def draw_boxes(image, boxes, labels):
 86 |     
 87 |     for box in boxes:
 88 |         xmin  = int((box.x - box.w/2) * image.shape[1])
 89 |         xmax  = int((box.x + box.w/2) * image.shape[1])
 90 |         ymin  = int((box.y - box.h/2) * image.shape[0])
 91 |         ymax  = int((box.y + box.h/2) * image.shape[0])
 92 | 
 93 |         cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)
 94 |         cv2.putText(image, 
 95 |                     labels[box.get_label()] + ' ' + str(box.get_score()), 
 96 |                     (xmin, ymin - 13), 
 97 |                     cv2.FONT_HERSHEY_SIMPLEX, 
 98 |                     1e-3 * image.shape[0], 
 99 |                     (0,255,0), 2)
100 |         
101 |     return image        
102 |         
103 | def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class):
104 |     grid_h, grid_w, nb_box = netout.shape[:3]
105 | 
106 |     boxes = []
107 |     
108 |     # decode the output by the network
109 |     netout[..., 4]  = sigmoid(netout[..., 4])
110 |     netout[..., 5:] = netout[..., 4][..., np.newaxis] * softmax(netout[..., 5:])
111 |     netout[..., 5:] *= netout[..., 5:] > obj_threshold
112 |     
113 |     for row in range(grid_h):
114 |         for col in range(grid_w):
115 |             for b in range(nb_box):
116 |                 # from 4th element onwards are confidence and class classes
117 |                 classes = netout[row,col,b,5:]
118 |                 
119 |                 if np.sum(classes) > 0:
120 |                     # first 4 elements are x, y, w, and h
121 |                     x, y, w, h = netout[row,col,b,:4]
122 | 
123 |                     x = (col + sigmoid(x)) / grid_w # center position, unit: image width
124 |                     y = (row + sigmoid(y)) / grid_h # center position, unit: image height
125 |                     w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width
126 |                     h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height
127 |                     confidence = netout[row,col,b,4]
128 |                     
129 |                     box = BoundBox(x, y, w, h, confidence, classes)
130 |                     
131 |                     boxes.append(box)
132 | 
133 |     # suppress non-maximal boxes
134 |     for c in range(nb_class):
135 |         sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes])))
136 | 
137 |         for i in xrange(len(sorted_indices)):
138 |             index_i = sorted_indices[i]
139 |             
140 |             if boxes[index_i].classes[c] == 0: 
141 |                 continue
142 |             else:
143 |                 for j in xrange(i+1, len(sorted_indices)):
144 |                     index_j = sorted_indices[j]
145 |                     
146 |                     if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold:
147 |                         boxes[index_j].classes[c] = 0
148 |                         
149 |     # remove the boxes which are less likely than a obj_threshold
150 |     boxes = [box for box in boxes if box.get_score() > obj_threshold]
151 |     
152 |     return boxes
153 | 
154 | def sigmoid(x):
155 |     return 1. / (1. + np.exp(-x))
156 | 
157 | def softmax(x, axis=-1, t=-100.):
158 |     x = x - np.max(x)
159 |     
160 |     if np.min(x) < t:
161 |         x = x/np.min(x)*t
162 |         
163 |     e_x = np.exp(x)
164 |     
165 |     return e_x / e_x.sum(axis, keepdims=True)


--------------------------------------------------------------------------------