├── .gitignore
├── LICENSE
├── README.md
├── __init__.py
├── barebone-yolo.ipynb
├── coco2pascal.py
├── images
├── custom-loss.png
├── custom-loss2.png
└── model.png
├── model.png
├── preprocessing.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Anderson Banihirwe
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Keras YOLO Series
2 | Keras implementation of YOLO (You Only Look Once) : Unified, Real-Time Object Detection
3 |
4 | This is a [Keras](https://keras.io/)
5 | implementation of YOLO, and YOLOv2.
6 | This project is mainly based on [darkflow](https://github.com/thtrieu/darkflow)
7 | and [darknet](https://github.com/pjreddie/darknet).
8 |
9 | For details about YOLO and YOLOv2 please refer to their [project page](https://pjreddie.com/darknet/yolo/)
10 | and the [paper](https://arxiv.org/abs/1612.08242):
11 | YOLO9000: Better, Faster, Stronger by Joseph Redmon and Ali Farhadi.
12 |
13 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/__init__.py
--------------------------------------------------------------------------------
/barebone-yolo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "toc": "true"
7 | },
8 | "source": [
9 | " # Table of Contents\n",
10 | "
"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "# YOLO"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Import packages"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 35,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "from keras import models\n",
34 | "from keras import layers\n",
35 | "from keras import callbacks\n",
36 | "from keras import optimizers\n",
37 | "from keras.utils.vis_utils import plot_model\n",
38 | "import keras.backend as K\n",
39 | "import tensorflow as tf\n",
40 | "%matplotlib inline\n",
41 | "import matplotlib.pyplot as plt\n",
42 | "import matplotlib\n",
43 | "matplotlib.style.use('seaborn')\n",
44 | "import numpy as np\n",
45 | "import os\n",
46 | "import cv2\n",
47 | "import imgaug as ia\n",
48 | "from imgaug import augmenters as iaa\n",
49 | "from preprocessing import parse_annotation, BatchGenerator\n",
50 | "from utils import WeightReader, decode_netout, draw_boxes"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "## Define and initialize global variables"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "metadata": {
64 | "collapsed": true
65 | },
66 | "outputs": [],
67 | "source": [
68 | "LABELS = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']\n",
69 | "\n",
70 | "IMAGE_H, IMAGE_W = 416, 416\n",
71 | "GRID_H, GRID_W = 13 , 13\n",
72 | "BOX = 5\n",
73 | "CLASS = len(LABELS)\n",
74 | "CLASS_WEIGHTS = np.ones(CLASS, dtype='float32')\n",
75 | "OBJ_THRESHOLD = 0.3#0.5\n",
76 | "NMS_THRESHOLD = 0.3#0.45\n",
77 | "ANCHORS = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]\n",
78 | "\n",
79 | "NO_OBJECT_SCALE = 1.0\n",
80 | "OBJECT_SCALE = 5.0\n",
81 | "COORD_SCALE = 1.0\n",
82 | "CLASS_SCALE = 1.0\n",
83 | "\n",
84 | "BATCH_SIZE = 16\n",
85 | "WARM_UP_BATCHES = 0\n",
86 | "TRUE_BOX_BUFFER = 50\n",
87 | "\n",
88 | "\n",
89 | "ALPHA = 0.1"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "metadata": {
96 | "collapsed": true
97 | },
98 | "outputs": [],
99 | "source": [
100 | "pre_trained_weights='weights/yolo.weights'\n",
101 | "train_image_folder = '/home/abanihi/Documents/deep-data/coco/images/train2014/'\n",
102 | "train_annot_folder = '/home/abanihi/Documents/deep-data/coco/train2014ann/'\n",
103 | "val_image_folder = '/home/abanihi/Documents/deep-data/coco/images/val2014/'\n",
104 | "val_annot_folder = '/home/abanihi/Documents/deep-data/coco/val2014ann/'"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Construct the Network"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 4,
117 | "metadata": {
118 | "collapsed": true
119 | },
120 | "outputs": [],
121 | "source": [
122 | "# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)\n",
123 | "def space_to_depth_x2(x):\n",
124 | " return tf.space_to_depth(x, block_size=2)"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 5,
130 | "metadata": {
131 | "collapsed": true
132 | },
133 | "outputs": [],
134 | "source": [
135 | "input_image = layers.Input(shape=(IMAGE_H, IMAGE_W, 3))\n",
136 | "true_boxes = layers.Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 6,
142 | "metadata": {
143 | "collapsed": true
144 | },
145 | "outputs": [],
146 | "source": [
147 | "def yolo():\n",
148 | " \n",
149 | " \n",
150 | " # Layer 1\n",
151 | " x = layers.Conv2D(32, (3, 3), strides=(1, 1), \n",
152 | " padding='same', name='conv_1', use_bias=False)(input_image)\n",
153 | " x = layers.BatchNormalization(name='norm_1')(x)\n",
154 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
155 | " x = layers.MaxPool2D(pool_size=(2,2))(x)\n",
156 | " \n",
157 | " # Layer 2\n",
158 | " x = layers.Conv2D(64, (3, 3), strides=(1, 1), padding='same', name='conv_2', use_bias=False)(x)\n",
159 | " x = layers.BatchNormalization(name='norm_2')(x)\n",
160 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
161 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
162 | " \n",
163 | " \n",
164 | " # Layer 3\n",
165 | " x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_3', use_bias=False)(x)\n",
166 | " x = layers.BatchNormalization(name='norm_3')(x)\n",
167 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
168 | " \n",
169 | " # Layer 4 \n",
170 | " x = layers.Conv2D(64, (1, 1), strides=(1, 1), padding='same', name='conv_4', use_bias=False)(x)\n",
171 | " x = layers.BatchNormalization(name='norm_4')(x)\n",
172 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
173 | " \n",
174 | " # Layer 5\n",
175 | " x = layers.Conv2D(128, (3, 3), strides=(1, 1), padding='same', name='conv_5', use_bias=False)(x)\n",
176 | " x = layers.BatchNormalization(name='norm_5')(x)\n",
177 | " x= layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
178 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
179 | " \n",
180 | " # Layer 6\n",
181 | " x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_6', use_bias=False)(x)\n",
182 | " x = layers.BatchNormalization(name='norm_6')(x)\n",
183 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
184 | " \n",
185 | " \n",
186 | " # Layer 7\n",
187 | " x = layers.Conv2D(128, (1, 1), strides=(1, 1), padding='same', name='conv_7', use_bias=False)(x)\n",
188 | " x= layers.BatchNormalization(name='norm_7')(x)\n",
189 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
190 | " \n",
191 | " # Layer 8\n",
192 | " x = layers.Conv2D(256, (3, 3), strides=(1, 1), padding='same', name='conv_8', use_bias=False)(x)\n",
193 | " x = layers.BatchNormalization(name='norm_8')(x)\n",
194 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
195 | " x = layers.MaxPooling2D(pool_size=(2, 2))(x)\n",
196 | " \n",
197 | " # Layer 9\n",
198 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_9', use_bias=False)(x)\n",
199 | " x = layers.BatchNormalization(name='norm_9')(x)\n",
200 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
201 | " \n",
202 | " # Layer 10\n",
203 | " x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_10', use_bias=False)(x)\n",
204 | " x = layers.BatchNormalization(name='norm_10')(x)\n",
205 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
206 | " \n",
207 | " # Layer 11\n",
208 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_11', use_bias=False)(x)\n",
209 | " x = layers.BatchNormalization(name='norm_11')(x)\n",
210 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
211 | " \n",
212 | " \n",
213 | " # Layer 12\n",
214 | " x = layers.Conv2D(256, (1, 1), strides=(1, 1), padding='same', name='conv_12', use_bias=False)(x)\n",
215 | " x = layers.BatchNormalization(name='norm_12')(x)\n",
216 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
217 | " \n",
218 | " # Layer 13\n",
219 | " x = layers.Conv2D(512, (3, 3), strides=(1, 1), padding='same', name='conv_13', use_bias=False)(x)\n",
220 | " x = layers.BatchNormalization(name='norm_13')(x)\n",
221 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
222 | " \n",
223 | " \n",
224 | " skip_connection = x\n",
225 | " \n",
226 | " x = layers.MaxPool2D(pool_size=(2, 2))(x)\n",
227 | " \n",
228 | " # Layer 14\n",
229 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_14', use_bias=False)(x)\n",
230 | " x = layers.BatchNormalization(name='norm_14')(x)\n",
231 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
232 | " \n",
233 | " # Layer 15\n",
234 | " x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_15', use_bias=False)(x)\n",
235 | " x = layers.BatchNormalization(name='norm_15')(x)\n",
236 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
237 | " \n",
238 | " # Layer 16\n",
239 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_16', use_bias=False)(x)\n",
240 | " x = layers.BatchNormalization(name='norm_16')(x)\n",
241 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
242 | " \n",
243 | " # Layer 17\n",
244 | " x = layers.Conv2D(512, (1, 1), strides=(1, 1), padding='same', name='conv_17', use_bias=False)(x)\n",
245 | " x = layers.BatchNormalization(name='norm_17')(x)\n",
246 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
247 | " \n",
248 | " # Layer 18\n",
249 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_18', use_bias=False)(x)\n",
250 | " x = layers.BatchNormalization(name='norm_18')(x)\n",
251 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
252 | " \n",
253 | " # Layer 19\n",
254 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_19', use_bias=False)(x)\n",
255 | " x = layers.BatchNormalization(name='norm_19')(x)\n",
256 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
257 | " \n",
258 | " # Layer 20\n",
259 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_20', use_bias=False)(x)\n",
260 | " x = layers.BatchNormalization(name='norm_20')(x)\n",
261 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
262 | " \n",
263 | " \n",
264 | " # Layer 21\n",
265 | " skip_connection = layers.Conv2D(64, (1, 1), strides=(1, 1), \n",
266 | " padding='same', name='conv_21', use_bias=False)(skip_connection)\n",
267 | " skip_connection = layers.BatchNormalization(name='norm_21')(skip_connection)\n",
268 | " skip_connection = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(skip_connection)\n",
269 | " skip_connection = layers.Lambda(space_to_depth_x2)(skip_connection)\n",
270 | " \n",
271 | " x = layers.concatenate([skip_connection, x])\n",
272 | " \n",
273 | " # Layer 22\n",
274 | " x = layers.Conv2D(1024, (3, 3), strides=(1, 1), padding='same', name='conv_22',\n",
275 | " use_bias=False)(x)\n",
276 | " x = layers.BatchNormalization(name='norm_22')(x)\n",
277 | " x = layers.advanced_activations.LeakyReLU(alpha=ALPHA)(x)\n",
278 | " \n",
279 | " # Layer 23\n",
280 | " x = layers.Conv2D((4 + 1 + CLASS) * 5, (1,1), strides=(1,1), padding='same', name='conv_23')(x)\n",
281 | " output = layers.Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)\n",
282 | " \n",
283 | " # small hack to allow true_boxes to be registered when Keras build the model \n",
284 | " # for more information: https://github.com/fchollet/keras/issues/2790\n",
285 | " output = layers.Lambda(lambda args: args[0])([output, true_boxes])\n",
286 | " \n",
287 | " model = models.Model([input_image, true_boxes], output)\n",
288 | " \n",
289 | " \n",
290 | " return model\n"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 7,
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "name": "stdout",
300 | "output_type": "stream",
301 | "text": [
302 | "____________________________________________________________________________________________________\n",
303 | "Layer (type) Output Shape Param # Connected to \n",
304 | "====================================================================================================\n",
305 | "input_1 (InputLayer) (None, 416, 416, 3) 0 \n",
306 | "____________________________________________________________________________________________________\n",
307 | "conv_1 (Conv2D) (None, 416, 416, 32) 864 input_1[0][0] \n",
308 | "____________________________________________________________________________________________________\n",
309 | "norm_1 (BatchNormalization) (None, 416, 416, 32) 128 conv_1[0][0] \n",
310 | "____________________________________________________________________________________________________\n",
311 | "leaky_re_lu_1 (LeakyReLU) (None, 416, 416, 32) 0 norm_1[0][0] \n",
312 | "____________________________________________________________________________________________________\n",
313 | "max_pooling2d_1 (MaxPooling2D) (None, 208, 208, 32) 0 leaky_re_lu_1[0][0] \n",
314 | "____________________________________________________________________________________________________\n",
315 | "conv_2 (Conv2D) (None, 208, 208, 64) 18432 max_pooling2d_1[0][0] \n",
316 | "____________________________________________________________________________________________________\n",
317 | "norm_2 (BatchNormalization) (None, 208, 208, 64) 256 conv_2[0][0] \n",
318 | "____________________________________________________________________________________________________\n",
319 | "leaky_re_lu_2 (LeakyReLU) (None, 208, 208, 64) 0 norm_2[0][0] \n",
320 | "____________________________________________________________________________________________________\n",
321 | "max_pooling2d_2 (MaxPooling2D) (None, 104, 104, 64) 0 leaky_re_lu_2[0][0] \n",
322 | "____________________________________________________________________________________________________\n",
323 | "conv_3 (Conv2D) (None, 104, 104, 128) 73728 max_pooling2d_2[0][0] \n",
324 | "____________________________________________________________________________________________________\n",
325 | "norm_3 (BatchNormalization) (None, 104, 104, 128) 512 conv_3[0][0] \n",
326 | "____________________________________________________________________________________________________\n",
327 | "leaky_re_lu_3 (LeakyReLU) (None, 104, 104, 128) 0 norm_3[0][0] \n",
328 | "____________________________________________________________________________________________________\n",
329 | "conv_4 (Conv2D) (None, 104, 104, 64) 8192 leaky_re_lu_3[0][0] \n",
330 | "____________________________________________________________________________________________________\n",
331 | "norm_4 (BatchNormalization) (None, 104, 104, 64) 256 conv_4[0][0] \n",
332 | "____________________________________________________________________________________________________\n",
333 | "leaky_re_lu_4 (LeakyReLU) (None, 104, 104, 64) 0 norm_4[0][0] \n",
334 | "____________________________________________________________________________________________________\n",
335 | "conv_5 (Conv2D) (None, 104, 104, 128) 73728 leaky_re_lu_4[0][0] \n",
336 | "____________________________________________________________________________________________________\n",
337 | "norm_5 (BatchNormalization) (None, 104, 104, 128) 512 conv_5[0][0] \n",
338 | "____________________________________________________________________________________________________\n",
339 | "leaky_re_lu_5 (LeakyReLU) (None, 104, 104, 128) 0 norm_5[0][0] \n",
340 | "____________________________________________________________________________________________________\n",
341 | "max_pooling2d_3 (MaxPooling2D) (None, 52, 52, 128) 0 leaky_re_lu_5[0][0] \n",
342 | "____________________________________________________________________________________________________\n",
343 | "conv_6 (Conv2D) (None, 52, 52, 256) 294912 max_pooling2d_3[0][0] \n",
344 | "____________________________________________________________________________________________________\n",
345 | "norm_6 (BatchNormalization) (None, 52, 52, 256) 1024 conv_6[0][0] \n",
346 | "____________________________________________________________________________________________________\n",
347 | "leaky_re_lu_6 (LeakyReLU) (None, 52, 52, 256) 0 norm_6[0][0] \n",
348 | "____________________________________________________________________________________________________\n",
349 | "conv_7 (Conv2D) (None, 52, 52, 128) 32768 leaky_re_lu_6[0][0] \n",
350 | "____________________________________________________________________________________________________\n",
351 | "norm_7 (BatchNormalization) (None, 52, 52, 128) 512 conv_7[0][0] \n",
352 | "____________________________________________________________________________________________________\n",
353 | "leaky_re_lu_7 (LeakyReLU) (None, 52, 52, 128) 0 norm_7[0][0] \n",
354 | "____________________________________________________________________________________________________\n",
355 | "conv_8 (Conv2D) (None, 52, 52, 256) 294912 leaky_re_lu_7[0][0] \n",
356 | "____________________________________________________________________________________________________\n",
357 | "norm_8 (BatchNormalization) (None, 52, 52, 256) 1024 conv_8[0][0] \n",
358 | "____________________________________________________________________________________________________\n",
359 | "leaky_re_lu_8 (LeakyReLU) (None, 52, 52, 256) 0 norm_8[0][0] \n",
360 | "____________________________________________________________________________________________________\n",
361 | "max_pooling2d_4 (MaxPooling2D) (None, 26, 26, 256) 0 leaky_re_lu_8[0][0] \n",
362 | "____________________________________________________________________________________________________\n",
363 | "conv_9 (Conv2D) (None, 26, 26, 512) 1179648 max_pooling2d_4[0][0] \n",
364 | "____________________________________________________________________________________________________\n",
365 | "norm_9 (BatchNormalization) (None, 26, 26, 512) 2048 conv_9[0][0] \n",
366 | "____________________________________________________________________________________________________\n",
367 | "leaky_re_lu_9 (LeakyReLU) (None, 26, 26, 512) 0 norm_9[0][0] \n",
368 | "____________________________________________________________________________________________________\n",
369 | "conv_10 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_9[0][0] \n",
370 | "____________________________________________________________________________________________________\n",
371 | "norm_10 (BatchNormalization) (None, 26, 26, 256) 1024 conv_10[0][0] \n",
372 | "____________________________________________________________________________________________________\n",
373 | "leaky_re_lu_10 (LeakyReLU) (None, 26, 26, 256) 0 norm_10[0][0] \n",
374 | "____________________________________________________________________________________________________\n",
375 | "conv_11 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_10[0][0] \n",
376 | "____________________________________________________________________________________________________\n",
377 | "norm_11 (BatchNormalization) (None, 26, 26, 512) 2048 conv_11[0][0] \n",
378 | "____________________________________________________________________________________________________\n",
379 | "leaky_re_lu_11 (LeakyReLU) (None, 26, 26, 512) 0 norm_11[0][0] \n",
380 | "____________________________________________________________________________________________________\n",
381 | "conv_12 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_11[0][0] \n",
382 | "____________________________________________________________________________________________________\n",
383 | "norm_12 (BatchNormalization) (None, 26, 26, 256) 1024 conv_12[0][0] \n",
384 | "____________________________________________________________________________________________________\n",
385 | "leaky_re_lu_12 (LeakyReLU) (None, 26, 26, 256) 0 norm_12[0][0] \n",
386 | "____________________________________________________________________________________________________\n",
387 | "conv_13 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_12[0][0] \n",
388 | "____________________________________________________________________________________________________\n",
389 | "norm_13 (BatchNormalization) (None, 26, 26, 512) 2048 conv_13[0][0] \n",
390 | "____________________________________________________________________________________________________\n",
391 | "leaky_re_lu_13 (LeakyReLU) (None, 26, 26, 512) 0 norm_13[0][0] \n",
392 | "____________________________________________________________________________________________________\n",
393 | "max_pooling2d_5 (MaxPooling2D) (None, 13, 13, 512) 0 leaky_re_lu_13[0][0] \n",
394 | "____________________________________________________________________________________________________\n",
395 | "conv_14 (Conv2D) (None, 13, 13, 1024) 4718592 max_pooling2d_5[0][0] \n",
396 | "____________________________________________________________________________________________________\n",
397 | "norm_14 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_14[0][0] \n",
398 | "____________________________________________________________________________________________________\n",
399 | "leaky_re_lu_14 (LeakyReLU) (None, 13, 13, 1024) 0 norm_14[0][0] \n",
400 | "____________________________________________________________________________________________________\n",
401 | "conv_15 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_14[0][0] \n",
402 | "____________________________________________________________________________________________________\n",
403 | "norm_15 (BatchNormalization) (None, 13, 13, 512) 2048 conv_15[0][0] \n",
404 | "____________________________________________________________________________________________________\n",
405 | "leaky_re_lu_15 (LeakyReLU) (None, 13, 13, 512) 0 norm_15[0][0] \n",
406 | "____________________________________________________________________________________________________\n",
407 | "conv_16 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_15[0][0] \n",
408 | "____________________________________________________________________________________________________\n",
409 | "norm_16 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_16[0][0] \n",
410 | "____________________________________________________________________________________________________\n",
411 | "leaky_re_lu_16 (LeakyReLU) (None, 13, 13, 1024) 0 norm_16[0][0] \n",
412 | "____________________________________________________________________________________________________\n",
413 | "conv_17 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_16[0][0] \n",
414 | "____________________________________________________________________________________________________\n",
415 | "norm_17 (BatchNormalization) (None, 13, 13, 512) 2048 conv_17[0][0] \n",
416 | "____________________________________________________________________________________________________\n",
417 | "leaky_re_lu_17 (LeakyReLU) (None, 13, 13, 512) 0 norm_17[0][0] \n",
418 | "____________________________________________________________________________________________________\n",
419 | "conv_18 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_17[0][0] \n",
420 | "____________________________________________________________________________________________________\n",
421 | "norm_18 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_18[0][0] \n",
422 | "____________________________________________________________________________________________________\n",
423 | "leaky_re_lu_18 (LeakyReLU) (None, 13, 13, 1024) 0 norm_18[0][0] \n",
424 | "____________________________________________________________________________________________________\n",
425 | "conv_19 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_18[0][0] \n",
426 | "____________________________________________________________________________________________________\n",
427 | "norm_19 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_19[0][0] \n",
428 | "____________________________________________________________________________________________________\n",
429 | "conv_21 (Conv2D) (None, 26, 26, 64) 32768 leaky_re_lu_13[0][0] \n",
430 | "____________________________________________________________________________________________________\n",
431 | "leaky_re_lu_19 (LeakyReLU) (None, 13, 13, 1024) 0 norm_19[0][0] \n",
432 | "____________________________________________________________________________________________________\n",
433 | "norm_21 (BatchNormalization) (None, 26, 26, 64) 256 conv_21[0][0] \n",
434 | "____________________________________________________________________________________________________\n",
435 | "conv_20 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_19[0][0] \n",
436 | "____________________________________________________________________________________________________\n",
437 | "leaky_re_lu_21 (LeakyReLU) (None, 26, 26, 64) 0 norm_21[0][0] \n",
438 | "____________________________________________________________________________________________________\n",
439 | "norm_20 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_20[0][0] \n",
440 | "____________________________________________________________________________________________________\n",
441 | "lambda_1 (Lambda) (None, 13, 13, 256) 0 leaky_re_lu_21[0][0] \n",
442 | "____________________________________________________________________________________________________\n",
443 | "leaky_re_lu_20 (LeakyReLU) (None, 13, 13, 1024) 0 norm_20[0][0] \n",
444 | "____________________________________________________________________________________________________\n",
445 | "concatenate_1 (Concatenate) (None, 13, 13, 1280) 0 lambda_1[0][0] \n",
446 | " leaky_re_lu_20[0][0] \n",
447 | "____________________________________________________________________________________________________\n",
448 | "conv_22 (Conv2D) (None, 13, 13, 1024) 11796480 concatenate_1[0][0] \n",
449 | "____________________________________________________________________________________________________\n",
450 | "norm_22 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_22[0][0] \n",
451 | "____________________________________________________________________________________________________\n",
452 | "leaky_re_lu_22 (LeakyReLU) (None, 13, 13, 1024) 0 norm_22[0][0] \n",
453 | "____________________________________________________________________________________________________\n",
454 | "conv_23 (Conv2D) (None, 13, 13, 425) 435625 leaky_re_lu_22[0][0] \n",
455 | "____________________________________________________________________________________________________\n",
456 | "reshape_1 (Reshape) (None, 13, 13, 5, 85) 0 conv_23[0][0] \n",
457 | "____________________________________________________________________________________________________\n",
458 | "input_2 (InputLayer) (None, 1, 1, 1, 50, 4 0 \n",
459 | "____________________________________________________________________________________________________\n",
460 | "lambda_2 (Lambda) (None, 13, 13, 5, 85) 0 reshape_1[0][0] \n",
461 | " input_2[0][0] \n",
462 | "====================================================================================================\n",
463 | "Total params: 50,983,561\n",
464 | "Trainable params: 50,962,889\n",
465 | "Non-trainable params: 20,672\n",
466 | "____________________________________________________________________________________________________\n"
467 | ]
468 | }
469 | ],
470 | "source": [
471 | "model = yolo()\n",
472 | "model.summary()"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 8,
478 | "metadata": {
479 | "collapsed": true
480 | },
481 | "outputs": [],
482 | "source": [
483 | "plot_model(model, to_file='model.png')"
484 | ]
485 | },
486 | {
487 | "cell_type": "markdown",
488 | "metadata": {},
489 | "source": [
490 | "Total params: 50,983,561\n",
491 | "Trainable params: 50,962,889\n",
492 | "Non-trainable params: 20,672"
493 | ]
494 | },
495 | {
496 | "cell_type": "markdown",
497 | "metadata": {},
498 | "source": [
499 | "## Load Pretrained weights\n",
500 | "\n",
501 | "Load the weights originally provided by YOLO"
502 | ]
503 | },
504 | {
505 | "cell_type": "code",
506 | "execution_count": 9,
507 | "metadata": {
508 | "collapsed": true
509 | },
510 | "outputs": [],
511 | "source": [
512 | "weight_reader = WeightReader(pre_trained_weights)"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 10,
518 | "metadata": {
519 | "collapsed": true
520 | },
521 | "outputs": [],
522 | "source": [
523 | "weight_reader.reset()\n",
524 | "nb_conv = 23"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 11,
530 | "metadata": {
531 | "collapsed": true
532 | },
533 | "outputs": [],
534 | "source": [
535 | "for i in range(1, nb_conv+1):\n",
536 | " conv_layer = model.get_layer('conv_' + str(i))\n",
537 | " \n",
538 | " if i < nb_conv:\n",
539 | " norm_layer = model.get_layer('norm_' + str(i))\n",
540 | " \n",
541 | " size = np.prod(norm_layer.get_weights()[0].shape)\n",
542 | " \n",
543 | " beta = weight_reader.read_bytes(size)\n",
544 | " gamma = weight_reader.read_bytes(size)\n",
545 | " mean = weight_reader.read_bytes(size)\n",
546 | " var = weight_reader.read_bytes(size)\n",
547 | " \n",
548 | " weights = norm_layer.set_weights([gamma, beta, mean, var])\n",
549 | " \n",
550 | " if len(conv_layer.get_weights()) > 1:\n",
551 | " bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))\n",
552 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
553 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
554 | " kernel = kernel.transpose([2,3,1,0])\n",
555 | " conv_layer.set_weights([kernel, bias])\n",
556 | " \n",
557 | " else:\n",
558 | " kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))\n",
559 | " kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))\n",
560 | " kernel = kernel.transpose([2,3,1,0])\n",
561 | " conv_layer.set_weights([kernel])"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "metadata": {
567 | "collapsed": true
568 | },
569 | "source": [
570 | "## Randomize weights of the last layer"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 12,
576 | "metadata": {
577 | "collapsed": true
578 | },
579 | "outputs": [],
580 | "source": [
581 | "# Get last convolutional layer\n",
582 | "layer = model.layers[-4] \n",
583 | "weights = layer.get_weights()\n",
584 | "\n",
585 | "new_kernel = np.random.normal(size=weights[0].shape) / (GRID_H*GRID_W)\n",
586 | "new_bias = np.random.normal(size=weights[1].shape) / (GRID_H*GRID_W)\n",
587 | "\n",
588 | "layer.set_weights([new_kernel, new_bias])"
589 | ]
590 | },
591 | {
592 | "cell_type": "markdown",
593 | "metadata": {},
594 | "source": [
595 | "## Training"
596 | ]
597 | },
598 | {
599 | "cell_type": "markdown",
600 | "metadata": {},
601 | "source": [
602 | "### Loss Function\n",
603 | "\n",
604 | "\n",
605 | "\n",
606 | "\n"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 54,
612 | "metadata": {
613 | "collapsed": true
614 | },
615 | "outputs": [],
616 | "source": [
617 | "\n",
618 | "\n",
619 | "def custom_loss(y_true, y_pred):\n",
620 | " mask_shape = tf.shape(y_true)[:4]\n",
621 | " \n",
622 | " cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))\n",
623 | " cell_y = tf.transpose(cell_x, (0,2,1,3,4))\n",
624 | "\n",
625 | " cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])\n",
626 | " \n",
627 | " coord_mask = tf.zeros(mask_shape)\n",
628 | " conf_mask = tf.zeros(mask_shape)\n",
629 | " class_mask = tf.zeros(mask_shape)\n",
630 | " \n",
631 | " seen = tf.Variable(0.)\n",
632 | " \n",
633 | " total_AP = tf.Variable(0.)\n",
634 | " \n",
635 | " \"\"\"\n",
636 | " Adjust prediction\n",
637 | " \"\"\"\n",
638 | " ### adjust x and y \n",
639 | " pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid\n",
640 | " \n",
641 | " ### adjust w and h\n",
642 | " pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])\n",
643 | " \n",
644 | " ### adjust confidence\n",
645 | " pred_box_conf = tf.sigmoid(y_pred[..., 4])\n",
646 | " \n",
647 | " ### adjust class probabilities\n",
648 | " pred_box_class = y_pred[..., 5:]\n",
649 | " \n",
650 | " \"\"\"\n",
651 | " Adjust ground truth\n",
652 | " \"\"\"\n",
653 | " ### adjust x and y\n",
654 | " true_box_xy = y_true[..., 0:2] # relative position to the containing cell\n",
655 | " \n",
656 | " ### adjust w and h\n",
657 | " true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically\n",
658 | " \n",
659 | " ### adjust confidence\n",
660 | " true_wh_half = true_box_wh / 2.\n",
661 | " true_mins = true_box_xy - true_wh_half\n",
662 | " true_maxes = true_box_xy + true_wh_half\n",
663 | " \n",
664 | " pred_wh_half = pred_box_wh / 2.\n",
665 | " pred_mins = pred_box_xy - pred_wh_half\n",
666 | " pred_maxes = pred_box_xy + pred_wh_half \n",
667 | " \n",
668 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n",
669 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
670 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
671 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
672 | " \n",
673 | " true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]\n",
674 | " pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]\n",
675 | "\n",
676 | " union_areas = pred_areas + true_areas - intersect_areas\n",
677 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n",
678 | " \n",
679 | " true_box_conf = iou_scores * y_true[..., 4]\n",
680 | " \n",
681 | " ### adjust class probabilities\n",
682 | " true_box_class = tf.to_int32(y_true[..., 5])\n",
683 | " \n",
684 | " \"\"\"\n",
685 | " Determine the masks\n",
686 | " \"\"\"\n",
687 | " ### coordinate mask: simply the position of the ground truth boxes (the predictors)\n",
688 | " coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE\n",
689 | " \n",
690 | " ### confidence mask: penelize predictors + penalize boxes with low IOU\n",
691 | " # penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6\n",
692 | " true_xy = true_boxes[..., 0:2]\n",
693 | " true_wh = true_boxes[..., 2:4]\n",
694 | " \n",
695 | " true_wh_half = true_wh / 2.\n",
696 | " true_mins = true_xy - true_wh_half\n",
697 | " true_maxes = true_xy + true_wh_half\n",
698 | " \n",
699 | " pred_xy = tf.expand_dims(pred_box_xy, 4)\n",
700 | " pred_wh = tf.expand_dims(pred_box_wh, 4)\n",
701 | " \n",
702 | " pred_wh_half = pred_wh / 2.\n",
703 | " pred_mins = pred_xy - pred_wh_half\n",
704 | " pred_maxes = pred_xy + pred_wh_half \n",
705 | " \n",
706 | " intersect_mins = tf.maximum(pred_mins, true_mins)\n",
707 | " intersect_maxes = tf.minimum(pred_maxes, true_maxes)\n",
708 | " intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)\n",
709 | " intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]\n",
710 | " \n",
711 | " true_areas = true_wh[..., 0] * true_wh[..., 1]\n",
712 | " pred_areas = pred_wh[..., 0] * pred_wh[..., 1]\n",
713 | "\n",
714 | " union_areas = pred_areas + true_areas - intersect_areas\n",
715 | " iou_scores = tf.truediv(intersect_areas, union_areas)\n",
716 | "\n",
717 | " best_ious = tf.reduce_max(iou_scores, axis=4)\n",
718 | " conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE\n",
719 | " \n",
720 | " # penalize the confidence of the boxes, which are reponsible for corresponding ground truth box\n",
721 | " conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE\n",
722 | " \n",
723 | " ### class mask: simply the position of the ground truth boxes (the predictors)\n",
724 | " class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE \n",
725 | " \n",
726 | " \"\"\"\n",
727 | " Warm-up training\n",
728 | " \"\"\"\n",
729 | " no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)\n",
730 | " seen = tf.assign_add(seen, 1.)\n",
731 | " \n",
732 | " true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES), \n",
733 | " lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask, \n",
734 | " true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask, \n",
735 | " tf.ones_like(coord_mask)],\n",
736 | " lambda: [true_box_xy, \n",
737 | " true_box_wh,\n",
738 | " coord_mask])\n",
739 | " \n",
740 | " \"\"\"\n",
741 | " Finalize the loss\n",
742 | " \"\"\"\n",
743 | " nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))\n",
744 | " nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0))\n",
745 | " nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))\n",
746 | " \n",
747 | " loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
748 | " loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.\n",
749 | " loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2.\n",
750 | " loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)\n",
751 | " loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)\n",
752 | " \n",
753 | " loss = loss_xy + loss_wh + loss_conf + loss_class\n",
754 | " \n",
755 | " nb_true_box = tf.reduce_sum(y_true[..., 4])\n",
756 | " nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > OBJ_THRESHOLD))\n",
757 | " \n",
758 | " total_AP = tf.assign_add(total_AP, nb_pred_box/nb_true_box) \n",
759 | " \n",
760 | " loss = tf.Print(loss, [loss_xy, loss_wh, loss_conf, loss_class, loss, total_AP/seen], message='DEBUG', summarize=1000)\n",
761 | " \n",
762 | " return loss\n",
763 | "\n"
764 | ]
765 | },
766 | {
767 | "cell_type": "markdown",
768 | "metadata": {},
769 | "source": [
770 | "### Parse the annotations to construct train generator and validation generator"
771 | ]
772 | },
773 | {
774 | "cell_type": "code",
775 | "execution_count": 14,
776 | "metadata": {
777 | "collapsed": true
778 | },
779 | "outputs": [],
780 | "source": [
781 | "generator_config = {\n",
782 | " 'IMAGE_H' : IMAGE_H, \n",
783 | " 'IMAGE_W' : IMAGE_W,\n",
784 | " 'GRID_H' : GRID_H, \n",
785 | " 'GRID_W' : GRID_W,\n",
786 | " 'BOX' : BOX,\n",
787 | " 'LABELS' : LABELS,\n",
788 | " 'CLASS' : len(LABELS),\n",
789 | " 'ANCHORS' : ANCHORS,\n",
790 | " 'BATCH_SIZE' : BATCH_SIZE,\n",
791 | " 'TRUE_BOX_BUFFER' : 50,\n",
792 | "}\n",
793 | "\n"
794 | ]
795 | },
796 | {
797 | "cell_type": "code",
798 | "execution_count": 16,
799 | "metadata": {},
800 | "outputs": [
801 | {
802 | "name": "stdout",
803 | "output_type": "stream",
804 | "text": [
805 | "CPU times: user 26.6 s, sys: 5.42 s, total: 32 s\n",
806 | "Wall time: 11min 35s\n"
807 | ]
808 | }
809 | ],
810 | "source": [
811 | "%%time\n",
812 | "train_imgs, seen_train_labels = parse_annotation(train_annot_folder, train_image_folder, labels=LABELS)"
813 | ]
814 | },
815 | {
816 | "cell_type": "code",
817 | "execution_count": 39,
818 | "metadata": {
819 | "collapsed": true
820 | },
821 | "outputs": [],
822 | "source": [
823 | "import os\n",
824 | "import cv2\n",
825 | "import copy\n",
826 | "import numpy as np\n",
827 | "import imgaug as ia\n",
828 | "from imgaug import augmenters as iaa\n",
829 | "import xml.etree.ElementTree as ET\n",
830 | "from utils import BoundBox, normalize, bbox_iou"
831 | ]
832 | },
833 | {
834 | "cell_type": "code",
835 | "execution_count": 43,
836 | "metadata": {
837 | "collapsed": true
838 | },
839 | "outputs": [],
840 | "source": [
841 | "class BatchGenerator:\n",
842 | " def __init__(self, images, \n",
843 | " config, \n",
844 | " shuffle=True, \n",
845 | " jitter=True, \n",
846 | " norm=True):\n",
847 | "\n",
848 | " self.images = images\n",
849 | " self.config = config\n",
850 | "\n",
851 | " self.shuffle = shuffle\n",
852 | " self.jitter = jitter\n",
853 | " self.norm = norm\n",
854 | " \n",
855 | "\n",
856 | " self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))]\n",
857 | "\n",
858 | " ### augmentors by https://github.com/aleju/imgaug\n",
859 | " sometimes = lambda aug: iaa.Sometimes(0.5, aug)\n",
860 | "\n",
861 | " # Define our sequence of augmentation steps that will be applied to every image\n",
862 | " # All augmenters with per_channel=0.5 will sample one value _per image_\n",
863 | " # in 50% of all cases. In all other cases they will sample new values\n",
864 | " # _per channel_.\n",
865 | " self.aug_pipe = iaa.Sequential(\n",
866 | " [\n",
867 | " # apply the following augmenters to most images\n",
868 | " #iaa.Fliplr(0.5), # horizontally flip 50% of all images\n",
869 | " #iaa.Flipud(0.2), # vertically flip 20% of all images\n",
870 | " #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width\n",
871 | " sometimes(iaa.Affine(\n",
872 | " #scale={\"x\": (0.8, 1.2), \"y\": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis\n",
873 | " #translate_percent={\"x\": (-0.2, 0.2), \"y\": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)\n",
874 | " #rotate=(-5, 5), # rotate by -45 to +45 degrees\n",
875 | " #shear=(-5, 5), # shear by -16 to +16 degrees\n",
876 | " #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)\n",
877 | " #cval=(0, 255), # if mode is constant, use a cval between 0 and 255\n",
878 | " #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)\n",
879 | " )),\n",
880 | " # execute 0 to 5 of the following (less important) augmenters per image\n",
881 | " # don't execute all of them, as that would often be way too strong\n",
882 | " iaa.SomeOf((0, 5),\n",
883 | " [\n",
884 | " #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation\n",
885 | " iaa.OneOf([\n",
886 | " iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0\n",
887 | " iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7\n",
888 | " iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7\n",
889 | " ]),\n",
890 | " iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images\n",
891 | " #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images\n",
892 | " # search either for all edges or for directed edges\n",
893 | " #sometimes(iaa.OneOf([\n",
894 | " # iaa.EdgeDetect(alpha=(0, 0.7)),\n",
895 | " # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),\n",
896 | " #])),\n",
897 | " iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images\n",
898 | " iaa.OneOf([\n",
899 | " iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels\n",
900 | " #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),\n",
901 | " ]),\n",
902 | " #iaa.Invert(0.05, per_channel=True), # invert color channels\n",
903 | " iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)\n",
904 | " iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)\n",
905 | " iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast\n",
906 | " #iaa.Grayscale(alpha=(0.0, 1.0)),\n",
907 | " #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)\n",
908 | " #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around\n",
909 | " ],\n",
910 | " random_order=True\n",
911 | " )\n",
912 | " ],\n",
913 | " random_order=True\n",
914 | " )\n",
915 | "\n",
916 | " if shuffle: np.random.shuffle(self.images)\n",
917 | "\n",
918 | " def get_generator(self):\n",
919 | " num_img = len(self.images)\n",
920 | " \n",
921 | " total_count = 0\n",
922 | " batch_count = 0\n",
923 | " \n",
924 | " x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images\n",
925 | " b_batch = np.zeros((self.config['BATCH_SIZE'], 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes\n",
926 | " y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1)) # desired network output\n",
927 | " \n",
928 | " while True:\n",
929 | " if total_count < num_img:\n",
930 | " train_instance = self.images[total_count]\n",
931 | "\n",
932 | " # augment input image and fix object's position and size\n",
933 | " img, all_objs = self.aug_image(train_instance, jitter=self.jitter)\n",
934 | " \n",
935 | " # construct output from object's x, y, w, h\n",
936 | " true_box_index = 0\n",
937 | " \n",
938 | " for obj in all_objs:\n",
939 | " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:\n",
940 | " center_x = .5*(obj['xmin'] + obj['xmax'])\n",
941 | " center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])\n",
942 | " center_y = .5*(obj['ymin'] + obj['ymax'])\n",
943 | " center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])\n",
944 | "\n",
945 | " grid_x = int(np.floor(center_x))\n",
946 | " grid_y = int(np.floor(center_y))\n",
947 | "\n",
948 | " if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:\n",
949 | " obj_indx = self.config['LABELS'].index(obj['name'])\n",
950 | " \n",
951 | " center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n",
952 | " center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell\n",
953 | " \n",
954 | " box = [center_x, center_y, center_w, center_h]\n",
955 | "\n",
956 | " # find the anchor that best predicts this box\n",
957 | " best_anchor = -1\n",
958 | " max_iou = -1\n",
959 | " \n",
960 | " shifted_box = BoundBox(0, \n",
961 | " 0, \n",
962 | " center_w, \n",
963 | " center_h)\n",
964 | " \n",
965 | " for i in range(len(self.anchors)):\n",
966 | " anchor = self.anchors[i]\n",
967 | " iou = bbox_iou(shifted_box, anchor)\n",
968 | " \n",
969 | " if max_iou < iou:\n",
970 | " best_anchor = i\n",
971 | " max_iou = iou\n",
972 | " \n",
973 | " # assign ground truth x, y, w, h, confidence and class probs to y_batch\n",
974 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4] = box\n",
975 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 4 ] = 1.\n",
976 | " y_batch[batch_count, grid_y, grid_x, best_anchor, 5 ] = obj_indx\n",
977 | " \n",
978 | " # assign the true box to b_batch\n",
979 | " b_batch[batch_count, 0, 0, 0, true_box_index] = box\n",
980 | " \n",
981 | " true_box_index += 1\n",
982 | " true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']\n",
983 | " \n",
984 | " # assign input image to x_batch\n",
985 | " if self.norm: \n",
986 | " x_batch[batch_count] = normalize(img)\n",
987 | " else:\n",
988 | " # plot image and bounding boxes for sanity check\n",
989 | " for obj in all_objs:\n",
990 | " if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:\n",
991 | " cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)\n",
992 | " cv2.putText(img[:,:,::-1], obj['name'], \n",
993 | " (obj['xmin']+2, obj['ymin']+12), \n",
994 | " 0, 1.2e-3 * img.shape[0], \n",
995 | " (0,255,0), 2)\n",
996 | " \n",
997 | " x_batch[batch_count] = img\n",
998 | "\n",
999 | " # increase instance counter in current batch\n",
1000 | " batch_count += 1 \n",
1001 | " \n",
1002 | " total_count += 1\n",
1003 | " if total_count >= num_img:\n",
1004 | " total_count = 0\n",
1005 | " if self.shuffle: np.random.shuffle(self.images) \n",
1006 | "\n",
1007 | " if batch_count >= self.config['BATCH_SIZE']:\n",
1008 | " yield [x_batch, b_batch], y_batch\n",
1009 | " \n",
1010 | " x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))\n",
1011 | " y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 5+self.config['CLASS'])) \n",
1012 | " \n",
1013 | " batch_count = 0\n",
1014 | "\n",
1015 | " def aug_image(self, train_instance, jitter):\n",
1016 | " image_name = train_instance['filename']\n",
1017 | " image = cv2.imread(image_name)\n",
1018 | " h, w, c = image.shape\n",
1019 | " \n",
1020 | " all_objs = copy.deepcopy(train_instance['object'])\n",
1021 | "\n",
1022 | " if jitter:\n",
1023 | " ### scale the image\n",
1024 | " scale = np.random.uniform() / 10. + 1.\n",
1025 | " image = cv2.resize(image, (0,0), fx = scale, fy = scale)\n",
1026 | "\n",
1027 | " ### translate the image\n",
1028 | " max_offx = (scale-1.) * w\n",
1029 | " max_offy = (scale-1.) * h\n",
1030 | " offx = int(np.random.uniform() * max_offx)\n",
1031 | " offy = int(np.random.uniform() * max_offy)\n",
1032 | " \n",
1033 | " image = image[offy : (offy + h), offx : (offx + w)]\n",
1034 | "\n",
1035 | " ### flip the image\n",
1036 | " flip = np.random.binomial(1, .5)\n",
1037 | " if flip > 0.5: image = cv2.flip(image, 1)\n",
1038 | " \n",
1039 | " image = self.aug_pipe.augment_image(image) \n",
1040 | " \n",
1041 | " # resize the image to standard size\n",
1042 | " image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))\n",
1043 | " image = image[:,:,::-1]\n",
1044 | "\n",
1045 | " # fix object's position and size\n",
1046 | " for obj in all_objs:\n",
1047 | " for attr in ['xmin', 'xmax']:\n",
1048 | " if jitter: obj[attr] = int(obj[attr] * scale - offx)\n",
1049 | " \n",
1050 | " obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)\n",
1051 | " obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)\n",
1052 | " \n",
1053 | " for attr in ['ymin', 'ymax']:\n",
1054 | " if jitter: obj[attr] = int(obj[attr] * scale - offy)\n",
1055 | " \n",
1056 | " obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)\n",
1057 | " obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)\n",
1058 | "\n",
1059 | " if jitter and flip > 0.5:\n",
1060 | " xmin = obj['xmin']\n",
1061 | " obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']\n",
1062 | " obj['xmax'] = self.config['IMAGE_W'] - xmin\n",
1063 | " \n",
1064 | " return image, all_objs\n",
1065 | "\n",
1066 | " def get_dateset_size(self):\n",
1067 | " return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))"
1068 | ]
1069 | },
1070 | {
1071 | "cell_type": "code",
1072 | "execution_count": 44,
1073 | "metadata": {},
1074 | "outputs": [
1075 | {
1076 | "name": "stdout",
1077 | "output_type": "stream",
1078 | "text": [
1079 | "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
1080 | "Wall time: 11.5 ms\n"
1081 | ]
1082 | }
1083 | ],
1084 | "source": [
1085 | "%%time\n",
1086 | "train_batch = BatchGenerator(train_imgs, generator_config)"
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "code",
1091 | "execution_count": 22,
1092 | "metadata": {},
1093 | "outputs": [
1094 | {
1095 | "name": "stdout",
1096 | "output_type": "stream",
1097 | "text": [
1098 | "CPU times: user 12.4 s, sys: 2.45 s, total: 14.8 s\n",
1099 | "Wall time: 4min 50s\n"
1100 | ]
1101 | }
1102 | ],
1103 | "source": [
1104 | "%%time\n",
1105 | "val_imgs, seen_val_labels = parse_annotation(val_annot_folder, val_image_folder, labels=LABELS)"
1106 | ]
1107 | },
1108 | {
1109 | "cell_type": "code",
1110 | "execution_count": 45,
1111 | "metadata": {},
1112 | "outputs": [
1113 | {
1114 | "name": "stdout",
1115 | "output_type": "stream",
1116 | "text": [
1117 | "CPU times: user 8 ms, sys: 0 ns, total: 8 ms\n",
1118 | "Wall time: 5.87 ms\n"
1119 | ]
1120 | }
1121 | ],
1122 | "source": [
1123 | "%%time\n",
1124 | "valid_batch = BatchGenerator(val_imgs, generator_config, jitter=False)"
1125 | ]
1126 | },
1127 | {
1128 | "cell_type": "markdown",
1129 | "metadata": {},
1130 | "source": [
1131 | "## Setup a few callbacks and start the training"
1132 | ]
1133 | },
1134 | {
1135 | "cell_type": "code",
1136 | "execution_count": 46,
1137 | "metadata": {
1138 | "collapsed": true
1139 | },
1140 | "outputs": [],
1141 | "source": [
1142 | "early_stop = callbacks.EarlyStopping(monitor='val_loss', \n",
1143 | " min_delta=0.001, \n",
1144 | " patience=3, \n",
1145 | " mode='min', \n",
1146 | " verbose=1)\n",
1147 | "\n",
1148 | "checkpoint = callbacks.ModelCheckpoint('weights_coco.h5', \n",
1149 | " monitor='val_loss', \n",
1150 | " verbose=1, \n",
1151 | " save_best_only=True, \n",
1152 | " mode='min', \n",
1153 | " period=1)"
1154 | ]
1155 | },
1156 | {
1157 | "cell_type": "code",
1158 | "execution_count": 47,
1159 | "metadata": {},
1160 | "outputs": [
1161 | {
1162 | "ename": "OSError",
1163 | "evalue": "Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)",
1164 | "output_type": "error",
1165 | "traceback": [
1166 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
1167 | "\u001b[0;31mOSError\u001b[0m Traceback (most recent call last)",
1168 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload_weights\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'weights_coco.h5'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
1169 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py\u001b[0m in \u001b[0;36mload_weights\u001b[0;34m(self, filepath, by_name)\u001b[0m\n\u001b[1;32m 2564\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mh5py\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2565\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mImportError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'`load_weights` requires h5py.'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2566\u001b[0;31m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5py\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mFile\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilepath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'r'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2567\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;34m'layer_names'\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mattrs\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;34m'model_weights'\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2568\u001b[0m \u001b[0mf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'model_weights'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1170 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, name, mode, driver, libver, userblock_size, swmr, **kwds)\u001b[0m\n\u001b[1;32m 267\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mphil\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 268\u001b[0m \u001b[0mfapl\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fapl\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdriver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlibver\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 269\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmake_fid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muserblock_size\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mswmr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mswmr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 270\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 271\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1171 | "\u001b[0;32m~/anaconda3/envs/dl/lib/python3.6/site-packages/h5py/_hl/files.py\u001b[0m in \u001b[0;36mmake_fid\u001b[0;34m(name, mode, userblock_size, fapl, fcpl, swmr)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mswmr\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mswmr_support\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mflags\u001b[0m \u001b[0;34m|=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_SWMR_READ\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 100\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mmode\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m'r+'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[0mfid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh5f\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mACC_RDWR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfapl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfapl\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
1172 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
1173 | "\u001b[0;32mh5py/_objects.pyx\u001b[0m in \u001b[0;36mh5py._objects.with_phil.wrapper\u001b[0;34m()\u001b[0m\n",
1174 | "\u001b[0;32mh5py/h5f.pyx\u001b[0m in \u001b[0;36mh5py.h5f.open\u001b[0;34m()\u001b[0m\n",
1175 | "\u001b[0;31mOSError\u001b[0m: Unable to open file (unable to open file: name = 'weights_coco.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)"
1176 | ]
1177 | }
1178 | ],
1179 | "source": [
1180 | "model.load_weights('weights_coco.h5')"
1181 | ]
1182 | },
1183 | {
1184 | "cell_type": "code",
1185 | "execution_count": null,
1186 | "metadata": {},
1187 | "outputs": [
1188 | {
1189 | "name": "stdout",
1190 | "output_type": "stream",
1191 | "text": [
1192 | "Epoch 1/100\n",
1193 | " 91/5120 [..............................] - ETA: 296822s - loss: 4.8495"
1194 | ]
1195 | }
1196 | ],
1197 | "source": [
1198 | "tb_counter = len([log for log in os.listdir(os.path.expanduser('~/logs/')) if 'coco_' in log]) + 1\n",
1199 | "tensorboard = callbacks.TensorBoard(log_dir=os.path.expanduser('~/logs/') + 'coco_' + '_' + str(tb_counter), \n",
1200 | " histogram_freq=0, \n",
1201 | " write_graph=True, \n",
1202 | " write_images=False)\n",
1203 | "\n",
1204 | "optimizer = optimizers.Adam(lr=0.5e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)\n",
1205 | "#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)\n",
1206 | "#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)\n",
1207 | "\n",
1208 | "model.compile(loss=custom_loss, optimizer=optimizer)\n",
1209 | "\n",
1210 | "model.fit_generator(generator = train_batch.get_generator(), \n",
1211 | " steps_per_epoch = train_batch.get_dateset_size(), \n",
1212 | " epochs = 100, \n",
1213 | " verbose = 1,\n",
1214 | " validation_data = valid_batch.get_generator(),\n",
1215 | " validation_steps = valid_batch.get_dateset_size(),\n",
1216 | " callbacks = [early_stop, checkpoint, tensorboard], \n",
1217 | " max_queue_size = 3)"
1218 | ]
1219 | },
1220 | {
1221 | "cell_type": "code",
1222 | "execution_count": null,
1223 | "metadata": {
1224 | "collapsed": true
1225 | },
1226 | "outputs": [],
1227 | "source": [
1228 | "%load_ext version_information\n",
1229 | "%version_information keras"
1230 | ]
1231 | },
1232 | {
1233 | "cell_type": "code",
1234 | "execution_count": null,
1235 | "metadata": {
1236 | "collapsed": true
1237 | },
1238 | "outputs": [],
1239 | "source": []
1240 | }
1241 | ],
1242 | "metadata": {
1243 | "kernelspec": {
1244 | "display_name": "Python 3",
1245 | "language": "python",
1246 | "name": "python3"
1247 | },
1248 | "language_info": {
1249 | "codemirror_mode": {
1250 | "name": "ipython",
1251 | "version": 3
1252 | },
1253 | "file_extension": ".py",
1254 | "mimetype": "text/x-python",
1255 | "name": "python",
1256 | "nbconvert_exporter": "python",
1257 | "pygments_lexer": "ipython3",
1258 | "version": "3.6.2"
1259 | },
1260 | "toc": {
1261 | "nav_menu": {},
1262 | "number_sections": true,
1263 | "sideBar": true,
1264 | "skip_h1_title": false,
1265 | "toc_cell": true,
1266 | "toc_position": {},
1267 | "toc_section_display": "block",
1268 | "toc_window_display": false
1269 | }
1270 | },
1271 | "nbformat": 4,
1272 | "nbformat_minor": 2
1273 | }
1274 |
--------------------------------------------------------------------------------
/coco2pascal.py:
--------------------------------------------------------------------------------
1 | import baker
2 | import json
3 | from path import Path as path
4 | from cytoolz import merge, join, groupby
5 | from cytoolz.compatibility import iteritems
6 | from cytoolz.curried import update_in
7 | from itertools import starmap
8 | from collections import deque
9 | from lxml import etree, objectify
10 | from scipy.io import savemat
11 | from scipy.ndimage import imread
12 |
13 |
14 | def keyjoin(leftkey, leftseq, rightkey, rightseq):
15 | return starmap(merge, join(leftkey, leftseq, rightkey, rightseq))
16 |
17 |
18 | def root(folder, filename, width, height):
19 | E = objectify.ElementMaker(annotate=False)
20 | return E.annotation(
21 | E.folder(folder),
22 | E.filename(filename),
23 | E.source(
24 | E.database('MS COCO 2014'),
25 | E.annotation('MS COCO 2014'),
26 | E.image('Flickr'),
27 | ),
28 | E.size(
29 | E.width(width),
30 | E.height(height),
31 | E.depth(3),
32 | ),
33 | E.segmented(0)
34 | )
35 |
36 |
37 | def instance_to_xml(anno):
38 | E = objectify.ElementMaker(annotate=False)
39 | xmin, ymin, width, height = anno['bbox']
40 | return E.object(
41 | E.name(anno['category_id']),
42 | E.bndbox(
43 | E.xmin(xmin),
44 | E.ymin(ymin),
45 | E.xmax(xmin+width),
46 | E.ymax(ymin+height),
47 | ),
48 | )
49 |
50 |
51 | @baker.command
52 | def write_categories(coco_annotation, dst):
53 | content = json.loads(path(coco_annotation).expand().text())
54 | categories = tuple( d['name'] for d in content['categories'])
55 | savemat(path(dst).expand(), {'categories': categories})
56 |
57 |
58 | def get_instances(coco_annotation):
59 | coco_annotation = path(coco_annotation).expand()
60 | content = json.loads(coco_annotation.text())
61 | categories = {d['id']: d['name'] for d in content['categories']}
62 | return categories, tuple(keyjoin('id', content['images'], 'image_id', content['annotations']))
63 |
64 | def rename(name, year=2014):
65 | out_name = path(name).stripext()
66 | # out_name = out_name.split('_')[-1]
67 | # out_name = '{}_{}'.format(year, out_name)
68 | return out_name
69 |
70 |
71 | @baker.command
72 | def create_imageset(annotations, dst):
73 | annotations = path(annotations).expand()
74 | dst = path(dst).expand()
75 | val_txt = dst / 'val.txt'
76 | train_txt = dst / 'train.txt'
77 |
78 | for val in annotations.listdir('*val*'):
79 | val_txt.write_text('{}\n'.format(val.basename().stripext()), append=True)
80 |
81 | for train in annotations.listdir('*train*'):
82 | train_txt.write_text('{}\n'.format(train.basename().stripext()), append=True)
83 |
84 | @baker.command
85 | def create_annotations(dbpath, subset, dst):
86 | annotations_path = path(dbpath).expand() / 'annotations/instances_{}2014.json'.format(subset)
87 | images_path = path(dbpath).expand() / 'images/{}2014'.format(subset)
88 | categories , instances= get_instances(annotations_path)
89 | dst = path(dst).expand()
90 |
91 | for i, instance in enumerate(instances):
92 | instances[i]['category_id'] = categories[instance['category_id']]
93 |
94 | for name, group in iteritems(groupby('file_name', instances)):
95 | img = imread(images_path / name)
96 | if img.ndim == 3:
97 | out_name = rename(name)
98 | annotation = root('VOC2014', '{}.jpg'.format(out_name),
99 | group[0]['height'], group[0]['width'])
100 | for instance in group:
101 | annotation.append(instance_to_xml(instance))
102 | etree.ElementTree(annotation).write(dst / '{}.xml'.format(out_name))
103 | print(out_name)
104 | else:
105 | print (instance['file_name'])
106 |
107 |
108 |
109 |
110 |
111 | if __name__ == '__main__':
112 | baker.run()
113 |
--------------------------------------------------------------------------------
/images/custom-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss.png
--------------------------------------------------------------------------------
/images/custom-loss2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/custom-loss2.png
--------------------------------------------------------------------------------
/images/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/images/model.png
--------------------------------------------------------------------------------
/model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/keras-yolo/3df717791cbfe1fa027c2347c498b4ac96b0b160/model.png
--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
1 | import os
2 | import cv2
3 | import copy
4 | import numpy as np
5 | import imgaug as ia
6 | from imgaug import augmenters as iaa
7 | import xml.etree.ElementTree as ET
8 | from utils import BoundBox, normalize, bbox_iou
9 |
10 | def parse_annotation(ann_dir, img_dir, labels=[]):
11 | all_imgs = []
12 | seen_labels = set()
13 |
14 | for ann in sorted(os.listdir(ann_dir)):
15 | img = {'object':[]}
16 |
17 | tree = ET.parse(ann_dir + ann)
18 |
19 | for elem in tree.iter():
20 | if 'filename' in elem.tag:
21 | all_imgs += [img]
22 | img['filename'] = img_dir + elem.text
23 | if 'width' in elem.tag:
24 | img['width'] = int(elem.text)
25 | if 'height' in elem.tag:
26 | img['height'] = int(elem.text)
27 | if 'object' in elem.tag or 'part' in elem.tag:
28 | obj = {}
29 |
30 | for attr in list(elem):
31 | if 'name' in attr.tag:
32 | obj['name'] = attr.text
33 | seen_labels.add(obj['name'])
34 |
35 | if len(labels) > 0 and obj['name'] not in labels:
36 | break
37 | else:
38 | img['object'] += [obj]
39 |
40 | if 'bndbox' in attr.tag:
41 | for dim in list(attr):
42 | if 'xmin' in dim.tag:
43 | obj['xmin'] = int(round(float(dim.text)))
44 | if 'ymin' in dim.tag:
45 | obj['ymin'] = int(round(float(dim.text)))
46 | if 'xmax' in dim.tag:
47 | obj['xmax'] = int(round(float(dim.text)))
48 | if 'ymax' in dim.tag:
49 | obj['ymax'] = int(round(float(dim.text)))
50 |
51 | return all_imgs, seen_labels
52 |
53 | class BatchGenerator:
54 | def __init__(self, images,
55 | config,
56 | shuffle=True,
57 | jitter=True,
58 | norm=True):
59 |
60 | self.images = images
61 | self.config = config
62 |
63 | self.shuffle = shuffle
64 | self.jitter = jitter
65 | self.norm = norm
66 |
67 | self.anchors = [BoundBox(0, 0, config['ANCHORS'][2*i], config['ANCHORS'][2*i+1]) for i in range(int(len(config['ANCHORS'])/2))]
68 |
69 | ### augmentors by https://github.com/aleju/imgaug
70 | sometimes = lambda aug: iaa.Sometimes(0.5, aug)
71 |
72 | # Define our sequence of augmentation steps that will be applied to every image
73 | # All augmenters with per_channel=0.5 will sample one value _per image_
74 | # in 50% of all cases. In all other cases they will sample new values
75 | # _per channel_.
76 | self.aug_pipe = iaa.Sequential(
77 | [
78 | # apply the following augmenters to most images
79 | #iaa.Fliplr(0.5), # horizontally flip 50% of all images
80 | #iaa.Flipud(0.2), # vertically flip 20% of all images
81 | #sometimes(iaa.Crop(percent=(0, 0.1))), # crop images by 0-10% of their height/width
82 | sometimes(iaa.Affine(
83 | #scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, # scale images to 80-120% of their size, individually per axis
84 | #translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, # translate by -20 to +20 percent (per axis)
85 | #rotate=(-5, 5), # rotate by -45 to +45 degrees
86 | #shear=(-5, 5), # shear by -16 to +16 degrees
87 | #order=[0, 1], # use nearest neighbour or bilinear interpolation (fast)
88 | #cval=(0, 255), # if mode is constant, use a cval between 0 and 255
89 | #mode=ia.ALL # use any of scikit-image's warping modes (see 2nd image from the top for examples)
90 | )),
91 | # execute 0 to 5 of the following (less important) augmenters per image
92 | # don't execute all of them, as that would often be way too strong
93 | iaa.SomeOf((0, 5),
94 | [
95 | #sometimes(iaa.Superpixels(p_replace=(0, 1.0), n_segments=(20, 200))), # convert images into their superpixel representation
96 | iaa.OneOf([
97 | iaa.GaussianBlur((0, 3.0)), # blur images with a sigma between 0 and 3.0
98 | iaa.AverageBlur(k=(2, 7)), # blur image using local means with kernel sizes between 2 and 7
99 | iaa.MedianBlur(k=(3, 11)), # blur image using local medians with kernel sizes between 2 and 7
100 | ]),
101 | iaa.Sharpen(alpha=(0, 1.0), lightness=(0.75, 1.5)), # sharpen images
102 | #iaa.Emboss(alpha=(0, 1.0), strength=(0, 2.0)), # emboss images
103 | # search either for all edges or for directed edges
104 | #sometimes(iaa.OneOf([
105 | # iaa.EdgeDetect(alpha=(0, 0.7)),
106 | # iaa.DirectedEdgeDetect(alpha=(0, 0.7), direction=(0.0, 1.0)),
107 | #])),
108 | iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), # add gaussian noise to images
109 | iaa.OneOf([
110 | iaa.Dropout((0.01, 0.1), per_channel=0.5), # randomly remove up to 10% of the pixels
111 | #iaa.CoarseDropout((0.03, 0.15), size_percent=(0.02, 0.05), per_channel=0.2),
112 | ]),
113 | #iaa.Invert(0.05, per_channel=True), # invert color channels
114 | iaa.Add((-10, 10), per_channel=0.5), # change brightness of images (by -10 to 10 of original value)
115 | iaa.Multiply((0.5, 1.5), per_channel=0.5), # change brightness of images (50-150% of original value)
116 | iaa.ContrastNormalization((0.5, 2.0), per_channel=0.5), # improve or worsen the contrast
117 | #iaa.Grayscale(alpha=(0.0, 1.0)),
118 | #sometimes(iaa.ElasticTransformation(alpha=(0.5, 3.5), sigma=0.25)), # move pixels locally around (with random strengths)
119 | #sometimes(iaa.PiecewiseAffine(scale=(0.01, 0.05))) # sometimes move parts of the image around
120 | ],
121 | random_order=True
122 | )
123 | ],
124 | random_order=True
125 | )
126 |
127 | if shuffle: np.random.shuffle(self.images)
128 |
129 | def get_generator(self):
130 | num_img = len(self.images)
131 |
132 | total_count = 0
133 | batch_count = 0
134 |
135 | x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3)) # input images
136 | b_batch = np.zeros((self.config['BATCH_SIZE'], 1 , 1 , 1 , self.config['TRUE_BOX_BUFFER'], 4)) # list of self.config['TRUE_self.config['BOX']_BUFFER'] GT boxes
137 | y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 4+1+1)) # desired network output
138 |
139 | while True:
140 | if total_count < num_img:
141 | train_instance = self.images[total_count]
142 |
143 | # augment input image and fix object's position and size
144 | img, all_objs = self.aug_image(train_instance, jitter=self.jitter)
145 |
146 | # construct output from object's x, y, w, h
147 | true_box_index = 0
148 |
149 | for obj in all_objs:
150 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin'] and obj['name'] in self.config['LABELS']:
151 | center_x = .5*(obj['xmin'] + obj['xmax'])
152 | center_x = center_x / (float(self.config['IMAGE_W']) / self.config['GRID_W'])
153 | center_y = .5*(obj['ymin'] + obj['ymax'])
154 | center_y = center_y / (float(self.config['IMAGE_H']) / self.config['GRID_H'])
155 |
156 | grid_x = int(np.floor(center_x))
157 | grid_y = int(np.floor(center_y))
158 |
159 | if grid_x < self.config['GRID_W'] and grid_y < self.config['GRID_H']:
160 | obj_indx = self.config['LABELS'].index(obj['name'])
161 |
162 | center_w = (obj['xmax'] - obj['xmin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
163 | center_h = (obj['ymax'] - obj['ymin']) / (float(self.config['IMAGE_W']) / self.config['GRID_W']) # unit: grid cell
164 |
165 | box = [center_x, center_y, center_w, center_h]
166 |
167 | # find the anchor that best predicts this box
168 | best_anchor = -1
169 | max_iou = -1
170 |
171 | shifted_box = BoundBox(0,
172 | 0,
173 | center_w,
174 | center_h)
175 |
176 | for i in range(len(self.anchors)):
177 | anchor = self.anchors[i]
178 | iou = bbox_iou(shifted_box, anchor)
179 |
180 | if max_iou < iou:
181 | best_anchor = i
182 | max_iou = iou
183 |
184 | # assign ground truth x, y, w, h, confidence and class probs to y_batch
185 | y_batch[batch_count, grid_y, grid_x, best_anchor, 0:4] = box
186 | y_batch[batch_count, grid_y, grid_x, best_anchor, 4 ] = 1.
187 | y_batch[batch_count, grid_y, grid_x, best_anchor, 5 ] = obj_indx
188 |
189 | # assign the true box to b_batch
190 | b_batch[batch_count, 0, 0, 0, true_box_index] = box
191 |
192 | true_box_index += 1
193 | true_box_index = true_box_index % self.config['TRUE_BOX_BUFFER']
194 |
195 | # assign input image to x_batch
196 | if self.norm:
197 | x_batch[batch_count] = normalize(img)
198 | else:
199 | # plot image and bounding boxes for sanity check
200 | for obj in all_objs:
201 | if obj['xmax'] > obj['xmin'] and obj['ymax'] > obj['ymin']:
202 | cv2.rectangle(img[:,:,::-1], (obj['xmin'],obj['ymin']), (obj['xmax'],obj['ymax']), (255,0,0), 3)
203 | cv2.putText(img[:,:,::-1], obj['name'],
204 | (obj['xmin']+2, obj['ymin']+12),
205 | 0, 1.2e-3 * img.shape[0],
206 | (0,255,0), 2)
207 |
208 | x_batch[batch_count] = img
209 |
210 | # increase instance counter in current batch
211 | batch_count += 1
212 |
213 | total_count += 1
214 | if total_count >= num_img:
215 | total_count = 0
216 | if self.shuffle: np.random.shuffle(self.images)
217 |
218 | if batch_count >= self.config['BATCH_SIZE']:
219 | yield [x_batch, b_batch], y_batch
220 |
221 | x_batch = np.zeros((self.config['BATCH_SIZE'], self.config['IMAGE_H'], self.config['IMAGE_W'], 3))
222 | y_batch = np.zeros((self.config['BATCH_SIZE'], self.config['GRID_H'], self.config['GRID_W'], self.config['BOX'], 5+self.config['CLASS']))
223 |
224 | batch_count = 0
225 |
226 | def aug_image(self, train_instance, jitter):
227 | image_name = train_instance['filename']
228 | image = cv2.imread(image_name)
229 | h, w, c = image.shape
230 |
231 | all_objs = copy.deepcopy(train_instance['object'])
232 |
233 | if jitter:
234 | ### scale the image
235 | scale = np.random.uniform() / 10. + 1.
236 | image = cv2.resize(image, (0,0), fx = scale, fy = scale)
237 |
238 | ### translate the image
239 | max_offx = (scale-1.) * w
240 | max_offy = (scale-1.) * h
241 | offx = int(np.random.uniform() * max_offx)
242 | offy = int(np.random.uniform() * max_offy)
243 |
244 | image = image[offy : (offy + h), offx : (offx + w)]
245 |
246 | ### flip the image
247 | flip = np.random.binomial(1, .5)
248 | if flip > 0.5: image = cv2.flip(image, 1)
249 |
250 | image = self.aug_pipe.augment_image(image)
251 |
252 | # resize the image to standard size
253 | image = cv2.resize(image, (self.config['IMAGE_H'], self.config['IMAGE_W']))
254 | image = image[:,:,::-1]
255 |
256 | # fix object's position and size
257 | for obj in all_objs:
258 | for attr in ['xmin', 'xmax']:
259 | if jitter: obj[attr] = int(obj[attr] * scale - offx)
260 |
261 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_W']) / w)
262 | obj[attr] = max(min(obj[attr], self.config['IMAGE_W']), 0)
263 |
264 | for attr in ['ymin', 'ymax']:
265 | if jitter: obj[attr] = int(obj[attr] * scale - offy)
266 |
267 | obj[attr] = int(obj[attr] * float(self.config['IMAGE_H']) / h)
268 | obj[attr] = max(min(obj[attr], self.config['IMAGE_H']), 0)
269 |
270 | if jitter and flip > 0.5:
271 | xmin = obj['xmin']
272 | obj['xmin'] = self.config['IMAGE_W'] - obj['xmax']
273 | obj['xmax'] = self.config['IMAGE_W'] - xmin
274 |
275 | return image, all_objs
276 |
277 | def get_dateset_size(self):
278 | return int(np.ceil(float(len(self.images))/self.config['BATCH_SIZE']))
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import xml.etree.ElementTree as ET
4 | import tensorflow as tf
5 | import copy
6 | import cv2
7 |
8 | class BoundBox:
9 | def __init__(self, x, y, w, h, c = None, classes = None):
10 | self.x = x
11 | self.y = y
12 | self.w = w
13 | self.h = h
14 |
15 | self.c = c
16 | self.classes = classes
17 |
18 | self.label = -1
19 | self.score = -1
20 |
21 | def get_label(self):
22 | if self.label == -1:
23 | self.label = np.argmax(self.classes)
24 |
25 | return self.label
26 |
27 | def get_score(self):
28 | if self.score == -1:
29 | self.score = self.classes[self.get_label()]
30 |
31 | return self.score
32 |
33 | class WeightReader:
34 | def __init__(self, weight_file):
35 | self.offset = 4
36 | self.all_weights = np.fromfile(weight_file, dtype='float32')
37 |
38 | def read_bytes(self, size):
39 | self.offset = self.offset + size
40 | return self.all_weights[self.offset-size:self.offset]
41 |
42 | def reset(self):
43 | self.offset = 4
44 |
45 | def normalize(image):
46 | image = image / 255.
47 |
48 | return image
49 |
50 | def bbox_iou(box1, box2):
51 | x1_min = box1.x - box1.w/2
52 | x1_max = box1.x + box1.w/2
53 | y1_min = box1.y - box1.h/2
54 | y1_max = box1.y + box1.h/2
55 |
56 | x2_min = box2.x - box2.w/2
57 | x2_max = box2.x + box2.w/2
58 | y2_min = box2.y - box2.h/2
59 | y2_max = box2.y + box2.h/2
60 |
61 | intersect_w = interval_overlap([x1_min, x1_max], [x2_min, x2_max])
62 | intersect_h = interval_overlap([y1_min, y1_max], [y2_min, y2_max])
63 |
64 | intersect = intersect_w * intersect_h
65 |
66 | union = box1.w * box1.h + box2.w * box2.h - intersect
67 |
68 | return float(intersect) / union
69 |
70 | def interval_overlap(interval_a, interval_b):
71 | x1, x2 = interval_a
72 | x3, x4 = interval_b
73 |
74 | if x3 < x1:
75 | if x4 < x1:
76 | return 0
77 | else:
78 | return min(x2,x4) - x1
79 | else:
80 | if x2 < x3:
81 | return 0
82 | else:
83 | return min(x2,x4) - x3
84 |
85 | def draw_boxes(image, boxes, labels):
86 |
87 | for box in boxes:
88 | xmin = int((box.x - box.w/2) * image.shape[1])
89 | xmax = int((box.x + box.w/2) * image.shape[1])
90 | ymin = int((box.y - box.h/2) * image.shape[0])
91 | ymax = int((box.y + box.h/2) * image.shape[0])
92 |
93 | cv2.rectangle(image, (xmin,ymin), (xmax,ymax), (0,255,0), 3)
94 | cv2.putText(image,
95 | labels[box.get_label()] + ' ' + str(box.get_score()),
96 | (xmin, ymin - 13),
97 | cv2.FONT_HERSHEY_SIMPLEX,
98 | 1e-3 * image.shape[0],
99 | (0,255,0), 2)
100 |
101 | return image
102 |
103 | def decode_netout(netout, obj_threshold, nms_threshold, anchors, nb_class):
104 | grid_h, grid_w, nb_box = netout.shape[:3]
105 |
106 | boxes = []
107 |
108 | # decode the output by the network
109 | netout[..., 4] = sigmoid(netout[..., 4])
110 | netout[..., 5:] = netout[..., 4][..., np.newaxis] * softmax(netout[..., 5:])
111 | netout[..., 5:] *= netout[..., 5:] > obj_threshold
112 |
113 | for row in range(grid_h):
114 | for col in range(grid_w):
115 | for b in range(nb_box):
116 | # from 4th element onwards are confidence and class classes
117 | classes = netout[row,col,b,5:]
118 |
119 | if np.sum(classes) > 0:
120 | # first 4 elements are x, y, w, and h
121 | x, y, w, h = netout[row,col,b,:4]
122 |
123 | x = (col + sigmoid(x)) / grid_w # center position, unit: image width
124 | y = (row + sigmoid(y)) / grid_h # center position, unit: image height
125 | w = anchors[2 * b + 0] * np.exp(w) / grid_w # unit: image width
126 | h = anchors[2 * b + 1] * np.exp(h) / grid_h # unit: image height
127 | confidence = netout[row,col,b,4]
128 |
129 | box = BoundBox(x, y, w, h, confidence, classes)
130 |
131 | boxes.append(box)
132 |
133 | # suppress non-maximal boxes
134 | for c in range(nb_class):
135 | sorted_indices = list(reversed(np.argsort([box.classes[c] for box in boxes])))
136 |
137 | for i in xrange(len(sorted_indices)):
138 | index_i = sorted_indices[i]
139 |
140 | if boxes[index_i].classes[c] == 0:
141 | continue
142 | else:
143 | for j in xrange(i+1, len(sorted_indices)):
144 | index_j = sorted_indices[j]
145 |
146 | if bbox_iou(boxes[index_i], boxes[index_j]) >= nms_threshold:
147 | boxes[index_j].classes[c] = 0
148 |
149 | # remove the boxes which are less likely than a obj_threshold
150 | boxes = [box for box in boxes if box.get_score() > obj_threshold]
151 |
152 | return boxes
153 |
154 | def sigmoid(x):
155 | return 1. / (1. + np.exp(-x))
156 |
157 | def softmax(x, axis=-1, t=-100.):
158 | x = x - np.max(x)
159 |
160 | if np.min(x) < t:
161 | x = x/np.min(x)*t
162 |
163 | e_x = np.exp(x)
164 |
165 | return e_x / e_x.sum(axis, keepdims=True)
--------------------------------------------------------------------------------