├── .gitignore ├── Notes ├── AnchorsBoxes.ipynb └── data_processing.ipynb ├── README.md ├── Train.py ├── datasets ├── __init__.py ├── data2record.py ├── dataset_utils.py ├── sythtextprovider.py └── testproviderfailed.py ├── deployment ├── __init__.py └── model_deploy.py ├── nets ├── __init__.py ├── custom_layers.py ├── textbox_common.py └── txtbox_300.py ├── processing ├── __init__.py ├── image_processing.py ├── image_processing2.py ├── ssd_vgg_preprocessing.py ├── test_processing.py └── tf_image.py ├── tf_extended ├── __init__.py ├── bboxes.py ├── image.py ├── math.py ├── metrics.py └── tensors.py └── tf_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .DS_Store 3 | .DS_Store? 4 | .Spotlight-V100 5 | .Trashes 6 | ehthumbs.db 7 | rawdata 8 | Thumbs.db 9 | .csv 10 | data 11 | # Directories. 12 | datasets/__pycache__/ 13 | deployment/__pycache__/ 14 | nets/__pycache__/ 15 | preprocessing/__pycache__/ 16 | Notes/.ipynb_checkpoints/ 17 | notebooks/.ipynb_checkpoints/ 18 | ssd-tensorflow.sublime-workspace 19 | ssd-tensorflow.sublime-project 20 | 21 | checkpoints/ssd_300_vgg.ckpt.data-00000-of-00001 22 | checkpoints/ssd_300_vgg.ckpt.index 23 | 24 | logs/ 25 | .ipynb_checkpoints/ 26 | __pycache__/ 27 | 28 | *.log 29 | 30 | checkpoints/VGG_VOC0712_SSD_* 31 | -------------------------------------------------------------------------------- /Notes/AnchorsBoxes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "## test \n", 12 | "## 1. anchor_boxes\n", 13 | "## 2. groudtruth encode\n", 14 | "## 3. bboxes decode(not yet finished)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 10, 20 | "metadata": { 21 | "collapsed": false 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "import numpy as np\n", 26 | "import math\n", 27 | "import tensorflow as tf\n", 28 | "import sys\n", 29 | "sys.path.insert(0,'../processing/')\n", 30 | "sys.path.insert(0,'../')\n", 31 | "from image_processing2 import *\n", 32 | "import tf_extended as tfe" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": { 39 | "collapsed": false 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "[0.2, 0.31666666666666665, 0.43333333333333335, 0.55, 0.6666666666666666, 0.7833333333333332, 0.8999999999999999]\n" 47 | ] 48 | } 49 | ], 50 | "source": [ 51 | "img_shape=(300, 300)\n", 52 | "num_classes=2\n", 53 | "feat_layers=['conv_4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'pool6']\n", 54 | "feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]\n", 55 | "scale_range=[0.20, 0.90]\n", 56 | "anchor_ratios=[1,2,3,5,7,10]\n", 57 | "normalizations=[20, -1, -1, -1, -1, -1]\n", 58 | "prior_scaling=[0.1, 0.1, 0.2, 0.2]\n", 59 | "\n", 60 | "step = (scale_range[1] - scale_range[0]) / len(feat_shapes)\n", 61 | "scales = [scale_range[0] + i * step for i in range(len(feat_shapes)+1)]\n", 62 | "print scales" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "(38, 38, 2, 1)\n", 77 | "[ 0.02368421 0.01674727 0.01367409 0.0105919 0.00895179 0.0074896 ]\n", 78 | "(38, 38, 2, 6)\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "def textbox_anchor_one_layer(img_shape,\n", 84 | " feat_size,\n", 85 | " ratios,\n", 86 | " scale,\n", 87 | " offset = 0.5,\n", 88 | " dtype=np.float32):\n", 89 | " # Follow the papers scheme\n", 90 | " # 12 ahchor boxes with out sk' = sqrt(sk * sk+1)\n", 91 | " y, x = np.mgrid[0:feat_size[0], 0:feat_size[1]] + 0.5\n", 92 | " y = y.astype(dtype) / feat_size[0]\n", 93 | " x = x.astype(dtype) / feat_size[1]\n", 94 | "\n", 95 | " x_offset = x\n", 96 | " y_offset = y + offset\n", 97 | " x_out = np.stack((x, x_offset), -1)\n", 98 | " y_out = np.stack((y, y_offset), -1)\n", 99 | " y_out = np.expand_dims(y_out, axis=-1)\n", 100 | " x_out = np.expand_dims(x_out, axis=-1)\n", 101 | "\n", 102 | " # \n", 103 | " num_anchors = 6\n", 104 | " h = np.zeros((num_anchors, ), dtype=dtype)\n", 105 | " w = np.zeros((num_anchors, ), dtype=dtype)\n", 106 | " for i ,r in enumerate(ratios):\n", 107 | " h[i] = scale / math.sqrt(r) / feat_size[0]\n", 108 | " w[i] = scale * math.sqrt(r) / feat_size[1]\n", 109 | " return y_out, x_out, h, w\n", 110 | "\n", 111 | "y,x,h,w = textbox_anchor_one_layer((300,300), (38,38),(1,2,3,5,7,10),0.9)\n", 112 | "print y.shape\n", 113 | "print h\n", 114 | "print (y -h).shape" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "6\n", 129 | "4\n", 130 | "(38, 38, 2, 1)\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "def textbox_achor_all_layers(img_shape,\n", 136 | " layers_shape,\n", 137 | " anchor_ratios,\n", 138 | " scales,\n", 139 | " offset=0.5,\n", 140 | " dtype=np.float32):\n", 141 | " \"\"\"\n", 142 | " Compute anchor boxes for all feature layers.\n", 143 | " \"\"\"\n", 144 | " layers_anchors = []\n", 145 | " for i, s in enumerate(layers_shape):\n", 146 | " anchor_bboxes = textbox_anchor_one_layer(img_shape, s,\n", 147 | " anchor_ratios,\n", 148 | " scales[i],\n", 149 | " offset=offset, dtype=dtype)\n", 150 | " layers_anchors.append(anchor_bboxes)\n", 151 | " return layers_anchors\n", 152 | "\n", 153 | "layers_anchors = textbox_achor_all_layers((300,300), feat_shapes,anchor_ratios,scales)\n", 154 | "print len(layers_anchors)\n", 155 | "print len(layers_anchors[0])\n", 156 | "print layers_anchors[0][0].shape\n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 12, 162 | "metadata": { 163 | "collapsed": false 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "# =========================================================================== #\n", 168 | "# TensorFlow implementation of Text Boxes encoding / decoding.\n", 169 | "# =========================================================================== #\n", 170 | "\n", 171 | "def tf_text_bboxes_encode_layer(bboxes,\n", 172 | " anchors_layer,\n", 173 | " matching_threshold=0.5,\n", 174 | " prior_scaling=[0.1, 0.1, 0.2, 0.2],\n", 175 | " dtype=tf.float32):\n", 176 | " \n", 177 | " \"\"\"\n", 178 | " Encode groundtruth labels and bounding boxes using Textbox anchors from\n", 179 | " one layer.\n", 180 | "\n", 181 | " Arguments:\n", 182 | " bboxes: Nx4 Tensor(float) with bboxes relative coordinates;\n", 183 | " anchors_layer: Numpy array with layer anchors;\n", 184 | " matching_threshold: Threshold for positive match with groundtruth bboxes;\n", 185 | " prior_scaling: Scaling of encoded coordinates.\n", 186 | "\n", 187 | " Return:\n", 188 | " (target_localizations, target_scores): Target Tensors.\n", 189 | " # thisi is a binary problem, so target_score and tartget_labels are same.\n", 190 | " \"\"\"\n", 191 | " # Anchors coordinates and volume.\n", 192 | "\n", 193 | " yref, xref, href, wref = anchors_layer\n", 194 | " print yref.shape\n", 195 | " print href.shape\n", 196 | " print bboxes.shape\n", 197 | " ymin = yref - href / 2.\n", 198 | " xmin = xref - wref / 2.\n", 199 | " ymax = yref + href / 2.\n", 200 | " xmax = xref + wref / 2. \n", 201 | " vol_anchors = (xmax - xmin) * (ymax - ymin)\n", 202 | " \n", 203 | " # Initialize tensors...\n", 204 | " shape = (yref.shape[0], yref.shape[1], yref.shape[2], href.size)\n", 205 | " # all follow the shape(feat.size, feat.size, 2, 6)\n", 206 | " #feat_labels = tf.zeros(shape, dtype=tf.int64)\n", 207 | " feat_scores = tf.zeros(shape, dtype=dtype)\n", 208 | "\n", 209 | " feat_ymin = tf.zeros(shape, dtype=dtype)\n", 210 | " feat_xmin = tf.zeros(shape, dtype=dtype)\n", 211 | " feat_ymax = tf.ones(shape, dtype=dtype)\n", 212 | " feat_xmax = tf.ones(shape, dtype=dtype)\n", 213 | "\n", 214 | " def jaccard_with_anchors(bbox):\n", 215 | " \"\"\"\n", 216 | " Compute jaccard score between a box and the anchors.\n", 217 | " \"\"\"\n", 218 | " int_ymin = tf.maximum(ymin, bbox[0])\n", 219 | " int_xmin = tf.maximum(xmin, bbox[1])\n", 220 | " int_ymax = tf.minimum(ymax, bbox[2])\n", 221 | " int_xmax = tf.minimum(xmax, bbox[3])\n", 222 | " h = tf.maximum(int_ymax - int_ymin, 0.)\n", 223 | " w = tf.maximum(int_xmax - int_xmin, 0.)\n", 224 | " # Volumes.\n", 225 | " inter_vol = h * w\n", 226 | " union_vol = vol_anchors - inter_vol \\\n", 227 | " + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])\n", 228 | " jaccard = tf.div(inter_vol, union_vol)\n", 229 | " return jaccard\n", 230 | " \n", 231 | " \"\"\"\n", 232 | " # never use in Textbox\n", 233 | " def intersection_with_anchors(bbox):\n", 234 | " '''\n", 235 | " Compute intersection between score a box and the anchors.\n", 236 | " '''\n", 237 | " int_ymin = tf.maximum(ymin, bbox[0])\n", 238 | " int_xmin = tf.maximum(xmin, bbox[1])\n", 239 | " int_ymax = tf.minimum(ymax, bbox[2])\n", 240 | " int_xmax = tf.minimum(xmax, bbox[3])\n", 241 | " h = tf.maximum(int_ymax - int_ymin, 0.)\n", 242 | " w = tf.maximum(int_xmax - int_xmin, 0.)\n", 243 | " inter_vol = h * w\n", 244 | " scores = tf.div(inter_vol, vol_anchors)\n", 245 | " return scores\n", 246 | " \"\"\"\n", 247 | " \n", 248 | " def condition(i, feat_scores,\n", 249 | " feat_ymin, feat_xmin, feat_ymax, feat_xmax):\n", 250 | " \"\"\"Condition: check label index.\n", 251 | " \"\"\"\n", 252 | " r = tf.less(i, 3)\n", 253 | " return r\n", 254 | "\n", 255 | " def body(i, feat_scores,feat_ymin, feat_xmin, feat_ymax, feat_xmax,bbox):\n", 256 | " \"\"\"Body: update feature labels, scores and bboxes.\n", 257 | " Follow the original SSD paper for that purpose:\n", 258 | " - assign values when jaccard > 0.5;\n", 259 | " - only update if beat the score of other bboxes.\n", 260 | " \"\"\"\n", 261 | " # Jaccard score.\n", 262 | " #bbox = bboxes[i]\n", 263 | " jaccard = jaccard_with_anchors(bbox)\n", 264 | " # Mask: check threshold + scores + no annotations + num_classes.\n", 265 | " mask = tf.greater(jaccard, feat_scores)\n", 266 | " mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))\n", 267 | " #mask = tf.logical_and(mask, feat_scores > -0.5)\n", 268 | " #mask = tf.logical_and(mask, label < num_classes)\n", 269 | " imask = tf.cast(mask, tf.int64)\n", 270 | " fmask = tf.cast(mask, dtype)\n", 271 | " # Update values using mask.\n", 272 | " #feat_labels = imask * label + (1 - imask) * feat_labels\n", 273 | " feat_scores = tf.where(mask, jaccard, feat_scores)\n", 274 | "\n", 275 | " feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin\n", 276 | " feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin\n", 277 | " feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax\n", 278 | " feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax\n", 279 | "\n", 280 | " # Check no annotation label: ignore these anchors...\n", 281 | " #interscts = intersection_with_anchors(bbox)\n", 282 | " #mask = tf.logical_and(interscts > ignore_threshold,\n", 283 | " # label == no_annotation_label)\n", 284 | " # Replace scores by -1.\n", 285 | " #feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)\n", 286 | "\n", 287 | " return [i+1, feat_scores,\n", 288 | " feat_ymin, feat_xmin, feat_ymax, feat_xmax]\n", 289 | " # Main loop definition.\n", 290 | " '''\n", 291 | " i = 0\n", 292 | " [i,feat_scores,\n", 293 | " feat_ymin, feat_xmin,\n", 294 | " feat_ymax, feat_xmax] = tf.while_loop(condition, body,\n", 295 | " [i, feat_scores,\n", 296 | " feat_ymin, feat_xmin,\n", 297 | " feat_ymax, feat_xmax])\n", 298 | " '''\n", 299 | " for i, bbox in enumerate(tf.unstack(bboxes, axis=0)):\n", 300 | " [i,feat_scores,feat_ymin, \n", 301 | " feat_xmin, feat_ymax, feat_xmax] = body(i, feat_scores,\n", 302 | " feat_ymin, feat_xmin, \n", 303 | " feat_ymax, feat_xmax,bbox)\n", 304 | " # Transform to center / size.\n", 305 | " feat_cy = (feat_ymax + feat_ymin) / 2.\n", 306 | " feat_cx = (feat_xmax + feat_xmin) / 2.\n", 307 | " feat_h = feat_ymax - feat_ymin\n", 308 | " feat_w = feat_xmax - feat_xmin\n", 309 | " # Encode features.\n", 310 | " feat_cy = (feat_cy - yref) / href / prior_scaling[0]\n", 311 | " feat_cx = (feat_cx - xref) / wref / prior_scaling[1]\n", 312 | " feat_h = tf.log(feat_h / href) / prior_scaling[2]\n", 313 | " feat_w = tf.log(feat_w / wref) / prior_scaling[3]\n", 314 | " # Use SSD ordering: x / y / w / h instead of ours.\n", 315 | " feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)\n", 316 | " return feat_localizations, feat_scores\n" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 13, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "def tf_text_bboxes_encode(bboxes,\n", 328 | " anchors,\n", 329 | " matching_threshold=0.5,\n", 330 | " prior_scaling=[0.1, 0.1, 0.2, 0.2],\n", 331 | " dtype=tf.float32,\n", 332 | " scope='ssd_bboxes_encode'):\n", 333 | " \"\"\"Encode groundtruth labels and bounding boxes using SSD net anchors.\n", 334 | " Encoding boxes for all feature layers.\n", 335 | "\n", 336 | " Arguments:\n", 337 | " bboxes: Nx4 Tensor(float) with bboxes relative coordinates;\n", 338 | " anchors: List of Numpy array with layer anchors;\n", 339 | " matching_threshold: Threshold for positive match with groundtruth bboxes;\n", 340 | " prior_scaling: Scaling of encoded coordinates.\n", 341 | "\n", 342 | " Return:\n", 343 | " (target_labels, target_localizations, target_scores):\n", 344 | " Each element is a list of target Tensors.\n", 345 | " \"\"\"\n", 346 | " with tf.name_scope(scope):\n", 347 | " target_labels = []\n", 348 | " target_localizations = []\n", 349 | " target_scores = []\n", 350 | " for i, anchors_layer in enumerate(anchors):\n", 351 | " with tf.name_scope('bboxes_encode_block_%i' % i):\n", 352 | " t_loc, t_scores = \\\n", 353 | " tf_text_bboxes_encode_layer(bboxes, anchors_layer,\n", 354 | " matching_threshold,\n", 355 | " prior_scaling, dtype)\n", 356 | " target_localizations.append(t_loc)\n", 357 | " target_scores.append(t_scores)\n", 358 | " return target_localizations, target_scores" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 14, 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "name SparseTensor(indices=Tensor(\"ParseSingleExample_1/Slice_Indices_image/name:0\", shape=(?, 1), dtype=int64), values=Tensor(\"ParseSingleExample_1/ParseExample/ParseExample:6\", shape=(?,), dtype=string), dense_shape=Tensor(\"ParseSingleExample_1/Squeeze_Shape_image/name:0\", shape=(1,), dtype=int64))\n", 373 | "image after decode Tensor(\"decode_jpeg_1/convert_image:0\", shape=(?, ?, 3), dtype=float32)\n", 374 | "labels: Tensor(\"ExpandDims_11:0\", shape=(1, ?), dtype=int64) \n", 375 | "(38, 38, 2, 1)\n", 376 | "(6,)\n", 377 | "(?, 4)\n" 378 | ] 379 | }, 380 | { 381 | "ename": "ValueError", 382 | "evalue": "Cannot infer num from shape (?, 4)", 383 | "output_type": "error", 384 | "traceback": [ 385 | "\u001b[0;31m---------------------------------------------------------------\u001b[0m", 386 | "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", 387 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 7\u001b[0m image,label,bboxes = image_processing(image_buffer, bboxes,label,\n\u001b[1;32m 8\u001b[0m train= True, thread_id = 0)\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mflocalization\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfscores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf_text_bboxes_encode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbboxes\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mlayers_anchors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mmatching_threshold\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 10\u001b[0m \u001b[0;31m#print flocalization.shape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 11\u001b[0m \u001b[0;31m#print fscores.shape\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 388 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtf_text_bboxes_encode\u001b[0;34m(bboxes, anchors, matching_threshold, prior_scaling, dtype, scope)\u001b[0m\n\u001b[1;32m 26\u001b[0m t_loc, t_scores = tf_text_bboxes_encode_layer(bboxes, anchors_layer,\n\u001b[1;32m 27\u001b[0m \u001b[0mmatching_threshold\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m prior_scaling, dtype)\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0mtarget_localizations\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt_loc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtarget_scores\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mt_scores\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 389 | "\u001b[0;32m\u001b[0m in \u001b[0;36mtf_text_bboxes_encode_layer\u001b[0;34m(bboxes, anchors_layer, matching_threshold, prior_scaling, dtype)\u001b[0m\n\u001b[1;32m 130\u001b[0m feat_ymax, feat_xmax])\n\u001b[1;32m 131\u001b[0m '''\n\u001b[0;32m--> 132\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbbox\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munstack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbboxes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 133\u001b[0m [i,feat_scores,feat_ymin, \n\u001b[1;32m 134\u001b[0m \u001b[0mfeat_xmin\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_ymax\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_xmax\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbody\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_scores\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 390 | "\u001b[0;32m/Applications/python/anaconda/envs/tensorflow2.7/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.pyc\u001b[0m in \u001b[0;36munstack\u001b[0;34m(value, num, axis, name)\u001b[0m\n\u001b[1;32m 958\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 959\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnum\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 960\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Cannot infer num from shape %s\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mvalue_shape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 961\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mgen_array_ops\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_unpack\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnum\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mnum\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 962\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 391 | "\u001b[0;31mValueError\u001b[0m: Cannot infer num from shape (?, 4)" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "data_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/'\n", 397 | "file_name = data_dir + '1.tfrecord'\n", 398 | "## test if file_name exists \n", 399 | "\n", 400 | "example = tf.python_io.tf_record_iterator(file_name).next()\n", 401 | "image_buffer, label, bboxes, name= parse_example(example)\n", 402 | "image,label,bboxes = image_processing(image_buffer, bboxes,label,\n", 403 | " train= True, thread_id = 0)\n", 404 | "flocalization, fscores = tf_text_bboxes_encode(bboxes,layers_anchors,matching_threshold=0.1)\n", 405 | "#print flocalization.shape\n", 406 | "#print fscores.shape\n", 407 | "\n", 408 | "\n", 409 | "with tf.Session() as sess:\n", 410 | " sess.run(tf.global_variables_initializer())\n", 411 | " Image, label, bboxes = sess.run([image, label, bboxes])\n", 412 | " flocalization, fscores = sess.run([flocalization,fscores])\n", 413 | " print label.shape\n", 414 | " print bboxes\n", 415 | " #print name\n", 416 | " #print width\n", 417 | " #print height\n", 418 | " print Image.shape\n", 419 | " print flocalization[0].shape\n", 420 | " for i in range(6):\n", 421 | " print np.where(fscores[i] > 0)\n", 422 | " \"\"\"\n", 423 | " visualize_bbox(Image, bboxes)\n", 424 | " skio.imshow(Image)\n", 425 | " skio.show()\n", 426 | " \"\"\"" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": { 433 | "collapsed": true 434 | }, 435 | "outputs": [], 436 | "source": [] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": { 442 | "collapsed": true 443 | }, 444 | "outputs": [], 445 | "source": [] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [] 455 | } 456 | ], 457 | "metadata": { 458 | "kernelspec": { 459 | "display_name": "keras_tf_2.7", 460 | "language": "python", 461 | "name": "tensorflow2.7" 462 | }, 463 | "language_info": { 464 | "codemirror_mode": { 465 | "name": "ipython", 466 | "version": 2 467 | }, 468 | "file_extension": ".py", 469 | "mimetype": "text/x-python", 470 | "name": "python", 471 | "nbconvert_exporter": "python", 472 | "pygments_lexer": "ipython2", 473 | "version": "2.7.13" 474 | } 475 | }, 476 | "nbformat": 4, 477 | "nbformat_minor": 0 478 | } 479 | -------------------------------------------------------------------------------- /Notes/data_processing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "## 1. Transform data to record format\n", 12 | "## First dataset from http://www.robots.ox.ac.uk/~vgg/data/scenetext/\n", 13 | "## This method failed, because " 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 1, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "'1.0.1'" 27 | ] 28 | }, 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "%matplotlib inline\n", 36 | "import math\n", 37 | "import numpy as np\n", 38 | "import scipy.io as sio\n", 39 | "import gzip\n", 40 | "from zipfile import ZipFile\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import cv2\n", 43 | "import sys\n", 44 | "sys.path.insert(0,'../')\n", 45 | "import tensorflow as tf\n", 46 | "import skimage.io as skio\n", 47 | "tf.InteractiveSession()\n", 48 | "from PIL import Image\n", 49 | "import re\n", 50 | "import os\n", 51 | "slim = tf.contrib.slim\n", 52 | "tf.__version__" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "def int64_feature(value):\n", 64 | " \"\"\"Wrapper for inserting int64 features into Example proto.\n", 65 | " \"\"\"\n", 66 | " if not isinstance(value, list):\n", 67 | " value = [value]\n", 68 | " return tf.train.Feature(int64_list=tf.train.Int64List(value=value))\n", 69 | "\n", 70 | "\n", 71 | "def float_feature(value):\n", 72 | " \"\"\"Wrapper for inserting float features into Example proto.\n", 73 | " \"\"\"\n", 74 | " if not isinstance(value, list):\n", 75 | " value = [value]\n", 76 | " return tf.train.Feature(float_list=tf.train.FloatList(value=value))\n", 77 | "\n", 78 | "\n", 79 | "def bytes_feature(value):\n", 80 | " \"\"\"Wrapper for inserting bytes features into Example proto.\n", 81 | " \"\"\"\n", 82 | " if not isinstance(value, list):\n", 83 | " value = [value]\n", 84 | " return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "def visualize_bbox(image, bboxes):\n", 96 | " \"\"\"\n", 97 | " Input: image (height, width, channels)\n", 98 | " bboxes (numof bboxes, 4) in order(ymin, xmin, ymax, xmax)\n", 99 | " range(0,1) \n", 100 | " \"\"\"\n", 101 | " numofbox = bboxes.shape[0]\n", 102 | " width = image.shape[1]\n", 103 | " height = image.shape[0]\n", 104 | " def norm(x):\n", 105 | " if x < 0:\n", 106 | " x = 0\n", 107 | " else:\n", 108 | " if x > 1:\n", 109 | " x = 1\n", 110 | " return x\n", 111 | " xmin = [int(norm(i) * width) for i in bboxes[:,1]]\n", 112 | " ymin = [int(norm(i) * height) for i in bboxes[:,0]]\n", 113 | " ymax = [int(norm(i) * height) for i in bboxes[:,2]]\n", 114 | " xmax = [int(norm(i) * width) for i in bboxes[:,3]]\n", 115 | "\n", 116 | " for i in range(numofbox):\n", 117 | " image = cv2.rectangle(image,(xmin[i],ymin[i]),\n", 118 | " (xmax[i],ymax[i]),(0,0,0))\n", 119 | " skio.imshow(image)\n", 120 | " skio.show()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "if __name__ == \"__main__\":\n", 132 | " data_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/'\n", 133 | " file_name = data_dir + '1.tfrecord'\n", 134 | " ## test if file_name exists \n", 135 | "\n", 136 | " example = tf.python_io.tf_record_iterator(file_name).next()\n", 137 | " image_buffer, label, bboxes, name= parse_example(example)\n", 138 | " image,label,bboxes = image_processing(image_buffer, bboxes,label,\n", 139 | " train= True, thread_id = 0)\n", 140 | "\n", 141 | " with tf.Session() as sess:\n", 142 | " sess.run(tf.global_variables_initializer())\n", 143 | " Image, label, bboxes = sess.run([image, label, bboxes])\n", 144 | " print label.shape\n", 145 | " print bboxes\n", 146 | " #print name\n", 147 | " #print width\n", 148 | " #print height\n", 149 | " print Image.shape\n", 150 | " visualize_bbox(Image, bboxes)\n", 151 | " skio.imshow(Image)\n", 152 | " skio.show()" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "keras_tf_2.7", 159 | "language": "python", 160 | "name": "tensorflow2.7" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 2 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython2", 172 | "version": "2.7.13" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 0 177 | } 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TextBoxes-TensorFlow 2 | TextBoxes re-implementation using tensorflow. 3 | This project is greatly inspired by [slim project](https://github.com/tensorflow/models/tree/master/slim) 4 | And many functions are modified based on [SSD-tensorflow project](https://github.com/balancap/SSD-Tensorflow) 5 | Later, we will overwrite this project so make it more 6 | flexiable and modularized. 7 | 8 | Author: 9 | Daitao Xing : dx383@nyu.edu 10 | Jin Huang : jh5442@nyu.edu 11 | 12 | # Progress 13 | 2017/ 03/14   14 | 15 | data_processing phase finished 16 | Test: 17 | 18 | 1. Download the dataset, put 1/ folder and gt.mat uner ddata/sythtext/ folder(will wirte script)   19 | 2. python datasets/data2record.py 20 | 3. python image_processing.py 21 | 22 | output: batch_size * 300 * 300 * 3 image 23 | 24 | 2017/ 03/17   25 | 26 | Finish the design of training(can start training) 27 | 28 | python train.py \ 29 | --train_dir=${TRAIN_DIR} \ 30 | --dataset_dir=${DATASET_DIR} \ 31 | --save_summaries_secs=60 \ 32 | --save_interval_secs=600 \ 33 | --weight_decay=0.0005 \ 34 | --optimizer=adam \ 35 | --learning_rate=0.001 \ 36 | --batch_size=32 37 | 38 | # Problems to be solved: 39 | 1. Need to redesign visualization 40 | 2. image_processing can be improved 41 | 42 | # Next steps: 43 | 44 | 1. traing on other datasets 45 | 2. fine tunes 46 | 3. test 47 | 4. automatic downloading datasets and so on 48 | 49 | -------------------------------------------------------------------------------- /Train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Generic training script that trains a SSD model using a given dataset.""" 16 | 17 | import tensorflow as tf 18 | from tensorflow.python.ops import control_flow_ops 19 | 20 | from datasets import sythtextprovider 21 | from deployment import model_deploy 22 | from nets import txtbox_300 23 | from processing import image_processing 24 | from processing import ssd_vgg_preprocessing 25 | import tf_utils 26 | 27 | slim = tf.contrib.slim 28 | 29 | # =========================================================================== # 30 | # Text Network flags. 31 | # =========================================================================== # 32 | tf.app.flags.DEFINE_float( 33 | 'loss_alpha', 1., 'Alpha parameter in the loss function.') 34 | tf.app.flags.DEFINE_float( 35 | 'negative_ratio', 3., 'Negative ratio in the loss function.') 36 | tf.app.flags.DEFINE_float( 37 | 'match_threshold', 0.1, 'Matching threshold in the loss function.') 38 | 39 | # =========================================================================== # 40 | # General Flags. 41 | # =========================================================================== # 42 | tf.app.flags.DEFINE_string( 43 | 'train_dir', '/tmp/tfmodel/', 44 | 'Directory where checkpoints and event logs are written to.') 45 | tf.app.flags.DEFINE_integer('num_clones', 1, 46 | 'Number of model clones to deploy.') 47 | tf.app.flags.DEFINE_boolean('clone_on_cpu', False, 48 | 'Use CPUs to deploy clones.') 49 | tf.app.flags.DEFINE_integer( 50 | 'num_readers', 4, 51 | 'The number of parallel readers that read data from the dataset.') 52 | tf.app.flags.DEFINE_integer( 53 | 'num_preprocessing_threads', 4, 54 | 'The number of threads used to create the batches.') 55 | 56 | tf.app.flags.DEFINE_integer( 57 | 'log_every_n_steps', 10, 58 | 'The frequency with which logs are print.') 59 | tf.app.flags.DEFINE_integer( 60 | 'save_summaries_secs', 600, 61 | 'The frequency with which summaries are saved, in seconds.') 62 | tf.app.flags.DEFINE_integer( 63 | 'save_interval_secs', 600, 64 | 'The frequency with which the model is saved, in seconds.') 65 | tf.app.flags.DEFINE_integer( 66 | 'gpu_memory_fraction', 0.75, 'GPU memory fraction to use.') 67 | 68 | # =========================================================================== # 69 | # Optimization Flags. 70 | # =========================================================================== # 71 | tf.app.flags.DEFINE_float( 72 | 'weight_decay', 0.0005, 'The weight decay on the model weights.') 73 | tf.app.flags.DEFINE_string( 74 | 'optimizer', 'rmsprop', 75 | 'The name of the optimizer, one of "adadelta", "adagrad", "adam",' 76 | '"ftrl", "momentum", "sgd" or "rmsprop".') 77 | tf.app.flags.DEFINE_float( 78 | 'adadelta_rho', 0.95, 79 | 'The decay rate for adadelta.') 80 | tf.app.flags.DEFINE_float( 81 | 'adagrad_initial_accumulator_value', 0.1, 82 | 'Starting value for the AdaGrad accumulators.') 83 | tf.app.flags.DEFINE_float( 84 | 'adam_beta1', 0.9, 85 | 'The exponential decay rate for the 1st moment estimates.') 86 | tf.app.flags.DEFINE_float( 87 | 'adam_beta2', 0.999, 88 | 'The exponential decay rate for the 2nd moment estimates.') 89 | tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.') 90 | tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5, 91 | 'The learning rate power.') 92 | tf.app.flags.DEFINE_float( 93 | 'ftrl_initial_accumulator_value', 0.1, 94 | 'Starting value for the FTRL accumulators.') 95 | tf.app.flags.DEFINE_float( 96 | 'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.') 97 | tf.app.flags.DEFINE_float( 98 | 'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.') 99 | tf.app.flags.DEFINE_float( 100 | 'momentum', 0.9, 101 | 'The momentum for the MomentumOptimizer and RMSPropOptimizer.') 102 | tf.app.flags.DEFINE_float('rmsprop_momentum', 0.9, 'Momentum.') 103 | tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.') 104 | 105 | # =========================================================================== # 106 | # Learning Rate Flags. 107 | # =========================================================================== # 108 | tf.app.flags.DEFINE_string( 109 | 'learning_rate_decay_type', 110 | 'fixed', 111 | 'Specifies how the learning rate is decayed. One of "fixed", "exponential",' 112 | ' or "polynomial"') 113 | tf.app.flags.DEFINE_float('learning_rate', 0.001, 'Initial learning rate.') 114 | tf.app.flags.DEFINE_float( 115 | 'end_learning_rate', 0.0001, 116 | 'The minimal end learning rate used by a polynomial decay learning rate.') 117 | tf.app.flags.DEFINE_float( 118 | 'label_smoothing', 0.0, 'The amount of label smoothing.') 119 | tf.app.flags.DEFINE_float( 120 | 'learning_rate_decay_factor', 0.1, 'Learning rate decay factor.') 121 | tf.app.flags.DEFINE_float( 122 | 'num_epochs_per_decay', 40000, 123 | 'Number of epochs after which learning rate decays.') 124 | tf.app.flags.DEFINE_float( 125 | 'moving_average_decay', None, 126 | 'The decay to use for the moving average.' 127 | 'If left as None, then moving averages are not used.') 128 | 129 | # =========================================================================== # 130 | # Dataset Flags. 131 | # =========================================================================== # 132 | tf.app.flags.DEFINE_string( 133 | 'dataset_name', 'sythtext', 'The name of the dataset to load.') 134 | tf.app.flags.DEFINE_integer( 135 | 'num_classes', 2, 'Number of classes to use in the dataset.') 136 | tf.app.flags.DEFINE_string( 137 | 'dataset_split_name', 'train', 'The name of the train/test split.') 138 | tf.app.flags.DEFINE_string( 139 | 'dataset_dir', None, 'The directory where the dataset files are stored.') 140 | tf.app.flags.DEFINE_integer( 141 | 'labels_offset', 0, 142 | 'An offset for the labels in the dataset. This flag is primarily used to ' 143 | 'evaluate the VGG and ResNet architectures which do not use a background ' 144 | 'class for the ImageNet dataset.') 145 | tf.app.flags.DEFINE_string( 146 | 'model_name', 'txtbox_300', 'The name of the architecture to train.') 147 | tf.app.flags.DEFINE_string( 148 | 'preprocessing_name', None, 'The name of the preprocessing to use. If left ' 149 | 'as `None`, then the model_name flag is used.') 150 | tf.app.flags.DEFINE_integer( 151 | 'batch_size', 32, 'The number of samples in each batch.') 152 | tf.app.flags.DEFINE_integer( 153 | 'train_image_size', None, 'Train image size') 154 | tf.app.flags.DEFINE_integer('max_number_of_steps', None, 155 | 'The maximum number of training steps.') 156 | # =========================================================================== # 157 | # Fine-Tuning Flags. 158 | # =========================================================================== # 159 | tf.app.flags.DEFINE_string( 160 | 'checkpoint_path', None, 161 | 'The path to a checkpoint from which to fine-tune.') 162 | tf.app.flags.DEFINE_string( 163 | 'checkpoint_model_scope', None, 164 | 'Model scope in the checkpoint. None if the same as the trained model.') 165 | tf.app.flags.DEFINE_string( 166 | 'checkpoint_exclude_scopes', None, 167 | 'Comma-separated list of scopes of variables to exclude when restoring ' 168 | 'from a checkpoint.') 169 | tf.app.flags.DEFINE_string( 170 | 'trainable_scopes', None, 171 | 'Comma-separated list of scopes to filter the set of variables to train.' 172 | 'By default, None would train all the variables.') 173 | tf.app.flags.DEFINE_boolean( 174 | 'ignore_missing_vars', False, 175 | 'When restoring a checkpoint would ignore missing variables.') 176 | 177 | FLAGS = tf.app.flags.FLAGS 178 | 179 | 180 | # =========================================================================== # 181 | # Main training routine. 182 | # =========================================================================== # 183 | def main(_): 184 | if not FLAGS.dataset_dir: 185 | raise ValueError('You must supply the dataset directory with --dataset_dir') 186 | 187 | tf.logging.set_verbosity(tf.logging.DEBUG) 188 | with tf.Graph().as_default(): 189 | # Config model_deploy. Keep TF Slim Models structure. 190 | # Useful if want to need multiple GPUs and/or servers in the future. 191 | deploy_config = model_deploy.DeploymentConfig( 192 | num_clones=FLAGS.num_clones, 193 | clone_on_cpu=FLAGS.clone_on_cpu, 194 | replica_id=0, 195 | num_replicas=1, 196 | num_ps_tasks=0) 197 | # Create global_step. 198 | with tf.device(deploy_config.variables_device()): 199 | global_step = slim.create_global_step() 200 | 201 | # Select the dataset. 202 | 203 | #dataset = dataset_factory.get_dataset( 204 | # FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir) 205 | dataset = sythtextprovider.get_datasets(FLAGS.dataset_dir) 206 | # Get the SSD network and its anchors. 207 | 208 | #ssd_class = nets_factory.get_network(FLAGS.model_name) 209 | #ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes) 210 | text_net = txtbox_300.TextboxNet() 211 | text_shape = text_net.params.img_shape 212 | print 'text_shape '+ str(text_shape) 213 | text_anchors = text_net.anchors(text_shape) 214 | print len(text_anchors) 215 | # Select the preprocessing function. 216 | ''' 217 | preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name 218 | image_preprocessing_fn = preprocessing_factory.get_preprocessing( 219 | preprocessing_name, is_training=True) 220 | ''' 221 | #tf_utils.print_configuration(FLAGS.__flags, ssd_params, 222 | # dataset.data_sources, FLAGS.train_dir) 223 | # =================================================================== # 224 | # Create a dataset provider and batches. 225 | # =================================================================== # 226 | with tf.device(deploy_config.inputs_device()): 227 | with tf.name_scope(FLAGS.dataset_name + '_data_provider'): 228 | provider = slim.dataset_data_provider.DatasetDataProvider( 229 | dataset, 230 | num_readers=FLAGS.num_readers, 231 | common_queue_capacity=20 * FLAGS.batch_size, 232 | common_queue_min=10 * FLAGS.batch_size, 233 | shuffle=True) 234 | # Get for SSD network: image, labels, bboxes. 235 | [image, shape, glabels, gbboxes] = provider.get(['image', 'shape', 236 | 'object/label', 237 | 'object/bbox']) 238 | 239 | init_op = tf.global_variables_initializer() 240 | 241 | # Pre-processing image, labels and bboxes. 242 | 243 | image, glabels, gbboxes = \ 244 | ssd_vgg_preprocessing.preprocess_image(image, glabels,gbboxes, 245 | text_shape,is_training=True, 246 | data_format='NHWC') 247 | 248 | # Encode groundtruth labels and bboxes. 249 | print 'bboxes num' + str(gbboxes.get_shape()) 250 | print 'glabes' + str(tf.shape(glabels)) 251 | glocalisations, gscores = \ 252 | text_net.bboxes_encode( gbboxes, text_anchors) 253 | batch_shape = [1] + [len(text_anchors)] * 2 254 | 255 | # Training batches and queue. 256 | 257 | r = tf.train.batch( 258 | tf_utils.reshape_list([image, glocalisations, gscores]), 259 | batch_size=FLAGS.batch_size, 260 | num_threads=FLAGS.num_preprocessing_threads, 261 | capacity=5 * FLAGS.batch_size) 262 | print 'r shape' + str(r[0]) + str(r[1]) + str(r[10]) 263 | b_image, b_glocalisations, b_gscores= \ 264 | tf_utils.reshape_list(r, batch_shape) 265 | 266 | 267 | # Intermediate queueing: unique batch computation pipeline for all 268 | # GPUs running the training. 269 | batch_queue = slim.prefetch_queue.prefetch_queue( 270 | tf_utils.reshape_list([b_image, b_glocalisations,b_gscores]), 271 | capacity=2 * deploy_config.num_clones) 272 | 273 | 274 | # =================================================================== # 275 | # Define the model running on every GPU. 276 | # =================================================================== # 277 | def clone_fn(batch_queue): 278 | 279 | #Allows data parallelism by creating multiple 280 | #clones of network_fn. 281 | 282 | # Dequeue batch. 283 | b_image, b_glocalisations, b_gscores = \ 284 | tf_utils.reshape_list(batch_queue.dequeue(), batch_shape) 285 | 286 | # Construct SSD network. 287 | arg_scope = text_net.arg_scope(weight_decay=FLAGS.weight_decay) 288 | with slim.arg_scope(arg_scope): 289 | localisations, logits, end_points = \ 290 | text_net.net(b_image, is_training=True) 291 | # Add loss function. 292 | text_net.losses(logits, localisations, 293 | b_glocalisations, b_gscores, 294 | match_threshold=FLAGS.match_threshold, 295 | negative_ratio=FLAGS.negative_ratio, 296 | alpha=FLAGS.loss_alpha, 297 | label_smoothing=FLAGS.label_smoothing) 298 | return end_points 299 | 300 | # Gather initial summaries. 301 | summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) 302 | 303 | # =================================================================== # 304 | # Add summaries from first clone. 305 | # =================================================================== # 306 | clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue]) 307 | first_clone_scope = deploy_config.clone_scope(0) 308 | # Gather update_ops from the first clone. These contain, for example, 309 | # the updates for the batch_norm variables created by network_fn. 310 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope) 311 | 312 | # Add summaries for end_points. 313 | end_points = clones[0].outputs 314 | for end_point in end_points: 315 | x = end_points[end_point] 316 | summaries.add(tf.summary.histogram('activations/' + end_point, x)) 317 | summaries.add(tf.summary.scalar('sparsity/' + end_point, 318 | tf.nn.zero_fraction(x))) 319 | # Add summaries for losses and extra losses. 320 | for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope): 321 | summaries.add(tf.summary.scalar(loss.op.name, loss)) 322 | for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope): 323 | summaries.add(tf.summary.scalar(loss.op.name, loss)) 324 | 325 | # Add summaries for variables. 326 | for variable in slim.get_model_variables(): 327 | summaries.add(tf.summary.histogram(variable.op.name, variable)) 328 | 329 | # =================================================================== # 330 | # Configure the moving averages. 331 | # =================================================================== # 332 | if FLAGS.moving_average_decay: 333 | moving_average_variables = slim.get_model_variables() 334 | variable_averages = tf.train.ExponentialMovingAverage( 335 | FLAGS.moving_average_decay, global_step) 336 | else: 337 | moving_average_variables, variable_averages = None, None 338 | 339 | # =================================================================== # 340 | # Configure the optimization procedure. 341 | # =================================================================== # 342 | with tf.device(deploy_config.optimizer_device()): 343 | learning_rate = tf_utils.configure_learning_rate(FLAGS, 344 | dataset.num_samples, 345 | global_step) 346 | optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate) 347 | summaries.add(tf.summary.scalar('learning_rate', learning_rate)) 348 | 349 | if FLAGS.moving_average_decay: 350 | # Update ops executed locally by trainer. 351 | update_ops.append(variable_averages.apply(moving_average_variables)) 352 | 353 | # Variables to train. 354 | variables_to_train = tf_utils.get_variables_to_train(FLAGS) 355 | 356 | # and returns a train_tensor and summary_op 357 | total_loss, clones_gradients = model_deploy.optimize_clones( 358 | clones, 359 | optimizer, 360 | var_list=variables_to_train) 361 | # Add total_loss to summary. 362 | summaries.add(tf.summary.scalar('total_loss', total_loss)) 363 | 364 | # Create gradient updates. 365 | grad_updates = optimizer.apply_gradients(clones_gradients, 366 | global_step=global_step) 367 | update_ops.append(grad_updates) 368 | update_op = tf.group(*update_ops) 369 | train_tensor = control_flow_ops.with_dependencies([update_op], total_loss, 370 | name='train_op') 371 | 372 | # Add the summaries from the first clone. These contain the summaries 373 | summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES, 374 | first_clone_scope)) 375 | # Merge all summaries together. 376 | summary_op = tf.summary.merge(list(summaries), name='summary_op') 377 | 378 | # =================================================================== # 379 | # Kicks off the training. 380 | # =================================================================== # 381 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) 382 | config = tf.ConfigProto(log_device_placement=False, 383 | gpu_options=gpu_options) 384 | saver = tf.train.Saver(max_to_keep=5, 385 | keep_checkpoint_every_n_hours=1.0, 386 | write_version=2, 387 | pad_step_number=False) 388 | slim.learning.train( 389 | train_tensor, 390 | logdir=FLAGS.train_dir, 391 | master='', 392 | is_chief=True, 393 | init_fn=tf_utils.get_init_fn(FLAGS), 394 | summary_op=summary_op, 395 | number_of_steps=FLAGS.max_number_of_steps, 396 | log_every_n_steps=FLAGS.log_every_n_steps, 397 | save_summaries_secs=FLAGS.save_summaries_secs, 398 | saver=saver, 399 | save_interval_secs=FLAGS.save_interval_secs, 400 | session_config=config, 401 | sync_optimizer=None) 402 | 403 | 404 | if __name__ == '__main__': 405 | tf.app.run() 406 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /datasets/data2record.py: -------------------------------------------------------------------------------- 1 | ## create script that download datasets and transform into tf-record 2 | ## Assume the datasets is downloaded into following folders 3 | ## SythTexts datasets(41G) 4 | ## data/sythtext/* 5 | 6 | import numpy as np 7 | import scipy.io as sio 8 | import os 9 | import tensorflow as tf 10 | import re 11 | from datasets.dataset_utils import int64_feature, float_feature, bytes_feature ,ImageCoder, norm 12 | import cv2 13 | from PIL import Image 14 | 15 | data_path = 'data/sythtext/' 16 | os.chdir(data_path) 17 | cellname = 'gt' 18 | textname = 'txt' 19 | imcell = 'imnames' 20 | wordname = 'wordBB' 21 | charname = 'charBB' 22 | NUMoffolder = 1 23 | 24 | ## SythText datasets is too big to store in a record. 25 | ## So Transform tfrecord according to dir name 26 | 27 | 28 | def _convert_to_example(image_data, shape, bbox, label,imname): 29 | nbbox = np.array(bbox) 30 | ymin = list(nbbox[:, 0]) 31 | xmin = list(nbbox[:, 1]) 32 | ymax = list(nbbox[:, 2]) 33 | xmax = list(nbbox[:, 3]) 34 | 35 | print 'shape: {}, height:{}, width:{}'.format(shape,shape[0],shape[1]) 36 | example = tf.train.Example(features=tf.train.Features(feature={ 37 | 'image/height': int64_feature(shape[0]), 38 | 'image/width': int64_feature(shape[1]), 39 | 'image/channels': int64_feature(shape[2]), 40 | 'image/shape': int64_feature(shape), 41 | 'image/object/bbox/xmin': float_feature(xmin), 42 | 'image/object/bbox/xmax': float_feature(xmax), 43 | 'image/object/bbox/ymin': float_feature(ymin), 44 | 'image/object/bbox/ymax': float_feature(ymax), 45 | 'image/object/bbox/label': int64_feature(label), 46 | 'image/format': bytes_feature('jpeg'), 47 | 'image/encoded': bytes_feature(image_data), 48 | 'image/name': bytes_feature(imname.tostring()), 49 | })) 50 | return example 51 | 52 | 53 | def _processing_image(wordbb, imname,coder): 54 | image_data = tf.gfile.GFile(imname, 'r').read() 55 | image = coder.decode_jpeg(image_data) 56 | #image_data = np.array(Image.open(imname)) 57 | shape = image.shape 58 | if(len(wordbb.shape) < 3 ): 59 | numofbox = 1 60 | else: 61 | numofbox = wordbb.shape[2] 62 | bbox = [] 63 | [xmin, ymin]= np.min(wordbb,1) 64 | [xmax, ymax] = np.max(wordbb,1) 65 | xmin = np.maximum(xmin, 0) 66 | ymin = np.maximum(ymin, 0) 67 | xmax = np.minimum(xmax, 1) 68 | ymax = np.minimum(ymax, 1) 69 | if numofbox > 1: 70 | bbox = [[ymin[i]/shape[0],xmin[i]/shape[1],ymax[i]/shape[0],xmax[i]/shape[1]] for i in range(numofbox)] 71 | if numofbox == 1: 72 | bbox = [[ymin/shape[0],xmin/shape[1],ymax/shape[0],xmax/shape[1]]] 73 | 74 | 75 | label = [1 for i in range(numofbox)] 76 | shape = list(shape) 77 | return image_data, shape, bbox, label, imname 78 | 79 | 80 | def run(): 81 | labels = sio.loadmat('gt.mat') 82 | print labels.keys() 83 | texts = labels[textname] 84 | imnames = labels[imcell] 85 | wordBB = labels[wordname] 86 | charBB = labels[charname] 87 | coder = ImageCoder() 88 | for i in range(NUMoffolder): 89 | tf_filename = str(i+1) + '.tfrecord' 90 | tfrecord_writer = tf.python_io.TFRecordWriter(tf_filename) 91 | dir = i+1 92 | pattern = re.compile(r'^{}\/'.format(dir)) 93 | i = 0 94 | res =[i for i in range(imnames.shape[1]) if pattern.match(imnames[0,i][0]) != None ] 95 | print len(res) 96 | # shuffle 97 | res = np.random.permutation(res) 98 | for j in res: 99 | wordbb = wordBB[0,j] 100 | imname = imnames[0,j][0] 101 | image_data, shape, bbox, label ,imname= _processing_image(wordbb, imname,coder) 102 | 103 | example = _convert_to_example(image_data, shape, bbox, label, imname) 104 | tfrecord_writer.write(example.SerializeToString()) 105 | print 'Transform to tfrecord finished' 106 | 107 | if __name__ == '__main__': 108 | run() 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /datasets/dataset_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Contains utilities for downloading and converting datasets.""" 16 | from __future__ import absolute_import 17 | from __future__ import division 18 | from __future__ import print_function 19 | 20 | import os 21 | import sys 22 | import tarfile 23 | 24 | from six.moves import urllib 25 | import tensorflow as tf 26 | 27 | LABELS_FILENAME = 'labels.txt' 28 | def norm(x): 29 | if x < 0: 30 | x = 0 31 | else: 32 | if x > 1: 33 | x = 1 34 | return x 35 | 36 | def int64_feature(value): 37 | """Wrapper for inserting int64 features into Example proto. 38 | """ 39 | if not isinstance(value, list): 40 | value = [value] 41 | return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) 42 | 43 | 44 | def float_feature(value): 45 | """Wrapper for inserting float features into Example proto. 46 | """ 47 | if not isinstance(value, list): 48 | value = [value] 49 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 50 | 51 | 52 | def bytes_feature(value): 53 | """Wrapper for inserting bytes features into Example proto. 54 | """ 55 | if not isinstance(value, list): 56 | value = [value] 57 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=value)) 58 | 59 | 60 | def image_to_tfexample(image_data, image_format, height, width, class_id): 61 | return tf.train.Example(features=tf.train.Features(feature={ 62 | 'image/encoded': bytes_feature(image_data), 63 | 'image/format': bytes_feature(image_format), 64 | 'image/class/label': int64_feature(class_id), 65 | 'image/height': int64_feature(height), 66 | 'image/width': int64_feature(width), 67 | })) 68 | 69 | 70 | def download_and_uncompress_tarball(tarball_url, dataset_dir): 71 | """Downloads the `tarball_url` and uncompresses it locally. 72 | 73 | Args: 74 | tarball_url: The URL of a tarball file. 75 | dataset_dir: The directory where the temporary files are stored. 76 | """ 77 | filename = tarball_url.split('/')[-1] 78 | filepath = os.path.join(dataset_dir, filename) 79 | 80 | def _progress(count, block_size, total_size): 81 | sys.stdout.write('\r>> Downloading %s %.1f%%' % ( 82 | filename, float(count * block_size) / float(total_size) * 100.0)) 83 | sys.stdout.flush() 84 | filepath, _ = urllib.request.urlretrieve(tarball_url, filepath, _progress) 85 | print() 86 | statinfo = os.stat(filepath) 87 | print('Successfully downloaded', filename, statinfo.st_size, 'bytes.') 88 | tarfile.open(filepath, 'r:gz').extractall(dataset_dir) 89 | 90 | 91 | def write_label_file(labels_to_class_names, dataset_dir, 92 | filename=LABELS_FILENAME): 93 | """Writes a file with the list of class names. 94 | 95 | Args: 96 | labels_to_class_names: A map of (integer) labels to class names. 97 | dataset_dir: The directory in which the labels file should be written. 98 | filename: The filename where the class names are written. 99 | """ 100 | labels_filename = os.path.join(dataset_dir, filename) 101 | with tf.gfile.Open(labels_filename, 'w') as f: 102 | for label in labels_to_class_names: 103 | class_name = labels_to_class_names[label] 104 | f.write('%d:%s\n' % (label, class_name)) 105 | 106 | 107 | def has_labels(dataset_dir, filename=LABELS_FILENAME): 108 | """Specifies whether or not the dataset directory contains a label map file. 109 | 110 | Args: 111 | dataset_dir: The directory in which the labels file is found. 112 | filename: The filename where the class names are written. 113 | 114 | Returns: 115 | `True` if the labels file exists and `False` otherwise. 116 | """ 117 | return tf.gfile.Exists(os.path.join(dataset_dir, filename)) 118 | 119 | 120 | def read_label_file(dataset_dir, filename=LABELS_FILENAME): 121 | """Reads the labels file and returns a mapping from ID to class name. 122 | 123 | Args: 124 | dataset_dir: The directory in which the labels file is found. 125 | filename: The filename where the class names are written. 126 | 127 | Returns: 128 | A map from a label (integer) to class name. 129 | """ 130 | labels_filename = os.path.join(dataset_dir, filename) 131 | with tf.gfile.Open(labels_filename, 'rb') as f: 132 | lines = f.read() 133 | lines = lines.split(b'\n') 134 | lines = filter(None, lines) 135 | 136 | labels_to_class_names = {} 137 | for line in lines: 138 | index = line.index(b':') 139 | labels_to_class_names[int(line[:index])] = line[index+1:] 140 | return labels_to_class_names 141 | 142 | 143 | class ImageCoder(object): 144 | """Helper class that provides TensorFlow image coding utilities.""" 145 | 146 | def __init__(self): 147 | # Create a single Session to run all image coding calls. 148 | self._sess = tf.Session() 149 | 150 | # Initializes function that converts PNG to JPEG data. 151 | self._png_data = tf.placeholder(dtype=tf.string) 152 | image = tf.image.decode_png(self._png_data, channels=3) 153 | self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100) 154 | 155 | # Initializes function that converts CMYK JPEG data to RGB JPEG data. 156 | self._cmyk_data = tf.placeholder(dtype=tf.string) 157 | image = tf.image.decode_jpeg(self._cmyk_data, channels=0) 158 | self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100) 159 | 160 | # Initializes function that decodes RGB JPEG data. 161 | self._decode_jpeg_data = tf.placeholder(dtype=tf.string) 162 | self._decode_jpeg = tf.image.decode_jpeg(self._decode_jpeg_data, channels=3) 163 | 164 | def png_to_jpeg(self, image_data): 165 | return self._sess.run(self._png_to_jpeg, 166 | feed_dict={self._png_data: image_data}) 167 | 168 | def cmyk_to_rgb(self, image_data): 169 | return self._sess.run(self._cmyk_to_rgb, 170 | feed_dict={self._cmyk_data: image_data}) 171 | 172 | def decode_jpeg(self, image_data): 173 | image = self._sess.run(self._decode_jpeg, 174 | feed_dict={self._decode_jpeg_data: image_data}) 175 | assert len(image.shape) == 3 176 | assert image.shape[2] == 3 177 | return image 178 | -------------------------------------------------------------------------------- /datasets/sythtextprovider.py: -------------------------------------------------------------------------------- 1 | ## an initial version 2 | ## Transform the tfrecord to slim data provider format 3 | 4 | import numpy 5 | import tensorflow as tf 6 | import os 7 | slim = tf.contrib.slim 8 | 9 | 10 | 11 | 12 | ITEMS_TO_DESCRIPTIONS = { 13 | 'image': 'slim.tfexample_decoder.Image', 14 | 'shape': 'shape', 15 | 'height': 'height', 16 | 'width': 'width', 17 | 'object/bbox': 'box', 18 | 'object/label': 'label' 19 | } 20 | SPLITS_TO_SIZES = { 21 | 'train': 4262, 22 | } 23 | NUM_CLASSES = 2 24 | 25 | 26 | 27 | def get_datasets(data_dir,file_pattern = '*.tfrecord'): 28 | file_patterns = os.path.join(data_dir, file_pattern) 29 | print 'file_path: {}'.format(file_patterns) 30 | reader = tf.TFRecordReader 31 | keys_to_features = { 32 | 'image/height': tf.FixedLenFeature([1], tf.int64), 33 | 'image/width': tf.FixedLenFeature([1], tf.int64), 34 | 'image/channels': tf.FixedLenFeature([1], tf.int64), 35 | 'image/shape': tf.FixedLenFeature([3], tf.int64), 36 | 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 37 | 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 38 | 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 39 | 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 40 | 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 41 | 'image/format': tf.FixedLenFeature([], tf.string, default_value='jpeg'), 42 | 'image/encoded': tf.FixedLenFeature([], tf.string, default_value=''), 43 | 'image/name': tf.VarLenFeature(dtype = tf.string), 44 | } 45 | 46 | items_to_handlers = { 47 | 'image': slim.tfexample_decoder.Image('image/encoded', 'image/format'), 48 | #'image': slim.tfexample_decoder.Tensor('image/encoded'), 49 | 'shape': slim.tfexample_decoder.Tensor('image/shape'), 50 | 'height': slim.tfexample_decoder.Tensor('image/height'), 51 | 'width': slim.tfexample_decoder.Tensor('image/width'), 52 | 'object/bbox': slim.tfexample_decoder.BoundingBox( 53 | ['xmin', 'ymin', 'xmax', 'ymax'], 'image/object/bbox/'), 54 | 'object/label': slim.tfexample_decoder.Tensor('image/object/bbox/label'), 55 | #'objext/txt': slim.tfexample_decoder.Tensor('image/object/bbox/label_text'), 56 | } 57 | 58 | decoder = slim.tfexample_decoder.TFExampleDecoder( 59 | keys_to_features, items_to_handlers) 60 | 61 | labels_to_names = None 62 | 63 | 64 | return slim.dataset.Dataset( 65 | data_sources=file_patterns, 66 | reader=reader, 67 | decoder=decoder, 68 | num_samples=SPLITS_TO_SIZES['train'], 69 | items_to_descriptions=ITEMS_TO_DESCRIPTIONS, 70 | num_classes=NUM_CLASSES, 71 | labels_to_names=labels_to_names) -------------------------------------------------------------------------------- /datasets/testproviderfailed.py: -------------------------------------------------------------------------------- 1 | import datasets.sythtextprovider as sythtext 2 | import tensorflow as tf 3 | slim = tf.contrib.slim 4 | import cv2 5 | #import matplotlib.pyplot as plt 6 | from PIL import Image 7 | from datasets.sythtextprovider import get_datasets 8 | """ 9 | data_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/' 10 | file = data_dir + '1.tfrecord' 11 | 12 | 13 | tfrecord_file_queue = tf.train.string_input_producer([file] ,num_epochs = 1,name='queue',shuffle = True) 14 | reader = tf.TFRecordReader() 15 | _, tfrecord_serialized = reader.read(tfrecord_file_queue) 16 | # label and image are stored as bytes but could be stored as 17 | # int64 or float64 values in a serialized tf.Example protobuf. 18 | tfrecord_features = tf.parse_single_example(tfrecord_serialized, 19 | features={ 20 | 'image/height': tf.FixedLenFeature([1], tf.int64), 21 | 'image/width': tf.FixedLenFeature([1], tf.int64), 22 | 'image/channels': tf.FixedLenFeature([1], tf.int64), 23 | 'image/shape': tf.FixedLenFeature([], tf.int64), 24 | 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 25 | 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 26 | 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 27 | 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 28 | 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 29 | #'image/object/bbox/label_text' : tf.VarLenFeature(dtype=tf.string), 30 | 'image/format': tf.FixedLenFeature((), tf.string, default_value='jpeg'), 31 | 'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), 32 | }, name='features') 33 | # image was saved as uint8, so we have to decode as uint8. 34 | 35 | image = tf.decode_raw(tfrecord_features['image/encoded'], tf.uint8) 36 | shape = tf.cast(tfrecord_features['image/shape'], tf.int64) 37 | #image = tf.reshape(image, shape) 38 | height = tf.cast(tfrecord_features['image/height'],tf.int64) 39 | width = tf.cast(tfrecord_features['image/width'],tf.int64) 40 | 41 | """ 42 | dataset_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/' 43 | dataset = get_datasets(dataset_dir) 44 | 45 | provider = slim.dataset_data_provider.DatasetDataProvider( 46 | dataset, 47 | num_readers=1, 48 | common_queue_capacity=20 * 32, 49 | common_queue_min=10 * 32, 50 | shuffle=True) 51 | # Get for SSD network: image, labels, bboxes. 52 | [image,shape, height, width,glabels, gbboxes,] = provider.get(['image','shape', 'height', 53 | 'width', 54 | 'object/label', 55 | 'object/bbox']) 56 | #image = tf.decode_raw(image, tf.uint8) 57 | #height = tf.cast(features['height'], tf.int32) 58 | #width = tf.cast(features['width'], tf.int32) 59 | #image = tf.reshape(image, tf.pack([height,width,3])) 60 | 61 | 62 | #print image 63 | print shape 64 | print glabels 65 | print gbboxes 66 | with tf.Session() as sess: 67 | 68 | sess.run(tf.global_variables_initializer()) 69 | sess.run(tf.local_variables_initializer()) 70 | coord = tf.train.Coordinator() 71 | threads = tf.train.start_queue_runners(coord=coord) 72 | #print sess.run(shape) 73 | #img = sess.run(image) 74 | #print img.shape 75 | print sess.run([height,width]) 76 | print sess.run(shape) 77 | print sess.run(gbboxes).shape 78 | print sess.run(glabels) 79 | 80 | 81 | coord.request_stop() 82 | coord.join(threads) 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /deployment/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /nets/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /nets/custom_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Implement some custom layers, not provided by TensorFlow. 16 | 17 | Trying to follow as much as possible the style/standards used in 18 | tf.contrib.layers 19 | """ 20 | import tensorflow as tf 21 | 22 | from tensorflow.contrib.framework.python.ops import add_arg_scope 23 | from tensorflow.contrib.layers.python.layers import initializers 24 | from tensorflow.contrib.framework.python.ops import variables 25 | from tensorflow.contrib.layers.python.layers import utils 26 | from tensorflow.python.ops import nn 27 | from tensorflow.python.ops import init_ops 28 | from tensorflow.python.ops import variable_scope 29 | 30 | 31 | def abs_smooth(x): 32 | """Smoothed absolute function. Useful to compute an L1 smooth error. 33 | 34 | Define as: 35 | x^2 / 2 if abs(x) < 1 36 | abs(x) - 0.5 if abs(x) > 1 37 | We use here a differentiable definition using min(x) and abs(x). Clearly 38 | not optimal, but good enough for our purpose! 39 | """ 40 | absx = tf.abs(x) 41 | minx = tf.minimum(absx, 1) 42 | r = 0.5 * ((absx - 1) * minx + absx) 43 | return r 44 | 45 | 46 | @add_arg_scope 47 | def l2_normalization( 48 | inputs, 49 | scaling=False, 50 | scale_initializer=init_ops.ones_initializer(), 51 | reuse=None, 52 | variables_collections=None, 53 | outputs_collections=None, 54 | data_format='NHWC', 55 | trainable=True, 56 | scope=None): 57 | """Implement L2 normalization on every feature (i.e. spatial normalization). 58 | 59 | Should be extended in some near future to other dimensions, providing a more 60 | flexible normalization framework. 61 | 62 | Args: 63 | inputs: a 4-D tensor with dimensions [batch_size, height, width, channels]. 64 | scaling: whether or not to add a post scaling operation along the dimensions 65 | which have been normalized. 66 | scale_initializer: An initializer for the weights. 67 | reuse: whether or not the layer and its variables should be reused. To be 68 | able to reuse the layer scope must be given. 69 | variables_collections: optional list of collections for all the variables or 70 | a dictionary containing a different list of collection per variable. 71 | outputs_collections: collection to add the outputs. 72 | data_format: NHWC or NCHW data format. 73 | trainable: If `True` also add variables to the graph collection 74 | `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 75 | scope: Optional scope for `variable_scope`. 76 | Returns: 77 | A `Tensor` representing the output of the operation. 78 | """ 79 | 80 | with variable_scope.variable_scope( 81 | scope, 'L2Normalization', [inputs], reuse=reuse) as sc: 82 | inputs_shape = inputs.get_shape() 83 | inputs_rank = inputs_shape.ndims 84 | dtype = inputs.dtype.base_dtype 85 | if data_format == 'NHWC': 86 | # norm_dim = tf.range(1, inputs_rank-1) 87 | norm_dim = tf.range(inputs_rank-1, inputs_rank) 88 | params_shape = inputs_shape[-1:] 89 | elif data_format == 'NCHW': 90 | # norm_dim = tf.range(2, inputs_rank) 91 | norm_dim = tf.range(1, 2) 92 | params_shape = (inputs_shape[1]) 93 | 94 | # Normalize along spatial dimensions. 95 | outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12) 96 | # Additional scaling. 97 | if scaling: 98 | scale_collections = utils.get_variable_collections( 99 | variables_collections, 'scale') 100 | scale = variables.model_variable('gamma', 101 | shape=params_shape, 102 | dtype=dtype, 103 | initializer=scale_initializer, 104 | collections=scale_collections, 105 | trainable=trainable) 106 | if data_format == 'NHWC': 107 | outputs = tf.multiply(outputs, scale) 108 | elif data_format == 'NCHW': 109 | scale = tf.expand_dims(scale, axis=-1) 110 | scale = tf.expand_dims(scale, axis=-1) 111 | outputs = tf.multiply(outputs, scale) 112 | # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1)) 113 | 114 | return utils.collect_named_outputs(outputs_collections, 115 | sc.original_name_scope, outputs) 116 | 117 | 118 | @add_arg_scope 119 | def pad2d(inputs, 120 | pad=(0, 0), 121 | mode='CONSTANT', 122 | data_format='NHWC', 123 | trainable=True, 124 | scope=None): 125 | """2D Padding layer, adding a symmetric padding to H and W dimensions. 126 | 127 | Aims to mimic padding in Caffe and MXNet, helping the port of models to 128 | TensorFlow. Tries to follow the naming convention of `tf.contrib.layers`. 129 | 130 | Args: 131 | inputs: 4D input Tensor; 132 | pad: 2-Tuple with padding values for H and W dimensions; 133 | mode: Padding mode. C.f. `tf.pad` 134 | data_format: NHWC or NCHW data format. 135 | """ 136 | with tf.name_scope(scope, 'pad2d', [inputs]): 137 | # Padding shape. 138 | if data_format == 'NHWC': 139 | paddings = [[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]] 140 | elif data_format == 'NCHW': 141 | paddings = [[0, 0], [0, 0], [pad[0], pad[0]], [pad[1], pad[1]]] 142 | net = tf.pad(inputs, paddings, mode=mode) 143 | return net 144 | 145 | 146 | @add_arg_scope 147 | def channel_to_last(inputs, 148 | data_format='NHWC', 149 | scope=None): 150 | """Move the channel axis to the last dimension. Allows to 151 | provide a single output format whatever the input data format. 152 | 153 | Args: 154 | inputs: Input Tensor; 155 | data_format: NHWC or NCHW. 156 | Return: 157 | Input in NHWC format. 158 | """ 159 | with tf.name_scope(scope, 'channel_to_last', [inputs]): 160 | if data_format == 'NHWC': 161 | net = inputs 162 | elif data_format == 'NCHW': 163 | net = tf.transpose(inputs, perm=(0, 2, 3, 1)) 164 | return net 165 | -------------------------------------------------------------------------------- /nets/textbox_common.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import numpy as np 4 | import math 5 | 6 | 7 | 8 | 9 | 10 | # =========================================================================== # 11 | # TensorFlow implementation of Text Boxes encoding / decoding. 12 | # =========================================================================== # 13 | 14 | def tf_text_bboxes_encode_layer(bboxes, 15 | anchors_layer, 16 | matching_threshold=0.1, 17 | prior_scaling=[0.1, 0.1, 0.2, 0.2], 18 | dtype=tf.float32): 19 | 20 | """ 21 | Encode groundtruth labels and bounding boxes using Textbox anchors from 22 | one layer. 23 | 24 | Arguments: 25 | bboxes: Nx4 Tensor(float) with bboxes relative coordinates; 26 | anchors_layer: Numpy array with layer anchors; 27 | matching_threshold: Threshold for positive match with groundtruth bboxes; 28 | prior_scaling: Scaling of encoded coordinates. 29 | 30 | Return: 31 | (target_localizations, target_scores): Target Tensors. 32 | # thisi is a binary problem, so target_score and tartget_labels are same. 33 | """ 34 | # Anchors coordinates and volume. 35 | 36 | yref, xref, href, wref = anchors_layer 37 | print yref.shape 38 | print href.shape 39 | print bboxes.shape 40 | ymin = yref - href / 2. 41 | xmin = xref - wref / 2. 42 | ymax = yref + href / 2. 43 | xmax = xref + wref / 2. 44 | vol_anchors = (xmax - xmin) * (ymax - ymin) 45 | 46 | # Initialize tensors... 47 | shape = (yref.shape[0], yref.shape[1], yref.shape[2], href.size) 48 | # all follow the shape(feat.size, feat.size, 2, 6) 49 | #feat_labels = tf.zeros(shape, dtype=tf.int64) 50 | feat_scores = tf.zeros(shape, dtype=dtype) 51 | 52 | feat_ymin = tf.zeros(shape, dtype=dtype) 53 | feat_xmin = tf.zeros(shape, dtype=dtype) 54 | feat_ymax = tf.ones(shape, dtype=dtype) 55 | feat_xmax = tf.ones(shape, dtype=dtype) 56 | 57 | def jaccard_with_anchors(bbox): 58 | """ 59 | Compute jaccard score between a box and the anchors. 60 | """ 61 | int_ymin = tf.maximum(ymin, bbox[0]) 62 | int_xmin = tf.maximum(xmin, bbox[1]) 63 | int_ymax = tf.minimum(ymax, bbox[2]) 64 | int_xmax = tf.minimum(xmax, bbox[3]) 65 | h = tf.maximum(int_ymax - int_ymin, 0.) 66 | w = tf.maximum(int_xmax - int_xmin, 0.) 67 | # Volumes. 68 | inter_vol = h * w 69 | union_vol = vol_anchors - inter_vol \ 70 | + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) 71 | jaccard = tf.div(inter_vol, union_vol) 72 | return jaccard 73 | 74 | """ 75 | # never use in Textbox 76 | def intersection_with_anchors(bbox): 77 | ''' 78 | Compute intersection between score a box and the anchors. 79 | ''' 80 | int_ymin = tf.maximum(ymin, bbox[0]) 81 | int_xmin = tf.maximum(xmin, bbox[1]) 82 | int_ymax = tf.minimum(ymax, bbox[2]) 83 | int_xmax = tf.minimum(xmax, bbox[3]) 84 | h = tf.maximum(int_ymax - int_ymin, 0.) 85 | w = tf.maximum(int_xmax - int_xmin, 0.) 86 | inter_vol = h * w 87 | scores = tf.div(inter_vol, vol_anchors) 88 | return scores 89 | """ 90 | 91 | def condition(i, feat_scores, 92 | feat_ymin, feat_xmin, feat_ymax, feat_xmax): 93 | """Condition: check label index. 94 | """ 95 | r = tf.less(i, tf.shape(bboxes)[0]) 96 | return r 97 | 98 | def body(i, feat_scores,feat_ymin, feat_xmin, feat_ymax, feat_xmax): 99 | """Body: update feature labels, scores and bboxes. 100 | Follow the original SSD paper for that purpose: 101 | - assign values when jaccard > 0.5; 102 | - only update if beat the score of other bboxes. 103 | """ 104 | # Jaccard score. 105 | bbox = bboxes[i] 106 | jaccard = jaccard_with_anchors(bbox) 107 | # Mask: check threshold + scores + no annotations + num_classes. 108 | mask = tf.greater(jaccard, feat_scores) 109 | mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold)) 110 | #mask = tf.logical_and(mask, feat_scores > -0.5) 111 | #mask = tf.logical_and(mask, label < num_classes) 112 | imask = tf.cast(mask, tf.int64) 113 | fmask = tf.cast(mask, dtype) 114 | # Update values using mask. 115 | #feat_labels = imask * label + (1 - imask) * feat_labels 116 | feat_scores = tf.where(mask, jaccard, feat_scores) 117 | 118 | feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin 119 | feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin 120 | feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax 121 | feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax 122 | 123 | # Check no annotation label: ignore these anchors... 124 | #interscts = intersection_with_anchors(bbox) 125 | #mask = tf.logical_and(interscts > ignore_threshold, 126 | # label == no_annotation_label) 127 | # Replace scores by -1. 128 | #feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores) 129 | 130 | return [i+1, feat_scores, 131 | feat_ymin, feat_xmin, feat_ymax, feat_xmax] 132 | # Main loop definition. 133 | 134 | i = 0 135 | [i,feat_scores, 136 | feat_ymin, feat_xmin, 137 | feat_ymax, feat_xmax] = tf.while_loop(condition, body, 138 | [i, feat_scores, 139 | feat_ymin, feat_xmin, 140 | feat_ymax, feat_xmax]) 141 | ''' 142 | for i, bbox in enumerate(tf.unpack(bboxes, axis=0)): 143 | [i,feat_scores,feat_ymin, 144 | feat_xmin, feat_ymax, feat_xmax] = body(i, feat_scores, 145 | feat_ymin, feat_xmin, 146 | feat_ymax, feat_xmax,bbox) 147 | ''' 148 | # Transform to center / size. 149 | feat_cy = (feat_ymax + feat_ymin) / 2. 150 | feat_cx = (feat_xmax + feat_xmin) / 2. 151 | feat_h = feat_ymax - feat_ymin 152 | feat_w = feat_xmax - feat_xmin 153 | # Encode features. 154 | feat_cy = (feat_cy - yref) / href / prior_scaling[0] 155 | feat_cx = (feat_cx - xref) / wref / prior_scaling[1] 156 | feat_h = tf.log(feat_h / href) / prior_scaling[2] 157 | feat_w = tf.log(feat_w / wref) / prior_scaling[3] 158 | # Use SSD ordering: x / y / w / h instead of ours. 159 | feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1) 160 | return feat_localizations, feat_scores 161 | 162 | 163 | 164 | def tf_text_bboxes_encode(bboxes, 165 | anchors, 166 | matching_threshold=0.1, 167 | prior_scaling=[0.1, 0.1, 0.2, 0.2], 168 | dtype=tf.float32, 169 | scope='text_bboxes_encode'): 170 | """Encode groundtruth labels and bounding boxes using SSD net anchors. 171 | Encoding boxes for all feature layers. 172 | 173 | Arguments: 174 | bboxes: Nx4 Tensor(float) with bboxes relative coordinates; 175 | anchors: List of Numpy array with layer anchors; 176 | matching_threshold: Threshold for positive match with groundtruth bboxes; 177 | prior_scaling: Scaling of encoded coordinates. 178 | 179 | Return: 180 | (target_labels, target_localizations, target_scores): 181 | Each element is a list of target Tensors. 182 | """ 183 | 184 | with tf.name_scope('text_bboxes_encode'): 185 | target_labels = [] 186 | target_localizations = [] 187 | target_scores = [] 188 | for i, anchors_layer in enumerate(anchors): 189 | with tf.name_scope('bboxes_encode_block_%i' % i): 190 | t_loc, t_scores = \ 191 | tf_text_bboxes_encode_layer(bboxes, anchors_layer, 192 | matching_threshold, 193 | prior_scaling, dtype) 194 | target_localizations.append(t_loc) 195 | target_scores.append(t_scores) 196 | return target_localizations, target_scores 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /nets/txtbox_300.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | This framework is based on SSD_tensorlow(https://github.com/balancap/SSD-Tensorflow) 4 | Add descriptions 5 | """ 6 | 7 | import math 8 | from collections import namedtuple 9 | 10 | import numpy as np 11 | import tensorflow as tf 12 | 13 | import tf_extended as tfe 14 | from nets import custom_layers 15 | from nets import textbox_common 16 | 17 | slim = tf.contrib.slim 18 | 19 | # =========================================================================== # 20 | # Text class definition. 21 | # =========================================================================== # 22 | TextboxParams = namedtuple('TextboxParameters', 23 | ['img_shape', 24 | 'num_classes', 25 | 'feat_layers', 26 | 'feat_shapes', 27 | 'scale_range', 28 | 'anchor_ratios', 29 | 'normalizations', 30 | 'prior_scaling', 31 | 'step', 32 | 'scales' 33 | ]) 34 | 35 | class TextboxNet(object): 36 | """ 37 | Implementation of the Textbox 300 network. 38 | 39 | The default features layers with 300x300 image input are: 40 | conv4_3 ==> 38 x 38 41 | fc7 ==> 19 x 19 42 | conv6_2 ==> 10 x 10 43 | conv7_2 ==> 5 x 5 44 | conv8_2 ==> 3 x 3 45 | pool6 ==> 1 x 1 46 | The default image size used to train this network is 300x300. 47 | """ 48 | default_params = TextboxParams( 49 | img_shape=(300, 300), 50 | num_classes=2, 51 | feat_layers=['conv4', 'conv7', 'conv8', 'conv9', 'conv10', 'global'], 52 | feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], 53 | scale_range=[0.20, 0.90], 54 | anchor_ratios=[1,2,3,5,7,10], 55 | normalizations=[20, -1, -1, -1, -1, -1], 56 | prior_scaling=[0.1, 0.1, 0.2, 0.2], 57 | step = 0.14 , 58 | scales = [0.2, 0.34, 0.48, 0.62, 0.76, 0.90] 59 | ) 60 | 61 | def __init__(self, params=None): 62 | """ 63 | Init the Textbox net with some parameters. Use the default ones 64 | if none provided. 65 | """ 66 | if isinstance(params, TextboxParams): 67 | self.params = params 68 | else: 69 | self.params = self.default_params 70 | #self.params.step = (scale_range[1] - scale_range[0])/ 5 71 | #self.params.scales = [scale_range[0] + i* self.params.step for i in range(6)] 72 | 73 | # ======================================================================= # 74 | def net(self, inputs, 75 | is_training=True, 76 | dropout_keep_prob=0.5, 77 | reuse=None, 78 | scope='text_box_300'): 79 | """ 80 | Text network definition. 81 | """ 82 | r = text_net(inputs, 83 | feat_layers=self.params.feat_layers, 84 | normalizations=self.params.normalizations, 85 | is_training=is_training, 86 | dropout_keep_prob=dropout_keep_prob, 87 | reuse=reuse, 88 | scope=scope) 89 | # Update feature shapes (try at least!) 90 | """ 91 | if update_feat_shapes: 92 | shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes) 93 | self.params = self.params._replace(feat_shapes=shapes) 94 | """ 95 | return r 96 | 97 | def arg_scope(self, weight_decay=0.0005, data_format='NHWC'): 98 | """Network arg_scope. 99 | """ 100 | return ssd_arg_scope(weight_decay, data_format=data_format) 101 | 102 | def arg_scope_caffe(self, caffe_scope): 103 | """Caffe arg_scope used for weights importing. 104 | """ 105 | return ssd_arg_scope_caffe(caffe_scope) 106 | 107 | # ======================================================================= # 108 | ''' 109 | def update_feature_shapes(self, predictions): 110 | """Update feature shapes from predictions collection (Tensor or Numpy 111 | array). 112 | """ 113 | shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes) 114 | self.params = self.params._replace(feat_shapes=shapes) 115 | ''' 116 | 117 | def anchors(self, img_shape, dtype=np.float32): 118 | """Compute the default anchor boxes, given an image shape. 119 | """ 120 | return textbox_achor_all_layers(img_shape, 121 | self.params.feat_shapes, 122 | self.params.anchor_ratios, 123 | self.params.scales, 124 | 0.5, 125 | dtype) 126 | 127 | def bboxes_encode(self, bboxes, anchors, 128 | scope='text_bboxes_encode'): 129 | """Encode labels and bounding boxes. 130 | """ 131 | return textbox_common.tf_text_bboxes_encode( 132 | bboxes, anchors, 133 | matching_threshold=0.1, 134 | prior_scaling=self.params.prior_scaling, 135 | scope=scope) 136 | 137 | def losses(self, logits, localisations, 138 | glocalisations, gscores, 139 | match_threshold=0.1, 140 | negative_ratio=3., 141 | alpha=1., 142 | label_smoothing=0., 143 | scope='ssd_losses'): 144 | """Define the SSD network losses. 145 | """ 146 | return ssd_losses(logits, localisations, 147 | glocalisations, gscores, 148 | match_threshold=match_threshold, 149 | negative_ratio=negative_ratio, 150 | alpha=alpha, 151 | label_smoothing=label_smoothing, 152 | scope=scope) 153 | 154 | 155 | 156 | def text_net(inputs, 157 | feat_layers=TextboxNet.default_params.feat_layers, 158 | normalizations=TextboxNet.default_params.normalizations, 159 | is_training=True, 160 | dropout_keep_prob=0.5, 161 | reuse=None, 162 | scope='text_box_300'): 163 | end_points = {} 164 | with tf.variable_scope(scope, 'text_box_300', [inputs], reuse=reuse): 165 | # Original VGG-16 blocks. 166 | net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1') 167 | end_points['conv1'] = net 168 | net = slim.max_pool2d(net, [2, 2], scope='pool1') 169 | # Block 2. 170 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2') 171 | end_points['conv2'] = net 172 | net = slim.max_pool2d(net, [2, 2], scope='pool2') 173 | # Block 3. 174 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') 175 | end_points['conv3'] = net 176 | net = slim.max_pool2d(net, [2, 2], scope='pool3') 177 | # Block 4. 178 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4') 179 | end_points['conv4'] = net 180 | net = slim.max_pool2d(net, [2, 2], scope='pool4') 181 | # Block 5. 182 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5') 183 | end_points['conv5'] = net 184 | net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5') 185 | 186 | # Additional SSD blocks. 187 | # Block 6: let's dilate the hell out of it! 188 | net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6') 189 | end_points['conv6'] = net 190 | # Block 7: 1x1 conv. Because the fuck. 191 | net = slim.conv2d(net, 1024, [1, 1], scope='conv7') 192 | end_points['conv7'] = net 193 | 194 | # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts). 195 | end_point = 'conv8' 196 | with tf.variable_scope(end_point): 197 | net = slim.conv2d(net, 256, [1, 1], scope='conv1x1') 198 | net = custom_layers.pad2d(net, pad=(1, 1)) 199 | net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID') 200 | end_points[end_point] = net 201 | end_point = 'conv9' 202 | with tf.variable_scope(end_point): 203 | net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 204 | net = custom_layers.pad2d(net, pad=(1, 1)) 205 | net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID') 206 | end_points[end_point] = net 207 | end_point = 'conv10' 208 | with tf.variable_scope(end_point): 209 | net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 210 | net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') 211 | end_points[end_point] = net 212 | end_point = 'global' 213 | with tf.variable_scope(end_point): 214 | net = slim.conv2d(net, 128, [1, 1], scope='conv1x1') 215 | net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID') 216 | end_points[end_point] = net 217 | 218 | # Prediction and localisations layers. 219 | predictions = [] 220 | logits = [] 221 | localisations = [] 222 | for i, layer in enumerate(feat_layers): 223 | with tf.variable_scope(layer + '_box'): 224 | p, l = text_multibox_layer(layer, 225 | end_points[layer], 226 | normalizations[i]) 227 | #predictions.append(prediction_fn(p)) 228 | logits.append(p) 229 | localisations.append(l) 230 | 231 | return localisations, logits, end_points 232 | 233 | 234 | def text_multibox_layer(layer, 235 | inputs, 236 | normalization=-1): 237 | """ 238 | Construct a multibox layer, return a class and localization predictions. 239 | The most different between textbox and ssd is the prediction shape 240 | where textbox has prediction score shape (38,38,2,6) 241 | and location has shape (38,38,2,6,4) 242 | besise,the kernel for fisrt 5 layers is 1*5 and padding is (0,2) 243 | kernel for the last layer is 1*1 and padding is 0 244 | """ 245 | net = inputs 246 | if normalization > 0: 247 | net = custom_layers.l2_normalization(net, scaling=True) 248 | # Number of anchors. 249 | num_anchors = 6 250 | num_classes = 2 251 | # Location. 252 | num_loc_pred = 2*num_anchors * 4 253 | if(layer == 'global'): 254 | loc_pred = slim.conv2d(net, num_loc_pred, [1, 1], activation_fn=None, padding = 'VALID', 255 | scope='conv_loc') 256 | else: 257 | loc_pred = slim.conv2d(net, num_loc_pred, [1, 5], activation_fn=None, padding = 'SAME', 258 | scope='conv_loc') 259 | #loc_pred = custom_layers.channel_to_last(loc_pred) 260 | loc_pred = tf.reshape(loc_pred, loc_pred.get_shape().as_list()[:-1] + [2,num_anchors,4]) 261 | # Class prediction. 262 | scores_pred = 2 * num_anchors * num_classes 263 | if(layer == 'global'): 264 | sco_pred = slim.conv2d(net, scores_pred, [1, 1], activation_fn=None, padding = 'VALID', 265 | scope='conv_cls') 266 | else: 267 | sco_pred = slim.conv2d(net, scores_pred, [1, 5], activation_fn=None, padding = 'SAME', 268 | scope='conv_cls') 269 | #cls_pred = custom_layers.channel_to_last(cls_pred) 270 | sco_pred = tf.reshape(sco_pred, sco_pred.get_shape().as_list()[:-1] + [2,num_anchors,num_classes]) 271 | return sco_pred, loc_pred 272 | 273 | 274 | 275 | ## produce anchor for one layer 276 | # each feature point has 12 default textboxes(6 boxes + 6 offsets boxes) 277 | # aspect ratios = (1,2,3,5,7,10) 278 | # feat_size : 279 | # conv4_3 ==> 38 x 38 280 | # fc7 ==> 19 x 19 281 | # conv6_2 ==> 10 x 10 282 | # conv7_2 ==> 5 x 5 283 | # conv8_2 ==> 3 x 3 284 | # pool6 ==> 1 x 1 285 | 286 | def textbox_anchor_one_layer(img_shape, 287 | feat_size, 288 | ratios, 289 | scale, 290 | offset = 0.5, 291 | dtype=np.float32): 292 | # Follow the papers scheme 293 | # 12 ahchor boxes with out sk' = sqrt(sk * sk+1) 294 | y, x = np.mgrid[0:feat_size[0], 0:feat_size[1]] + 0.5 295 | y = y.astype(dtype) / feat_size[0] 296 | x = x.astype(dtype) / feat_size[1] 297 | x_offset = x 298 | y_offset = y + offset 299 | x_out = np.stack((x, x_offset), -1) 300 | y_out = np.stack((y, y_offset), -1) 301 | y_out = np.expand_dims(y_out, axis=-1) 302 | x_out = np.expand_dims(x_out, axis=-1) 303 | 304 | 305 | # 306 | num_anchors = 6 307 | h = np.zeros((num_anchors, ), dtype=dtype) 308 | w = np.zeros((num_anchors, ), dtype=dtype) 309 | for i ,r in enumerate(ratios): 310 | h[i] = scale / math.sqrt(r) / feat_size[0] 311 | w[i] = scale * math.sqrt(r) / feat_size[1] 312 | return y_out, x_out, h, w 313 | 314 | 315 | 316 | ## produce anchor for all layers 317 | def textbox_achor_all_layers(img_shape, 318 | layers_shape, 319 | anchor_ratios, 320 | scales, 321 | offset=0.5, 322 | dtype=np.float32): 323 | """ 324 | Compute anchor boxes for all feature layers. 325 | """ 326 | layers_anchors = [] 327 | for i, s in enumerate(layers_shape): 328 | anchor_bboxes = textbox_anchor_one_layer(img_shape, s, 329 | anchor_ratios, 330 | scales[i], 331 | offset=offset, dtype=dtype) 332 | layers_anchors.append(anchor_bboxes) 333 | return layers_anchors 334 | 335 | def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'): 336 | """Defines the VGG arg scope. 337 | 338 | Args: 339 | weight_decay: The l2 regularization coefficient. 340 | 341 | Returns: 342 | An arg_scope. 343 | """ 344 | with slim.arg_scope([slim.conv2d, slim.fully_connected], 345 | activation_fn=tf.nn.relu, 346 | weights_regularizer=slim.l2_regularizer(weight_decay), 347 | weights_initializer=tf.contrib.layers.xavier_initializer(), 348 | biases_initializer=tf.zeros_initializer()): 349 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], 350 | padding='SAME', 351 | data_format=data_format): 352 | with slim.arg_scope([custom_layers.pad2d, 353 | custom_layers.l2_normalization, 354 | custom_layers.channel_to_last], 355 | data_format=data_format) as sc: 356 | return sc 357 | 358 | 359 | # =========================================================================== # 360 | # Caffe scope: importing weights at initialization. 361 | # =========================================================================== # 362 | def ssd_arg_scope_caffe(caffe_scope): 363 | """Caffe scope definition. 364 | 365 | Args: 366 | caffe_scope: Caffe scope object with loaded weights. 367 | 368 | Returns: 369 | An arg_scope. 370 | """ 371 | # Default network arg scope. 372 | with slim.arg_scope([slim.conv2d], 373 | activation_fn=tf.nn.relu, 374 | weights_initializer=caffe_scope.conv_weights_init(), 375 | biases_initializer=caffe_scope.conv_biases_init()): 376 | with slim.arg_scope([slim.fully_connected], 377 | activation_fn=tf.nn.relu): 378 | with slim.arg_scope([custom_layers.l2_normalization], 379 | scale_initializer=caffe_scope.l2_norm_scale_init()): 380 | with slim.arg_scope([slim.conv2d, slim.max_pool2d], 381 | padding='SAME') as sc: 382 | return sc 383 | 384 | 385 | # =========================================================================== # 386 | # Text loss function. 387 | # =========================================================================== # 388 | def ssd_losses(logits, localisations, 389 | glocalisations, gscores, 390 | match_threshold=0.1, 391 | negative_ratio=3., 392 | alpha=1., 393 | label_smoothing=0., 394 | scope=None): 395 | """Loss functions for training the text box network. 396 | 397 | 398 | Arguments: 399 | logits: (list of) predictions logits Tensors; 400 | localisations: (list of) localisations Tensors; 401 | glocalisations: (list of) groundtruth localisations Tensors; 402 | gscores: (list of) groundtruth score Tensors; 403 | """ 404 | with tf.name_scope(scope, 'text_loss'): 405 | l_cross_pos = [] 406 | l_cross_neg = [] 407 | l_loc = [] 408 | for i in range(len(logits)): 409 | dtype = logits[i].dtype 410 | with tf.name_scope('block_%i' % i): 411 | # Determine weights Tensor. 412 | pmask = gscores[i] > match_threshold 413 | ipmask = tf.cast(pmask, tf.int32) 414 | fpmask = tf.cast(pmask, dtype) 415 | n_positives = tf.reduce_sum(fpmask) 416 | 417 | # Negative mask 418 | # Number of negative entries to select. 419 | n_neg = tf.cast(negative_ratio * n_positives, tf.int32) 420 | 421 | nvalues = tf.where(tf.cast(1-ipmask,tf.bool), gscores[i], np.zeros(gscores[i].shape)) 422 | nvalues_flat = tf.reshape(nvalues, [-1]) 423 | val, idxes = tf.nn.top_k(nvalues_flat, k=n_neg) 424 | minval = val[-1] 425 | # Final negative mask. 426 | nmask = nvalues > minval 427 | fnmask = tf.cast(nmask, dtype) 428 | inmask = tf.cast(nmask, tf.int32) 429 | # Add cross-entropy loss. 430 | with tf.name_scope('cross_entropy_pos'): 431 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], 432 | labels=ipmask) 433 | loss = tf.losses.compute_weighted_loss(loss, fpmask) 434 | l_cross_pos.append(loss) 435 | 436 | with tf.name_scope('cross_entropy_neg'): 437 | loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i], 438 | labels=inmask) 439 | loss = tf.losses.compute_weighted_loss(loss, fnmask) 440 | l_cross_neg.append(loss) 441 | 442 | # Add localization loss: smooth L1, L2, ... 443 | with tf.name_scope('localization'): 444 | # Weights Tensor: positive mask + random negative. 445 | weights = tf.expand_dims(alpha * fpmask, axis=-1) 446 | loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i]) 447 | loss = tf.losses.compute_weighted_loss(loss, weights) 448 | l_loc.append(loss) 449 | 450 | # Additional total losses... 451 | with tf.name_scope('total'): 452 | total_cross_pos = tf.add_n(l_cross_pos, 'cross_entropy_pos') 453 | total_cross_neg = tf.add_n(l_cross_neg, 'cross_entropy_neg') 454 | total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy') 455 | total_loc = tf.add_n(l_loc, 'localization') 456 | 457 | # Add to EXTRA LOSSES TF.collection 458 | tf.add_to_collection('EXTRA_LOSSES', total_cross_pos) 459 | tf.add_to_collection('EXTRA_LOSSES', total_cross_neg) 460 | tf.add_to_collection('EXTRA_LOSSES', total_cross) 461 | tf.add_to_collection('EXTRA_LOSSES', total_loc) 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | -------------------------------------------------------------------------------- /processing/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /processing/image_processing.py: -------------------------------------------------------------------------------- 1 | 2 | import tensorflow as tf 3 | import tf_extended as tfe 4 | import os 5 | import matplotlib.pyplot as plt 6 | import skimage.io as skio 7 | import cv2 8 | import numpy as np 9 | 10 | 11 | def image_processing(image, bbox,labels, text_shape,train = True): 12 | Height = text_shape[0] 13 | Width = text_shape[1] 14 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 15 | if train: 16 | image,labels,bbox = distorted_image(image, Height,labels,Width,bbox) 17 | else: 18 | image = eval_image(image, Height, Width) 19 | 20 | return image, labels, bbox 21 | 22 | def distorted_image(image, height,labels,width,bbox,scope = None): 23 | # Each bounding box has shape [1, num_boxes, box coords] and 24 | # the coordinates are ordered [ymin, xmin, ymax, xmax]. 25 | 26 | # Display the bounding box in the first thread only. 27 | with tf.name_scope(scope, 'distorted_bounding_box_crop', 28 | [image, bbox,height,width]): 29 | 30 | bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( 31 | tf.shape(image), 32 | bounding_boxes=bbox, 33 | min_object_covered=0.1, 34 | aspect_ratio_range=(0.9,1.1), 35 | area_range=(0.1,1.0), 36 | max_attempts=200, 37 | use_image_if_no_bounding_boxes=True) 38 | 39 | distort_bbox = distort_bbox[0, 0] 40 | 41 | # Crop the image to the specified bounding box. 42 | cropped_image = tf.slice(image, bbox_begin, bbox_size) 43 | # Restore the shape since the dynamic slice loses 3rd dimension. 44 | 45 | distorted_image = tf.image.resize_images(cropped_image, [height, width], 46 | method=tf.image.ResizeMethod.BILINEAR) 47 | distorted_image.set_shape([height, width, 3]) 48 | 49 | distorted_image = tf.image.random_flip_left_right(distorted_image) 50 | # Randomly distort the colors. 51 | distorted_image = distort_color(distorted_image) 52 | 53 | 54 | bboxes = tfe.bboxes_resize(distort_bbox, bbox) 55 | print "labels: %s " % (labels) 56 | label, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,threshold = 0.4) 57 | print "bboxes: %s " % (bboxes) 58 | return distorted_image, label, bboxes 59 | 60 | 61 | 62 | def eval_image(image, height, width, scope=None): 63 | """Prepare one image for evaluation. 64 | 65 | Args: 66 | image: 3-D float Tensor 67 | height: integer 68 | width: integer 69 | scope: Optional scope for op_scope. 70 | Returns: 71 | 3-D float Tensor of prepared image. 72 | """ 73 | with tf.name_scope(scope, 'eval_image',[image, height, width]): 74 | # Crop the central region of the image with an area containing 87.5% of 75 | # the original image. 76 | image = tf.image.central_crop(image, central_fraction=0.875) 77 | 78 | # Resize the image to the original height and width. 79 | image = tf.expand_dims(image, 0) 80 | image = tf.image.resize_bilinear(image, [height, width], 81 | align_corners=False) 82 | image = tf.squeeze(image, [0]) 83 | return image 84 | 85 | 86 | def distort_color(image, scope=None): 87 | """Distort the color of the image. 88 | 89 | Each color distortion is non-commutative and thus ordering of the color ops 90 | matters. Ideally we would randomly permute the ordering of the color ops. 91 | Rather then adding that level of complication, we select a distinct ordering 92 | of color ops for each preprocessing thread. 93 | 94 | Args: 95 | image: Tensor containing single image. 96 | thread_id: preprocessing thread ID. 97 | scope: Optional scope for op_scope. 98 | Returns: 99 | color-distorted image 100 | """ 101 | color_ordering = np.random.randint(2) 102 | if color_ordering == 0: 103 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 104 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 105 | image = tf.image.random_hue(image, max_delta=0.2) 106 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 107 | elif color_ordering == 1: 108 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 109 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 110 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 111 | image = tf.image.random_hue(image, max_delta=0.2) 112 | 113 | # The random_* ops do not necessarily clamp. 114 | image = tf.clip_by_value(image, 0.0, 1.0) 115 | return image 116 | -------------------------------------------------------------------------------- /processing/image_processing2.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file pre-process images from a datasets and 3 | output batch iamges and labels(bboxes) 4 | 5 | parse examples from tfrecord 6 | 1.parse_example 7 | 8 | Pre-processing images : 9 | 1. crop and pad images randomly 10 | 2. crop and pad bbox 11 | 3. Transform images and bboxes to input/output vectors 12 | 13 | """ 14 | 15 | import tensorflow as tf 16 | import tf_extended as tfe 17 | import os 18 | import matplotlib.pyplot as plt 19 | import skimage.io as skio 20 | import cv2 21 | 22 | 23 | FLAGS = tf.app.flags.FLAGS 24 | 25 | tf.app.flags.DEFINE_integer('batch_size', 1, 26 | """Number of images to process in a batch.""") 27 | tf.app.flags.DEFINE_integer('Height', 300, 28 | """Provide square images of this size.""") 29 | tf.app.flags.DEFINE_integer('Width', 300, 30 | """Provide square images of this size.""") 31 | tf.app.flags.DEFINE_integer('num_preprocess_threads', 4, 32 | """Number of preprocessing threads per tower. """ 33 | """Please make this a multiple of 4.""") 34 | tf.app.flags.DEFINE_integer('num_readers', 1, 35 | """Number of parallel readers during train.""") 36 | 37 | # Images are preprocessed asynchronously using multiple threads specified by 38 | # --num_preprocss_threads and the resulting processed images are stored in a 39 | # random shuffling queue. The shuffling queue dequeues --batch_size images 40 | # for processing on a given Inception tower. A larger shuffling queue guarantees 41 | # better mixing across examples within a batch and results in slightly higher 42 | # predictive performance in a trained model. Empirically, 43 | # --input_queue_memory_factor=16 works well. A value of 16 implies a queue size 44 | # of 1024*16 images. Assuming RGB 299x299 images, this implies a queue size of 45 | # 16GB. If the machine is memory limited, then decrease this factor to 46 | # decrease the CPU memory footprint, accordingly. 47 | 48 | tf.app.flags.DEFINE_integer('input_queue_memory_factor', 1, 49 | """Size of the queue of preprocessed images. """ 50 | """Default is ideal but try smaller values, e.g. """ 51 | """4, 2 or 1, if host memory is constrained. See """ 52 | """comments in code for more details.""") 53 | 54 | 55 | def distorted_inputs(data_files, batch_size=None, num_preprocess_threads=None): 56 | """Generate batches of distorted versions of ImageNet images. 57 | 58 | Use this function as the inputs for training a network. 59 | 60 | Distorting images provides a useful technique for augmenting the data 61 | set during training in order to make the network invariant to aspects 62 | of the image that do not effect the label. 63 | 64 | Args: 65 | dataset: instance of Dataset class specifying the dataset. 66 | batch_size: integer, number of examples in batch 67 | num_preprocess_threads: integer, total number of preprocessing threads but 68 | None defaults to FLAGS.num_preprocess_threads. 69 | 70 | Returns: 71 | images: Images. 4D tensor of size [batch_size, FLAGS.image_size, 72 | FLAGS.image_size, 3]. 73 | labels: 1-D integer Tensor of [batch_size]. 74 | """ 75 | if not batch_size: 76 | batch_size = FLAGS.batch_size 77 | 78 | # Force all input processing onto CPU in order to reserve the GPU for 79 | # the forward inference and back-propagation. 80 | with tf.device('/cpu:0'): 81 | images ,box,name= batch_inputs( 82 | data_files, batch_size, train=True, 83 | num_preprocess_threads=num_preprocess_threads, 84 | num_readers=FLAGS.num_readers) 85 | return images,box,name 86 | 87 | def parse_example(example_serialized): 88 | """ 89 | One example proto containing following fields 90 | 'image/height': int64_feature(shape[0]), 91 | 'image/width': int64_feature(shape[1]), 92 | 'image/channels': int64_feature(shape[2]), 93 | 'image/shape': int64_feature(shape), 94 | 'image/object/bbox/xmin': float_feature(xmin), 95 | 'image/object/bbox/xmax': float_feature(xmax), 96 | 'image/object/bbox/ymin': float_feature(ymin), 97 | 'image/object/bbox/ymax': float_feature(ymax), 98 | 'image/object/bbox/label': int64_feature(label), 99 | 'image/format': bytes_feature('jpeg'), 100 | 'image/encoded': bytes_feature(image_data.tostring()), 101 | 102 | Input : example_serialized 103 | 104 | Ouput: 105 | Image_buffer 106 | """ 107 | feature_map = { 108 | 'image/height': tf.FixedLenFeature([1], tf.int64), 109 | 'image/width': tf.FixedLenFeature([1], tf.int64), 110 | 'image/channels': tf.FixedLenFeature([1], tf.int64), 111 | 'image/shape': tf.FixedLenFeature([3], tf.int64), 112 | 'image/object/bbox/xmin': tf.VarLenFeature(dtype=tf.float32), 113 | 'image/object/bbox/ymin': tf.VarLenFeature(dtype=tf.float32), 114 | 'image/object/bbox/xmax': tf.VarLenFeature(dtype=tf.float32), 115 | 'image/object/bbox/ymax': tf.VarLenFeature(dtype=tf.float32), 116 | 'image/object/bbox/label': tf.VarLenFeature(dtype=tf.int64), 117 | 'image/format': tf.FixedLenFeature([], tf.string, default_value='jpeg'), 118 | 'image/encoded': tf.FixedLenFeature([], tf.string, default_value=''), 119 | 'image/name': tf.VarLenFeature(dtype = tf.string), 120 | } 121 | features = tf.parse_single_example(example_serialized, feature_map) 122 | #image = tf.decode_raw(features['image/encoded'], tf.uint8) 123 | xmin = tf.expand_dims(features['image/object/bbox/xmin'].values, 0) 124 | ymin = tf.expand_dims(features['image/object/bbox/ymin'].values, 0) 125 | xmax = tf.expand_dims(features['image/object/bbox/xmax'].values, 0) 126 | ymax = tf.expand_dims(features['image/object/bbox/ymax'].values, 0) 127 | bboxes = tf.concat([ymin, xmin, ymax, xmax],0) 128 | bboxes = tf.expand_dims(bboxes,0) 129 | bboxes = tf.transpose(bboxes, [0,2,1]) 130 | Image_buffer = features['image/encoded'] 131 | label = tf.expand_dims(features['image/object/bbox/label'].values, 0) 132 | width = tf.cast(features['image/height'], dtype=tf.int64) 133 | height = tf.cast(features['image/width'], dtype=tf.int64) 134 | name = tf.cast(features['image/name'], dtype = tf.string) 135 | print "name %s" % (name) 136 | return Image_buffer, label, bboxes, name 137 | 138 | 139 | 140 | def image_processing(image_buffer, bbox,labels, train,thread_id = 0): 141 | image = decode_jpeg(image_buffer) 142 | Height = FLAGS.Height 143 | Width = FLAGS.Width 144 | 145 | if train: 146 | image,labels,bbox = distorted_image(image, Height,labels,Width,bbox,thread_id) 147 | else: 148 | image = eval_image(image, Height, Width) 149 | 150 | return image, labels, bbox 151 | 152 | def distorted_image(image, height,labels,width,bbox,thread_id,scope = None): 153 | # Each bounding box has shape [1, num_boxes, box coords] and 154 | # the coordinates are ordered [ymin, xmin, ymax, xmax]. 155 | 156 | # Display the bounding box in the first thread only. 157 | with tf.name_scope(scope, 'distorted_bounding_box_crop', 158 | [image, bbox,height,width]): 159 | if not thread_id: 160 | image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0), 161 | bbox) 162 | tf.summary.image('image_with_bounding_boxes', image_with_box) 163 | 164 | 165 | bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( 166 | tf.shape(image), 167 | bounding_boxes=bbox, 168 | min_object_covered=0.1, 169 | aspect_ratio_range=(0.9,1.1), 170 | area_range=(0.1,1.0), 171 | max_attempts=200, 172 | use_image_if_no_bounding_boxes=True) 173 | 174 | if not thread_id: 175 | image_with_distorted_box = tf.image.draw_bounding_boxes( 176 | tf.expand_dims(image, 0), distort_bbox) 177 | tf.summary.image('images_with_distorted_bounding_box', 178 | image_with_distorted_box) 179 | 180 | distort_bbox = distort_bbox[0, 0] 181 | 182 | # Crop the image to the specified bounding box. 183 | cropped_image = tf.slice(image, bbox_begin, bbox_size) 184 | # Restore the shape since the dynamic slice loses 3rd dimension. 185 | 186 | distorted_image = tf.image.resize_images(cropped_image, [height, width], 187 | method=tf.image.ResizeMethod.BILINEAR) 188 | distorted_image.set_shape([height, width, 3]) 189 | if not thread_id: 190 | tf.summary.image('cropped_resized_image', 191 | tf.expand_dims(distorted_image, 0)) 192 | distorted_image = tf.image.random_flip_left_right(distorted_image) 193 | # Randomly distort the colors. 194 | distorted_image = distort_color(distorted_image, thread_id) 195 | 196 | if not thread_id: 197 | tf.summary.image('final_distorted_image', 198 | tf.expand_dims(distorted_image, 0)) 199 | # Update bounding boxes: resize and filter out. 200 | 201 | bboxes = tfe.bboxes_resize(distort_bbox, bbox) 202 | print "labels: %s " % (labels) 203 | label, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,threshold = 0.4) 204 | 205 | return distorted_image, label, bboxes 206 | 207 | 208 | 209 | def decode_jpeg(image_buffer, scope=None): 210 | """Decode a JPEG string into one 3-D float image Tensor. 211 | 212 | Args: 213 | image_buffer: scalar string Tensor. 214 | scope: Optional scope for op_scope. 215 | Returns: 216 | 3-D float Tensor with values ranging from [0, 1). 217 | """ 218 | with tf.name_scope(scope, 'decode_jpeg',[image_buffer]): 219 | # Decode the string as an RGB JPEG. 220 | # Note that the resulting image contains an unknown height and width 221 | # that is set dynamically by decode_jpeg. In other words, the height 222 | # and width of image is unknown at compile-time. 223 | image = tf.image.decode_jpeg(image_buffer, channels=3) 224 | # After this point, all image pixels reside in [0,1) 225 | # until the very end, when they're rescaled to (-1, 1). The various 226 | # adjust_* ops all require this range for dtype float. 227 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 228 | print 'image after decode %s' % (image) 229 | return image 230 | 231 | 232 | def eval_image(image, height, width, scope=None): 233 | """Prepare one image for evaluation. 234 | 235 | Args: 236 | image: 3-D float Tensor 237 | height: integer 238 | width: integer 239 | scope: Optional scope for op_scope. 240 | Returns: 241 | 3-D float Tensor of prepared image. 242 | """ 243 | with tf.name_scope(scope, 'eval_image',[image, height, width]): 244 | # Crop the central region of the image with an area containing 87.5% of 245 | # the original image. 246 | image = tf.image.central_crop(image, central_fraction=0.875) 247 | 248 | # Resize the image to the original height and width. 249 | image = tf.expand_dims(image, 0) 250 | image = tf.image.resize_bilinear(image, [height, width], 251 | align_corners=False) 252 | image = tf.squeeze(image, [0]) 253 | return image 254 | 255 | 256 | def distort_color(image, thread_id=0, scope=None): 257 | """Distort the color of the image. 258 | 259 | Each color distortion is non-commutative and thus ordering of the color ops 260 | matters. Ideally we would randomly permute the ordering of the color ops. 261 | Rather then adding that level of complication, we select a distinct ordering 262 | of color ops for each preprocessing thread. 263 | 264 | Args: 265 | image: Tensor containing single image. 266 | thread_id: preprocessing thread ID. 267 | scope: Optional scope for op_scope. 268 | Returns: 269 | color-distorted image 270 | """ 271 | with tf.name_scope( scope, 'distort_color',[image]): 272 | color_ordering = thread_id % 2 273 | 274 | if color_ordering == 0: 275 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 276 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 277 | image = tf.image.random_hue(image, max_delta=0.2) 278 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 279 | elif color_ordering == 1: 280 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 281 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 282 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 283 | image = tf.image.random_hue(image, max_delta=0.2) 284 | 285 | # The random_* ops do not necessarily clamp. 286 | image = tf.clip_by_value(image, 0.0, 1.0) 287 | return image 288 | 289 | 290 | def batch_inputs(data_files, batch_size, train, num_preprocess_threads=None,num_readers=4): 291 | 292 | """Contruct batches of training or evaluation examples from the image dataset. 293 | Args: 294 | dataset: instance of Dataset class specifying the dataset. 295 | See dataset.py for details. 296 | batch_size: integer 297 | train: boolean 298 | num_preprocess_threads: integer, total number of preprocessing threads 299 | num_readers: integer, number of parallel readers 300 | 301 | Returns: 302 | images: 4-D float Tensor of a batch of images 303 | labels: 1-D integer Tensor of [batch_size]. 304 | 305 | Raises: 306 | ValueError: if data is not found 307 | """ 308 | 309 | #print 1 310 | with tf.name_scope('batch_processing'): 311 | if data_files is None: 312 | raise ValueError('No data files found for this dataset') 313 | 314 | # Create filename_queue 315 | if train: 316 | filename_queue = tf.train.string_input_producer(data_files,num_epochs = 2, 317 | shuffle=True, 318 | capacity=16) 319 | else: 320 | filename_queue = tf.train.string_input_producer(data_files, num_epochs = 2, 321 | shuffle=False, 322 | capacity=1) 323 | if num_preprocess_threads is None: 324 | num_preprocess_threads = FLAGS.num_preprocess_threads 325 | 326 | if num_preprocess_threads % 4: 327 | raise ValueError('Please make num_preprocess_threads a multiple ' 328 | 'of 4 (%d % 4 != 0).', num_preprocess_threads) 329 | 330 | if num_readers is None: 331 | num_readers = FLAGS.num_readers 332 | 333 | if num_readers < 1: 334 | raise ValueError('Please make num_readers at least 1') 335 | 336 | # Approximate number of examples per shard. 337 | 338 | examples_per_shard = 512 339 | 340 | # Size the random shuffle queue to balance between good global 341 | # mixing (more examples) and memory use (fewer examples). 342 | # 1 image uses 299*299*3*4 bytes = 1MB 343 | # The default input_queue_memory_factor is 16 implying a shuffling queue 344 | # size: examples_per_shard * 16 * 1MB = 17.6GB 345 | 346 | min_queue_examples = examples_per_shard * FLAGS.input_queue_memory_factor 347 | if train: 348 | examples_queue = tf.RandomShuffleQueue( 349 | capacity=min_queue_examples + 3 * batch_size, 350 | min_after_dequeue=min_queue_examples, 351 | dtypes=[tf.string]) 352 | else: 353 | examples_queue = tf.FIFOQueue( 354 | capacity=examples_per_shard + 3 * batch_size, 355 | dtypes=[tf.string]) 356 | 357 | # Create multiple readers to populate the queue of examples. 358 | if num_readers > 1: 359 | enqueue_ops = [] 360 | for _ in range(num_readers): 361 | reader = tf.TFRecordReader() 362 | _, value = reader.read(filename_queue) 363 | enqueue_ops.append(examples_queue.enqueue([value])) 364 | 365 | tf.train.queue_runner.add_queue_runner( 366 | tf.train.queue_runner.QueueRunner(examples_queue, enqueue_ops)) 367 | example_serialized = examples_queue.dequeue() 368 | else: 369 | reader = tf.TFRecordReader() 370 | _, example_serialized = reader.read(filename_queue) 371 | 372 | images_and_labels = [] 373 | for thread_id in range(num_preprocess_threads): 374 | # Parse a serialized Example proto to extract the image and metadata. 375 | image_buffer, label_index, bbox, name= parse_example(example_serialized) 376 | image,labels,bbox = image_processing(image_buffer, bbox,label_index, 377 | train, thread_id) 378 | 379 | images_and_labels.append([image, bbox[1,:],name]) 380 | 381 | images ,box,names= tf.train.batch_join( 382 | images_and_labels, 383 | batch_size=batch_size, 384 | capacity=2 * num_preprocess_threads * batch_size) 385 | print 'box shape %s' % (box.shape) 386 | 387 | # Reshape images into these desired dimensions. 388 | 389 | print 'image batch phase %s' % (images) 390 | height = FLAGS.Height 391 | width = FLAGS.Width 392 | depth = 3 393 | 394 | #images = tf.cast(images, tf.float32) 395 | #images = tf.reshape(images, shape=[batch_size, height, width, depth]) 396 | 397 | print 'image reshape %s' % (images) 398 | 399 | # Display the training images in the visualizer. 400 | 401 | tf.summary.image('images', images) 402 | 403 | return images, box, names 404 | 405 | 406 | 407 | def main(_): 408 | data_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/' 409 | tf_record_pattern = os.path.join(data_dir, '*.tfrecord') 410 | data_files = tf.gfile.Glob(tf_record_pattern) 411 | print data_files 412 | images ,box,name= distorted_inputs(data_files) 413 | print images.shape 414 | 415 | with tf.Session() as sess: 416 | sess.run(tf.global_variables_initializer()) 417 | sess.run(tf.local_variables_initializer()) 418 | coord = tf.train.Coordinator() 419 | threads = tf.train.start_queue_runners(coord=coord) 420 | #print sess.run(shape) 421 | img = sess.run(images) 422 | boxb = sess.run(box) 423 | name = sess.run(name) 424 | print name 425 | print img.shape 426 | print img[0,:,:,:] 427 | #skio.imshow(img[1,:,:,:]) 428 | image = img[0,:,:,:] 429 | xmin = int(boxb[0,1] * 300) 430 | ymin = int(boxb[0,0] * 300) 431 | xmax = int(boxb[0,3] * 300) 432 | ymax = int(boxb[0,2] * 300) 433 | skio.imshow(cv2.rectangle(image,(xmin,ymin),(xmax,ymax),(0,0,0))) 434 | skio.show() 435 | skio.imshow(skio.imread(data_dir+ name)) 436 | skio.show() 437 | coord.request_stop() 438 | coord.join(threads) 439 | 440 | if __name__ == '__main__': 441 | tf.app.run() 442 | 443 | 444 | 445 | 446 | -------------------------------------------------------------------------------- /processing/ssd_vgg_preprocessing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Pre-processing images for SSD-type networks. 16 | """ 17 | from enum import Enum, IntEnum 18 | import numpy as np 19 | 20 | import tensorflow as tf 21 | import tf_extended as tfe 22 | 23 | from tensorflow.python.ops import control_flow_ops 24 | 25 | from processing import tf_image 26 | 27 | 28 | slim = tf.contrib.slim 29 | 30 | # Resizing strategies. 31 | Resize = IntEnum('Resize', ('NONE', # Nothing! 32 | 'CENTRAL_CROP', # Crop (and pad if necessary). 33 | 'PAD_AND_RESIZE', # Pad, and resize to output shape. 34 | 'WARP_RESIZE')) # Warp resize. 35 | 36 | # VGG mean parameters. 37 | _R_MEAN = 123. 38 | _G_MEAN = 117. 39 | _B_MEAN = 104. 40 | 41 | # Some training pre-processing parameters. 42 | BBOX_CROP_OVERLAP = 0.4 # Minimum overlap to keep a bbox after cropping. 43 | CROP_RATIO_RANGE = (0.8, 1.2) # Distortion ratio during cropping. 44 | EVAL_SIZE = (300, 300) 45 | 46 | 47 | def tf_image_whitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN]): 48 | """Subtracts the given means from each image channel. 49 | 50 | Returns: 51 | the centered image. 52 | """ 53 | if image.get_shape().ndims != 3: 54 | raise ValueError('Input must be of size [height, width, C>0]') 55 | num_channels = image.get_shape().as_list()[-1] 56 | if len(means) != num_channels: 57 | raise ValueError('len(means) must match the number of channels') 58 | 59 | mean = tf.constant(means, dtype=image.dtype) 60 | image = image - mean 61 | return image 62 | 63 | 64 | def tf_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True): 65 | """Re-convert to original image distribution, and convert to int if 66 | necessary. 67 | 68 | Returns: 69 | Centered image. 70 | """ 71 | mean = tf.constant(means, dtype=image.dtype) 72 | image = image + mean 73 | if to_int: 74 | image = tf.cast(image, tf.int32) 75 | return image 76 | 77 | 78 | def np_image_unwhitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN], to_int=True): 79 | """Re-convert to original image distribution, and convert to int if 80 | necessary. Numpy version. 81 | 82 | Returns: 83 | Centered image. 84 | """ 85 | img = np.copy(image) 86 | img += np.array(means, dtype=img.dtype) 87 | if to_int: 88 | img = img.astype(np.uint8) 89 | return img 90 | 91 | 92 | def tf_summary_image(image, bboxes, name='image', unwhitened=False): 93 | """Add image with bounding boxes to summary. 94 | """ 95 | if unwhitened: 96 | image = tf_image_unwhitened(image) 97 | image = tf.expand_dims(image, 0) 98 | bboxes = tf.expand_dims(bboxes, 0) 99 | image_with_box = tf.image.draw_bounding_boxes(image, bboxes) 100 | tf.summary.image(name, image_with_box) 101 | 102 | 103 | def apply_with_random_selector(x, func, num_cases): 104 | """Computes func(x, sel), with sel sampled from [0...num_cases-1]. 105 | 106 | Args: 107 | x: input Tensor. 108 | func: Python function to apply. 109 | num_cases: Python int32, number of cases to sample sel from. 110 | 111 | Returns: 112 | The result of func(x, sel), where func receives the value of the 113 | selector as a python integer, but sel is sampled dynamically. 114 | """ 115 | sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32) 116 | # Pass the real x only to one of the func calls. 117 | return control_flow_ops.merge([ 118 | func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case) 119 | for case in range(num_cases)])[0] 120 | 121 | 122 | def distort_color(image, color_ordering=0, fast_mode=True, scope=None): 123 | """Distort the color of a Tensor image. 124 | 125 | Each color distortion is non-commutative and thus ordering of the color ops 126 | matters. Ideally we would randomly permute the ordering of the color ops. 127 | Rather then adding that level of complication, we select a distinct ordering 128 | of color ops for each preprocessing thread. 129 | 130 | Args: 131 | image: 3-D Tensor containing single image in [0, 1]. 132 | color_ordering: Python int, a type of distortion (valid values: 0-3). 133 | fast_mode: Avoids slower ops (random_hue and random_contrast) 134 | scope: Optional scope for name_scope. 135 | Returns: 136 | 3-D Tensor color-distorted image on range [0, 1] 137 | Raises: 138 | ValueError: if color_ordering not in [0, 3] 139 | """ 140 | with tf.name_scope(scope, 'distort_color', [image]): 141 | if fast_mode: 142 | if color_ordering == 0: 143 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 144 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 145 | else: 146 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 147 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 148 | else: 149 | if color_ordering == 0: 150 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 151 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 152 | image = tf.image.random_hue(image, max_delta=0.2) 153 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 154 | elif color_ordering == 1: 155 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 156 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 157 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 158 | image = tf.image.random_hue(image, max_delta=0.2) 159 | elif color_ordering == 2: 160 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 161 | image = tf.image.random_hue(image, max_delta=0.2) 162 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 163 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 164 | elif color_ordering == 3: 165 | image = tf.image.random_hue(image, max_delta=0.2) 166 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 167 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 168 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 169 | else: 170 | raise ValueError('color_ordering must be in [0, 3]') 171 | # The random_* ops do not necessarily clamp. 172 | return tf.clip_by_value(image, 0.0, 1.0) 173 | 174 | 175 | def distorted_bounding_box_crop(image, 176 | labels, 177 | bboxes, 178 | min_object_covered=0.05, 179 | aspect_ratio_range=(0.9, 1.1), 180 | area_range=(0.1, 1.0), 181 | max_attempts=200, 182 | scope=None): 183 | """Generates cropped_image using a one of the bboxes randomly distorted. 184 | 185 | See `tf.image.sample_distorted_bounding_box` for more documentation. 186 | 187 | Args: 188 | image: 3-D Tensor of image (it will be converted to floats in [0, 1]). 189 | bbox: 3-D float Tensor of bounding boxes arranged [1, num_boxes, coords] 190 | where each coordinate is [0, 1) and the coordinates are arranged 191 | as [ymin, xmin, ymax, xmax]. If num_boxes is 0 then it would use the whole 192 | image. 193 | min_object_covered: An optional `float`. Defaults to `0.1`. The cropped 194 | area of the image must contain at least this fraction of any bounding box 195 | supplied. 196 | aspect_ratio_range: An optional list of `floats`. The cropped area of the 197 | image must have an aspect ratio = width / height within this range. 198 | area_range: An optional list of `floats`. The cropped area of the image 199 | must contain a fraction of the supplied image within in this range. 200 | max_attempts: An optional `int`. Number of attempts at generating a cropped 201 | region of the image of the specified constraints. After `max_attempts` 202 | failures, return the entire image. 203 | scope: Optional scope for name_scope. 204 | Returns: 205 | A tuple, a 3-D Tensor cropped_image and the distorted bbox 206 | """ 207 | with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]): 208 | # Each bounding box has shape [1, num_boxes, box coords] and 209 | # the coordinates are ordered [ymin, xmin, ymax, xmax]. 210 | bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box( 211 | tf.shape(image), 212 | bounding_boxes=tf.expand_dims(bboxes, 0), 213 | min_object_covered=min_object_covered, 214 | aspect_ratio_range=aspect_ratio_range, 215 | area_range=area_range, 216 | max_attempts=max_attempts, 217 | use_image_if_no_bounding_boxes=True) 218 | distort_bbox = distort_bbox[0, 0] 219 | 220 | # Crop the image to the specified bounding box. 221 | cropped_image = tf.slice(image, bbox_begin, bbox_size) 222 | # Restore the shape since the dynamic slice loses 3rd dimension. 223 | cropped_image.set_shape([None, None, 3]) 224 | 225 | # Update bounding boxes: resize and filter out. 226 | bboxes = tfe.bboxes_resize(distort_bbox, bboxes) 227 | labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes, 228 | BBOX_CROP_OVERLAP) 229 | return cropped_image, labels, bboxes, distort_bbox 230 | 231 | 232 | def preprocess_for_train(image, labels, bboxes, 233 | out_shape, data_format='NHWC', 234 | scope='ssd_preprocessing_train'): 235 | """Preprocesses the given image for training. 236 | 237 | Note that the actual resizing scale is sampled from 238 | [`resize_size_min`, `resize_size_max`]. 239 | 240 | Args: 241 | image: A `Tensor` representing an image of arbitrary size. 242 | output_height: The height of the image after preprocessing. 243 | output_width: The width of the image after preprocessing. 244 | resize_side_min: The lower bound for the smallest side of the image for 245 | aspect-preserving resizing. 246 | resize_side_max: The upper bound for the smallest side of the image for 247 | aspect-preserving resizing. 248 | 249 | Returns: 250 | A preprocessed image. 251 | """ 252 | fast_mode = False 253 | with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]): 254 | if image.get_shape().ndims != 3: 255 | raise ValueError('Input must be of size [height, width, C>0]') 256 | # Convert to float scaled [0, 1]. 257 | if image.dtype != tf.float32: 258 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 259 | tf_summary_image(image, bboxes, 'image_with_bboxes') 260 | 261 | # # Remove DontCare labels. 262 | # labels, bboxes = ssd_common.tf_bboxes_filter_labels(out_label, 263 | # labels, 264 | # bboxes) 265 | 266 | # Distort image and bounding boxes. 267 | dst_image = image 268 | dst_image, labels, bboxes, distort_bbox = \ 269 | distorted_bounding_box_crop(image, labels, bboxes, 270 | aspect_ratio_range=CROP_RATIO_RANGE) 271 | # Resize image to output size. 272 | dst_image = tf_image.resize_image(dst_image, out_shape, 273 | method=tf.image.ResizeMethod.BILINEAR, 274 | align_corners=False) 275 | tf_summary_image(dst_image, bboxes, 'image_shape_distorted') 276 | 277 | # Randomly flip the image horizontally. 278 | dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) 279 | 280 | # Randomly distort the colors. There are 4 ways to do it. 281 | dst_image = apply_with_random_selector( 282 | dst_image, 283 | lambda x, ordering: distort_color(x, ordering, fast_mode), 284 | num_cases=4) 285 | tf_summary_image(dst_image, bboxes, 'image_color_distorted') 286 | 287 | # Rescale to VGG input scale. 288 | image = dst_image * 255. 289 | image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) 290 | # Image data format. 291 | if data_format == 'NCHW': 292 | image = tf.transpose(image, perm=(2, 0, 1)) 293 | return image, labels, bboxes 294 | 295 | 296 | def preprocess_for_eval(image, labels, bboxes, 297 | out_shape=EVAL_SIZE, data_format='NHWC', 298 | difficults=None, resize=Resize.WARP_RESIZE, 299 | scope='ssd_preprocessing_train'): 300 | """Preprocess an image for evaluation. 301 | 302 | Args: 303 | image: A `Tensor` representing an image of arbitrary size. 304 | out_shape: Output shape after pre-processing (if resize != None) 305 | resize: Resize strategy. 306 | 307 | Returns: 308 | A preprocessed image. 309 | """ 310 | with tf.name_scope(scope): 311 | if image.get_shape().ndims != 3: 312 | raise ValueError('Input must be of size [height, width, C>0]') 313 | 314 | image = tf.to_float(image) 315 | image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) 316 | 317 | # Add image rectangle to bboxes. 318 | bbox_img = tf.constant([[0., 0., 1., 1.]]) 319 | if bboxes is None: 320 | bboxes = bbox_img 321 | else: 322 | bboxes = tf.concat([bbox_img, bboxes], axis=0) 323 | 324 | if resize == Resize.NONE: 325 | # No resizing... 326 | pass 327 | elif resize == Resize.CENTRAL_CROP: 328 | # Central cropping of the image. 329 | image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( 330 | image, bboxes, out_shape[0], out_shape[1]) 331 | elif resize == Resize.PAD_AND_RESIZE: 332 | # Resize image first: find the correct factor... 333 | shape = tf.shape(image) 334 | factor = tf.minimum(tf.to_double(1.0), 335 | tf.minimum(tf.to_double(out_shape[0] / shape[0]), 336 | tf.to_double(out_shape[1] / shape[1]))) 337 | resize_shape = factor * tf.to_double(shape[0:2]) 338 | resize_shape = tf.cast(tf.floor(resize_shape), tf.int32) 339 | 340 | image = tf_image.resize_image(image, resize_shape, 341 | method=tf.image.ResizeMethod.BILINEAR, 342 | align_corners=False) 343 | # Pad to expected size. 344 | image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( 345 | image, bboxes, out_shape[0], out_shape[1]) 346 | elif resize == Resize.WARP_RESIZE: 347 | # Warp resize of the image. 348 | image = tf_image.resize_image(image, out_shape, 349 | method=tf.image.ResizeMethod.BILINEAR, 350 | align_corners=False) 351 | 352 | # Split back bounding boxes. 353 | bbox_img = bboxes[0] 354 | bboxes = bboxes[1:] 355 | # Remove difficult boxes. 356 | if difficults is not None: 357 | mask = tf.logical_not(tf.cast(difficults, tf.bool)) 358 | labels = tf.boolean_mask(labels, mask) 359 | bboxes = tf.boolean_mask(bboxes, mask) 360 | # Image data format. 361 | if data_format == 'NCHW': 362 | image = tf.transpose(image, perm=(2, 0, 1)) 363 | return image, labels, bboxes, bbox_img 364 | 365 | 366 | def preprocess_image(image, 367 | labels, 368 | bboxes, 369 | out_shape, 370 | data_format, 371 | is_training=False, 372 | **kwargs): 373 | """Pre-process an given image. 374 | 375 | Args: 376 | image: A `Tensor` representing an image of arbitrary size. 377 | output_height: The height of the image after preprocessing. 378 | output_width: The width of the image after preprocessing. 379 | is_training: `True` if we're preprocessing the image for training and 380 | `False` otherwise. 381 | resize_side_min: The lower bound for the smallest side of the image for 382 | aspect-preserving resizing. If `is_training` is `False`, then this value 383 | is used for rescaling. 384 | resize_side_max: The upper bound for the smallest side of the image for 385 | aspect-preserving resizing. If `is_training` is `False`, this value is 386 | ignored. Otherwise, the resize side is sampled from 387 | [resize_size_min, resize_size_max]. 388 | 389 | Returns: 390 | A preprocessed image. 391 | """ 392 | if is_training: 393 | return preprocess_for_train(image, labels, bboxes, 394 | out_shape=out_shape, 395 | data_format=data_format) 396 | else: 397 | return preprocess_for_eval(image, labels, bboxes, 398 | out_shape=out_shape, 399 | data_format=data_format, 400 | **kwargs) 401 | -------------------------------------------------------------------------------- /processing/test_processing.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script will test all functions and scripts in data pre-processing phase 3 | Test functions includes: 4 | image_processing. 5 | data <- tfrecord 6 | image_buffer, label_index, bbox, name <- parse_example 7 | image,labels,bbox <- image_processing 8 | image <- distorted_image 9 | """ 10 | 11 | import tensorflow as tf 12 | import matplotlib.pyplot as plt 13 | import numpy as np 14 | import skimage.io as skio 15 | 16 | import tf_extended as tfe 17 | from image_processing import * 18 | import cv2 19 | 20 | def visualize_bbox(image, bboxes): 21 | """ 22 | Input: image (height, width, channels) 23 | bboxes (numof bboxes, 4) in order(ymin, xmin, ymax, xmax) 24 | range(0,1) 25 | """ 26 | numofbox = bboxes.shape[0] 27 | width = image.shape[1] 28 | height = image.shape[0] 29 | def norm(x): 30 | if x < 0: 31 | x = 0 32 | else: 33 | if x > 1: 34 | x = 1 35 | return x 36 | xmin = [int(norm(i) * width) for i in bboxes[:,1]] 37 | ymin = [int(norm(i) * height) for i in bboxes[:,0]] 38 | ymax = [int(norm(i) * height) for i in bboxes[:,2]] 39 | xmax = [int(norm(i) * width) for i in bboxes[:,3]] 40 | 41 | for i in range(numofbox): 42 | image = cv2.rectangle(image,(xmin[i],ymin[i]), 43 | (xmax[i],ymax[i]),(0,0,0)) 44 | skio.imshow(image) 45 | skio.show() 46 | 47 | 48 | 49 | 50 | if __name__ == "__main__": 51 | data_dir = '/Users/xiaodiu/Documents/github/projecttextbox/TextBoxes-TensorFlow/data/sythtext/' 52 | file_name = data_dir + '1.tfrecord' 53 | ## test if file_name exists 54 | 55 | example = tf.python_io.tf_record_iterator(file_name).next() 56 | image_buffer, label, bboxes, name= parse_example(example) 57 | image,label,bboxes = image_processing(image_buffer, bboxes,label, 58 | train= True, thread_id = 0) 59 | 60 | with tf.Session() as sess: 61 | sess.run(tf.global_variables_initializer()) 62 | Image, label, bboxes = sess.run([image, label, bboxes]) 63 | print label.shape 64 | print bboxes 65 | #print name 66 | #print width 67 | #print height 68 | print Image.shape 69 | visualize_bbox(Image, bboxes) 70 | skio.imshow(Image) 71 | skio.show() 72 | 73 | -------------------------------------------------------------------------------- /processing/tf_image.py: -------------------------------------------------------------------------------- 1 | # Copyright 2015 The TensorFlow Authors and Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Custom image operations. 16 | Most of the following methods extend TensorFlow image library, and part of 17 | the code is shameless copy-paste of the former! 18 | """ 19 | import tensorflow as tf 20 | 21 | from tensorflow.python.framework import constant_op 22 | from tensorflow.python.framework import dtypes 23 | from tensorflow.python.framework import ops 24 | from tensorflow.python.framework import tensor_shape 25 | from tensorflow.python.framework import tensor_util 26 | from tensorflow.python.ops import array_ops 27 | from tensorflow.python.ops import check_ops 28 | from tensorflow.python.ops import clip_ops 29 | from tensorflow.python.ops import control_flow_ops 30 | from tensorflow.python.ops import gen_image_ops 31 | from tensorflow.python.ops import gen_nn_ops 32 | from tensorflow.python.ops import string_ops 33 | from tensorflow.python.ops import math_ops 34 | from tensorflow.python.ops import random_ops 35 | from tensorflow.python.ops import variables 36 | 37 | 38 | # =========================================================================== # 39 | # Modification of TensorFlow image routines. 40 | # =========================================================================== # 41 | def _assert(cond, ex_type, msg): 42 | """A polymorphic assert, works with tensors and boolean expressions. 43 | If `cond` is not a tensor, behave like an ordinary assert statement, except 44 | that a empty list is returned. If `cond` is a tensor, return a list 45 | containing a single TensorFlow assert op. 46 | Args: 47 | cond: Something evaluates to a boolean value. May be a tensor. 48 | ex_type: The exception class to use. 49 | msg: The error message. 50 | Returns: 51 | A list, containing at most one assert op. 52 | """ 53 | if _is_tensor(cond): 54 | return [control_flow_ops.Assert(cond, [msg])] 55 | else: 56 | if not cond: 57 | raise ex_type(msg) 58 | else: 59 | return [] 60 | 61 | 62 | def _is_tensor(x): 63 | """Returns `True` if `x` is a symbolic tensor-like object. 64 | Args: 65 | x: A python object to check. 66 | Returns: 67 | `True` if `x` is a `tf.Tensor` or `tf.Variable`, otherwise `False`. 68 | """ 69 | return isinstance(x, (ops.Tensor, variables.Variable)) 70 | 71 | 72 | def _ImageDimensions(image): 73 | """Returns the dimensions of an image tensor. 74 | Args: 75 | image: A 3-D Tensor of shape `[height, width, channels]`. 76 | Returns: 77 | A list of `[height, width, channels]` corresponding to the dimensions of the 78 | input image. Dimensions that are statically known are python integers, 79 | otherwise they are integer scalar tensors. 80 | """ 81 | if image.get_shape().is_fully_defined(): 82 | return image.get_shape().as_list() 83 | else: 84 | static_shape = image.get_shape().with_rank(3).as_list() 85 | dynamic_shape = array_ops.unstack(array_ops.shape(image), 3) 86 | return [s if s is not None else d 87 | for s, d in zip(static_shape, dynamic_shape)] 88 | 89 | 90 | def _Check3DImage(image, require_static=True): 91 | """Assert that we are working with properly shaped image. 92 | Args: 93 | image: 3-D Tensor of shape [height, width, channels] 94 | require_static: If `True`, requires that all dimensions of `image` are 95 | known and non-zero. 96 | Raises: 97 | ValueError: if `image.shape` is not a 3-vector. 98 | Returns: 99 | An empty list, if `image` has fully defined dimensions. Otherwise, a list 100 | containing an assert op is returned. 101 | """ 102 | try: 103 | image_shape = image.get_shape().with_rank(3) 104 | except ValueError: 105 | raise ValueError("'image' must be three-dimensional.") 106 | if require_static and not image_shape.is_fully_defined(): 107 | raise ValueError("'image' must be fully defined.") 108 | if any(x == 0 for x in image_shape): 109 | raise ValueError("all dims of 'image.shape' must be > 0: %s" % 110 | image_shape) 111 | if not image_shape.is_fully_defined(): 112 | return [check_ops.assert_positive(array_ops.shape(image), 113 | ["all dims of 'image.shape' " 114 | "must be > 0."])] 115 | else: 116 | return [] 117 | 118 | 119 | def fix_image_flip_shape(image, result): 120 | """Set the shape to 3 dimensional if we don't know anything else. 121 | Args: 122 | image: original image size 123 | result: flipped or transformed image 124 | Returns: 125 | An image whose shape is at least None,None,None. 126 | """ 127 | image_shape = image.get_shape() 128 | if image_shape == tensor_shape.unknown_shape(): 129 | result.set_shape([None, None, None]) 130 | else: 131 | result.set_shape(image_shape) 132 | return result 133 | 134 | 135 | # =========================================================================== # 136 | # Image + BBoxes methods: cropping, resizing, flipping, ... 137 | # =========================================================================== # 138 | def bboxes_crop_or_pad(bboxes, 139 | height, width, 140 | offset_y, offset_x, 141 | target_height, target_width): 142 | """Adapt bounding boxes to crop or pad operations. 143 | Coordinates are always supposed to be relative to the image. 144 | 145 | Arguments: 146 | bboxes: Tensor Nx4 with bboxes coordinates [y_min, x_min, y_max, x_max]; 147 | height, width: Original image dimension; 148 | offset_y, offset_x: Offset to apply, 149 | negative if cropping, positive if padding; 150 | target_height, target_width: Target dimension after cropping / padding. 151 | """ 152 | with tf.name_scope('bboxes_crop_or_pad'): 153 | # Rescale bounding boxes in pixels. 154 | scale = tf.cast(tf.stack([height, width, height, width]), bboxes.dtype) 155 | bboxes = bboxes * scale 156 | # Add offset. 157 | offset = tf.cast(tf.stack([offset_y, offset_x, offset_y, offset_x]), bboxes.dtype) 158 | bboxes = bboxes + offset 159 | # Rescale to target dimension. 160 | scale = tf.cast(tf.stack([target_height, target_width, 161 | target_height, target_width]), bboxes.dtype) 162 | bboxes = bboxes / scale 163 | return bboxes 164 | 165 | 166 | def resize_image_bboxes_with_crop_or_pad(image, bboxes, 167 | target_height, target_width): 168 | """Crops and/or pads an image to a target width and height. 169 | Resizes an image to a target width and height by either centrally 170 | cropping the image or padding it evenly with zeros. 171 | 172 | If `width` or `height` is greater than the specified `target_width` or 173 | `target_height` respectively, this op centrally crops along that dimension. 174 | If `width` or `height` is smaller than the specified `target_width` or 175 | `target_height` respectively, this op centrally pads with 0 along that 176 | dimension. 177 | Args: 178 | image: 3-D tensor of shape `[height, width, channels]` 179 | target_height: Target height. 180 | target_width: Target width. 181 | Raises: 182 | ValueError: if `target_height` or `target_width` are zero or negative. 183 | Returns: 184 | Cropped and/or padded image of shape 185 | `[target_height, target_width, channels]` 186 | """ 187 | with tf.name_scope('resize_with_crop_or_pad'): 188 | image = ops.convert_to_tensor(image, name='image') 189 | 190 | assert_ops = [] 191 | assert_ops += _Check3DImage(image, require_static=False) 192 | assert_ops += _assert(target_width > 0, ValueError, 193 | 'target_width must be > 0.') 194 | assert_ops += _assert(target_height > 0, ValueError, 195 | 'target_height must be > 0.') 196 | 197 | image = control_flow_ops.with_dependencies(assert_ops, image) 198 | # `crop_to_bounding_box` and `pad_to_bounding_box` have their own checks. 199 | # Make sure our checks come first, so that error messages are clearer. 200 | if _is_tensor(target_height): 201 | target_height = control_flow_ops.with_dependencies( 202 | assert_ops, target_height) 203 | if _is_tensor(target_width): 204 | target_width = control_flow_ops.with_dependencies(assert_ops, target_width) 205 | 206 | def max_(x, y): 207 | if _is_tensor(x) or _is_tensor(y): 208 | return math_ops.maximum(x, y) 209 | else: 210 | return max(x, y) 211 | 212 | def min_(x, y): 213 | if _is_tensor(x) or _is_tensor(y): 214 | return math_ops.minimum(x, y) 215 | else: 216 | return min(x, y) 217 | 218 | def equal_(x, y): 219 | if _is_tensor(x) or _is_tensor(y): 220 | return math_ops.equal(x, y) 221 | else: 222 | return x == y 223 | 224 | height, width, _ = _ImageDimensions(image) 225 | width_diff = target_width - width 226 | offset_crop_width = max_(-width_diff // 2, 0) 227 | offset_pad_width = max_(width_diff // 2, 0) 228 | 229 | height_diff = target_height - height 230 | offset_crop_height = max_(-height_diff // 2, 0) 231 | offset_pad_height = max_(height_diff // 2, 0) 232 | 233 | # Maybe crop if needed. 234 | height_crop = min_(target_height, height) 235 | width_crop = min_(target_width, width) 236 | cropped = tf.image.crop_to_bounding_box(image, offset_crop_height, offset_crop_width, 237 | height_crop, width_crop) 238 | bboxes = bboxes_crop_or_pad(bboxes, 239 | height, width, 240 | -offset_crop_height, -offset_crop_width, 241 | height_crop, width_crop) 242 | # Maybe pad if needed. 243 | resized = tf.image.pad_to_bounding_box(cropped, offset_pad_height, offset_pad_width, 244 | target_height, target_width) 245 | bboxes = bboxes_crop_or_pad(bboxes, 246 | height_crop, width_crop, 247 | offset_pad_height, offset_pad_width, 248 | target_height, target_width) 249 | 250 | # In theory all the checks below are redundant. 251 | if resized.get_shape().ndims is None: 252 | raise ValueError('resized contains no shape.') 253 | 254 | resized_height, resized_width, _ = _ImageDimensions(resized) 255 | 256 | assert_ops = [] 257 | assert_ops += _assert(equal_(resized_height, target_height), ValueError, 258 | 'resized height is not correct.') 259 | assert_ops += _assert(equal_(resized_width, target_width), ValueError, 260 | 'resized width is not correct.') 261 | 262 | resized = control_flow_ops.with_dependencies(assert_ops, resized) 263 | return resized, bboxes 264 | 265 | 266 | def resize_image(image, size, 267 | method=tf.image.ResizeMethod.BILINEAR, 268 | align_corners=False): 269 | """Resize an image and bounding boxes. 270 | """ 271 | # Resize image. 272 | with tf.name_scope('resize_image'): 273 | height, width, channels = _ImageDimensions(image) 274 | image = tf.expand_dims(image, 0) 275 | image = tf.image.resize_images(image, size, 276 | method, align_corners) 277 | image = tf.reshape(image, tf.stack([size[0], size[1], channels])) 278 | return image 279 | 280 | 281 | def random_flip_left_right(image, bboxes, seed=None): 282 | """Random flip left-right of an image and its bounding boxes. 283 | """ 284 | def flip_bboxes(bboxes): 285 | """Flip bounding boxes coordinates. 286 | """ 287 | bboxes = tf.stack([bboxes[:, 0], 1 - bboxes[:, 3], 288 | bboxes[:, 2], 1 - bboxes[:, 1]], axis=-1) 289 | return bboxes 290 | 291 | # Random flip. Tensorflow implementation. 292 | with tf.name_scope('random_flip_left_right'): 293 | image = ops.convert_to_tensor(image, name='image') 294 | _Check3DImage(image, require_static=False) 295 | uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed) 296 | mirror_cond = math_ops.less(uniform_random, .5) 297 | # Flip image. 298 | result = control_flow_ops.cond(mirror_cond, 299 | lambda: array_ops.reverse_v2(image, [1]), 300 | lambda: image) 301 | # Flip bboxes. 302 | bboxes = control_flow_ops.cond(mirror_cond, 303 | lambda: flip_bboxes(bboxes), 304 | lambda: bboxes) 305 | return fix_image_flip_shape(image, result), bboxes 306 | 307 | -------------------------------------------------------------------------------- /tf_extended/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TF Extended: additional metrics. 16 | """ 17 | 18 | # pylint: disable=unused-import,line-too-long,g-importing-member,wildcard-import 19 | from tf_extended.metrics import * 20 | from tf_extended.tensors import * 21 | from tf_extended.bboxes import * 22 | from tf_extended.image import * 23 | from tf_extended.math import * 24 | 25 | -------------------------------------------------------------------------------- /tf_extended/bboxes.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TF Extended: additional bounding boxes methods. 16 | """ 17 | import numpy as np 18 | import tensorflow as tf 19 | 20 | from tf_extended import tensors as tfe_tensors 21 | from tf_extended import math as tfe_math 22 | 23 | 24 | # =========================================================================== # 25 | # Standard boxes algorithms. 26 | # =========================================================================== # 27 | def bboxes_sort_all_classes(classes, scores, bboxes, top_k=400, scope=None): 28 | """Sort bounding boxes by decreasing order and keep only the top_k. 29 | Assume the input Tensors mix-up objects with different classes. 30 | Assume a batch-type input. 31 | 32 | Args: 33 | classes: Batch x N Tensor containing integer classes. 34 | scores: Batch x N Tensor containing float scores. 35 | bboxes: Batch x N x 4 Tensor containing boxes coordinates. 36 | top_k: Top_k boxes to keep. 37 | Return: 38 | classes, scores, bboxes: Sorted tensors of shape Batch x Top_k. 39 | """ 40 | with tf.name_scope(scope, 'bboxes_sort', [classes, scores, bboxes]): 41 | scores, idxes = tf.nn.top_k(scores, k=top_k, sorted=True) 42 | 43 | # Trick to be able to use tf.gather: map for each element in the batch. 44 | def fn_gather(classes, bboxes, idxes): 45 | cl = tf.gather(classes, idxes) 46 | bb = tf.gather(bboxes, idxes) 47 | return [cl, bb] 48 | r = tf.map_fn(lambda x: fn_gather(x[0], x[1], x[2]), 49 | [classes, bboxes, idxes], 50 | dtype=[classes.dtype, bboxes.dtype], 51 | parallel_iterations=10, 52 | back_prop=False, 53 | swap_memory=False, 54 | infer_shape=True) 55 | classes = r[0] 56 | bboxes = r[1] 57 | return classes, scores, bboxes 58 | 59 | 60 | def bboxes_sort(scores, bboxes, top_k=400, scope=None): 61 | """Sort bounding boxes by decreasing order and keep only the top_k. 62 | If inputs are dictionnaries, assume every key is a different class. 63 | Assume a batch-type input. 64 | 65 | Args: 66 | scores: Batch x N Tensor/Dictionary containing float scores. 67 | bboxes: Batch x N x 4 Tensor/Dictionary containing boxes coordinates. 68 | top_k: Top_k boxes to keep. 69 | Return: 70 | scores, bboxes: Sorted Tensors/Dictionaries of shape Batch x Top_k x 1|4. 71 | """ 72 | # Dictionaries as inputs. 73 | if isinstance(scores, dict) or isinstance(bboxes, dict): 74 | with tf.name_scope(scope, 'bboxes_sort_dict'): 75 | d_scores = {} 76 | d_bboxes = {} 77 | for c in scores.keys(): 78 | s, b = bboxes_sort(scores[c], bboxes[c], top_k=top_k) 79 | d_scores[c] = s 80 | d_bboxes[c] = b 81 | return d_scores, d_bboxes 82 | 83 | # Tensors inputs. 84 | with tf.name_scope(scope, 'bboxes_sort', [scores, bboxes]): 85 | # Sort scores... 86 | scores, idxes = tf.nn.top_k(scores, k=top_k, sorted=True) 87 | 88 | # Trick to be able to use tf.gather: map for each element in the first dim. 89 | def fn_gather(bboxes, idxes): 90 | bb = tf.gather(bboxes, idxes) 91 | return [bb] 92 | r = tf.map_fn(lambda x: fn_gather(x[0], x[1]), 93 | [bboxes, idxes], 94 | dtype=[bboxes.dtype], 95 | parallel_iterations=10, 96 | back_prop=False, 97 | swap_memory=False, 98 | infer_shape=True) 99 | bboxes = r[0] 100 | return scores, bboxes 101 | 102 | 103 | def bboxes_clip(bbox_ref, bboxes, scope=None): 104 | """Clip bounding boxes to a reference box. 105 | Batch-compatible if the first dimension of `bbox_ref` and `bboxes` 106 | can be broadcasted. 107 | 108 | Args: 109 | bbox_ref: Reference bounding box. Nx4 or 4 shaped-Tensor; 110 | bboxes: Bounding boxes to clip. Nx4 or 4 shaped-Tensor or dictionary. 111 | Return: 112 | Clipped bboxes. 113 | """ 114 | # Bboxes is dictionary. 115 | if isinstance(bboxes, dict): 116 | with tf.name_scope(scope, 'bboxes_clip_dict'): 117 | d_bboxes = {} 118 | for c in bboxes.keys(): 119 | d_bboxes[c] = bboxes_clip(bbox_ref, bboxes[c]) 120 | return d_bboxes 121 | 122 | # Tensors inputs. 123 | with tf.name_scope(scope, 'bboxes_clip'): 124 | # Easier with transposed bboxes. Especially for broadcasting. 125 | bbox_ref = tf.transpose(bbox_ref) 126 | bboxes = tf.transpose(bboxes) 127 | # Intersection bboxes and reference bbox. 128 | ymin = tf.maximum(bboxes[0], bbox_ref[0]) 129 | xmin = tf.maximum(bboxes[1], bbox_ref[1]) 130 | ymax = tf.minimum(bboxes[2], bbox_ref[2]) 131 | xmax = tf.minimum(bboxes[3], bbox_ref[3]) 132 | bboxes = tf.transpose(tf.stack([ymin, xmin, ymax, xmax], axis=0)) 133 | return bboxes 134 | 135 | 136 | def bboxes_resize(bbox_ref, bboxes, name=None): 137 | """Resize bounding boxes based on a reference bounding box, 138 | assuming that the latter is [0, 0, 1, 1] after transform. Useful for 139 | updating a collection of boxes after cropping an image. 140 | """ 141 | # Bboxes is dictionary. 142 | if isinstance(bboxes, dict): 143 | with tf.name_scope(name, 'bboxes_resize_dict'): 144 | d_bboxes = {} 145 | for c in bboxes.keys(): 146 | d_bboxes[c] = bboxes_resize(bbox_ref, bboxes[c]) 147 | return d_bboxes 148 | 149 | # Tensors inputs. 150 | with tf.name_scope(name, 'bboxes_resize'): 151 | # Translate. 152 | v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]]) 153 | bboxes = bboxes - v 154 | # Scale. 155 | s = tf.stack([bbox_ref[2] - bbox_ref[0], 156 | bbox_ref[3] - bbox_ref[1], 157 | bbox_ref[2] - bbox_ref[0], 158 | bbox_ref[3] - bbox_ref[1]]) 159 | bboxes = bboxes / s 160 | return bboxes 161 | 162 | 163 | def bboxes_nms(scores, bboxes, nms_threshold=0.5, keep_top_k=200, scope=None): 164 | """Apply non-maximum selection to bounding boxes. In comparison to TF 165 | implementation, use classes information for matching. 166 | Should only be used on single-entries. Use batch version otherwise. 167 | 168 | Args: 169 | scores: N Tensor containing float scores. 170 | bboxes: N x 4 Tensor containing boxes coordinates. 171 | nms_threshold: Matching threshold in NMS algorithm; 172 | keep_top_k: Number of total object to keep after NMS. 173 | Return: 174 | classes, scores, bboxes Tensors, sorted by score. 175 | Padded with zero if necessary. 176 | """ 177 | with tf.name_scope(scope, 'bboxes_nms_single', [scores, bboxes]): 178 | # Apply NMS algorithm. 179 | idxes = tf.image.non_max_suppression(bboxes, scores, 180 | keep_top_k, nms_threshold) 181 | scores = tf.gather(scores, idxes) 182 | bboxes = tf.gather(bboxes, idxes) 183 | # Pad results. 184 | scores = tfe_tensors.pad_axis(scores, 0, keep_top_k, axis=0) 185 | bboxes = tfe_tensors.pad_axis(bboxes, 0, keep_top_k, axis=0) 186 | return scores, bboxes 187 | 188 | 189 | def bboxes_nms_batch(scores, bboxes, nms_threshold=0.5, keep_top_k=200, 190 | scope=None): 191 | """Apply non-maximum selection to bounding boxes. In comparison to TF 192 | implementation, use classes information for matching. 193 | Use only on batched-inputs. Use zero-padding in order to batch output 194 | results. 195 | 196 | Args: 197 | scores: Batch x N Tensor/Dictionary containing float scores. 198 | bboxes: Batch x N x 4 Tensor/Dictionary containing boxes coordinates. 199 | nms_threshold: Matching threshold in NMS algorithm; 200 | keep_top_k: Number of total object to keep after NMS. 201 | Return: 202 | scores, bboxes Tensors/Dictionaries, sorted by score. 203 | Padded with zero if necessary. 204 | """ 205 | # Dictionaries as inputs. 206 | if isinstance(scores, dict) or isinstance(bboxes, dict): 207 | with tf.name_scope(scope, 'bboxes_nms_batch_dict'): 208 | d_scores = {} 209 | d_bboxes = {} 210 | for c in scores.keys(): 211 | s, b = bboxes_nms_batch(scores[c], bboxes[c], 212 | nms_threshold=nms_threshold, 213 | keep_top_k=keep_top_k) 214 | d_scores[c] = s 215 | d_bboxes[c] = b 216 | return d_scores, d_bboxes 217 | 218 | # Tensors inputs. 219 | with tf.name_scope(scope, 'bboxes_nms_batch'): 220 | r = tf.map_fn(lambda x: bboxes_nms(x[0], x[1], 221 | nms_threshold, keep_top_k), 222 | (scores, bboxes), 223 | dtype=(scores.dtype, bboxes.dtype), 224 | parallel_iterations=10, 225 | back_prop=False, 226 | swap_memory=False, 227 | infer_shape=True) 228 | scores, bboxes = r 229 | return scores, bboxes 230 | 231 | 232 | # def bboxes_fast_nms(classes, scores, bboxes, 233 | # nms_threshold=0.5, eta=3., num_classes=21, 234 | # pad_output=True, scope=None): 235 | # with tf.name_scope(scope, 'bboxes_fast_nms', 236 | # [classes, scores, bboxes]): 237 | 238 | # nms_classes = tf.zeros((0,), dtype=classes.dtype) 239 | # nms_scores = tf.zeros((0,), dtype=scores.dtype) 240 | # nms_bboxes = tf.zeros((0, 4), dtype=bboxes.dtype) 241 | 242 | 243 | def bboxes_matching(label, scores, bboxes, 244 | glabels, gbboxes, gdifficults, 245 | matching_threshold=0.5, scope=None): 246 | """Matching a collection of detected boxes with groundtruth values. 247 | Does not accept batched-inputs. 248 | The algorithm goes as follows: for every detected box, check 249 | if one grountruth box is matching. If none, then considered as False Positive. 250 | If the grountruth box is already matched with another one, it also counts 251 | as a False Positive. We refer the Pascal VOC documentation for the details. 252 | 253 | Args: 254 | rclasses, rscores, rbboxes: N(x4) Tensors. Detected objects, sorted by score; 255 | glabels, gbboxes: Groundtruth bounding boxes. May be zero padded, hence 256 | zero-class objects are ignored. 257 | matching_threshold: Threshold for a positive match. 258 | Return: Tuple of: 259 | n_gbboxes: Scalar Tensor with number of groundtruth boxes (may difer from 260 | size because of zero padding). 261 | tp_match: (N,)-shaped boolean Tensor containing with True Positives. 262 | fp_match: (N,)-shaped boolean Tensor containing with False Positives. 263 | """ 264 | with tf.name_scope(scope, 'bboxes_matching_single', 265 | [scores, bboxes, glabels, gbboxes]): 266 | rsize = tf.size(scores) 267 | rshape = tf.shape(scores) 268 | rlabel = tf.cast(label, glabels.dtype) 269 | # Number of groundtruth boxes. 270 | gdifficults = tf.cast(gdifficults, tf.bool) 271 | n_gbboxes = tf.count_nonzero(tf.logical_and(tf.equal(glabels, label), 272 | tf.logical_not(gdifficults))) 273 | # Grountruth matching arrays. 274 | gmatch = tf.zeros(tf.shape(glabels), dtype=tf.bool) 275 | grange = tf.range(tf.size(glabels), dtype=tf.int32) 276 | # True/False positive matching TensorArrays. 277 | sdtype = tf.bool 278 | ta_tp_bool = tf.TensorArray(sdtype, size=rsize, dynamic_size=False, infer_shape=True) 279 | ta_fp_bool = tf.TensorArray(sdtype, size=rsize, dynamic_size=False, infer_shape=True) 280 | 281 | # Loop over returned objects. 282 | def m_condition(i, ta_tp, ta_fp, gmatch): 283 | r = tf.less(i, rsize) 284 | return r 285 | 286 | def m_body(i, ta_tp, ta_fp, gmatch): 287 | # Jaccard score with groundtruth bboxes. 288 | rbbox = bboxes[i] 289 | jaccard = bboxes_jaccard(rbbox, gbboxes) 290 | jaccard = jaccard * tf.cast(tf.equal(glabels, rlabel), dtype=jaccard.dtype) 291 | 292 | # Best fit, checking it's above threshold. 293 | idxmax = tf.cast(tf.argmax(jaccard, axis=0), tf.int32) 294 | jcdmax = jaccard[idxmax] 295 | match = jcdmax > matching_threshold 296 | existing_match = gmatch[idxmax] 297 | not_difficult = tf.logical_not(gdifficults[idxmax]) 298 | 299 | # TP: match & no previous match and FP: previous match | no match. 300 | # If difficult: no record, i.e FP=False and TP=False. 301 | tp = tf.logical_and(not_difficult, 302 | tf.logical_and(match, tf.logical_not(existing_match))) 303 | ta_tp = ta_tp.write(i, tp) 304 | fp = tf.logical_and(not_difficult, 305 | tf.logical_or(existing_match, tf.logical_not(match))) 306 | ta_fp = ta_fp.write(i, fp) 307 | # Update grountruth match. 308 | mask = tf.logical_and(tf.equal(grange, idxmax), 309 | tf.logical_and(not_difficult, match)) 310 | gmatch = tf.logical_or(gmatch, mask) 311 | 312 | return [i+1, ta_tp, ta_fp, gmatch] 313 | # Main loop definition. 314 | i = 0 315 | [i, ta_tp_bool, ta_fp_bool, gmatch] = \ 316 | tf.while_loop(m_condition, m_body, 317 | [i, ta_tp_bool, ta_fp_bool, gmatch], 318 | parallel_iterations=1, 319 | back_prop=False) 320 | # TensorArrays to Tensors and reshape. 321 | tp_match = tf.reshape(ta_tp_bool.stack(), rshape) 322 | fp_match = tf.reshape(ta_fp_bool.stack(), rshape) 323 | 324 | # Some debugging information... 325 | # tp_match = tf.Print(tp_match, 326 | # [n_gbboxes, 327 | # tf.reduce_sum(tf.cast(tp_match, tf.int64)), 328 | # tf.reduce_sum(tf.cast(fp_match, tf.int64)), 329 | # tf.reduce_sum(tf.cast(gmatch, tf.int64))], 330 | # 'Matching (NG, TP, FP, GM): ') 331 | return n_gbboxes, tp_match, fp_match 332 | 333 | 334 | def bboxes_matching_batch(labels, scores, bboxes, 335 | glabels, gbboxes, gdifficults, 336 | matching_threshold=0.5, scope=None): 337 | """Matching a collection of detected boxes with groundtruth values. 338 | Batched-inputs version. 339 | 340 | Args: 341 | rclasses, rscores, rbboxes: BxN(x4) Tensors. Detected objects, sorted by score; 342 | glabels, gbboxes: Groundtruth bounding boxes. May be zero padded, hence 343 | zero-class objects are ignored. 344 | matching_threshold: Threshold for a positive match. 345 | Return: Tuple or Dictionaries with: 346 | n_gbboxes: Scalar Tensor with number of groundtruth boxes (may difer from 347 | size because of zero padding). 348 | tp: (B, N)-shaped boolean Tensor containing with True Positives. 349 | fp: (B, N)-shaped boolean Tensor containing with False Positives. 350 | """ 351 | # Dictionaries as inputs. 352 | if isinstance(scores, dict) or isinstance(bboxes, dict): 353 | with tf.name_scope(scope, 'bboxes_matching_batch_dict'): 354 | d_n_gbboxes = {} 355 | d_tp = {} 356 | d_fp = {} 357 | for c in labels: 358 | n, tp, fp, _ = bboxes_matching_batch(c, scores[c], bboxes[c], 359 | glabels, gbboxes, gdifficults, 360 | matching_threshold) 361 | d_n_gbboxes[c] = n 362 | d_tp[c] = tp 363 | d_fp[c] = fp 364 | return d_n_gbboxes, d_tp, d_fp, scores 365 | 366 | with tf.name_scope(scope, 'bboxes_matching_batch', 367 | [scores, bboxes, glabels, gbboxes]): 368 | r = tf.map_fn(lambda x: bboxes_matching(labels, x[0], x[1], 369 | x[2], x[3], x[4], 370 | matching_threshold), 371 | (scores, bboxes, glabels, gbboxes, gdifficults), 372 | dtype=(tf.int64, tf.bool, tf.bool), 373 | parallel_iterations=10, 374 | back_prop=False, 375 | swap_memory=True, 376 | infer_shape=True) 377 | return r[0], r[1], r[2], scores 378 | 379 | 380 | # =========================================================================== # 381 | # Some filteting methods. 382 | # =========================================================================== # 383 | def bboxes_filter_center(labels, bboxes, margins=[0., 0., 0., 0.], 384 | scope=None): 385 | """Filter out bounding boxes whose center are not in 386 | the rectangle [0, 0, 1, 1] + margins. The margin Tensor 387 | can be used to enforce or loosen this condition. 388 | 389 | Return: 390 | labels, bboxes: Filtered elements. 391 | """ 392 | with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]): 393 | cy = (bboxes[:, 0] + bboxes[:, 2]) / 2. 394 | cx = (bboxes[:, 1] + bboxes[:, 3]) / 2. 395 | mask = tf.greater(cy, margins[0]) 396 | mask = tf.logical_and(mask, tf.greater(cx, margins[1])) 397 | mask = tf.logical_and(mask, tf.less(cx, 1. + margins[2])) 398 | mask = tf.logical_and(mask, tf.less(cx, 1. + margins[3])) 399 | # Boolean masking... 400 | labels = tf.boolean_mask(labels, mask) 401 | bboxes = tf.boolean_mask(bboxes, mask) 402 | return labels, bboxes 403 | 404 | 405 | def bboxes_filter_overlap(labels, bboxes, threshold=0.5, 406 | scope=None): 407 | """Filter out bounding boxes based on overlap with reference 408 | box [0, 0, 1, 1]. 409 | 410 | Return: 411 | labels, bboxes: Filtered elements. 412 | """ 413 | with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]): 414 | scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype), 415 | bboxes) 416 | mask = scores > threshold 417 | labels = tf.boolean_mask(labels, mask) 418 | bboxes = tf.boolean_mask(bboxes, mask) 419 | return labels, bboxes 420 | 421 | 422 | def bboxes_filter_labels(labels, bboxes, 423 | out_labels=[], num_classes=np.inf, 424 | scope=None): 425 | """Filter out labels from a collection. Typically used to get 426 | of DontCare elements. Also remove elements based on the number of classes. 427 | 428 | Return: 429 | labels, bboxes: Filtered elements. 430 | """ 431 | with tf.name_scope(scope, 'bboxes_filter_labels', [labels, bboxes]): 432 | mask = tf.greater_equal(labels, num_classes) 433 | for l in labels: 434 | mask = tf.logical_and(mask, tf.not_equal(labels, l)) 435 | labels = tf.boolean_mask(labels, mask) 436 | bboxes = tf.boolean_mask(bboxes, mask) 437 | return labels, bboxes 438 | 439 | 440 | # =========================================================================== # 441 | # Standard boxes computation. 442 | # =========================================================================== # 443 | def bboxes_jaccard(bbox_ref, bboxes, name=None): 444 | """Compute jaccard score between a reference box and a collection 445 | of bounding boxes. 446 | 447 | Args: 448 | bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). 449 | bboxes: (N, 4) Tensor, collection of bounding boxes. 450 | Return: 451 | (N,) Tensor with Jaccard scores. 452 | """ 453 | with tf.name_scope(name, 'bboxes_jaccard'): 454 | # Should be more efficient to first transpose. 455 | bboxes = tf.transpose(bboxes) 456 | bbox_ref = tf.transpose(bbox_ref) 457 | # Intersection bbox and volume. 458 | int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) 459 | int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) 460 | int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) 461 | int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) 462 | h = tf.maximum(int_ymax - int_ymin, 0.) 463 | w = tf.maximum(int_xmax - int_xmin, 0.) 464 | # Volumes. 465 | inter_vol = h * w 466 | union_vol = -inter_vol \ 467 | + (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) \ 468 | + (bbox_ref[2] - bbox_ref[0]) * (bbox_ref[3] - bbox_ref[1]) 469 | jaccard = tfe_math.safe_divide(inter_vol, union_vol, 'jaccard') 470 | return jaccard 471 | 472 | 473 | def bboxes_intersection(bbox_ref, bboxes, name=None): 474 | """Compute relative intersection between a reference box and a 475 | collection of bounding boxes. Namely, compute the quotient between 476 | intersection area and box area. 477 | 478 | Args: 479 | bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). 480 | bboxes: (N, 4) Tensor, collection of bounding boxes. 481 | Return: 482 | (N,) Tensor with relative intersection. 483 | """ 484 | with tf.name_scope(name, 'bboxes_intersection'): 485 | # Should be more efficient to first transpose. 486 | bboxes = tf.transpose(bboxes) 487 | bbox_ref = tf.transpose(bbox_ref) 488 | # Intersection bbox and volume. 489 | int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) 490 | int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) 491 | int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) 492 | int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) 493 | h = tf.maximum(int_ymax - int_ymin, 0.) 494 | w = tf.maximum(int_xmax - int_xmin, 0.) 495 | # Volumes. 496 | inter_vol = h * w 497 | bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) 498 | scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection') 499 | return scores 500 | -------------------------------------------------------------------------------- /tf_extended/image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gxd1994/TextBoxes-TensorFlow/7ae19de6c4e7bccaa5695762bd1f0864b9f4e593/tf_extended/image.py -------------------------------------------------------------------------------- /tf_extended/math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TF Extended: additional math functions. 16 | """ 17 | import tensorflow as tf 18 | 19 | from tensorflow.python.ops import array_ops 20 | from tensorflow.python.ops import math_ops 21 | from tensorflow.python.framework import dtypes 22 | from tensorflow.python.framework import ops 23 | 24 | 25 | def safe_divide(numerator, denominator, name): 26 | """Divides two values, returning 0 if the denominator is <= 0. 27 | Args: 28 | numerator: A real `Tensor`. 29 | denominator: A real `Tensor`, with dtype matching `numerator`. 30 | name: Name for the returned op. 31 | Returns: 32 | 0 if `denominator` <= 0, else `numerator` / `denominator` 33 | """ 34 | return tf.where( 35 | math_ops.greater(denominator, 0), 36 | math_ops.divide(numerator, denominator), 37 | tf.zeros_like(numerator), 38 | name=name) 39 | 40 | 41 | def cummax(x, reverse=False, name=None): 42 | """Compute the cumulative maximum of the tensor `x` along `axis`. This 43 | operation is similar to the more classic `cumsum`. Only support 1D Tensor 44 | for now. 45 | 46 | Args: 47 | x: A `Tensor`. Must be one of the following types: `float32`, `float64`, 48 | `int64`, `int32`, `uint8`, `uint16`, `int16`, `int8`, `complex64`, 49 | `complex128`, `qint8`, `quint8`, `qint32`, `half`. 50 | axis: A `Tensor` of type `int32` (default: 0). 51 | reverse: A `bool` (default: False). 52 | name: A name for the operation (optional). 53 | Returns: 54 | A `Tensor`. Has the same type as `x`. 55 | """ 56 | with ops.name_scope(name, "Cummax", [x]) as name: 57 | x = ops.convert_to_tensor(x, name="x") 58 | # Not very optimal: should directly integrate reverse into tf.scan. 59 | if reverse: 60 | x = tf.reverse(x, axis=[0]) 61 | # 'Accumlating' maximum: ensure it is always increasing. 62 | cmax = tf.scan(lambda a, y: tf.maximum(a, y), x, 63 | initializer=None, parallel_iterations=1, 64 | back_prop=False, swap_memory=False) 65 | if reverse: 66 | cmax = tf.reverse(cmax, axis=[0]) 67 | return cmax 68 | -------------------------------------------------------------------------------- /tf_extended/metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TF Extended: additional metrics. 16 | """ 17 | import tensorflow as tf 18 | import numpy as np 19 | 20 | from tensorflow.contrib.framework.python.ops import variables as contrib_variables 21 | from tensorflow.python.framework import dtypes 22 | from tensorflow.python.framework import ops 23 | from tensorflow.python.ops import array_ops 24 | from tensorflow.python.ops import math_ops 25 | from tensorflow.python.ops import nn 26 | from tensorflow.python.ops import state_ops 27 | from tensorflow.python.ops import variable_scope 28 | from tensorflow.python.ops import variables 29 | 30 | from tf_extended import math as tfe_math 31 | 32 | 33 | # =========================================================================== # 34 | # TensorFlow utils 35 | # =========================================================================== # 36 | def _create_local(name, shape, collections=None, validate_shape=True, 37 | dtype=dtypes.float32): 38 | """Creates a new local variable. 39 | Args: 40 | name: The name of the new or existing variable. 41 | shape: Shape of the new or existing variable. 42 | collections: A list of collection names to which the Variable will be added. 43 | validate_shape: Whether to validate the shape of the variable. 44 | dtype: Data type of the variables. 45 | Returns: 46 | The created variable. 47 | """ 48 | # Make sure local variables are added to tf.GraphKeys.LOCAL_VARIABLES 49 | collections = list(collections or []) 50 | collections += [ops.GraphKeys.LOCAL_VARIABLES] 51 | return variables.Variable( 52 | initial_value=array_ops.zeros(shape, dtype=dtype), 53 | name=name, 54 | trainable=False, 55 | collections=collections, 56 | validate_shape=validate_shape) 57 | 58 | 59 | def _safe_div(numerator, denominator, name): 60 | """Divides two values, returning 0 if the denominator is <= 0. 61 | Args: 62 | numerator: A real `Tensor`. 63 | denominator: A real `Tensor`, with dtype matching `numerator`. 64 | name: Name for the returned op. 65 | Returns: 66 | 0 if `denominator` <= 0, else `numerator` / `denominator` 67 | """ 68 | return tf.where( 69 | math_ops.greater(denominator, 0), 70 | math_ops.divide(numerator, denominator), 71 | tf.zeros_like(numerator), 72 | name=name) 73 | 74 | 75 | def _broadcast_weights(weights, values): 76 | """Broadcast `weights` to the same shape as `values`. 77 | This returns a version of `weights` following the same broadcast rules as 78 | `mul(weights, values)`. When computing a weighted average, use this function 79 | to broadcast `weights` before summing them; e.g., 80 | `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`. 81 | Args: 82 | weights: `Tensor` whose shape is broadcastable to `values`. 83 | values: `Tensor` of any shape. 84 | Returns: 85 | `weights` broadcast to `values` shape. 86 | """ 87 | weights_shape = weights.get_shape() 88 | values_shape = values.get_shape() 89 | if(weights_shape.is_fully_defined() and 90 | values_shape.is_fully_defined() and 91 | weights_shape.is_compatible_with(values_shape)): 92 | return weights 93 | return math_ops.mul( 94 | weights, array_ops.ones_like(values), name='broadcast_weights') 95 | 96 | 97 | # =========================================================================== # 98 | # TF Extended metrics: TP and FP arrays. 99 | # =========================================================================== # 100 | def precision_recall(num_gbboxes, num_detections, tp, fp, scores, 101 | dtype=tf.float64, scope=None): 102 | """Compute precision and recall from scores, true positives and false 103 | positives booleans arrays 104 | """ 105 | # Input dictionaries: dict outputs as streaming metrics. 106 | if isinstance(scores, dict): 107 | d_precision = {} 108 | d_recall = {} 109 | for c in num_gbboxes.keys(): 110 | scope = 'precision_recall_%s' % c 111 | p, r = precision_recall(num_gbboxes[c], num_detections[c], 112 | tp[c], fp[c], scores[c], 113 | dtype, scope) 114 | d_precision[c] = p 115 | d_recall[c] = r 116 | return d_precision, d_recall 117 | 118 | # Sort by score. 119 | with tf.name_scope(scope, 'precision_recall', 120 | [num_gbboxes, num_detections, tp, fp, scores]): 121 | # Sort detections by score. 122 | scores, idxes = tf.nn.top_k(scores, k=num_detections, sorted=True) 123 | tp = tf.gather(tp, idxes) 124 | fp = tf.gather(fp, idxes) 125 | # Computer recall and precision. 126 | tp = tf.cumsum(tf.cast(tp, dtype), axis=0) 127 | fp = tf.cumsum(tf.cast(fp, dtype), axis=0) 128 | recall = _safe_div(tp, tf.cast(num_gbboxes, dtype), 'recall') 129 | precision = _safe_div(tp, tp + fp, 'precision') 130 | return tf.tuple([precision, recall]) 131 | 132 | 133 | def streaming_tp_fp_arrays(num_gbboxes, tp, fp, scores, 134 | remove_zero_scores=True, 135 | metrics_collections=None, 136 | updates_collections=None, 137 | name=None): 138 | """Streaming computation of True and False Positive arrays. This metrics 139 | also keeps track of scores and number of grountruth objects. 140 | """ 141 | # Input dictionaries: dict outputs as streaming metrics. 142 | if isinstance(scores, dict) or isinstance(fp, dict): 143 | d_values = {} 144 | d_update_ops = {} 145 | for c in num_gbboxes.keys(): 146 | scope = 'streaming_tp_fp_%s' % c 147 | v, up = streaming_tp_fp_arrays(num_gbboxes[c], tp[c], fp[c], scores[c], 148 | remove_zero_scores, 149 | metrics_collections, 150 | updates_collections, 151 | name=scope) 152 | d_values[c] = v 153 | d_update_ops[c] = up 154 | return d_values, d_update_ops 155 | 156 | # Input Tensors... 157 | with variable_scope.variable_scope(name, 'streaming_tp_fp', 158 | [num_gbboxes, tp, fp, scores]): 159 | num_gbboxes = math_ops.to_int64(num_gbboxes) 160 | scores = math_ops.to_float(scores) 161 | stype = tf.bool 162 | tp = tf.cast(tp, stype) 163 | fp = tf.cast(fp, stype) 164 | # Reshape TP and FP tensors and clean away 0 class values. 165 | scores = tf.reshape(scores, [-1]) 166 | tp = tf.reshape(tp, [-1]) 167 | fp = tf.reshape(fp, [-1]) 168 | # Remove TP and FP both false. 169 | mask = tf.logical_or(tp, fp) 170 | if remove_zero_scores: 171 | rm_threshold = 1e-4 172 | mask = tf.logical_and(mask, tf.greater(scores, rm_threshold)) 173 | scores = tf.boolean_mask(scores, mask) 174 | tp = tf.boolean_mask(tp, mask) 175 | fp = tf.boolean_mask(fp, mask) 176 | 177 | # Local variables accumlating information over batches. 178 | v_nobjects = _create_local('v_num_gbboxes', shape=[], dtype=tf.int64) 179 | v_ndetections = _create_local('v_num_detections', shape=[], dtype=tf.int32) 180 | v_scores = _create_local('v_scores', shape=[0, ]) 181 | v_tp = _create_local('v_tp', shape=[0, ], dtype=stype) 182 | v_fp = _create_local('v_fp', shape=[0, ], dtype=stype) 183 | 184 | # Update operations. 185 | nobjects_op = state_ops.assign_add(v_nobjects, 186 | tf.reduce_sum(num_gbboxes)) 187 | ndetections_op = state_ops.assign_add(v_ndetections, 188 | tf.size(scores, out_type=tf.int32)) 189 | scores_op = state_ops.assign(v_scores, tf.concat([v_scores, scores], axis=0), 190 | validate_shape=False) 191 | tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp], axis=0), 192 | validate_shape=False) 193 | fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp], axis=0), 194 | validate_shape=False) 195 | 196 | # Value and update ops. 197 | val = (v_nobjects, v_ndetections, v_tp, v_fp, v_scores) 198 | with ops.control_dependencies([nobjects_op, ndetections_op, 199 | scores_op, tp_op, fp_op]): 200 | update_op = (nobjects_op, ndetections_op, tp_op, fp_op, scores_op) 201 | 202 | if metrics_collections: 203 | ops.add_to_collections(metrics_collections, val) 204 | if updates_collections: 205 | ops.add_to_collections(updates_collections, update_op) 206 | return val, update_op 207 | 208 | 209 | # =========================================================================== # 210 | # Average precision computations. 211 | # =========================================================================== # 212 | def average_precision_voc12(precision, recall, name=None): 213 | """Compute (interpolated) average precision from precision and recall Tensors. 214 | 215 | The implementation follows Pascal 2012 and ILSVRC guidelines. 216 | See also: https://sanchom.wordpress.com/tag/average-precision/ 217 | """ 218 | with tf.name_scope(name, 'average_precision_voc12', [precision, recall]): 219 | # Convert to float64 to decrease error on Riemann sums. 220 | precision = tf.cast(precision, dtype=tf.float64) 221 | recall = tf.cast(recall, dtype=tf.float64) 222 | 223 | # Add bounds values to precision and recall. 224 | precision = tf.concat([[0.], precision, [0.]], axis=0) 225 | recall = tf.concat([[0.], recall, [1.]], axis=0) 226 | # Ensures precision is increasing in reverse order. 227 | precision = tfe_math.cummax(precision, reverse=True) 228 | 229 | # Riemann sums for estimating the integral. 230 | # mean_pre = (precision[1:] + precision[:-1]) / 2. 231 | mean_pre = precision[1:] 232 | diff_rec = recall[1:] - recall[:-1] 233 | ap = tf.reduce_sum(mean_pre * diff_rec) 234 | return ap 235 | 236 | 237 | def average_precision_voc07(precision, recall, name=None): 238 | """Compute (interpolated) average precision from precision and recall Tensors. 239 | 240 | The implementation follows Pascal 2007 guidelines. 241 | See also: https://sanchom.wordpress.com/tag/average-precision/ 242 | """ 243 | with tf.name_scope(name, 'average_precision_voc07', [precision, recall]): 244 | # Convert to float64 to decrease error on cumulated sums. 245 | precision = tf.cast(precision, dtype=tf.float64) 246 | recall = tf.cast(recall, dtype=tf.float64) 247 | # Add zero-limit value to avoid any boundary problem... 248 | precision = tf.concat([precision, [0.]], axis=0) 249 | recall = tf.concat([recall, [np.inf]], axis=0) 250 | 251 | # Split the integral into 10 bins. 252 | l_aps = [] 253 | for t in np.arange(0., 1.1, 0.1): 254 | mask = tf.greater_equal(recall, t) 255 | v = tf.reduce_max(tf.boolean_mask(precision, mask)) 256 | l_aps.append(v / 11.) 257 | ap = tf.add_n(l_aps) 258 | return ap 259 | 260 | 261 | def precision_recall_values(xvals, precision, recall, name=None): 262 | """Compute values on the precision/recall curve. 263 | 264 | Args: 265 | x: Python list of floats; 266 | precision: 1D Tensor decreasing. 267 | recall: 1D Tensor increasing. 268 | Return: 269 | list of precision values. 270 | """ 271 | with ops.name_scope(name, "precision_recall_values", 272 | [precision, recall]) as name: 273 | # Add bounds values to precision and recall. 274 | precision = tf.concat([[0.], precision, [0.]], axis=0) 275 | recall = tf.concat([[0.], recall, [1.]], axis=0) 276 | precision = tfe_math.cummax(precision, reverse=True) 277 | 278 | prec_values = [] 279 | for x in xvals: 280 | mask = tf.less_equal(recall, x) 281 | val = tf.reduce_min(tf.boolean_mask(precision, mask)) 282 | prec_values.append(val) 283 | return tf.tuple(prec_values) 284 | 285 | 286 | # =========================================================================== # 287 | # TF Extended metrics: old stuff! 288 | # =========================================================================== # 289 | def _precision_recall(n_gbboxes, n_detections, scores, tp, fp, scope=None): 290 | """Compute precision and recall from scores, true positives and false 291 | positives booleans arrays 292 | """ 293 | # Sort by score. 294 | with tf.name_scope(scope, 'prec_rec', [n_gbboxes, scores, tp, fp]): 295 | # Sort detections by score. 296 | scores, idxes = tf.nn.top_k(scores, k=n_detections, sorted=True) 297 | tp = tf.gather(tp, idxes) 298 | fp = tf.gather(fp, idxes) 299 | # Computer recall and precision. 300 | dtype = tf.float64 301 | tp = tf.cumsum(tf.cast(tp, dtype), axis=0) 302 | fp = tf.cumsum(tf.cast(fp, dtype), axis=0) 303 | recall = _safe_div(tp, tf.cast(n_gbboxes, dtype), 'recall') 304 | precision = _safe_div(tp, tp + fp, 'precision') 305 | 306 | return tf.tuple([precision, recall]) 307 | 308 | 309 | def streaming_precision_recall_arrays(n_gbboxes, rclasses, rscores, 310 | tp_tensor, fp_tensor, 311 | remove_zero_labels=True, 312 | metrics_collections=None, 313 | updates_collections=None, 314 | name=None): 315 | """Streaming computation of precision / recall arrays. This metrics 316 | keeps tracks of boolean True positives and False positives arrays. 317 | """ 318 | with variable_scope.variable_scope(name, 'stream_precision_recall', 319 | [n_gbboxes, rclasses, tp_tensor, fp_tensor]): 320 | n_gbboxes = math_ops.to_int64(n_gbboxes) 321 | rclasses = math_ops.to_int64(rclasses) 322 | rscores = math_ops.to_float(rscores) 323 | 324 | stype = tf.int32 325 | tp_tensor = tf.cast(tp_tensor, stype) 326 | fp_tensor = tf.cast(fp_tensor, stype) 327 | 328 | # Reshape TP and FP tensors and clean away 0 class values. 329 | rclasses = tf.reshape(rclasses, [-1]) 330 | rscores = tf.reshape(rscores, [-1]) 331 | tp_tensor = tf.reshape(tp_tensor, [-1]) 332 | fp_tensor = tf.reshape(fp_tensor, [-1]) 333 | if remove_zero_labels: 334 | mask = tf.greater(rclasses, 0) 335 | rclasses = tf.boolean_mask(rclasses, mask) 336 | rscores = tf.boolean_mask(rscores, mask) 337 | tp_tensor = tf.boolean_mask(tp_tensor, mask) 338 | fp_tensor = tf.boolean_mask(fp_tensor, mask) 339 | 340 | # Local variables accumlating information over batches. 341 | v_nobjects = _create_local('v_nobjects', shape=[], dtype=tf.int64) 342 | v_ndetections = _create_local('v_ndetections', shape=[], dtype=tf.int32) 343 | v_scores = _create_local('v_scores', shape=[0, ]) 344 | v_tp = _create_local('v_tp', shape=[0, ], dtype=stype) 345 | v_fp = _create_local('v_fp', shape=[0, ], dtype=stype) 346 | 347 | # Update operations. 348 | nobjects_op = state_ops.assign_add(v_nobjects, 349 | tf.reduce_sum(n_gbboxes)) 350 | ndetections_op = state_ops.assign_add(v_ndetections, 351 | tf.size(rscores, out_type=tf.int32)) 352 | scores_op = state_ops.assign(v_scores, tf.concat([v_scores, rscores], axis=0), 353 | validate_shape=False) 354 | tp_op = state_ops.assign(v_tp, tf.concat([v_tp, tp_tensor], axis=0), 355 | validate_shape=False) 356 | fp_op = state_ops.assign(v_fp, tf.concat([v_fp, fp_tensor], axis=0), 357 | validate_shape=False) 358 | 359 | # Precision and recall computations. 360 | # r = _precision_recall(nobjects_op, scores_op, tp_op, fp_op, 'value') 361 | r = _precision_recall(v_nobjects, v_ndetections, v_scores, 362 | v_tp, v_fp, 'value') 363 | 364 | with ops.control_dependencies([nobjects_op, ndetections_op, 365 | scores_op, tp_op, fp_op]): 366 | update_op = _precision_recall(nobjects_op, ndetections_op, 367 | scores_op, tp_op, fp_op, 'update_op') 368 | 369 | # update_op = tf.Print(update_op, 370 | # [tf.reduce_sum(tf.cast(mask, tf.int64)), 371 | # tf.reduce_sum(tf.cast(mask2, tf.int64)), 372 | # tf.reduce_min(rscores), 373 | # tf.reduce_sum(n_gbboxes)], 374 | # 'Metric: ') 375 | # Some debugging stuff! 376 | # update_op = tf.Print(update_op, 377 | # [tf.shape(tp_op), 378 | # tf.reduce_sum(tf.cast(tp_op, tf.int64), axis=0)], 379 | # 'TP and FP shape: ') 380 | # update_op[0] = tf.Print(update_op, 381 | # [nobjects_op], 382 | # '# Groundtruth bboxes: ') 383 | # update_op = tf.Print(update_op, 384 | # [update_op[0][0], 385 | # update_op[0][-1], 386 | # tf.reduce_min(update_op[0]), 387 | # tf.reduce_max(update_op[0]), 388 | # tf.reduce_min(update_op[1]), 389 | # tf.reduce_max(update_op[1])], 390 | # 'Precision and recall :') 391 | 392 | if metrics_collections: 393 | ops.add_to_collections(metrics_collections, r) 394 | if updates_collections: 395 | ops.add_to_collections(updates_collections, update_op) 396 | return r, update_op 397 | 398 | -------------------------------------------------------------------------------- /tf_extended/tensors.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """TF Extended: additional tensors operations. 16 | """ 17 | import tensorflow as tf 18 | 19 | from tensorflow.contrib.framework.python.ops import variables as contrib_variables 20 | from tensorflow.contrib.metrics.python.ops import set_ops 21 | from tensorflow.python.framework import dtypes 22 | from tensorflow.python.framework import ops 23 | from tensorflow.python.framework import sparse_tensor 24 | from tensorflow.python.ops import array_ops 25 | from tensorflow.python.ops import check_ops 26 | from tensorflow.python.ops import control_flow_ops 27 | from tensorflow.python.ops import math_ops 28 | from tensorflow.python.ops import nn 29 | from tensorflow.python.ops import state_ops 30 | from tensorflow.python.ops import variable_scope 31 | from tensorflow.python.ops import variables 32 | 33 | 34 | def get_shape(x, rank=None): 35 | """Returns the dimensions of a Tensor as list of integers or scale tensors. 36 | 37 | Args: 38 | x: N-d Tensor; 39 | rank: Rank of the Tensor. If None, will try to guess it. 40 | Returns: 41 | A list of `[d1, d2, ..., dN]` corresponding to the dimensions of the 42 | input tensor. Dimensions that are statically known are python integers, 43 | otherwise they are integer scalar tensors. 44 | """ 45 | if x.get_shape().is_fully_defined(): 46 | return x.get_shape().as_list() 47 | else: 48 | static_shape = x.get_shape() 49 | if rank is None: 50 | static_shape = static_shape.as_list() 51 | rank = len(static_shape) 52 | else: 53 | static_shape = x.get_shape().with_rank(rank).as_list() 54 | dynamic_shape = tf.unstack(tf.shape(x), rank) 55 | return [s if s is not None else d 56 | for s, d in zip(static_shape, dynamic_shape)] 57 | 58 | 59 | def pad_axis(x, offset, size, axis=0, name=None): 60 | """Pad a tensor on an axis, with a given offset and output size. 61 | The tensor is padded with zero (i.e. CONSTANT mode). Note that the if the 62 | `size` is smaller than existing size + `offset`, the output tensor 63 | was the latter dimension. 64 | 65 | Args: 66 | x: Tensor to pad; 67 | offset: Offset to add on the dimension chosen; 68 | size: Final size of the dimension. 69 | Return: 70 | Padded tensor whose dimension on `axis` is `size`, or greater if 71 | the input vector was larger. 72 | """ 73 | with tf.name_scope(name, 'pad_axis'): 74 | shape = get_shape(x) 75 | rank = len(shape) 76 | # Padding description. 77 | new_size = tf.maximum(size-offset-shape[axis], 0) 78 | pad1 = tf.stack([0]*axis + [offset] + [0]*(rank-axis-1)) 79 | pad2 = tf.stack([0]*axis + [new_size] + [0]*(rank-axis-1)) 80 | paddings = tf.stack([pad1, pad2], axis=1) 81 | x = tf.pad(x, paddings, mode='CONSTANT') 82 | # Reshape, to get fully defined shape if possible. 83 | # TODO: fix with tf.slice 84 | shape[axis] = size 85 | x = tf.reshape(x, tf.stack(shape)) 86 | return x 87 | 88 | 89 | # def select_at_index(idx, val, t): 90 | # """Return a tensor. 91 | # """ 92 | # idx = tf.expand_dims(tf.expand_dims(idx, 0), 0) 93 | # val = tf.expand_dims(val, 0) 94 | # t = t + tf.scatter_nd(idx, val, tf.shape(t)) 95 | # return t 96 | -------------------------------------------------------------------------------- /tf_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 Paul Balanca. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Diverse TensorFlow utils, for training, evaluation and so on! 16 | """ 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | import os 21 | from pprint import pprint 22 | 23 | import tensorflow as tf 24 | from tensorflow.contrib.slim.python.slim.data import parallel_reader 25 | 26 | slim = tf.contrib.slim 27 | 28 | 29 | # =========================================================================== # 30 | # General tools. 31 | # =========================================================================== # 32 | def reshape_list(l, shape=None): 33 | """Reshape list of (list): 1D to 2D or the other way around. 34 | 35 | Args: 36 | l: List or List of list. 37 | shape: 1D or 2D shape. 38 | Return 39 | Reshaped list. 40 | """ 41 | r = [] 42 | if shape is None: 43 | # Flatten everything. 44 | for a in l: 45 | if isinstance(a, (list, tuple)): 46 | r = r + list(a) 47 | else: 48 | r.append(a) 49 | else: 50 | # Reshape to list of list. 51 | i = 0 52 | for s in shape: 53 | if s == 1: 54 | r.append(l[i]) 55 | else: 56 | r.append(l[i:i+s]) 57 | i += s 58 | return r 59 | 60 | 61 | # =========================================================================== # 62 | # Training utils. 63 | # =========================================================================== # 64 | def print_configuration(flags, ssd_params, data_sources, save_dir=None): 65 | """Print the training configuration. 66 | """ 67 | def print_config(stream=None): 68 | print('\n# =========================================================================== #', file=stream) 69 | print('# Training | Evaluation flags:', file=stream) 70 | print('# =========================================================================== #', file=stream) 71 | pprint(flags, stream=stream) 72 | 73 | print('\n# =========================================================================== #', file=stream) 74 | print('# SSD net parameters:', file=stream) 75 | print('# =========================================================================== #', file=stream) 76 | pprint(dict(ssd_params._asdict()), stream=stream) 77 | 78 | print('\n# =========================================================================== #', file=stream) 79 | print('# Training | Evaluation dataset files:', file=stream) 80 | print('# =========================================================================== #', file=stream) 81 | data_files = parallel_reader.get_data_files(data_sources) 82 | pprint(data_files, stream=stream) 83 | print('', file=stream) 84 | 85 | print_config(None) 86 | # Save to a text file as well. 87 | if save_dir is not None: 88 | if not os.path.exists(save_dir): 89 | os.makedirs(save_dir) 90 | path = os.path.join(save_dir, 'training_config.txt') 91 | with open(path, "w") as out: 92 | print_config(out) 93 | 94 | 95 | def configure_learning_rate(flags, num_samples_per_epoch, global_step): 96 | """Configures the learning rate. 97 | 98 | Args: 99 | num_samples_per_epoch: The number of samples in each epoch of training. 100 | global_step: The global_step tensor. 101 | Returns: 102 | A `Tensor` representing the learning rate. 103 | """ 104 | decay_steps = int(num_samples_per_epoch / flags.batch_size * 105 | flags.num_epochs_per_decay) 106 | 107 | if flags.learning_rate_decay_type == 'exponential': 108 | return tf.train.exponential_decay(flags.learning_rate, 109 | global_step, 110 | decay_steps, 111 | flags.learning_rate_decay_factor, 112 | staircase=True, 113 | name='exponential_decay_learning_rate') 114 | elif flags.learning_rate_decay_type == 'fixed': 115 | return tf.constant(flags.learning_rate, name='fixed_learning_rate') 116 | elif flags.learning_rate_decay_type == 'polynomial': 117 | return tf.train.polynomial_decay(flags.learning_rate, 118 | global_step, 119 | decay_steps, 120 | flags.end_learning_rate, 121 | power=1.0, 122 | cycle=False, 123 | name='polynomial_decay_learning_rate') 124 | else: 125 | raise ValueError('learning_rate_decay_type [%s] was not recognized', 126 | flags.learning_rate_decay_type) 127 | 128 | 129 | def configure_optimizer(flags, learning_rate): 130 | """Configures the optimizer used for training. 131 | 132 | Args: 133 | learning_rate: A scalar or `Tensor` learning rate. 134 | Returns: 135 | An instance of an optimizer. 136 | """ 137 | if flags.optimizer == 'adadelta': 138 | optimizer = tf.train.AdadeltaOptimizer( 139 | learning_rate, 140 | rho=flags.adadelta_rho, 141 | epsilon=flags.opt_epsilon) 142 | elif flags.optimizer == 'adagrad': 143 | optimizer = tf.train.AdagradOptimizer( 144 | learning_rate, 145 | initial_accumulator_value=flags.adagrad_initial_accumulator_value) 146 | elif flags.optimizer == 'adam': 147 | optimizer = tf.train.AdamOptimizer( 148 | learning_rate, 149 | beta1=flags.adam_beta1, 150 | beta2=flags.adam_beta2, 151 | epsilon=flags.opt_epsilon) 152 | elif flags.optimizer == 'ftrl': 153 | optimizer = tf.train.FtrlOptimizer( 154 | learning_rate, 155 | learning_rate_power=flags.ftrl_learning_rate_power, 156 | initial_accumulator_value=flags.ftrl_initial_accumulator_value, 157 | l1_regularization_strength=flags.ftrl_l1, 158 | l2_regularization_strength=flags.ftrl_l2) 159 | elif flags.optimizer == 'momentum': 160 | optimizer = tf.train.MomentumOptimizer( 161 | learning_rate, 162 | momentum=flags.momentum, 163 | name='Momentum') 164 | elif flags.optimizer == 'rmsprop': 165 | optimizer = tf.train.RMSPropOptimizer( 166 | learning_rate, 167 | decay=flags.rmsprop_decay, 168 | momentum=flags.rmsprop_momentum, 169 | epsilon=flags.opt_epsilon) 170 | elif flags.optimizer == 'sgd': 171 | optimizer = tf.train.GradientDescentOptimizer(learning_rate) 172 | else: 173 | raise ValueError('Optimizer [%s] was not recognized', flags.optimizer) 174 | return optimizer 175 | 176 | 177 | def add_variables_summaries(learning_rate): 178 | summaries = [] 179 | for variable in slim.get_model_variables(): 180 | summaries.append(tf.summary.histogram(variable.op.name, variable)) 181 | summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate)) 182 | return summaries 183 | 184 | 185 | def update_model_scope(var, ckpt_scope, new_scope): 186 | return var.op.name.replace(new_scope,'vgg_16') 187 | 188 | 189 | def get_init_fn(flags): 190 | """Returns a function run by the chief worker to warm-start the training. 191 | Note that the init_fn is only run when initializing the model during the very 192 | first global step. 193 | 194 | Returns: 195 | An init function run by the supervisor. 196 | """ 197 | if flags.checkpoint_path is None: 198 | return None 199 | # Warn the user if a checkpoint exists in the train_dir. Then ignore. 200 | if tf.train.latest_checkpoint(flags.train_dir): 201 | tf.logging.info( 202 | 'Ignoring --checkpoint_path because a checkpoint already exists in %s' 203 | % flags.train_dir) 204 | return None 205 | 206 | exclusions = [] 207 | if flags.checkpoint_exclude_scopes: 208 | exclusions = [scope.strip() 209 | for scope in flags.checkpoint_exclude_scopes.split(',')] 210 | 211 | # TODO(sguada) variables.filter_variables() 212 | variables_to_restore = [] 213 | for var in slim.get_model_variables(): 214 | excluded = False 215 | for exclusion in exclusions: 216 | if var.op.name.startswith(exclusion): 217 | excluded = True 218 | break 219 | if not excluded: 220 | variables_to_restore.append(var) 221 | # Change model scope if necessary. 222 | if flags.checkpoint_model_scope is not None: 223 | variables_to_restore = \ 224 | {var.op.name.replace(flags.model_name, 225 | flags.checkpoint_model_scope): var 226 | for var in variables_to_restore} 227 | 228 | 229 | if tf.gfile.IsDirectory(flags.checkpoint_path): 230 | checkpoint_path = tf.train.latest_checkpoint(flags.checkpoint_path) 231 | else: 232 | checkpoint_path = flags.checkpoint_path 233 | tf.logging.info('Fine-tuning from %s' % checkpoint_path) 234 | 235 | return slim.assign_from_checkpoint_fn( 236 | checkpoint_path, 237 | variables_to_restore, 238 | ignore_missing_vars=flags.ignore_missing_vars) 239 | 240 | 241 | def get_variables_to_train(flags): 242 | """Returns a list of variables to train. 243 | 244 | Returns: 245 | A list of variables to train by the optimizer. 246 | """ 247 | if flags.trainable_scopes is None: 248 | return tf.trainable_variables() 249 | else: 250 | scopes = [scope.strip() for scope in flags.trainable_scopes.split(',')] 251 | 252 | variables_to_train = [] 253 | for scope in scopes: 254 | variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 255 | variables_to_train.extend(variables) 256 | return variables_to_train 257 | 258 | 259 | # =========================================================================== # 260 | # Evaluation utils. 261 | # =========================================================================== # 262 | --------------------------------------------------------------------------------