├── LICENSE ├── README.md ├── Section 1 ├── Section_1-Tensorflow_Image_Captioning.ipynb ├── im2txt │ ├── configuration.py │ ├── data │ │ ├── build_mscoco_data.py │ │ └── download_and_preprocess_mscoco.sh │ ├── evaluate.py │ ├── inference_utils │ │ └── __pycache__ │ │ │ ├── caption_generator.cpython-36.pyc │ │ │ ├── inference_wrapper_base.cpython-36.pyc │ │ │ └── vocabulary.cpython-36.pyc │ ├── inference_wrapper.py │ ├── ops │ │ ├── __pycache__ │ │ │ ├── image_embedding.cpython-36.pyc │ │ │ ├── image_processing.cpython-36.pyc │ │ │ └── inputs.cpython-36.pyc │ │ ├── image_embedding.py │ │ ├── image_embedding_test.py │ │ ├── image_processing.py │ │ └── inputs.py │ ├── show_and_tell_model.py │ ├── show_and_tell_model_test.py │ └── train.py └── test_images │ ├── ballons.jpeg │ ├── bike.jpeg │ ├── fireworks.jpeg │ ├── football.jpeg │ ├── headphones.jpeg │ └── laughing.jpeg ├── Section 2 ├── Packt_CV_w_Py3_adv_projects_Section2_License_plate_recognition.ipynb ├── knn.p └── tests │ ├── Thumbs.db │ ├── p2.jpg │ ├── p5.jpg │ ├── p7.jpg │ └── p9.jpg └── Section 3 ├── AUTHORS ├── config.py ├── dlib_face_recognition_resnet_model_v1.dat ├── models ├── coco │ ├── coco-resnet-101.meta │ ├── download_models.sh │ ├── pairwise │ │ ├── pairwise_stats.mat │ │ ├── spatial_model_cidx_10_12.mat │ │ ├── spatial_model_cidx_10_16.mat │ │ ├── spatial_model_cidx_10_17.mat │ │ ├── spatial_model_cidx_11_12.mat │ │ ├── spatial_model_cidx_11_14.mat │ │ ├── spatial_model_cidx_11_15.mat │ │ ├── spatial_model_cidx_11_16.mat │ │ ├── spatial_model_cidx_12_13.mat │ │ ├── spatial_model_cidx_12_14.mat │ │ ├── spatial_model_cidx_12_15.mat │ │ ├── spatial_model_cidx_12_17.mat │ │ ├── spatial_model_cidx_13_14.mat │ │ ├── spatial_model_cidx_13_16.mat │ │ ├── spatial_model_cidx_13_17.mat │ │ ├── spatial_model_cidx_14_15.mat │ │ ├── spatial_model_cidx_14_16.mat │ │ ├── spatial_model_cidx_14_17.mat │ │ ├── spatial_model_cidx_15_16.mat │ │ ├── spatial_model_cidx_16_17.mat │ │ ├── spatial_model_cidx_1_10.mat │ │ ├── spatial_model_cidx_1_11.mat │ │ ├── spatial_model_cidx_1_12.mat │ │ ├── spatial_model_cidx_1_14.mat │ │ ├── spatial_model_cidx_1_15.mat │ │ ├── spatial_model_cidx_1_17.mat │ │ ├── spatial_model_cidx_1_3.mat │ │ ├── spatial_model_cidx_1_4.mat │ │ ├── spatial_model_cidx_1_8.mat │ │ ├── spatial_model_cidx_1_9.mat │ │ ├── spatial_model_cidx_2_10.mat │ │ ├── spatial_model_cidx_2_11.mat │ │ ├── spatial_model_cidx_2_13.mat │ │ ├── spatial_model_cidx_2_16.mat │ │ ├── spatial_model_cidx_2_3.mat │ │ ├── spatial_model_cidx_2_5.mat │ │ ├── spatial_model_cidx_2_7.mat │ │ ├── spatial_model_cidx_2_9.mat │ │ ├── spatial_model_cidx_3_11.mat │ │ ├── spatial_model_cidx_3_13.mat │ │ ├── spatial_model_cidx_3_14.mat │ │ ├── spatial_model_cidx_3_16.mat │ │ ├── spatial_model_cidx_3_4.mat │ │ ├── spatial_model_cidx_3_5.mat │ │ ├── spatial_model_cidx_3_7.mat │ │ ├── spatial_model_cidx_3_8.mat │ │ ├── spatial_model_cidx_4_10.mat │ │ ├── spatial_model_cidx_4_12.mat │ │ ├── spatial_model_cidx_4_14.mat │ │ ├── spatial_model_cidx_4_15.mat │ │ ├── spatial_model_cidx_4_16.mat │ │ ├── spatial_model_cidx_4_6.mat │ │ ├── spatial_model_cidx_4_7.mat │ │ ├── spatial_model_cidx_4_9.mat │ │ ├── spatial_model_cidx_5_10.mat │ │ ├── spatial_model_cidx_5_11.mat │ │ ├── spatial_model_cidx_5_12.mat │ │ ├── spatial_model_cidx_5_14.mat │ │ ├── spatial_model_cidx_5_6.mat │ │ ├── spatial_model_cidx_5_8.mat │ │ ├── spatial_model_cidx_5_9.mat │ │ ├── spatial_model_cidx_6_10.mat │ │ ├── spatial_model_cidx_6_13.mat │ │ ├── spatial_model_cidx_6_14.mat │ │ ├── spatial_model_cidx_6_15.mat │ │ ├── spatial_model_cidx_6_16.mat │ │ ├── spatial_model_cidx_6_17.mat │ │ ├── spatial_model_cidx_6_8.mat │ │ ├── spatial_model_cidx_6_9.mat │ │ ├── spatial_model_cidx_7_10.mat │ │ ├── spatial_model_cidx_7_11.mat │ │ ├── spatial_model_cidx_7_13.mat │ │ ├── spatial_model_cidx_7_15.mat │ │ ├── spatial_model_cidx_7_17.mat │ │ ├── spatial_model_cidx_8_10.mat │ │ ├── spatial_model_cidx_8_12.mat │ │ ├── spatial_model_cidx_8_13.mat │ │ ├── spatial_model_cidx_8_14.mat │ │ ├── spatial_model_cidx_8_15.mat │ │ ├── spatial_model_cidx_8_17.mat │ │ ├── spatial_model_cidx_9_14.mat │ │ ├── spatial_model_cidx_9_16.mat │ │ └── spatial_model_cidx_9_17.mat │ ├── pairwise_coco.tar.gz │ └── train │ │ └── pose_cfg.yaml ├── mpii │ ├── download_models.sh │ ├── mpii-single-resnet-101.index │ ├── mpii-single-resnet-101.meta │ ├── test │ │ └── pose_cfg.yaml │ └── train │ │ └── pose_cfg.yaml └── pretrained │ └── download.sh ├── pexels-photo-712521.jpeg ├── pexels-photo-776615.jpeg └── testcases └── vids ├── boy_walking.mp4 └── sidewalk.mp4 /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced Computer Vision Projects [Video] 2 | This is the code repository for [Advanced Computer Vision Projects [Video]](https://www.packtpub.com/big-data-and-business-intelligence/advanced-computer-vision-projects-video?utm_source=github&utm_medium=repository&utm_campaign=9781788620772), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish. 3 | ## About the Video Course 4 | Python’s wealth of powerful packages along with its clear syntax make state-of-the art computer vision and machine learning accessible to developers with a variety of backgrounds. This video course will equip you with the tools and skills to utilize the latest and greatest algorithms in computer vision, making applications that weren’t possible until recent years. 5 | 6 | In this course, you’ll continue to use TensorFlow and extend it to generate full captions from images. Later, you’ll see how to read text from license plates from real-world images using Google’s Tesseract Software. Finally, you’ll see how to track human body poses using “DeeperCut” within TensorFlow. 7 | 8 | At the end of this course, you’ll develop an application that can estimate human poses within images and will be able to take on the world with best practices in computer vision with machine learning. 9 | 10 | 11 |

What You Will Learn

12 |
13 |
18 | 19 | ## Instructions and Navigation 20 | ### Assumed Knowledge 21 | To fully benefit from the coverage included in this course, you will need:
22 | This video course is for Python developers who wish to learn the latest cutting-edge algorithms to solve computer vision problems that were impossible until recently. 23 | ### Technical Requirements 24 | This course has the following software requirements:
25 | This course has the following software requirements: 26 | 27 | This course has been tested on the following system configuration: ● OS: Windows 10 ● Processor: Intel i7 4th generation mobile ● Memory: 32 GB ● Hard Disk Space: 1 TB ● Video Card: GeForce GTX 970m 28 | 29 | ## Related Products 30 | * [Computer Vision Projects with Python 3 [Video]](https://www.packtpub.com/big-data-and-business-intelligence/computer-vision-projects-python-3-video?utm_source=github&utm_medium=repository&utm_campaign=9781788835565) 31 | 32 | * [Real-World Machine Learning Projects with Scikit-Learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/real-world-machine-learning-projects-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789131222) 33 | 34 | * [Java Machine Learning for Computer Vision [Video]](https://www.packtpub.com/big-data-and-business-intelligence/java-machine-learning-computer-vision-video?utm_source=github&utm_medium=repository&utm_campaign=9781789130652) 35 | 36 | -------------------------------------------------------------------------------- /Section 1/Section_1-Tensorflow_Image_Captioning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Section One – Image Captioning with Tensorflow" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "# load essential libraries\n", 17 | "import math\n", 18 | "import os\n", 19 | "\n", 20 | "import tensorflow as tf\n", 21 | "\n", 22 | "%pylab inline" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# load Tensorflow/Google Brain base code\n", 32 | "# https://github.com/tensorflow/models/tree/master/research/im2txt\n", 33 | "\n", 34 | "from im2txt import configuration\n", 35 | "from im2txt import inference_wrapper\n", 36 | "from im2txt.inference_utils import caption_generator\n", 37 | "from im2txt.inference_utils import vocabulary" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# tell our function where to find the trained model and vocabulary\n", 47 | "checkpoint_path = './model'\n", 48 | "vocab_file = './model/word_counts.txt'" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# this is the function we'll call to produce our captions \n", 58 | "# given input file name(s) -- separate file names by a ,\n", 59 | "# if more than one\n", 60 | "\n", 61 | "def gen_caption(input_files):\n", 62 | " # only print serious log messages\n", 63 | " tf.logging.set_verbosity(tf.logging.FATAL)\n", 64 | " # load our pretrained model\n", 65 | " g = tf.Graph()\n", 66 | " with g.as_default():\n", 67 | " model = inference_wrapper.InferenceWrapper()\n", 68 | " restore_fn = model.build_graph_from_config(configuration.ModelConfig(),\n", 69 | " checkpoint_path)\n", 70 | " g.finalize()\n", 71 | "\n", 72 | " # Create the vocabulary.\n", 73 | " vocab = vocabulary.Vocabulary(vocab_file)\n", 74 | "\n", 75 | " filenames = []\n", 76 | " for file_pattern in input_files.split(\",\"):\n", 77 | " filenames.extend(tf.gfile.Glob(file_pattern))\n", 78 | " tf.logging.info(\"Running caption generation on %d files matching %s\",\n", 79 | " len(filenames), input_files)\n", 80 | "\n", 81 | " with tf.Session(graph=g) as sess:\n", 82 | " # Load the model from checkpoint.\n", 83 | " restore_fn(sess)\n", 84 | "\n", 85 | " # Prepare the caption generator. Here we are implicitly using the default\n", 86 | " # beam search parameters. See caption_generator.py for a description of the\n", 87 | " # available beam search parameters.\n", 88 | " generator = caption_generator.CaptionGenerator(model, vocab)\n", 89 | " \n", 90 | " captionlist = []\n", 91 | "\n", 92 | " for filename in filenames:\n", 93 | " with tf.gfile.GFile(filename, \"rb\") as f:\n", 94 | " image = f.read()\n", 95 | " captions = generator.beam_search(sess, image)\n", 96 | " print(\"Captions for image %s:\" % os.path.basename(filename))\n", 97 | " for i, caption in enumerate(captions):\n", 98 | " # Ignore begin and end words.\n", 99 | " sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]]\n", 100 | " sentence = \" \".join(sentence)\n", 101 | " print(\" %d) %s (p=%f)\" % (i, sentence, math.exp(caption.logprob)))\n", 102 | " captionlist.append(sentence)\n", 103 | " return captionlist" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "testfile = 'test_images/ballons.jpeg'\n", 113 | "\n", 114 | "figure()\n", 115 | "imshow(imread(testfile))\n", 116 | "\n", 117 | "capts = gen_caption(testfile)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "input_files = 'test_images/ballons.jpeg,test_images/bike.jpeg,test_images/dog.jpeg,test_images/fireworks.jpeg,test_images/football.jpeg,test_images/giraffes.jpeg,test_images/headphones.jpeg,test_images/laughing.jpeg,test_images/objects.jpeg,test_images/snowboard.jpeg,test_images/surfing.jpeg'\n", 127 | "\n", 128 | "capts = gen_caption(input_files)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "\n", 143 | "\n", 144 | "\n", 145 | "

\n", 146 | "Retraining the image captioner" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 1, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "True" 158 | ] 159 | }, 160 | "execution_count": 1, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "# First download pretrained Inception (v3) model\n", 167 | "\n", 168 | "import webbrowser \n", 169 | "webbrowser.open(\"http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz\")\n", 170 | "\n", 171 | "# Completely unzip tar.gz file to get inception_v3.ckpt,\n", 172 | "# --recommend storing in im2txt/data directory" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# Now gather and prepare the mscoco data\n", 182 | "\n", 183 | "# Comment out cd magic command if already in data directory\n", 184 | "%cd im2txt/data\n", 185 | "# This command will take an hour or more to run typically.\n", 186 | "# Note, you will need a lot of HD space (>100 GB)!\n", 187 | "%run build_mscoco_data.py\n", 188 | "\n", 189 | "# At this point you have files in im2txt/data/mscoco/raw-data that you can train\n", 190 | "# on, or you can substitute your own data\n", 191 | "\n", 192 | "%cd .." 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 2, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# load needed modules\n", 202 | "\n", 203 | "import tensorflow as tf\n", 204 | "\n", 205 | "from im2txt import configuration\n", 206 | "from im2txt import show_and_tell_model" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 3, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# Define (but don't run yet) our captioning training function\n", 216 | "def train():\n", 217 | " model_config = configuration.ModelConfig()\n", 218 | " model_config.input_file_pattern = input_file_pattern\n", 219 | " model_config.inception_checkpoint_file = inception_checkpoint_file\n", 220 | " training_config = configuration.TrainingConfig()\n", 221 | "\n", 222 | " # Create training directory.\n", 223 | " train_dir = train_dir\n", 224 | " if not tf.gfile.IsDirectory(train_dir):\n", 225 | " tf.logging.info(\"Creating training directory: %s\", train_dir)\n", 226 | " tf.gfile.MakeDirs(train_dir)\n", 227 | "\n", 228 | " # Build the TensorFlow graph.\n", 229 | " g = tf.Graph()\n", 230 | " with g.as_default():\n", 231 | " # Build the model.\n", 232 | " model = show_and_tell_model.ShowAndTellModel(\n", 233 | " model_config, mode=\"train\", train_inception=train_inception)\n", 234 | " model.build()\n", 235 | "\n", 236 | " # Set up the learning rate.\n", 237 | " learning_rate_decay_fn = None\n", 238 | " if train_inception:\n", 239 | " learning_rate = tf.constant(training_config.train_inception_learning_rate)\n", 240 | " else:\n", 241 | " learning_rate = tf.constant(training_config.initial_learning_rate)\n", 242 | " if training_config.learning_rate_decay_factor > 0:\n", 243 | " num_batches_per_epoch = (training_config.num_examples_per_epoch /\n", 244 | " model_config.batch_size)\n", 245 | " decay_steps = int(num_batches_per_epoch *\n", 246 | " training_config.num_epochs_per_decay)\n", 247 | "\n", 248 | " def _learning_rate_decay_fn(learning_rate, global_step):\n", 249 | " return tf.train.exponential_decay(\n", 250 | " learning_rate,\n", 251 | " global_step,\n", 252 | " decay_steps=decay_steps,\n", 253 | " decay_rate=training_config.learning_rate_decay_factor,\n", 254 | " staircase=True)\n", 255 | "\n", 256 | " learning_rate_decay_fn = _learning_rate_decay_fn\n", 257 | "\n", 258 | " # Set up the training ops.\n", 259 | " train_op = tf.contrib.layers.optimize_loss(\n", 260 | " loss=model.total_loss,\n", 261 | " global_step=model.global_step,\n", 262 | " learning_rate=learning_rate,\n", 263 | " optimizer=training_config.optimizer,\n", 264 | " clip_gradients=training_config.clip_gradients,\n", 265 | " learning_rate_decay_fn=learning_rate_decay_fn)\n", 266 | "\n", 267 | " # Set up the Saver for saving and restoring model checkpoints.\n", 268 | " saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)\n", 269 | "\n", 270 | " # Run training.\n", 271 | " tf.contrib.slim.learning.train(\n", 272 | " train_op,\n", 273 | " train_dir,\n", 274 | " log_every_n_steps=log_every_n_steps,\n", 275 | " graph=g,\n", 276 | " global_step=model.global_step,\n", 277 | " number_of_steps=number_of_steps,\n", 278 | " init_fn=model.init_fn,\n", 279 | " saver=saver)\n", 280 | "\n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "# Initial training\n", 290 | "input_file_pattern = 'im2txt/data/mscoco/train-?????-of-00256'\n", 291 | "\n", 292 | "# change these if you put your stuff somewhere else\n", 293 | "inception_checkpoint_file = 'im2txt/data/inception_v3.ckpt'\n", 294 | "train_dir = 'im2txt/model'\n", 295 | "\n", 296 | "# Don't train inception for initial run\n", 297 | "train_inception = False\n", 298 | "number_of_steps = 1000000\n", 299 | "log_every_n_steps = 1\n", 300 | "\n", 301 | "# Now run the training (warning: takes days-to-weeks!!!)\n", 302 | "train()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "# Fine tuning\n", 312 | "input_file_pattern = 'im2txt/data/mscoco/train-?????-of-00256'\n", 313 | "\n", 314 | "# change these if you put your stuff somewhere else\n", 315 | "inception_checkpoint_file = 'im2txt/data/inception_v3.ckpt'\n", 316 | "train_dir = 'im2txt/model'\n", 317 | "\n", 318 | "# This will refine our results\n", 319 | "train_inception = True\n", 320 | "number_of_steps = 3000000\n", 321 | "log_every_n_steps = 1\n", 322 | "\n", 323 | "# Now run the training (warning: takes even longer than initial training!!!)\n", 324 | "train()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "# If you completed this, you can go back to the start of this notebook and \n", 334 | "# point checkpoint_path and vocab_file to your generated files." 335 | ] 336 | } 337 | ], 338 | "metadata": { 339 | "kernelspec": { 340 | "display_name": "Python 3", 341 | "language": "python", 342 | "name": "python3" 343 | }, 344 | "language_info": { 345 | "codemirror_mode": { 346 | "name": "ipython", 347 | "version": 3 348 | }, 349 | "file_extension": ".py", 350 | "mimetype": "text/x-python", 351 | "name": "python", 352 | "nbconvert_exporter": "python", 353 | "pygments_lexer": "ipython3", 354 | "version": "3.6.5" 355 | } 356 | }, 357 | "nbformat": 4, 358 | "nbformat_minor": 2 359 | } 360 | -------------------------------------------------------------------------------- /Section 1/im2txt/configuration.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image-to-text model and training configurations.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | class ModelConfig(object): 24 | """Wrapper class for model hyperparameters.""" 25 | 26 | def __init__(self): 27 | """Sets the default model hyperparameters.""" 28 | # File pattern of sharded TFRecord file containing SequenceExample protos. 29 | # Must be provided in training and evaluation modes. 30 | self.input_file_pattern = None 31 | 32 | # Image format ("jpeg" or "png"). 33 | self.image_format = "jpeg" 34 | 35 | # Approximate number of values per input shard. Used to ensure sufficient 36 | # mixing between shards in training. 37 | self.values_per_input_shard = 2300 38 | # Minimum number of shards to keep in the input queue. 39 | self.input_queue_capacity_factor = 2 40 | # Number of threads for prefetching SequenceExample protos. 41 | self.num_input_reader_threads = 1 42 | 43 | # Name of the SequenceExample context feature containing image data. 44 | self.image_feature_name = "image/data" 45 | # Name of the SequenceExample feature list containing integer captions. 46 | self.caption_feature_name = "image/caption_ids" 47 | 48 | # Number of unique words in the vocab (plus 1, for ). 49 | # The default value is larger than the expected actual vocab size to allow 50 | # for differences between tokenizer versions used in preprocessing. There is 51 | # no harm in using a value greater than the actual vocab size, but using a 52 | # value less than the actual vocab size will result in an error. 53 | self.vocab_size = 12000 54 | 55 | # Number of threads for image preprocessing. Should be a multiple of 2. 56 | self.num_preprocess_threads = 4 57 | 58 | # Batch size. 59 | self.batch_size = 32 60 | 61 | # File containing an Inception v3 checkpoint to initialize the variables 62 | # of the Inception model. Must be provided when starting training for the 63 | # first time. 64 | self.inception_checkpoint_file = None 65 | 66 | # Dimensions of Inception v3 input images. 67 | self.image_height = 299 68 | self.image_width = 299 69 | 70 | # Scale used to initialize model variables. 71 | self.initializer_scale = 0.08 72 | 73 | # LSTM input and output dimensionality, respectively. 74 | self.embedding_size = 512 75 | self.num_lstm_units = 512 76 | 77 | # If < 1.0, the dropout keep probability applied to LSTM variables. 78 | self.lstm_dropout_keep_prob = 0.7 79 | 80 | 81 | class TrainingConfig(object): 82 | """Wrapper class for training hyperparameters.""" 83 | 84 | def __init__(self): 85 | """Sets the default training hyperparameters.""" 86 | # Number of examples per epoch of training data. 87 | self.num_examples_per_epoch = 586363 88 | 89 | # Optimizer for training the model. 90 | self.optimizer = "SGD" 91 | 92 | # Learning rate for the initial phase of training. 93 | self.initial_learning_rate = 2.0 94 | self.learning_rate_decay_factor = 0.5 95 | self.num_epochs_per_decay = 8.0 96 | 97 | # Learning rate when fine tuning the Inception v3 parameters. 98 | self.train_inception_learning_rate = 0.0005 99 | 100 | # If not None, clip gradients to this value. 101 | self.clip_gradients = 5.0 102 | 103 | # How many model checkpoints to keep. 104 | self.max_checkpoints_to_keep = 5 105 | -------------------------------------------------------------------------------- /Section 1/im2txt/data/build_mscoco_data.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Converts MSCOCO data to TFRecord file format with SequenceExample protos. 16 | 17 | The MSCOCO images are expected to reside in JPEG files located in the following 18 | directory structure: 19 | 20 | train_image_dir/COCO_train2014_000000000151.jpg 21 | train_image_dir/COCO_train2014_000000000260.jpg 22 | ... 23 | 24 | and 25 | 26 | val_image_dir/COCO_val2014_000000000042.jpg 27 | val_image_dir/COCO_val2014_000000000073.jpg 28 | ... 29 | 30 | The MSCOCO annotations JSON files are expected to reside in train_captions_file 31 | and val_captions_file respectively. 32 | 33 | This script converts the combined MSCOCO data into sharded data files consisting 34 | of 256, 4 and 8 TFRecord files, respectively: 35 | 36 | output_dir/train-00000-of-00256 37 | output_dir/train-00001-of-00256 38 | ... 39 | output_dir/train-00255-of-00256 40 | 41 | and 42 | 43 | output_dir/val-00000-of-00004 44 | ... 45 | output_dir/val-00003-of-00004 46 | 47 | and 48 | 49 | output_dir/test-00000-of-00008 50 | ... 51 | output_dir/test-00007-of-00008 52 | 53 | Each TFRecord file contains ~2300 records. Each record within the TFRecord file 54 | is a serialized SequenceExample proto consisting of precisely one image-caption 55 | pair. Note that each image has multiple captions (usually 5) and therefore each 56 | image is replicated multiple times in the TFRecord files. 57 | 58 | The SequenceExample proto contains the following fields: 59 | 60 | context: 61 | image/image_id: integer MSCOCO image identifier 62 | image/data: string containing JPEG encoded image in RGB colorspace 63 | 64 | feature_lists: 65 | image/caption: list of strings containing the (tokenized) caption words 66 | image/caption_ids: list of integer ids corresponding to the caption words 67 | 68 | The captions are tokenized using the NLTK (http://www.nltk.org/) word tokenizer. 69 | The vocabulary of word identifiers is constructed from the sorted list (by 70 | descending frequency) of word tokens in the training set. Only tokens appearing 71 | at least 4 times are considered; all other words get the "unknown" word id. 72 | 73 | NOTE: This script will consume around 100GB of disk space because each image 74 | in the MSCOCO dataset is replicated ~5 times (once per caption) in the output. 75 | This is done for two reasons: 76 | 1. In order to better shuffle the training data. 77 | 2. It makes it easier to perform asynchronous preprocessing of each image in 78 | TensorFlow. 79 | 80 | Running this script using 16 threads may take around 1 hour on a HP Z420. 81 | """ 82 | 83 | from __future__ import absolute_import 84 | from __future__ import division 85 | from __future__ import print_function 86 | 87 | from collections import Counter 88 | from collections import namedtuple 89 | from datetime import datetime 90 | import json 91 | import os.path 92 | import random 93 | import sys 94 | import threading 95 | 96 | 97 | 98 | import nltk.tokenize 99 | import numpy as np 100 | from six.moves import xrange 101 | import tensorflow as tf 102 | 103 | tf.flags.DEFINE_string("train_image_dir", "/tmp/train2014/", 104 | "Training image directory.") 105 | tf.flags.DEFINE_string("val_image_dir", "/tmp/val2014", 106 | "Validation image directory.") 107 | 108 | tf.flags.DEFINE_string("train_captions_file", "/tmp/captions_train2014.json", 109 | "Training captions JSON file.") 110 | tf.flags.DEFINE_string("val_captions_file", "/tmp/captions_val2014.json", 111 | "Validation captions JSON file.") 112 | 113 | tf.flags.DEFINE_string("output_dir", "/tmp/", "Output data directory.") 114 | 115 | tf.flags.DEFINE_integer("train_shards", 256, 116 | "Number of shards in training TFRecord files.") 117 | tf.flags.DEFINE_integer("val_shards", 4, 118 | "Number of shards in validation TFRecord files.") 119 | tf.flags.DEFINE_integer("test_shards", 8, 120 | "Number of shards in testing TFRecord files.") 121 | 122 | tf.flags.DEFINE_string("start_word", "", 123 | "Special word added to the beginning of each sentence.") 124 | tf.flags.DEFINE_string("end_word", "", 125 | "Special word added to the end of each sentence.") 126 | tf.flags.DEFINE_string("unknown_word", "", 127 | "Special word meaning 'unknown'.") 128 | tf.flags.DEFINE_integer("min_word_count", 4, 129 | "The minimum number of occurrences of each word in the " 130 | "training set for inclusion in the vocabulary.") 131 | tf.flags.DEFINE_string("word_counts_output_file", "/tmp/word_counts.txt", 132 | "Output vocabulary file of word counts.") 133 | 134 | tf.flags.DEFINE_integer("num_threads", 8, 135 | "Number of threads to preprocess the images.") 136 | 137 | FLAGS = tf.flags.FLAGS 138 | 139 | ImageMetadata = namedtuple("ImageMetadata", 140 | ["image_id", "filename", "captions"]) 141 | 142 | 143 | class Vocabulary(object): 144 | """Simple vocabulary wrapper.""" 145 | 146 | def __init__(self, vocab, unk_id): 147 | """Initializes the vocabulary. 148 | 149 | Args: 150 | vocab: A dictionary of word to word_id. 151 | unk_id: Id of the special 'unknown' word. 152 | """ 153 | self._vocab = vocab 154 | self._unk_id = unk_id 155 | 156 | def word_to_id(self, word): 157 | """Returns the integer id of a word string.""" 158 | if word in self._vocab: 159 | return self._vocab[word] 160 | else: 161 | return self._unk_id 162 | 163 | 164 | class ImageDecoder(object): 165 | """Helper class for decoding images in TensorFlow.""" 166 | 167 | def __init__(self): 168 | # Create a single TensorFlow Session for all image decoding calls. 169 | self._sess = tf.Session() 170 | 171 | # TensorFlow ops for JPEG decoding. 172 | self._encoded_jpeg = tf.placeholder(dtype=tf.string) 173 | self._decode_jpeg = tf.image.decode_jpeg(self._encoded_jpeg, channels=3) 174 | 175 | def decode_jpeg(self, encoded_jpeg): 176 | image = self._sess.run(self._decode_jpeg, 177 | feed_dict={self._encoded_jpeg: encoded_jpeg}) 178 | assert len(image.shape) == 3 179 | assert image.shape[2] == 3 180 | return image 181 | 182 | 183 | def _int64_feature(value): 184 | """Wrapper for inserting an int64 Feature into a SequenceExample proto.""" 185 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 186 | 187 | 188 | def _bytes_feature(value): 189 | """Wrapper for inserting a bytes Feature into a SequenceExample proto.""" 190 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value)])) 191 | 192 | 193 | def _int64_feature_list(values): 194 | """Wrapper for inserting an int64 FeatureList into a SequenceExample proto.""" 195 | return tf.train.FeatureList(feature=[_int64_feature(v) for v in values]) 196 | 197 | 198 | def _bytes_feature_list(values): 199 | """Wrapper for inserting a bytes FeatureList into a SequenceExample proto.""" 200 | return tf.train.FeatureList(feature=[_bytes_feature(v) for v in values]) 201 | 202 | 203 | def _to_sequence_example(image, decoder, vocab): 204 | """Builds a SequenceExample proto for an image-caption pair. 205 | 206 | Args: 207 | image: An ImageMetadata object. 208 | decoder: An ImageDecoder object. 209 | vocab: A Vocabulary object. 210 | 211 | Returns: 212 | A SequenceExample proto. 213 | """ 214 | with tf.gfile.FastGFile(image.filename, "r") as f: 215 | encoded_image = f.read() 216 | 217 | try: 218 | decoder.decode_jpeg(encoded_image) 219 | except (tf.errors.InvalidArgumentError, AssertionError): 220 | print("Skipping file with invalid JPEG data: %s" % image.filename) 221 | return 222 | 223 | context = tf.train.Features(feature={ 224 | "image/image_id": _int64_feature(image.image_id), 225 | "image/data": _bytes_feature(encoded_image), 226 | }) 227 | 228 | assert len(image.captions) == 1 229 | caption = image.captions[0] 230 | caption_ids = [vocab.word_to_id(word) for word in caption] 231 | feature_lists = tf.train.FeatureLists(feature_list={ 232 | "image/caption": _bytes_feature_list(caption), 233 | "image/caption_ids": _int64_feature_list(caption_ids) 234 | }) 235 | sequence_example = tf.train.SequenceExample( 236 | context=context, feature_lists=feature_lists) 237 | 238 | return sequence_example 239 | 240 | 241 | def _process_image_files(thread_index, ranges, name, images, decoder, vocab, 242 | num_shards): 243 | """Processes and saves a subset of images as TFRecord files in one thread. 244 | 245 | Args: 246 | thread_index: Integer thread identifier within [0, len(ranges)]. 247 | ranges: A list of pairs of integers specifying the ranges of the dataset to 248 | process in parallel. 249 | name: Unique identifier specifying the dataset. 250 | images: List of ImageMetadata. 251 | decoder: An ImageDecoder object. 252 | vocab: A Vocabulary object. 253 | num_shards: Integer number of shards for the output files. 254 | """ 255 | # Each thread produces N shards where N = num_shards / num_threads. For 256 | # instance, if num_shards = 128, and num_threads = 2, then the first thread 257 | # would produce shards [0, 64). 258 | num_threads = len(ranges) 259 | assert not num_shards % num_threads 260 | num_shards_per_batch = int(num_shards / num_threads) 261 | 262 | shard_ranges = np.linspace(ranges[thread_index][0], ranges[thread_index][1], 263 | num_shards_per_batch + 1).astype(int) 264 | num_images_in_thread = ranges[thread_index][1] - ranges[thread_index][0] 265 | 266 | counter = 0 267 | for s in xrange(num_shards_per_batch): 268 | # Generate a sharded version of the file name, e.g. 'train-00002-of-00010' 269 | shard = thread_index * num_shards_per_batch + s 270 | output_filename = "%s-%.5d-of-%.5d" % (name, shard, num_shards) 271 | output_file = os.path.join(FLAGS.output_dir, output_filename) 272 | writer = tf.python_io.TFRecordWriter(output_file) 273 | 274 | shard_counter = 0 275 | images_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int) 276 | for i in images_in_shard: 277 | image = images[i] 278 | 279 | sequence_example = _to_sequence_example(image, decoder, vocab) 280 | if sequence_example is not None: 281 | writer.write(sequence_example.SerializeToString()) 282 | shard_counter += 1 283 | counter += 1 284 | 285 | if not counter % 1000: 286 | print("%s [thread %d]: Processed %d of %d items in thread batch." % 287 | (datetime.now(), thread_index, counter, num_images_in_thread)) 288 | sys.stdout.flush() 289 | 290 | writer.close() 291 | print("%s [thread %d]: Wrote %d image-caption pairs to %s" % 292 | (datetime.now(), thread_index, shard_counter, output_file)) 293 | sys.stdout.flush() 294 | shard_counter = 0 295 | print("%s [thread %d]: Wrote %d image-caption pairs to %d shards." % 296 | (datetime.now(), thread_index, counter, num_shards_per_batch)) 297 | sys.stdout.flush() 298 | 299 | 300 | def _process_dataset(name, images, vocab, num_shards): 301 | """Processes a complete data set and saves it as a TFRecord. 302 | 303 | Args: 304 | name: Unique identifier specifying the dataset. 305 | images: List of ImageMetadata. 306 | vocab: A Vocabulary object. 307 | num_shards: Integer number of shards for the output files. 308 | """ 309 | # Break up each image into a separate entity for each caption. 310 | images = [ImageMetadata(image.image_id, image.filename, [caption]) 311 | for image in images for caption in image.captions] 312 | 313 | # Shuffle the ordering of images. Make the randomization repeatable. 314 | random.seed(12345) 315 | random.shuffle(images) 316 | 317 | # Break the images into num_threads batches. Batch i is defined as 318 | # images[ranges[i][0]:ranges[i][1]]. 319 | num_threads = min(num_shards, FLAGS.num_threads) 320 | spacing = np.linspace(0, len(images), num_threads + 1).astype(np.int) 321 | ranges = [] 322 | threads = [] 323 | for i in xrange(len(spacing) - 1): 324 | ranges.append([spacing[i], spacing[i + 1]]) 325 | 326 | # Create a mechanism for monitoring when all threads are finished. 327 | coord = tf.train.Coordinator() 328 | 329 | # Create a utility for decoding JPEG images to run sanity checks. 330 | decoder = ImageDecoder() 331 | 332 | # Launch a thread for each batch. 333 | print("Launching %d threads for spacings: %s" % (num_threads, ranges)) 334 | for thread_index in xrange(len(ranges)): 335 | args = (thread_index, ranges, name, images, decoder, vocab, num_shards) 336 | t = threading.Thread(target=_process_image_files, args=args) 337 | t.start() 338 | threads.append(t) 339 | 340 | # Wait for all the threads to terminate. 341 | coord.join(threads) 342 | print("%s: Finished processing all %d image-caption pairs in data set '%s'." % 343 | (datetime.now(), len(images), name)) 344 | 345 | 346 | def _create_vocab(captions): 347 | """Creates the vocabulary of word to word_id. 348 | 349 | The vocabulary is saved to disk in a text file of word counts. The id of each 350 | word in the file is its corresponding 0-based line number. 351 | 352 | Args: 353 | captions: A list of lists of strings. 354 | 355 | Returns: 356 | A Vocabulary object. 357 | """ 358 | print("Creating vocabulary.") 359 | counter = Counter() 360 | for c in captions: 361 | counter.update(c) 362 | print("Total words:", len(counter)) 363 | 364 | # Filter uncommon words and sort by descending count. 365 | word_counts = [x for x in counter.items() if x[1] >= FLAGS.min_word_count] 366 | word_counts.sort(key=lambda x: x[1], reverse=True) 367 | print("Words in vocabulary:", len(word_counts)) 368 | 369 | # Write out the word counts file. 370 | with tf.gfile.FastGFile(FLAGS.word_counts_output_file, "w") as f: 371 | f.write("\n".join(["%s %d" % (w, c) for w, c in word_counts])) 372 | print("Wrote vocabulary file:", FLAGS.word_counts_output_file) 373 | 374 | # Create the vocabulary dictionary. 375 | reverse_vocab = [x[0] for x in word_counts] 376 | unk_id = len(reverse_vocab) 377 | vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)]) 378 | vocab = Vocabulary(vocab_dict, unk_id) 379 | 380 | return vocab 381 | 382 | 383 | def _process_caption(caption): 384 | """Processes a caption string into a list of tonenized words. 385 | 386 | Args: 387 | caption: A string caption. 388 | 389 | Returns: 390 | A list of strings; the tokenized caption. 391 | """ 392 | tokenized_caption = [FLAGS.start_word] 393 | tokenized_caption.extend(nltk.tokenize.word_tokenize(caption.lower())) 394 | tokenized_caption.append(FLAGS.end_word) 395 | return tokenized_caption 396 | 397 | 398 | def _load_and_process_metadata(captions_file, image_dir): 399 | """Loads image metadata from a JSON file and processes the captions. 400 | 401 | Args: 402 | captions_file: JSON file containing caption annotations. 403 | image_dir: Directory containing the image files. 404 | 405 | Returns: 406 | A list of ImageMetadata. 407 | """ 408 | with tf.gfile.FastGFile(captions_file, "r") as f: 409 | caption_data = json.load(f) 410 | 411 | # Extract the filenames. 412 | id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]] 413 | 414 | # Extract the captions. Each image_id is associated with multiple captions. 415 | id_to_captions = {} 416 | for annotation in caption_data["annotations"]: 417 | image_id = annotation["image_id"] 418 | caption = annotation["caption"] 419 | id_to_captions.setdefault(image_id, []) 420 | id_to_captions[image_id].append(caption) 421 | 422 | assert len(id_to_filename) == len(id_to_captions) 423 | assert set([x[0] for x in id_to_filename]) == set(id_to_captions.keys()) 424 | print("Loaded caption metadata for %d images from %s" % 425 | (len(id_to_filename), captions_file)) 426 | 427 | # Process the captions and combine the data into a list of ImageMetadata. 428 | print("Processing captions.") 429 | image_metadata = [] 430 | num_captions = 0 431 | for image_id, base_filename in id_to_filename: 432 | filename = os.path.join(image_dir, base_filename) 433 | captions = [_process_caption(c) for c in id_to_captions[image_id]] 434 | image_metadata.append(ImageMetadata(image_id, filename, captions)) 435 | num_captions += len(captions) 436 | print("Finished processing %d captions for %d images in %s" % 437 | (num_captions, len(id_to_filename), captions_file)) 438 | 439 | return image_metadata 440 | 441 | 442 | def main(unused_argv): 443 | def _is_valid_num_shards(num_shards): 444 | """Returns True if num_shards is compatible with FLAGS.num_threads.""" 445 | return num_shards < FLAGS.num_threads or not num_shards % FLAGS.num_threads 446 | 447 | assert _is_valid_num_shards(FLAGS.train_shards), ( 448 | "Please make the FLAGS.num_threads commensurate with FLAGS.train_shards") 449 | assert _is_valid_num_shards(FLAGS.val_shards), ( 450 | "Please make the FLAGS.num_threads commensurate with FLAGS.val_shards") 451 | assert _is_valid_num_shards(FLAGS.test_shards), ( 452 | "Please make the FLAGS.num_threads commensurate with FLAGS.test_shards") 453 | 454 | if not tf.gfile.IsDirectory(FLAGS.output_dir): 455 | tf.gfile.MakeDirs(FLAGS.output_dir) 456 | 457 | # Load image metadata from caption files. 458 | mscoco_train_dataset = _load_and_process_metadata(FLAGS.train_captions_file, 459 | FLAGS.train_image_dir) 460 | mscoco_val_dataset = _load_and_process_metadata(FLAGS.val_captions_file, 461 | FLAGS.val_image_dir) 462 | 463 | # Redistribute the MSCOCO data as follows: 464 | # train_dataset = 100% of mscoco_train_dataset + 85% of mscoco_val_dataset. 465 | # val_dataset = 5% of mscoco_val_dataset (for validation during training). 466 | # test_dataset = 10% of mscoco_val_dataset (for final evaluation). 467 | train_cutoff = int(0.85 * len(mscoco_val_dataset)) 468 | val_cutoff = int(0.90 * len(mscoco_val_dataset)) 469 | train_dataset = mscoco_train_dataset + mscoco_val_dataset[0:train_cutoff] 470 | val_dataset = mscoco_val_dataset[train_cutoff:val_cutoff] 471 | test_dataset = mscoco_val_dataset[val_cutoff:] 472 | 473 | # Create vocabulary from the training captions. 474 | train_captions = [c for image in train_dataset for c in image.captions] 475 | vocab = _create_vocab(train_captions) 476 | 477 | _process_dataset("train", train_dataset, vocab, FLAGS.train_shards) 478 | _process_dataset("val", val_dataset, vocab, FLAGS.val_shards) 479 | _process_dataset("test", test_dataset, vocab, FLAGS.test_shards) 480 | 481 | 482 | if __name__ == "__main__": 483 | tf.app.run() 484 | -------------------------------------------------------------------------------- /Section 1/im2txt/data/download_and_preprocess_mscoco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ============================================================================== 16 | 17 | # Script to download and preprocess the MSCOCO data set. 18 | # 19 | # The outputs of this script are sharded TFRecord files containing serialized 20 | # SequenceExample protocol buffers. See build_mscoco_data.py for details of how 21 | # the SequenceExample protocol buffers are constructed. 22 | # 23 | # usage: 24 | # ./download_and_preprocess_mscoco.sh 25 | set -e 26 | 27 | if [ -z "$1" ]; then 28 | echo "usage download_and_preproces_mscoco.sh [data dir]" 29 | exit 30 | fi 31 | 32 | if [ "$(uname)" == "Darwin" ]; then 33 | UNZIP="tar -xf" 34 | else 35 | UNZIP="unzip -nq" 36 | fi 37 | 38 | # Create the output directories. 39 | OUTPUT_DIR="${1%/}" 40 | SCRATCH_DIR="${OUTPUT_DIR}/raw-data" 41 | mkdir -p "${OUTPUT_DIR}" 42 | mkdir -p "${SCRATCH_DIR}" 43 | CURRENT_DIR=$(pwd) 44 | WORK_DIR="$0.runfiles/im2txt/im2txt" 45 | 46 | # Helper function to download and unpack a .zip file. 47 | function download_and_unzip() { 48 | local BASE_URL=${1} 49 | local FILENAME=${2} 50 | 51 | if [ ! -f ${FILENAME} ]; then 52 | echo "Downloading ${FILENAME} to $(pwd)" 53 | wget -nd -c "${BASE_URL}/${FILENAME}" 54 | else 55 | echo "Skipping download of ${FILENAME}" 56 | fi 57 | echo "Unzipping ${FILENAME}" 58 | ${UNZIP} ${FILENAME} 59 | } 60 | 61 | cd ${SCRATCH_DIR} 62 | 63 | # Download the images. 64 | BASE_IMAGE_URL="http://msvocds.blob.core.windows.net/coco2014" 65 | 66 | TRAIN_IMAGE_FILE="train2014.zip" 67 | #download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE} 68 | TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2014" 69 | 70 | VAL_IMAGE_FILE="val2014.zip" 71 | #download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE} 72 | VAL_IMAGE_DIR="${SCRATCH_DIR}/val2014" 73 | 74 | # Download the captions. 75 | BASE_CAPTIONS_URL="http://msvocds.blob.core.windows.net/annotations-1-0-3" 76 | CAPTIONS_FILE="captions_train-val2014.zip" 77 | #download_and_unzip ${BASE_CAPTIONS_URL} ${CAPTIONS_FILE} 78 | TRAIN_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_train2014.json" 79 | VAL_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_val2014.json" 80 | 81 | # Build TFRecords of the image data. 82 | cd "${CURRENT_DIR}" 83 | #BUILD_SCRIPT="${WORK_DIR}/build_mscoco_data" 84 | 85 | echo $TRAIN_IMAGE_DIR 86 | echo $VAL_IMAGE_DIR 87 | echo $TRAIN_CAPTIONS_FILE 88 | echo $VAL_CAPTIONS_FILE 89 | echo $OUTPUT_DIR 90 | 91 | #BUILD_SCRIPT=./build_mscoco_data 92 | #"${BUILD_SCRIPT}" \ 93 | #--train_image_dir="${TRAIN_IMAGE_DIR}" \ 94 | #--val_image_dir="${VAL_IMAGE_DIR}" \ 95 | #--train_captions_file="${TRAIN_CAPTIONS_FILE}" \ 96 | #--val_captions_file="${VAL_CAPTIONS_FILE}" \ 97 | #--output_dir="${OUTPUT_DIR}" \ 98 | #--word_counts_output_file="${OUTPUT_DIR}/word_counts.txt" \ 99 | 100 | 101 | echo python build_mscoco_data.py --train_image_dir="${TRAIN_IMAGE_DIR}" \ 102 | --val_image_dir="${VAL_IMAGE_DIR}" \ 103 | --train_captions_file="${TRAIN_CAPTIONS_FILE}" \ 104 | --val_captions_file="${VAL_CAPTIONS_FILE}" \ 105 | --output_dir="${OUTPUT_DIR}" \ 106 | --word_counts_output_file="${OUTPUT_DIR}/word_counts.txt" \ 107 | -------------------------------------------------------------------------------- /Section 1/im2txt/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Evaluate the model. 17 | 18 | This script should be run concurrently with training so that summaries show up 19 | in TensorBoard. 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import math 27 | import os.path 28 | import time 29 | 30 | 31 | import numpy as np 32 | import tensorflow as tf 33 | 34 | from im2txt import configuration 35 | from im2txt import show_and_tell_model 36 | 37 | FLAGS = tf.flags.FLAGS 38 | 39 | tf.flags.DEFINE_string("input_file_pattern", "", 40 | "File pattern of sharded TFRecord input files.") 41 | tf.flags.DEFINE_string("checkpoint_dir", "", 42 | "Directory containing model checkpoints.") 43 | tf.flags.DEFINE_string("eval_dir", "", "Directory to write event logs.") 44 | 45 | tf.flags.DEFINE_integer("eval_interval_secs", 600, 46 | "Interval between evaluation runs.") 47 | tf.flags.DEFINE_integer("num_eval_examples", 10132, 48 | "Number of examples for evaluation.") 49 | 50 | tf.flags.DEFINE_integer("min_global_step", 5000, 51 | "Minimum global step to run evaluation.") 52 | 53 | tf.logging.set_verbosity(tf.logging.INFO) 54 | 55 | 56 | def evaluate_model(sess, model, global_step, summary_writer, summary_op): 57 | """Computes perplexity-per-word over the evaluation dataset. 58 | 59 | Summaries and perplexity-per-word are written out to the eval directory. 60 | 61 | Args: 62 | sess: Session object. 63 | model: Instance of ShowAndTellModel; the model to evaluate. 64 | global_step: Integer; global step of the model checkpoint. 65 | summary_writer: Instance of FileWriter. 66 | summary_op: Op for generating model summaries. 67 | """ 68 | # Log model summaries on a single batch. 69 | summary_str = sess.run(summary_op) 70 | summary_writer.add_summary(summary_str, global_step) 71 | 72 | # Compute perplexity over the entire dataset. 73 | num_eval_batches = int( 74 | math.ceil(FLAGS.num_eval_examples / model.config.batch_size)) 75 | 76 | start_time = time.time() 77 | sum_losses = 0. 78 | sum_weights = 0. 79 | for i in range(num_eval_batches): 80 | cross_entropy_losses, weights = sess.run([ 81 | model.target_cross_entropy_losses, 82 | model.target_cross_entropy_loss_weights 83 | ]) 84 | sum_losses += np.sum(cross_entropy_losses * weights) 85 | sum_weights += np.sum(weights) 86 | if not i % 100: 87 | tf.logging.info("Computed losses for %d of %d batches.", i + 1, 88 | num_eval_batches) 89 | eval_time = time.time() - start_time 90 | 91 | perplexity = math.exp(sum_losses / sum_weights) 92 | tf.logging.info("Perplexity = %f (%.2g sec)", perplexity, eval_time) 93 | 94 | # Log perplexity to the FileWriter. 95 | summary = tf.Summary() 96 | value = summary.value.add() 97 | value.simple_value = perplexity 98 | value.tag = "Perplexity" 99 | summary_writer.add_summary(summary, global_step) 100 | 101 | # Write the Events file to the eval directory. 102 | summary_writer.flush() 103 | tf.logging.info("Finished processing evaluation at global step %d.", 104 | global_step) 105 | 106 | 107 | def run_once(model, saver, summary_writer, summary_op): 108 | """Evaluates the latest model checkpoint. 109 | 110 | Args: 111 | model: Instance of ShowAndTellModel; the model to evaluate. 112 | saver: Instance of tf.train.Saver for restoring model Variables. 113 | summary_writer: Instance of FileWriter. 114 | summary_op: Op for generating model summaries. 115 | """ 116 | model_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) 117 | if not model_path: 118 | tf.logging.info("Skipping evaluation. No checkpoint found in: %s", 119 | FLAGS.checkpoint_dir) 120 | return 121 | 122 | with tf.Session() as sess: 123 | # Load model from checkpoint. 124 | tf.logging.info("Loading model from checkpoint: %s", model_path) 125 | saver.restore(sess, model_path) 126 | global_step = tf.train.global_step(sess, model.global_step.name) 127 | tf.logging.info("Successfully loaded %s at global step = %d.", 128 | os.path.basename(model_path), global_step) 129 | if global_step < FLAGS.min_global_step: 130 | tf.logging.info("Skipping evaluation. Global step = %d < %d", global_step, 131 | FLAGS.min_global_step) 132 | return 133 | 134 | # Start the queue runners. 135 | coord = tf.train.Coordinator() 136 | threads = tf.train.start_queue_runners(coord=coord) 137 | 138 | # Run evaluation on the latest checkpoint. 139 | try: 140 | evaluate_model( 141 | sess=sess, 142 | model=model, 143 | global_step=global_step, 144 | summary_writer=summary_writer, 145 | summary_op=summary_op) 146 | except Exception as e: # pylint: disable=broad-except 147 | tf.logging.error("Evaluation failed.") 148 | coord.request_stop(e) 149 | 150 | coord.request_stop() 151 | coord.join(threads, stop_grace_period_secs=10) 152 | 153 | 154 | def run(): 155 | """Runs evaluation in a loop, and logs summaries to TensorBoard.""" 156 | # Create the evaluation directory if it doesn't exist. 157 | eval_dir = FLAGS.eval_dir 158 | if not tf.gfile.IsDirectory(eval_dir): 159 | tf.logging.info("Creating eval directory: %s", eval_dir) 160 | tf.gfile.MakeDirs(eval_dir) 161 | 162 | g = tf.Graph() 163 | with g.as_default(): 164 | # Build the model for evaluation. 165 | model_config = configuration.ModelConfig() 166 | model_config.input_file_pattern = FLAGS.input_file_pattern 167 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="eval") 168 | model.build() 169 | 170 | # Create the Saver to restore model Variables. 171 | saver = tf.train.Saver() 172 | 173 | # Create the summary operation and the summary writer. 174 | summary_op = tf.summary.merge_all() 175 | summary_writer = tf.summary.FileWriter(eval_dir) 176 | 177 | g.finalize() 178 | 179 | # Run a new evaluation run every eval_interval_secs. 180 | while True: 181 | start = time.time() 182 | tf.logging.info("Starting evaluation at " + time.strftime( 183 | "%Y-%m-%d-%H:%M:%S", time.localtime())) 184 | run_once(model, saver, summary_writer, summary_op) 185 | time_to_next_eval = start + FLAGS.eval_interval_secs - time.time() 186 | if time_to_next_eval > 0: 187 | time.sleep(time_to_next_eval) 188 | 189 | 190 | def main(unused_argv): 191 | assert FLAGS.input_file_pattern, "--input_file_pattern is required" 192 | assert FLAGS.checkpoint_dir, "--checkpoint_dir is required" 193 | assert FLAGS.eval_dir, "--eval_dir is required" 194 | run() 195 | 196 | 197 | if __name__ == "__main__": 198 | tf.app.run() 199 | -------------------------------------------------------------------------------- /Section 1/im2txt/inference_utils/__pycache__/caption_generator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/caption_generator.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/inference_utils/__pycache__/inference_wrapper_base.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/inference_wrapper_base.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/inference_utils/__pycache__/vocabulary.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/vocabulary.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/inference_wrapper.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Model wrapper class for performing inference with a ShowAndTellModel.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | 24 | from im2txt import show_and_tell_model 25 | from im2txt.inference_utils import inference_wrapper_base 26 | 27 | 28 | class InferenceWrapper(inference_wrapper_base.InferenceWrapperBase): 29 | """Model wrapper class for performing inference with a ShowAndTellModel.""" 30 | 31 | def __init__(self): 32 | super(InferenceWrapper, self).__init__() 33 | 34 | def build_model(self, model_config): 35 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="inference") 36 | model.build() 37 | return model 38 | 39 | def feed_image(self, sess, encoded_image): 40 | initial_state = sess.run(fetches="lstm/initial_state:0", 41 | feed_dict={"image_feed:0": encoded_image}) 42 | return initial_state 43 | 44 | def inference_step(self, sess, input_feed, state_feed): 45 | softmax_output, state_output = sess.run( 46 | fetches=["softmax:0", "lstm/state:0"], 47 | feed_dict={ 48 | "input_feed:0": input_feed, 49 | "lstm/state_feed:0": state_feed, 50 | }) 51 | return softmax_output, state_output, None 52 | -------------------------------------------------------------------------------- /Section 1/im2txt/ops/__pycache__/image_embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/image_embedding.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/ops/__pycache__/image_processing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/image_processing.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/ops/__pycache__/inputs.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/inputs.cpython-36.pyc -------------------------------------------------------------------------------- /Section 1/im2txt/ops/image_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image embedding ops.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3_base 26 | 27 | slim = tf.contrib.slim 28 | 29 | 30 | def inception_v3(images, 31 | trainable=True, 32 | is_training=True, 33 | weight_decay=0.00004, 34 | stddev=0.1, 35 | dropout_keep_prob=0.8, 36 | use_batch_norm=True, 37 | batch_norm_params=None, 38 | add_summaries=True, 39 | scope="InceptionV3"): 40 | """Builds an Inception V3 subgraph for image embeddings. 41 | 42 | Args: 43 | images: A float32 Tensor of shape [batch, height, width, channels]. 44 | trainable: Whether the inception submodel should be trainable or not. 45 | is_training: Boolean indicating training mode or not. 46 | weight_decay: Coefficient for weight regularization. 47 | stddev: The standard deviation of the trunctated normal weight initializer. 48 | dropout_keep_prob: Dropout keep probability. 49 | use_batch_norm: Whether to use batch normalization. 50 | batch_norm_params: Parameters for batch normalization. See 51 | tf.contrib.layers.batch_norm for details. 52 | add_summaries: Whether to add activation summaries. 53 | scope: Optional Variable scope. 54 | 55 | Returns: 56 | end_points: A dictionary of activations from inception_v3 layers. 57 | """ 58 | # Only consider the inception model to be in training mode if it's trainable. 59 | is_inception_model_training = trainable and is_training 60 | 61 | if use_batch_norm: 62 | # Default parameters for batch normalization. 63 | if not batch_norm_params: 64 | batch_norm_params = { 65 | "is_training": is_inception_model_training, 66 | "trainable": trainable, 67 | # Decay for the moving averages. 68 | "decay": 0.9997, 69 | # Epsilon to prevent 0s in variance. 70 | "epsilon": 0.001, 71 | # Collection containing the moving mean and moving variance. 72 | "variables_collections": { 73 | "beta": None, 74 | "gamma": None, 75 | "moving_mean": ["moving_vars"], 76 | "moving_variance": ["moving_vars"], 77 | } 78 | } 79 | else: 80 | batch_norm_params = None 81 | 82 | if trainable: 83 | weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay) 84 | else: 85 | weights_regularizer = None 86 | 87 | with tf.variable_scope(scope, "InceptionV3", [images]) as scope: 88 | with slim.arg_scope( 89 | [slim.conv2d, slim.fully_connected], 90 | weights_regularizer=weights_regularizer, 91 | trainable=trainable): 92 | with slim.arg_scope( 93 | [slim.conv2d], 94 | weights_initializer=tf.truncated_normal_initializer(stddev=stddev), 95 | activation_fn=tf.nn.relu, 96 | normalizer_fn=slim.batch_norm, 97 | normalizer_params=batch_norm_params): 98 | net, end_points = inception_v3_base(images, scope=scope) 99 | with tf.variable_scope("logits"): 100 | shape = net.get_shape() 101 | net = slim.avg_pool2d(net, shape[1:3], padding="VALID", scope="pool") 102 | net = slim.dropout( 103 | net, 104 | keep_prob=dropout_keep_prob, 105 | is_training=is_inception_model_training, 106 | scope="dropout") 107 | net = slim.flatten(net, scope="flatten") 108 | 109 | # Add summaries. 110 | if add_summaries: 111 | for v in end_points.values(): 112 | tf.contrib.layers.summaries.summarize_activation(v) 113 | 114 | return net 115 | -------------------------------------------------------------------------------- /Section 1/im2txt/ops/image_embedding_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for tensorflow_models.im2txt.ops.image_embedding.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | from im2txt.ops import image_embedding 26 | 27 | 28 | class InceptionV3Test(tf.test.TestCase): 29 | 30 | def setUp(self): 31 | super(InceptionV3Test, self).setUp() 32 | 33 | batch_size = 4 34 | height = 299 35 | width = 299 36 | num_channels = 3 37 | self._images = tf.placeholder(tf.float32, 38 | [batch_size, height, width, num_channels]) 39 | self._batch_size = batch_size 40 | 41 | def _countInceptionParameters(self): 42 | """Counts the number of parameters in the inception model at top scope.""" 43 | counter = {} 44 | for v in tf.global_variables(): 45 | name_tokens = v.op.name.split("/") 46 | if name_tokens[0] == "InceptionV3": 47 | name = "InceptionV3/" + name_tokens[1] 48 | num_params = v.get_shape().num_elements() 49 | assert num_params 50 | counter[name] = counter.get(name, 0) + num_params 51 | return counter 52 | 53 | def _verifyParameterCounts(self): 54 | """Verifies the number of parameters in the inception model.""" 55 | param_counts = self._countInceptionParameters() 56 | expected_param_counts = { 57 | "InceptionV3/Conv2d_1a_3x3": 960, 58 | "InceptionV3/Conv2d_2a_3x3": 9312, 59 | "InceptionV3/Conv2d_2b_3x3": 18624, 60 | "InceptionV3/Conv2d_3b_1x1": 5360, 61 | "InceptionV3/Conv2d_4a_3x3": 138816, 62 | "InceptionV3/Mixed_5b": 256368, 63 | "InceptionV3/Mixed_5c": 277968, 64 | "InceptionV3/Mixed_5d": 285648, 65 | "InceptionV3/Mixed_6a": 1153920, 66 | "InceptionV3/Mixed_6b": 1298944, 67 | "InceptionV3/Mixed_6c": 1692736, 68 | "InceptionV3/Mixed_6d": 1692736, 69 | "InceptionV3/Mixed_6e": 2143872, 70 | "InceptionV3/Mixed_7a": 1699584, 71 | "InceptionV3/Mixed_7b": 5047872, 72 | "InceptionV3/Mixed_7c": 6080064, 73 | } 74 | self.assertDictEqual(expected_param_counts, param_counts) 75 | 76 | def _assertCollectionSize(self, expected_size, collection): 77 | actual_size = len(tf.get_collection(collection)) 78 | if expected_size != actual_size: 79 | self.fail("Found %d items in collection %s (expected %d)." % 80 | (actual_size, collection, expected_size)) 81 | 82 | def testTrainableTrueIsTrainingTrue(self): 83 | embeddings = image_embedding.inception_v3( 84 | self._images, trainable=True, is_training=True) 85 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 86 | 87 | self._verifyParameterCounts() 88 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 89 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) 90 | self._assertCollectionSize(188, tf.GraphKeys.UPDATE_OPS) 91 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) 92 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 93 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 94 | 95 | def testTrainableTrueIsTrainingFalse(self): 96 | embeddings = image_embedding.inception_v3( 97 | self._images, trainable=True, is_training=False) 98 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 99 | 100 | self._verifyParameterCounts() 101 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 102 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES) 103 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 104 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES) 105 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 106 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 107 | 108 | def testTrainableFalseIsTrainingTrue(self): 109 | embeddings = image_embedding.inception_v3( 110 | self._images, trainable=False, is_training=True) 111 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 112 | 113 | self._verifyParameterCounts() 114 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 115 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) 116 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 117 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) 118 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 119 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 120 | 121 | def testTrainableFalseIsTrainingFalse(self): 122 | embeddings = image_embedding.inception_v3( 123 | self._images, trainable=False, is_training=False) 124 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list()) 125 | 126 | self._verifyParameterCounts() 127 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES) 128 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES) 129 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS) 130 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES) 131 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES) 132 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES) 133 | 134 | 135 | if __name__ == "__main__": 136 | tf.test.main() 137 | -------------------------------------------------------------------------------- /Section 1/im2txt/ops/image_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Helper functions for image preprocessing.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | 26 | def distort_image(image, thread_id): 27 | """Perform random distortions on an image. 28 | 29 | Args: 30 | image: A float32 Tensor of shape [height, width, 3] with values in [0, 1). 31 | thread_id: Preprocessing thread id used to select the ordering of color 32 | distortions. There should be a multiple of 2 preprocessing threads. 33 | 34 | Returns: 35 | distorted_image: A float32 Tensor of shape [height, width, 3] with values in 36 | [0, 1]. 37 | """ 38 | # Randomly flip horizontally. 39 | with tf.name_scope("flip_horizontal", values=[image]): 40 | image = tf.image.random_flip_left_right(image) 41 | 42 | # Randomly distort the colors based on thread id. 43 | color_ordering = thread_id % 2 44 | with tf.name_scope("distort_color", values=[image]): 45 | if color_ordering == 0: 46 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 47 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 48 | image = tf.image.random_hue(image, max_delta=0.032) 49 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 50 | elif color_ordering == 1: 51 | image = tf.image.random_brightness(image, max_delta=32. / 255.) 52 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5) 53 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5) 54 | image = tf.image.random_hue(image, max_delta=0.032) 55 | 56 | # The random_* ops do not necessarily clamp. 57 | image = tf.clip_by_value(image, 0.0, 1.0) 58 | 59 | return image 60 | 61 | 62 | def process_image(encoded_image, 63 | is_training, 64 | height, 65 | width, 66 | resize_height=346, 67 | resize_width=346, 68 | thread_id=0, 69 | image_format="jpeg"): 70 | """Decode an image, resize and apply random distortions. 71 | 72 | In training, images are distorted slightly differently depending on thread_id. 73 | 74 | Args: 75 | encoded_image: String Tensor containing the image. 76 | is_training: Boolean; whether preprocessing for training or eval. 77 | height: Height of the output image. 78 | width: Width of the output image. 79 | resize_height: If > 0, resize height before crop to final dimensions. 80 | resize_width: If > 0, resize width before crop to final dimensions. 81 | thread_id: Preprocessing thread id used to select the ordering of color 82 | distortions. There should be a multiple of 2 preprocessing threads. 83 | image_format: "jpeg" or "png". 84 | 85 | Returns: 86 | A float32 Tensor of shape [height, width, 3] with values in [-1, 1]. 87 | 88 | Raises: 89 | ValueError: If image_format is invalid. 90 | """ 91 | # Helper function to log an image summary to the visualizer. Summaries are 92 | # only logged in thread 0. 93 | def image_summary(name, image): 94 | if not thread_id: 95 | tf.summary.image(name, tf.expand_dims(image, 0)) 96 | 97 | # Decode image into a float32 Tensor of shape [?, ?, 3] with values in [0, 1). 98 | with tf.name_scope("decode", values=[encoded_image]): 99 | if image_format == "jpeg": 100 | image = tf.image.decode_jpeg(encoded_image, channels=3) 101 | elif image_format == "png": 102 | image = tf.image.decode_png(encoded_image, channels=3) 103 | else: 104 | raise ValueError("Invalid image format: %s" % image_format) 105 | image = tf.image.convert_image_dtype(image, dtype=tf.float32) 106 | image_summary("original_image", image) 107 | 108 | # Resize image. 109 | assert (resize_height > 0) == (resize_width > 0) 110 | if resize_height: 111 | image = tf.image.resize_images(image, 112 | size=[resize_height, resize_width], 113 | method=tf.image.ResizeMethod.BILINEAR) 114 | 115 | # Crop to final dimensions. 116 | if is_training: 117 | image = tf.random_crop(image, [height, width, 3]) 118 | else: 119 | # Central crop, assuming resize_height > height, resize_width > width. 120 | image = tf.image.resize_image_with_crop_or_pad(image, height, width) 121 | 122 | image_summary("resized_image", image) 123 | 124 | # Randomly distort the image. 125 | if is_training: 126 | image = distort_image(image, thread_id) 127 | 128 | image_summary("final_image", image) 129 | 130 | # Rescale to [-1,1] instead of [0, 1] 131 | image = tf.subtract(image, 0.5) 132 | image = tf.multiply(image, 2.0) 133 | return image 134 | -------------------------------------------------------------------------------- /Section 1/im2txt/ops/inputs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Input ops.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import tensorflow as tf 24 | 25 | 26 | def parse_sequence_example(serialized, image_feature, caption_feature): 27 | """Parses a tensorflow.SequenceExample into an image and caption. 28 | 29 | Args: 30 | serialized: A scalar string Tensor; a single serialized SequenceExample. 31 | image_feature: Name of SequenceExample context feature containing image 32 | data. 33 | caption_feature: Name of SequenceExample feature list containing integer 34 | captions. 35 | 36 | Returns: 37 | encoded_image: A scalar string Tensor containing a JPEG encoded image. 38 | caption: A 1-D uint64 Tensor with dynamically specified length. 39 | """ 40 | context, sequence = tf.parse_single_sequence_example( 41 | serialized, 42 | context_features={ 43 | image_feature: tf.FixedLenFeature([], dtype=tf.string) 44 | }, 45 | sequence_features={ 46 | caption_feature: tf.FixedLenSequenceFeature([], dtype=tf.int64), 47 | }) 48 | 49 | encoded_image = context[image_feature] 50 | caption = sequence[caption_feature] 51 | return encoded_image, caption 52 | 53 | 54 | def prefetch_input_data(reader, 55 | file_pattern, 56 | is_training, 57 | batch_size, 58 | values_per_shard, 59 | input_queue_capacity_factor=16, 60 | num_reader_threads=1, 61 | shard_queue_name="filename_queue", 62 | value_queue_name="input_queue"): 63 | """Prefetches string values from disk into an input queue. 64 | 65 | In training the capacity of the queue is important because a larger queue 66 | means better mixing of training examples between shards. The minimum number of 67 | values kept in the queue is values_per_shard * input_queue_capacity_factor, 68 | where input_queue_memory factor should be chosen to trade-off better mixing 69 | with memory usage. 70 | 71 | Args: 72 | reader: Instance of tf.ReaderBase. 73 | file_pattern: Comma-separated list of file patterns (e.g. 74 | /tmp/train_data-?????-of-00100). 75 | is_training: Boolean; whether prefetching for training or eval. 76 | batch_size: Model batch size used to determine queue capacity. 77 | values_per_shard: Approximate number of values per shard. 78 | input_queue_capacity_factor: Minimum number of values to keep in the queue 79 | in multiples of values_per_shard. See comments above. 80 | num_reader_threads: Number of reader threads to fill the queue. 81 | shard_queue_name: Name for the shards filename queue. 82 | value_queue_name: Name for the values input queue. 83 | 84 | Returns: 85 | A Queue containing prefetched string values. 86 | """ 87 | data_files = [] 88 | for pattern in file_pattern.split(","): 89 | data_files.extend(tf.gfile.Glob(pattern)) 90 | if not data_files: 91 | tf.logging.fatal("Found no input files matching %s", file_pattern) 92 | else: 93 | tf.logging.info("Prefetching values from %d files matching %s", 94 | len(data_files), file_pattern) 95 | 96 | if is_training: 97 | filename_queue = tf.train.string_input_producer( 98 | data_files, shuffle=True, capacity=16, name=shard_queue_name) 99 | min_queue_examples = values_per_shard * input_queue_capacity_factor 100 | capacity = min_queue_examples + 100 * batch_size 101 | values_queue = tf.RandomShuffleQueue( 102 | capacity=capacity, 103 | min_after_dequeue=min_queue_examples, 104 | dtypes=[tf.string], 105 | name="random_" + value_queue_name) 106 | else: 107 | filename_queue = tf.train.string_input_producer( 108 | data_files, shuffle=False, capacity=1, name=shard_queue_name) 109 | capacity = values_per_shard + 3 * batch_size 110 | values_queue = tf.FIFOQueue( 111 | capacity=capacity, dtypes=[tf.string], name="fifo_" + value_queue_name) 112 | 113 | enqueue_ops = [] 114 | for _ in range(num_reader_threads): 115 | _, value = reader.read(filename_queue) 116 | enqueue_ops.append(values_queue.enqueue([value])) 117 | tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner( 118 | values_queue, enqueue_ops)) 119 | tf.summary.scalar( 120 | "queue/%s/fraction_of_%d_full" % (values_queue.name, capacity), 121 | tf.cast(values_queue.size(), tf.float32) * (1. / capacity)) 122 | 123 | return values_queue 124 | 125 | 126 | def batch_with_dynamic_pad(images_and_captions, 127 | batch_size, 128 | queue_capacity, 129 | add_summaries=True): 130 | """Batches input images and captions. 131 | 132 | This function splits the caption into an input sequence and a target sequence, 133 | where the target sequence is the input sequence right-shifted by 1. Input and 134 | target sequences are batched and padded up to the maximum length of sequences 135 | in the batch. A mask is created to distinguish real words from padding words. 136 | 137 | Example: 138 | Actual captions in the batch ('-' denotes padded character): 139 | [ 140 | [ 1 2 3 4 5 ], 141 | [ 1 2 3 4 - ], 142 | [ 1 2 3 - - ], 143 | ] 144 | 145 | input_seqs: 146 | [ 147 | [ 1 2 3 4 ], 148 | [ 1 2 3 - ], 149 | [ 1 2 - - ], 150 | ] 151 | 152 | target_seqs: 153 | [ 154 | [ 2 3 4 5 ], 155 | [ 2 3 4 - ], 156 | [ 2 3 - - ], 157 | ] 158 | 159 | mask: 160 | [ 161 | [ 1 1 1 1 ], 162 | [ 1 1 1 0 ], 163 | [ 1 1 0 0 ], 164 | ] 165 | 166 | Args: 167 | images_and_captions: A list of pairs [image, caption], where image is a 168 | Tensor of shape [height, width, channels] and caption is a 1-D Tensor of 169 | any length. Each pair will be processed and added to the queue in a 170 | separate thread. 171 | batch_size: Batch size. 172 | queue_capacity: Queue capacity. 173 | add_summaries: If true, add caption length summaries. 174 | 175 | Returns: 176 | images: A Tensor of shape [batch_size, height, width, channels]. 177 | input_seqs: An int32 Tensor of shape [batch_size, padded_length]. 178 | target_seqs: An int32 Tensor of shape [batch_size, padded_length]. 179 | mask: An int32 0/1 Tensor of shape [batch_size, padded_length]. 180 | """ 181 | enqueue_list = [] 182 | for image, caption in images_and_captions: 183 | caption_length = tf.shape(caption)[0] 184 | input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0) 185 | 186 | input_seq = tf.slice(caption, [0], input_length) 187 | target_seq = tf.slice(caption, [1], input_length) 188 | indicator = tf.ones(input_length, dtype=tf.int32) 189 | enqueue_list.append([image, input_seq, target_seq, indicator]) 190 | 191 | images, input_seqs, target_seqs, mask = tf.train.batch_join( 192 | enqueue_list, 193 | batch_size=batch_size, 194 | capacity=queue_capacity, 195 | dynamic_pad=True, 196 | name="batch_and_pad") 197 | 198 | if add_summaries: 199 | lengths = tf.add(tf.reduce_sum(mask, 1), 1) 200 | tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths)) 201 | tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths)) 202 | tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths)) 203 | 204 | return images, input_seqs, target_seqs, mask 205 | -------------------------------------------------------------------------------- /Section 1/im2txt/show_and_tell_model.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555. 17 | 18 | "Show and Tell: A Neural Image Caption Generator" 19 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | 27 | import tensorflow as tf 28 | 29 | from im2txt.ops import image_embedding 30 | from im2txt.ops import image_processing 31 | from im2txt.ops import inputs as input_ops 32 | 33 | 34 | class ShowAndTellModel(object): 35 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555. 36 | 37 | "Show and Tell: A Neural Image Caption Generator" 38 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan 39 | """ 40 | 41 | def __init__(self, config, mode, train_inception=False): 42 | """Basic setup. 43 | 44 | Args: 45 | config: Object containing configuration parameters. 46 | mode: "train", "eval" or "inference". 47 | train_inception: Whether the inception submodel variables are trainable. 48 | """ 49 | assert mode in ["train", "eval", "inference"] 50 | self.config = config 51 | self.mode = mode 52 | self.train_inception = train_inception 53 | 54 | # Reader for the input data. 55 | self.reader = tf.TFRecordReader() 56 | 57 | # To match the "Show and Tell" paper we initialize all variables with a 58 | # random uniform initializer. 59 | self.initializer = tf.random_uniform_initializer( 60 | minval=-self.config.initializer_scale, 61 | maxval=self.config.initializer_scale) 62 | 63 | # A float32 Tensor with shape [batch_size, height, width, channels]. 64 | self.images = None 65 | 66 | # An int32 Tensor with shape [batch_size, padded_length]. 67 | self.input_seqs = None 68 | 69 | # An int32 Tensor with shape [batch_size, padded_length]. 70 | self.target_seqs = None 71 | 72 | # An int32 0/1 Tensor with shape [batch_size, padded_length]. 73 | self.input_mask = None 74 | 75 | # A float32 Tensor with shape [batch_size, embedding_size]. 76 | self.image_embeddings = None 77 | 78 | # A float32 Tensor with shape [batch_size, padded_length, embedding_size]. 79 | self.seq_embeddings = None 80 | 81 | # A float32 scalar Tensor; the total loss for the trainer to optimize. 82 | self.total_loss = None 83 | 84 | # A float32 Tensor with shape [batch_size * padded_length]. 85 | self.target_cross_entropy_losses = None 86 | 87 | # A float32 Tensor with shape [batch_size * padded_length]. 88 | self.target_cross_entropy_loss_weights = None 89 | 90 | # Collection of variables from the inception submodel. 91 | self.inception_variables = [] 92 | 93 | # Function to restore the inception submodel from checkpoint. 94 | self.init_fn = None 95 | 96 | # Global step Tensor. 97 | self.global_step = None 98 | 99 | def is_training(self): 100 | """Returns true if the model is built for training mode.""" 101 | return self.mode == "train" 102 | 103 | def process_image(self, encoded_image, thread_id=0): 104 | """Decodes and processes an image string. 105 | 106 | Args: 107 | encoded_image: A scalar string Tensor; the encoded image. 108 | thread_id: Preprocessing thread id used to select the ordering of color 109 | distortions. 110 | 111 | Returns: 112 | A float32 Tensor of shape [height, width, 3]; the processed image. 113 | """ 114 | return image_processing.process_image(encoded_image, 115 | is_training=self.is_training(), 116 | height=self.config.image_height, 117 | width=self.config.image_width, 118 | thread_id=thread_id, 119 | image_format=self.config.image_format) 120 | 121 | def build_inputs(self): 122 | """Input prefetching, preprocessing and batching. 123 | 124 | Outputs: 125 | self.images 126 | self.input_seqs 127 | self.target_seqs (training and eval only) 128 | self.input_mask (training and eval only) 129 | """ 130 | if self.mode == "inference": 131 | # In inference mode, images and inputs are fed via placeholders. 132 | image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed") 133 | input_feed = tf.placeholder(dtype=tf.int64, 134 | shape=[None], # batch_size 135 | name="input_feed") 136 | 137 | # Process image and insert batch dimensions. 138 | images = tf.expand_dims(self.process_image(image_feed), 0) 139 | input_seqs = tf.expand_dims(input_feed, 1) 140 | 141 | # No target sequences or input mask in inference mode. 142 | target_seqs = None 143 | input_mask = None 144 | else: 145 | # Prefetch serialized SequenceExample protos. 146 | input_queue = input_ops.prefetch_input_data( 147 | self.reader, 148 | self.config.input_file_pattern, 149 | is_training=self.is_training(), 150 | batch_size=self.config.batch_size, 151 | values_per_shard=self.config.values_per_input_shard, 152 | input_queue_capacity_factor=self.config.input_queue_capacity_factor, 153 | num_reader_threads=self.config.num_input_reader_threads) 154 | 155 | # Image processing and random distortion. Split across multiple threads 156 | # with each thread applying a slightly different distortion. 157 | assert self.config.num_preprocess_threads % 2 == 0 158 | images_and_captions = [] 159 | for thread_id in range(self.config.num_preprocess_threads): 160 | serialized_sequence_example = input_queue.dequeue() 161 | encoded_image, caption = input_ops.parse_sequence_example( 162 | serialized_sequence_example, 163 | image_feature=self.config.image_feature_name, 164 | caption_feature=self.config.caption_feature_name) 165 | image = self.process_image(encoded_image, thread_id=thread_id) 166 | images_and_captions.append([image, caption]) 167 | 168 | # Batch inputs. 169 | queue_capacity = (2 * self.config.num_preprocess_threads * 170 | self.config.batch_size) 171 | images, input_seqs, target_seqs, input_mask = ( 172 | input_ops.batch_with_dynamic_pad(images_and_captions, 173 | batch_size=self.config.batch_size, 174 | queue_capacity=queue_capacity)) 175 | 176 | self.images = images 177 | self.input_seqs = input_seqs 178 | self.target_seqs = target_seqs 179 | self.input_mask = input_mask 180 | 181 | def build_image_embeddings(self): 182 | """Builds the image model subgraph and generates image embeddings. 183 | 184 | Inputs: 185 | self.images 186 | 187 | Outputs: 188 | self.image_embeddings 189 | """ 190 | inception_output = image_embedding.inception_v3( 191 | self.images, 192 | trainable=self.train_inception, 193 | is_training=self.is_training()) 194 | self.inception_variables = tf.get_collection( 195 | tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3") 196 | 197 | # Map inception output into embedding space. 198 | with tf.variable_scope("image_embedding") as scope: 199 | image_embeddings = tf.contrib.layers.fully_connected( 200 | inputs=inception_output, 201 | num_outputs=self.config.embedding_size, 202 | activation_fn=None, 203 | weights_initializer=self.initializer, 204 | biases_initializer=None, 205 | scope=scope) 206 | 207 | # Save the embedding size in the graph. 208 | tf.constant(self.config.embedding_size, name="embedding_size") 209 | 210 | self.image_embeddings = image_embeddings 211 | 212 | def build_seq_embeddings(self): 213 | """Builds the input sequence embeddings. 214 | 215 | Inputs: 216 | self.input_seqs 217 | 218 | Outputs: 219 | self.seq_embeddings 220 | """ 221 | with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"): 222 | embedding_map = tf.get_variable( 223 | name="map", 224 | shape=[self.config.vocab_size, self.config.embedding_size], 225 | initializer=self.initializer) 226 | seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs) 227 | 228 | self.seq_embeddings = seq_embeddings 229 | 230 | def build_model(self): 231 | """Builds the model. 232 | 233 | Inputs: 234 | self.image_embeddings 235 | self.seq_embeddings 236 | self.target_seqs (training and eval only) 237 | self.input_mask (training and eval only) 238 | 239 | Outputs: 240 | self.total_loss (training and eval only) 241 | self.target_cross_entropy_losses (training and eval only) 242 | self.target_cross_entropy_loss_weights (training and eval only) 243 | """ 244 | # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the 245 | # modified LSTM in the "Show and Tell" paper has no biases and outputs 246 | # new_c * sigmoid(o). 247 | lstm_cell = tf.contrib.rnn.BasicLSTMCell( 248 | num_units=self.config.num_lstm_units, state_is_tuple=True) 249 | if self.mode == "train": 250 | lstm_cell = tf.contrib.rnn.DropoutWrapper( 251 | lstm_cell, 252 | input_keep_prob=self.config.lstm_dropout_keep_prob, 253 | output_keep_prob=self.config.lstm_dropout_keep_prob) 254 | 255 | with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope: 256 | # Feed the image embeddings to set the initial LSTM state. 257 | zero_state = lstm_cell.zero_state( 258 | batch_size=self.image_embeddings.get_shape()[0], dtype=tf.float32) 259 | _, initial_state = lstm_cell(self.image_embeddings, zero_state) 260 | 261 | # Allow the LSTM variables to be reused. 262 | lstm_scope.reuse_variables() 263 | 264 | if self.mode == "inference": 265 | # In inference mode, use concatenated states for convenient feeding and 266 | # fetching. 267 | tf.concat(axis=1, values=initial_state, name="initial_state") 268 | 269 | # Placeholder for feeding a batch of concatenated states. 270 | state_feed = tf.placeholder(dtype=tf.float32, 271 | shape=[None, sum(lstm_cell.state_size)], 272 | name="state_feed") 273 | state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1) 274 | 275 | # Run a single LSTM step. 276 | lstm_outputs, state_tuple = lstm_cell( 277 | inputs=tf.squeeze(self.seq_embeddings, axis=[1]), 278 | state=state_tuple) 279 | 280 | # Concatentate the resulting state. 281 | tf.concat(axis=1, values=state_tuple, name="state") 282 | else: 283 | # Run the batch of sequence embeddings through the LSTM. 284 | sequence_length = tf.reduce_sum(self.input_mask, 1) 285 | lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell, 286 | inputs=self.seq_embeddings, 287 | sequence_length=sequence_length, 288 | initial_state=initial_state, 289 | dtype=tf.float32, 290 | scope=lstm_scope) 291 | 292 | # Stack batches vertically. 293 | lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size]) 294 | 295 | with tf.variable_scope("logits") as logits_scope: 296 | logits = tf.contrib.layers.fully_connected( 297 | inputs=lstm_outputs, 298 | num_outputs=self.config.vocab_size, 299 | activation_fn=None, 300 | weights_initializer=self.initializer, 301 | scope=logits_scope) 302 | 303 | if self.mode == "inference": 304 | tf.nn.softmax(logits, name="softmax") 305 | else: 306 | targets = tf.reshape(self.target_seqs, [-1]) 307 | weights = tf.to_float(tf.reshape(self.input_mask, [-1])) 308 | 309 | # Compute losses. 310 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets, 311 | logits=logits) 312 | batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)), 313 | tf.reduce_sum(weights), 314 | name="batch_loss") 315 | tf.losses.add_loss(batch_loss) 316 | total_loss = tf.losses.get_total_loss() 317 | 318 | # Add summaries. 319 | tf.summary.scalar("losses/batch_loss", batch_loss) 320 | tf.summary.scalar("losses/total_loss", total_loss) 321 | for var in tf.trainable_variables(): 322 | tf.summary.histogram("parameters/" + var.op.name, var) 323 | 324 | self.total_loss = total_loss 325 | self.target_cross_entropy_losses = losses # Used in evaluation. 326 | self.target_cross_entropy_loss_weights = weights # Used in evaluation. 327 | 328 | def setup_inception_initializer(self): 329 | """Sets up the function to restore inception variables from checkpoint.""" 330 | if self.mode != "inference": 331 | # Restore inception variables only. 332 | saver = tf.train.Saver(self.inception_variables) 333 | 334 | def restore_fn(sess): 335 | tf.logging.info("Restoring Inception variables from checkpoint file %s", 336 | self.config.inception_checkpoint_file) 337 | saver.restore(sess, self.config.inception_checkpoint_file) 338 | 339 | self.init_fn = restore_fn 340 | 341 | def setup_global_step(self): 342 | """Sets up the global step Tensor.""" 343 | global_step = tf.Variable( 344 | initial_value=0, 345 | name="global_step", 346 | trainable=False, 347 | collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES]) 348 | 349 | self.global_step = global_step 350 | 351 | def build(self): 352 | """Creates all ops for training and evaluation.""" 353 | self.build_inputs() 354 | self.build_image_embeddings() 355 | self.build_seq_embeddings() 356 | self.build_model() 357 | self.setup_inception_initializer() 358 | self.setup_global_step() 359 | -------------------------------------------------------------------------------- /Section 1/im2txt/show_and_tell_model_test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Tests for tensorflow_models.im2txt.show_and_tell_model.""" 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | 23 | import numpy as np 24 | import tensorflow as tf 25 | 26 | from im2txt import configuration 27 | from im2txt import show_and_tell_model 28 | 29 | 30 | class ShowAndTellModel(show_and_tell_model.ShowAndTellModel): 31 | """Subclass of ShowAndTellModel without the disk I/O.""" 32 | 33 | def build_inputs(self): 34 | if self.mode == "inference": 35 | # Inference mode doesn't read from disk, so defer to parent. 36 | return super(ShowAndTellModel, self).build_inputs() 37 | else: 38 | # Replace disk I/O with random Tensors. 39 | self.images = tf.random_uniform( 40 | shape=[self.config.batch_size, self.config.image_height, 41 | self.config.image_width, 3], 42 | minval=-1, 43 | maxval=1) 44 | self.input_seqs = tf.random_uniform( 45 | [self.config.batch_size, 15], 46 | minval=0, 47 | maxval=self.config.vocab_size, 48 | dtype=tf.int64) 49 | self.target_seqs = tf.random_uniform( 50 | [self.config.batch_size, 15], 51 | minval=0, 52 | maxval=self.config.vocab_size, 53 | dtype=tf.int64) 54 | self.input_mask = tf.ones_like(self.input_seqs) 55 | 56 | 57 | class ShowAndTellModelTest(tf.test.TestCase): 58 | 59 | def setUp(self): 60 | super(ShowAndTellModelTest, self).setUp() 61 | self._model_config = configuration.ModelConfig() 62 | 63 | def _countModelParameters(self): 64 | """Counts the number of parameters in the model at top level scope.""" 65 | counter = {} 66 | for v in tf.global_variables(): 67 | name = v.op.name.split("/")[0] 68 | num_params = v.get_shape().num_elements() 69 | assert num_params 70 | counter[name] = counter.get(name, 0) + num_params 71 | return counter 72 | 73 | def _checkModelParameters(self): 74 | """Verifies the number of parameters in the model.""" 75 | param_counts = self._countModelParameters() 76 | expected_param_counts = { 77 | "InceptionV3": 21802784, 78 | # inception_output_size * embedding_size 79 | "image_embedding": 1048576, 80 | # vocab_size * embedding_size 81 | "seq_embedding": 6144000, 82 | # (embedding_size + num_lstm_units + 1) * 4 * num_lstm_units 83 | "lstm": 2099200, 84 | # (num_lstm_units + 1) * vocab_size 85 | "logits": 6156000, 86 | "global_step": 1, 87 | } 88 | self.assertDictEqual(expected_param_counts, param_counts) 89 | 90 | def _checkOutputs(self, expected_shapes, feed_dict=None): 91 | """Verifies that the model produces expected outputs. 92 | 93 | Args: 94 | expected_shapes: A dict mapping Tensor or Tensor name to expected output 95 | shape. 96 | feed_dict: Values of Tensors to feed into Session.run(). 97 | """ 98 | fetches = expected_shapes.keys() 99 | 100 | with self.test_session() as sess: 101 | sess.run(tf.global_variables_initializer()) 102 | outputs = sess.run(fetches, feed_dict) 103 | 104 | for index, output in enumerate(outputs): 105 | tensor = fetches[index] 106 | expected = expected_shapes[tensor] 107 | actual = output.shape 108 | if expected != actual: 109 | self.fail("Tensor %s has shape %s (expected %s)." % 110 | (tensor, actual, expected)) 111 | 112 | def testBuildForTraining(self): 113 | model = ShowAndTellModel(self._model_config, mode="train") 114 | model.build() 115 | 116 | self._checkModelParameters() 117 | 118 | expected_shapes = { 119 | # [batch_size, image_height, image_width, 3] 120 | model.images: (32, 299, 299, 3), 121 | # [batch_size, sequence_length] 122 | model.input_seqs: (32, 15), 123 | # [batch_size, sequence_length] 124 | model.target_seqs: (32, 15), 125 | # [batch_size, sequence_length] 126 | model.input_mask: (32, 15), 127 | # [batch_size, embedding_size] 128 | model.image_embeddings: (32, 512), 129 | # [batch_size, sequence_length, embedding_size] 130 | model.seq_embeddings: (32, 15, 512), 131 | # Scalar 132 | model.total_loss: (), 133 | # [batch_size * sequence_length] 134 | model.target_cross_entropy_losses: (480,), 135 | # [batch_size * sequence_length] 136 | model.target_cross_entropy_loss_weights: (480,), 137 | } 138 | self._checkOutputs(expected_shapes) 139 | 140 | def testBuildForEval(self): 141 | model = ShowAndTellModel(self._model_config, mode="eval") 142 | model.build() 143 | 144 | self._checkModelParameters() 145 | 146 | expected_shapes = { 147 | # [batch_size, image_height, image_width, 3] 148 | model.images: (32, 299, 299, 3), 149 | # [batch_size, sequence_length] 150 | model.input_seqs: (32, 15), 151 | # [batch_size, sequence_length] 152 | model.target_seqs: (32, 15), 153 | # [batch_size, sequence_length] 154 | model.input_mask: (32, 15), 155 | # [batch_size, embedding_size] 156 | model.image_embeddings: (32, 512), 157 | # [batch_size, sequence_length, embedding_size] 158 | model.seq_embeddings: (32, 15, 512), 159 | # Scalar 160 | model.total_loss: (), 161 | # [batch_size * sequence_length] 162 | model.target_cross_entropy_losses: (480,), 163 | # [batch_size * sequence_length] 164 | model.target_cross_entropy_loss_weights: (480,), 165 | } 166 | self._checkOutputs(expected_shapes) 167 | 168 | def testBuildForInference(self): 169 | model = ShowAndTellModel(self._model_config, mode="inference") 170 | model.build() 171 | 172 | self._checkModelParameters() 173 | 174 | # Test feeding an image to get the initial LSTM state. 175 | images_feed = np.random.rand(1, 299, 299, 3) 176 | feed_dict = {model.images: images_feed} 177 | expected_shapes = { 178 | # [batch_size, embedding_size] 179 | model.image_embeddings: (1, 512), 180 | # [batch_size, 2 * num_lstm_units] 181 | "lstm/initial_state:0": (1, 1024), 182 | } 183 | self._checkOutputs(expected_shapes, feed_dict) 184 | 185 | # Test feeding a batch of inputs and LSTM states to get softmax output and 186 | # LSTM states. 187 | input_feed = np.random.randint(0, 10, size=3) 188 | state_feed = np.random.rand(3, 1024) 189 | feed_dict = {"input_feed:0": input_feed, "lstm/state_feed:0": state_feed} 190 | expected_shapes = { 191 | # [batch_size, 2 * num_lstm_units] 192 | "lstm/state:0": (3, 1024), 193 | # [batch_size, vocab_size] 194 | "softmax:0": (3, 12000), 195 | } 196 | self._checkOutputs(expected_shapes, feed_dict) 197 | 198 | 199 | if __name__ == "__main__": 200 | tf.test.main() 201 | -------------------------------------------------------------------------------- /Section 1/im2txt/train.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Train the model.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | 22 | import tensorflow as tf 23 | 24 | from im2txt import configuration 25 | from im2txt import show_and_tell_model 26 | 27 | FLAGS = tf.app.flags.FLAGS 28 | 29 | tf.flags.DEFINE_string("input_file_pattern", "", 30 | "File pattern of sharded TFRecord input files.") 31 | tf.flags.DEFINE_string("inception_checkpoint_file", "", 32 | "Path to a pretrained inception_v3 model.") 33 | tf.flags.DEFINE_string("train_dir", "", 34 | "Directory for saving and loading model checkpoints.") 35 | tf.flags.DEFINE_boolean("train_inception", False, 36 | "Whether to train inception submodel variables.") 37 | tf.flags.DEFINE_integer("number_of_steps", 1000000, "Number of training steps.") 38 | tf.flags.DEFINE_integer("log_every_n_steps", 1, 39 | "Frequency at which loss and global step are logged.") 40 | 41 | tf.logging.set_verbosity(tf.logging.INFO) 42 | 43 | 44 | def main(unused_argv): 45 | assert FLAGS.input_file_pattern, "--input_file_pattern is required" 46 | assert FLAGS.train_dir, "--train_dir is required" 47 | 48 | model_config = configuration.ModelConfig() 49 | model_config.input_file_pattern = FLAGS.input_file_pattern 50 | model_config.inception_checkpoint_file = FLAGS.inception_checkpoint_file 51 | training_config = configuration.TrainingConfig() 52 | 53 | # Create training directory. 54 | train_dir = FLAGS.train_dir 55 | if not tf.gfile.IsDirectory(train_dir): 56 | tf.logging.info("Creating training directory: %s", train_dir) 57 | tf.gfile.MakeDirs(train_dir) 58 | 59 | # Build the TensorFlow graph. 60 | g = tf.Graph() 61 | with g.as_default(): 62 | # Build the model. 63 | model = show_and_tell_model.ShowAndTellModel( 64 | model_config, mode="train", train_inception=FLAGS.train_inception) 65 | model.build() 66 | 67 | # Set up the learning rate. 68 | learning_rate_decay_fn = None 69 | if FLAGS.train_inception: 70 | learning_rate = tf.constant(training_config.train_inception_learning_rate) 71 | else: 72 | learning_rate = tf.constant(training_config.initial_learning_rate) 73 | if training_config.learning_rate_decay_factor > 0: 74 | num_batches_per_epoch = (training_config.num_examples_per_epoch / 75 | model_config.batch_size) 76 | decay_steps = int(num_batches_per_epoch * 77 | training_config.num_epochs_per_decay) 78 | 79 | def _learning_rate_decay_fn(learning_rate, global_step): 80 | return tf.train.exponential_decay( 81 | learning_rate, 82 | global_step, 83 | decay_steps=decay_steps, 84 | decay_rate=training_config.learning_rate_decay_factor, 85 | staircase=True) 86 | 87 | learning_rate_decay_fn = _learning_rate_decay_fn 88 | 89 | # Set up the training ops. 90 | train_op = tf.contrib.layers.optimize_loss( 91 | loss=model.total_loss, 92 | global_step=model.global_step, 93 | learning_rate=learning_rate, 94 | optimizer=training_config.optimizer, 95 | clip_gradients=training_config.clip_gradients, 96 | learning_rate_decay_fn=learning_rate_decay_fn) 97 | 98 | # Set up the Saver for saving and restoring model checkpoints. 99 | saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep) 100 | 101 | # Run training. 102 | tf.contrib.slim.learning.train( 103 | train_op, 104 | train_dir, 105 | log_every_n_steps=FLAGS.log_every_n_steps, 106 | graph=g, 107 | global_step=model.global_step, 108 | number_of_steps=FLAGS.number_of_steps, 109 | init_fn=model.init_fn, 110 | saver=saver) 111 | 112 | 113 | if __name__ == "__main__": 114 | tf.app.run() 115 | -------------------------------------------------------------------------------- /Section 1/test_images/ballons.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/ballons.jpeg -------------------------------------------------------------------------------- /Section 1/test_images/bike.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/bike.jpeg -------------------------------------------------------------------------------- /Section 1/test_images/fireworks.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/fireworks.jpeg -------------------------------------------------------------------------------- /Section 1/test_images/football.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/football.jpeg -------------------------------------------------------------------------------- /Section 1/test_images/headphones.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/headphones.jpeg -------------------------------------------------------------------------------- /Section 1/test_images/laughing.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/laughing.jpeg -------------------------------------------------------------------------------- /Section 2/knn.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/knn.p -------------------------------------------------------------------------------- /Section 2/tests/Thumbs.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/Thumbs.db -------------------------------------------------------------------------------- /Section 2/tests/p2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p2.jpg -------------------------------------------------------------------------------- /Section 2/tests/p5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p5.jpg -------------------------------------------------------------------------------- /Section 2/tests/p7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p7.jpg -------------------------------------------------------------------------------- /Section 2/tests/p9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p9.jpg -------------------------------------------------------------------------------- /Section 3/AUTHORS: -------------------------------------------------------------------------------- 1 | Eldar Insafutdinov, github.com/eldar 2 | Mikhaylo Andriluka, github.com/andriluka 3 | Mihai Fieraru, github.com/fierarufmihai -------------------------------------------------------------------------------- /Section 3/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | import logging 4 | 5 | import yaml 6 | from easydict import EasyDict as edict 7 | 8 | import default_config 9 | 10 | 11 | cfg = default_config.cfg 12 | 13 | 14 | def _merge_a_into_b(a, b): 15 | """Merge config dictionary a into config dictionary b, clobbering the 16 | options in b whenever they are also specified in a. 17 | """ 18 | if type(a) is not edict: 19 | return 20 | 21 | for k, v in a.items(): 22 | # a must specify keys that are in b 23 | #if k not in b: 24 | # raise KeyError('{} is not a valid config key'.format(k)) 25 | 26 | # recursively merge dicts 27 | if type(v) is edict: 28 | try: 29 | _merge_a_into_b(a[k], b[k]) 30 | except: 31 | print('Error under config key: {}'.format(k)) 32 | raise 33 | else: 34 | b[k] = v 35 | 36 | 37 | def cfg_from_file(filename): 38 | """Load a config from file filename and merge it into the default options. 39 | """ 40 | with open(filename, 'r') as f: 41 | yaml_cfg = edict(yaml.load(f)) 42 | 43 | _merge_a_into_b(yaml_cfg, cfg) 44 | 45 | logging.info("Config:\n"+pprint.pformat(cfg)) 46 | return cfg 47 | 48 | 49 | def load_config(filename = "pose_cfg.yaml"): 50 | if 'POSE_PARAM_PATH' in os.environ: 51 | filename = os.environ['POSE_PARAM_PATH'] + '/' + filename 52 | return cfg_from_file(filename) 53 | 54 | 55 | if __name__ == "__main__": 56 | print(load_config()) -------------------------------------------------------------------------------- /Section 3/dlib_face_recognition_resnet_model_v1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/dlib_face_recognition_resnet_model_v1.dat -------------------------------------------------------------------------------- /Section 3/models/coco/coco-resnet-101.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/coco-resnet-101.meta -------------------------------------------------------------------------------- /Section 3/models/coco/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.data-00000-of-00001 4 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.meta 5 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.index 6 | 7 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/pairwise_coco.tar.gz 8 | tar xvzf pairwise_coco.tar.gz 9 | -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/pairwise_stats.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/pairwise_stats.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_10_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_10_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_10_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_11_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_11_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_11_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_11_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_12_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_12_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_12_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_12_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_13_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_13_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_13_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_14_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_14_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_14_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_15_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_15_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_16_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_16_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_11.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_11.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_3.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_3.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_4.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_4.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_8.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_8.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_1_9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_9.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_11.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_11.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_3.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_3.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_5.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_5.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_7.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_7.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_2_9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_9.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_11.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_11.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_4.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_4.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_5.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_5.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_7.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_7.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_3_8.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_8.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_6.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_6.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_7.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_7.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_4_9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_9.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_11.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_11.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_6.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_6.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_8.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_8.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_5_9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_9.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_8.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_8.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_6_9.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_9.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_7_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_7_11.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_11.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_7_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_7_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_7_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_10.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_10.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_12.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_12.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_13.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_13.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_15.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_15.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_8_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_9_14.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_14.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_9_16.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_16.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise/spatial_model_cidx_9_17.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_17.mat -------------------------------------------------------------------------------- /Section 3/models/coco/pairwise_coco.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise_coco.tar.gz -------------------------------------------------------------------------------- /Section 3/models/coco/train/pose_cfg.yaml: -------------------------------------------------------------------------------- 1 | dataset: /path/to/coco/dataset 2 | dataset_phase: train2014 3 | dataset_ann: person_keypoints # 'image_info' or 'person_keypoints' 4 | dataset_type: coco 5 | coco_only_images_with_people: true 6 | 7 | global_scale: 1.0 8 | pos_dist_thresh: 17 9 | scale_jitter_lo: 0.85 10 | scale_jitter_up: 1.15 11 | 12 | net_type: resnet_101 13 | init_weights: ../../pretrained/resnet_v1_101.ckpt 14 | 15 | location_refinement: true 16 | locref_huber_loss: true 17 | locref_loss_weight: 0.05 18 | locref_stdev: 7.2801 19 | 20 | pairwise_predict: true 21 | pairwise_huber_loss: true 22 | pairwise_loss_weight: 0.05 23 | pairwise_stats_fn: ../pairwise/pairwise_stats.mat 24 | 25 | intermediate_supervision: true 26 | intermediate_supervision_layer: 12 27 | 28 | max_input_size: 850 29 | multi_step: 30 | - [0.005, 10000] 31 | - [0.02, 450000] 32 | - [0.002, 750000] 33 | - [0.0005, 1050000] 34 | - [0.0002, 1550000] 35 | - [0.00005, 1800000] 36 | display_iters: 20 37 | save_iters: 200000 38 | 39 | mirror: true 40 | -------------------------------------------------------------------------------- /Section 3/models/mpii/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.data-00000-of-00001 4 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.meta 5 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.index 6 | -------------------------------------------------------------------------------- /Section 3/models/mpii/mpii-single-resnet-101.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/mpii/mpii-single-resnet-101.index -------------------------------------------------------------------------------- /Section 3/models/mpii/mpii-single-resnet-101.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/mpii/mpii-single-resnet-101.meta -------------------------------------------------------------------------------- /Section 3/models/mpii/test/pose_cfg.yaml: -------------------------------------------------------------------------------- 1 | dataset: dataset-test.mat 2 | dataset_type: "mpii" 3 | global_scale: 1.0 4 | init_weights: models/mpii/snapshot-1030000 5 | location_refinement: true 6 | locref_stdev: 7.2801 7 | net_type: resnet_101 8 | scoremap_dir: test 9 | -------------------------------------------------------------------------------- /Section 3/models/mpii/train/pose_cfg.yaml: -------------------------------------------------------------------------------- 1 | dataset: /path/to/dataset.mat 2 | dataset_type: "mpii" 3 | 4 | pos_dist_thresh: 17 5 | global_scale: 0.8452830189 6 | scale_jitter_lo: 0.85 7 | scale_jitter_up: 1.15 8 | 9 | net_type: resnet_101 10 | init_weights: ../../pretrained/resnet_v1_101.ckpt 11 | 12 | location_refinement: true 13 | locref_huber_loss: true 14 | locref_loss_weight: 0.05 15 | locref_stdev: 7.2801 16 | 17 | intermediate_supervision: true 18 | intermediate_supervision_layer: 12 19 | 20 | max_input_size: 850 21 | multi_step: 22 | - [0.005, 10000] 23 | - [0.02, 430000] 24 | - [0.002, 730000] 25 | - [0.001, 1030000] 26 | display_iters: 20 27 | save_iters: 60000 28 | 29 | mirror: true 30 | -------------------------------------------------------------------------------- /Section 3/models/pretrained/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Listed here https://github.com/tensorflow/models/tree/master/slim#pre-trained-models 4 | 5 | curl http://download.tensorflow.org/models/resnet_v1_50_2016_08_28.tar.gz | tar xvz 6 | curl http://download.tensorflow.org/models/resnet_v1_101_2016_08_28.tar.gz | tar xvz 7 | -------------------------------------------------------------------------------- /Section 3/pexels-photo-712521.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/pexels-photo-712521.jpeg -------------------------------------------------------------------------------- /Section 3/pexels-photo-776615.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/pexels-photo-776615.jpeg -------------------------------------------------------------------------------- /Section 3/testcases/vids/boy_walking.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/testcases/vids/boy_walking.mp4 -------------------------------------------------------------------------------- /Section 3/testcases/vids/sidewalk.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/testcases/vids/sidewalk.mp4 --------------------------------------------------------------------------------