├── LICENSE
├── README.md
├── Section 1
├── Section_1-Tensorflow_Image_Captioning.ipynb
├── im2txt
│ ├── configuration.py
│ ├── data
│ │ ├── build_mscoco_data.py
│ │ └── download_and_preprocess_mscoco.sh
│ ├── evaluate.py
│ ├── inference_utils
│ │ └── __pycache__
│ │ │ ├── caption_generator.cpython-36.pyc
│ │ │ ├── inference_wrapper_base.cpython-36.pyc
│ │ │ └── vocabulary.cpython-36.pyc
│ ├── inference_wrapper.py
│ ├── ops
│ │ ├── __pycache__
│ │ │ ├── image_embedding.cpython-36.pyc
│ │ │ ├── image_processing.cpython-36.pyc
│ │ │ └── inputs.cpython-36.pyc
│ │ ├── image_embedding.py
│ │ ├── image_embedding_test.py
│ │ ├── image_processing.py
│ │ └── inputs.py
│ ├── show_and_tell_model.py
│ ├── show_and_tell_model_test.py
│ └── train.py
└── test_images
│ ├── ballons.jpeg
│ ├── bike.jpeg
│ ├── fireworks.jpeg
│ ├── football.jpeg
│ ├── headphones.jpeg
│ └── laughing.jpeg
├── Section 2
├── Packt_CV_w_Py3_adv_projects_Section2_License_plate_recognition.ipynb
├── knn.p
└── tests
│ ├── Thumbs.db
│ ├── p2.jpg
│ ├── p5.jpg
│ ├── p7.jpg
│ └── p9.jpg
└── Section 3
├── AUTHORS
├── config.py
├── dlib_face_recognition_resnet_model_v1.dat
├── models
├── coco
│ ├── coco-resnet-101.meta
│ ├── download_models.sh
│ ├── pairwise
│ │ ├── pairwise_stats.mat
│ │ ├── spatial_model_cidx_10_12.mat
│ │ ├── spatial_model_cidx_10_16.mat
│ │ ├── spatial_model_cidx_10_17.mat
│ │ ├── spatial_model_cidx_11_12.mat
│ │ ├── spatial_model_cidx_11_14.mat
│ │ ├── spatial_model_cidx_11_15.mat
│ │ ├── spatial_model_cidx_11_16.mat
│ │ ├── spatial_model_cidx_12_13.mat
│ │ ├── spatial_model_cidx_12_14.mat
│ │ ├── spatial_model_cidx_12_15.mat
│ │ ├── spatial_model_cidx_12_17.mat
│ │ ├── spatial_model_cidx_13_14.mat
│ │ ├── spatial_model_cidx_13_16.mat
│ │ ├── spatial_model_cidx_13_17.mat
│ │ ├── spatial_model_cidx_14_15.mat
│ │ ├── spatial_model_cidx_14_16.mat
│ │ ├── spatial_model_cidx_14_17.mat
│ │ ├── spatial_model_cidx_15_16.mat
│ │ ├── spatial_model_cidx_16_17.mat
│ │ ├── spatial_model_cidx_1_10.mat
│ │ ├── spatial_model_cidx_1_11.mat
│ │ ├── spatial_model_cidx_1_12.mat
│ │ ├── spatial_model_cidx_1_14.mat
│ │ ├── spatial_model_cidx_1_15.mat
│ │ ├── spatial_model_cidx_1_17.mat
│ │ ├── spatial_model_cidx_1_3.mat
│ │ ├── spatial_model_cidx_1_4.mat
│ │ ├── spatial_model_cidx_1_8.mat
│ │ ├── spatial_model_cidx_1_9.mat
│ │ ├── spatial_model_cidx_2_10.mat
│ │ ├── spatial_model_cidx_2_11.mat
│ │ ├── spatial_model_cidx_2_13.mat
│ │ ├── spatial_model_cidx_2_16.mat
│ │ ├── spatial_model_cidx_2_3.mat
│ │ ├── spatial_model_cidx_2_5.mat
│ │ ├── spatial_model_cidx_2_7.mat
│ │ ├── spatial_model_cidx_2_9.mat
│ │ ├── spatial_model_cidx_3_11.mat
│ │ ├── spatial_model_cidx_3_13.mat
│ │ ├── spatial_model_cidx_3_14.mat
│ │ ├── spatial_model_cidx_3_16.mat
│ │ ├── spatial_model_cidx_3_4.mat
│ │ ├── spatial_model_cidx_3_5.mat
│ │ ├── spatial_model_cidx_3_7.mat
│ │ ├── spatial_model_cidx_3_8.mat
│ │ ├── spatial_model_cidx_4_10.mat
│ │ ├── spatial_model_cidx_4_12.mat
│ │ ├── spatial_model_cidx_4_14.mat
│ │ ├── spatial_model_cidx_4_15.mat
│ │ ├── spatial_model_cidx_4_16.mat
│ │ ├── spatial_model_cidx_4_6.mat
│ │ ├── spatial_model_cidx_4_7.mat
│ │ ├── spatial_model_cidx_4_9.mat
│ │ ├── spatial_model_cidx_5_10.mat
│ │ ├── spatial_model_cidx_5_11.mat
│ │ ├── spatial_model_cidx_5_12.mat
│ │ ├── spatial_model_cidx_5_14.mat
│ │ ├── spatial_model_cidx_5_6.mat
│ │ ├── spatial_model_cidx_5_8.mat
│ │ ├── spatial_model_cidx_5_9.mat
│ │ ├── spatial_model_cidx_6_10.mat
│ │ ├── spatial_model_cidx_6_13.mat
│ │ ├── spatial_model_cidx_6_14.mat
│ │ ├── spatial_model_cidx_6_15.mat
│ │ ├── spatial_model_cidx_6_16.mat
│ │ ├── spatial_model_cidx_6_17.mat
│ │ ├── spatial_model_cidx_6_8.mat
│ │ ├── spatial_model_cidx_6_9.mat
│ │ ├── spatial_model_cidx_7_10.mat
│ │ ├── spatial_model_cidx_7_11.mat
│ │ ├── spatial_model_cidx_7_13.mat
│ │ ├── spatial_model_cidx_7_15.mat
│ │ ├── spatial_model_cidx_7_17.mat
│ │ ├── spatial_model_cidx_8_10.mat
│ │ ├── spatial_model_cidx_8_12.mat
│ │ ├── spatial_model_cidx_8_13.mat
│ │ ├── spatial_model_cidx_8_14.mat
│ │ ├── spatial_model_cidx_8_15.mat
│ │ ├── spatial_model_cidx_8_17.mat
│ │ ├── spatial_model_cidx_9_14.mat
│ │ ├── spatial_model_cidx_9_16.mat
│ │ └── spatial_model_cidx_9_17.mat
│ ├── pairwise_coco.tar.gz
│ └── train
│ │ └── pose_cfg.yaml
├── mpii
│ ├── download_models.sh
│ ├── mpii-single-resnet-101.index
│ ├── mpii-single-resnet-101.meta
│ ├── test
│ │ └── pose_cfg.yaml
│ └── train
│ │ └── pose_cfg.yaml
└── pretrained
│ └── download.sh
├── pexels-photo-712521.jpeg
├── pexels-photo-776615.jpeg
└── testcases
└── vids
├── boy_walking.mp4
└── sidewalk.mp4
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Advanced Computer Vision Projects [Video]
2 | This is the code repository for [Advanced Computer Vision Projects [Video]](https://www.packtpub.com/big-data-and-business-intelligence/advanced-computer-vision-projects-video?utm_source=github&utm_medium=repository&utm_campaign=9781788620772), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the video course from start to finish.
3 | ## About the Video Course
4 | Python’s wealth of powerful packages along with its clear syntax make state-of-the art computer vision and machine learning accessible to developers with a variety of backgrounds. This video course will equip you with the tools and skills to utilize the latest and greatest algorithms in computer vision, making applications that weren’t possible until recent years.
5 |
6 | In this course, you’ll continue to use TensorFlow and extend it to generate full captions from images. Later, you’ll see how to read text from license plates from real-world images using Google’s Tesseract Software. Finally, you’ll see how to track human body poses using “DeeperCut” within TensorFlow.
7 |
8 | At the end of this course, you’ll develop an application that can estimate human poses within images and will be able to take on the world with best practices in computer vision with machine learning.
9 |
10 |
11 |
What You Will Learn
12 |
13 |
14 | - Apply LSTMs to automated image captioning
15 |
- Know how to read text from real-world images
16 |
- See how to extract human pose data from images
17 |
- Understand the TensorFlow workflow model
18 |
19 | ## Instructions and Navigation
20 | ### Assumed Knowledge
21 | To fully benefit from the coverage included in this course, you will need:
22 | This video course is for Python developers who wish to learn the latest cutting-edge algorithms to solve computer vision problems that were impossible until recently.
23 | ### Technical Requirements
24 | This course has the following software requirements:
25 | This course has the following software requirements:
26 |
27 | This course has been tested on the following system configuration: ● OS: Windows 10 ● Processor: Intel i7 4th generation mobile ● Memory: 32 GB ● Hard Disk Space: 1 TB ● Video Card: GeForce GTX 970m
28 |
29 | ## Related Products
30 | * [Computer Vision Projects with Python 3 [Video]](https://www.packtpub.com/big-data-and-business-intelligence/computer-vision-projects-python-3-video?utm_source=github&utm_medium=repository&utm_campaign=9781788835565)
31 |
32 | * [Real-World Machine Learning Projects with Scikit-Learn [Video]](https://www.packtpub.com/big-data-and-business-intelligence/real-world-machine-learning-projects-scikit-learn-video?utm_source=github&utm_medium=repository&utm_campaign=9781789131222)
33 |
34 | * [Java Machine Learning for Computer Vision [Video]](https://www.packtpub.com/big-data-and-business-intelligence/java-machine-learning-computer-vision-video?utm_source=github&utm_medium=repository&utm_campaign=9781789130652)
35 |
36 |
--------------------------------------------------------------------------------
/Section 1/Section_1-Tensorflow_Image_Captioning.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Section One – Image Captioning with Tensorflow"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": null,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# load essential libraries\n",
17 | "import math\n",
18 | "import os\n",
19 | "\n",
20 | "import tensorflow as tf\n",
21 | "\n",
22 | "%pylab inline"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "# load Tensorflow/Google Brain base code\n",
32 | "# https://github.com/tensorflow/models/tree/master/research/im2txt\n",
33 | "\n",
34 | "from im2txt import configuration\n",
35 | "from im2txt import inference_wrapper\n",
36 | "from im2txt.inference_utils import caption_generator\n",
37 | "from im2txt.inference_utils import vocabulary"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "# tell our function where to find the trained model and vocabulary\n",
47 | "checkpoint_path = './model'\n",
48 | "vocab_file = './model/word_counts.txt'"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": null,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "# this is the function we'll call to produce our captions \n",
58 | "# given input file name(s) -- separate file names by a ,\n",
59 | "# if more than one\n",
60 | "\n",
61 | "def gen_caption(input_files):\n",
62 | " # only print serious log messages\n",
63 | " tf.logging.set_verbosity(tf.logging.FATAL)\n",
64 | " # load our pretrained model\n",
65 | " g = tf.Graph()\n",
66 | " with g.as_default():\n",
67 | " model = inference_wrapper.InferenceWrapper()\n",
68 | " restore_fn = model.build_graph_from_config(configuration.ModelConfig(),\n",
69 | " checkpoint_path)\n",
70 | " g.finalize()\n",
71 | "\n",
72 | " # Create the vocabulary.\n",
73 | " vocab = vocabulary.Vocabulary(vocab_file)\n",
74 | "\n",
75 | " filenames = []\n",
76 | " for file_pattern in input_files.split(\",\"):\n",
77 | " filenames.extend(tf.gfile.Glob(file_pattern))\n",
78 | " tf.logging.info(\"Running caption generation on %d files matching %s\",\n",
79 | " len(filenames), input_files)\n",
80 | "\n",
81 | " with tf.Session(graph=g) as sess:\n",
82 | " # Load the model from checkpoint.\n",
83 | " restore_fn(sess)\n",
84 | "\n",
85 | " # Prepare the caption generator. Here we are implicitly using the default\n",
86 | " # beam search parameters. See caption_generator.py for a description of the\n",
87 | " # available beam search parameters.\n",
88 | " generator = caption_generator.CaptionGenerator(model, vocab)\n",
89 | " \n",
90 | " captionlist = []\n",
91 | "\n",
92 | " for filename in filenames:\n",
93 | " with tf.gfile.GFile(filename, \"rb\") as f:\n",
94 | " image = f.read()\n",
95 | " captions = generator.beam_search(sess, image)\n",
96 | " print(\"Captions for image %s:\" % os.path.basename(filename))\n",
97 | " for i, caption in enumerate(captions):\n",
98 | " # Ignore begin and end words.\n",
99 | " sentence = [vocab.id_to_word(w) for w in caption.sentence[1:-1]]\n",
100 | " sentence = \" \".join(sentence)\n",
101 | " print(\" %d) %s (p=%f)\" % (i, sentence, math.exp(caption.logprob)))\n",
102 | " captionlist.append(sentence)\n",
103 | " return captionlist"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "testfile = 'test_images/ballons.jpeg'\n",
113 | "\n",
114 | "figure()\n",
115 | "imshow(imread(testfile))\n",
116 | "\n",
117 | "capts = gen_caption(testfile)"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": null,
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "input_files = 'test_images/ballons.jpeg,test_images/bike.jpeg,test_images/dog.jpeg,test_images/fireworks.jpeg,test_images/football.jpeg,test_images/giraffes.jpeg,test_images/headphones.jpeg,test_images/laughing.jpeg,test_images/objects.jpeg,test_images/snowboard.jpeg,test_images/surfing.jpeg'\n",
127 | "\n",
128 | "capts = gen_caption(input_files)"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": null,
134 | "metadata": {},
135 | "outputs": [],
136 | "source": []
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "\n",
143 | "\n",
144 | "\n",
145 | "\n",
146 | "Retraining the image captioner"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 1,
152 | "metadata": {},
153 | "outputs": [
154 | {
155 | "data": {
156 | "text/plain": [
157 | "True"
158 | ]
159 | },
160 | "execution_count": 1,
161 | "metadata": {},
162 | "output_type": "execute_result"
163 | }
164 | ],
165 | "source": [
166 | "# First download pretrained Inception (v3) model\n",
167 | "\n",
168 | "import webbrowser \n",
169 | "webbrowser.open(\"http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz\")\n",
170 | "\n",
171 | "# Completely unzip tar.gz file to get inception_v3.ckpt,\n",
172 | "# --recommend storing in im2txt/data directory"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": null,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "# Now gather and prepare the mscoco data\n",
182 | "\n",
183 | "# Comment out cd magic command if already in data directory\n",
184 | "%cd im2txt/data\n",
185 | "# This command will take an hour or more to run typically.\n",
186 | "# Note, you will need a lot of HD space (>100 GB)!\n",
187 | "%run build_mscoco_data.py\n",
188 | "\n",
189 | "# At this point you have files in im2txt/data/mscoco/raw-data that you can train\n",
190 | "# on, or you can substitute your own data\n",
191 | "\n",
192 | "%cd .."
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 2,
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# load needed modules\n",
202 | "\n",
203 | "import tensorflow as tf\n",
204 | "\n",
205 | "from im2txt import configuration\n",
206 | "from im2txt import show_and_tell_model"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 3,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "# Define (but don't run yet) our captioning training function\n",
216 | "def train():\n",
217 | " model_config = configuration.ModelConfig()\n",
218 | " model_config.input_file_pattern = input_file_pattern\n",
219 | " model_config.inception_checkpoint_file = inception_checkpoint_file\n",
220 | " training_config = configuration.TrainingConfig()\n",
221 | "\n",
222 | " # Create training directory.\n",
223 | " train_dir = train_dir\n",
224 | " if not tf.gfile.IsDirectory(train_dir):\n",
225 | " tf.logging.info(\"Creating training directory: %s\", train_dir)\n",
226 | " tf.gfile.MakeDirs(train_dir)\n",
227 | "\n",
228 | " # Build the TensorFlow graph.\n",
229 | " g = tf.Graph()\n",
230 | " with g.as_default():\n",
231 | " # Build the model.\n",
232 | " model = show_and_tell_model.ShowAndTellModel(\n",
233 | " model_config, mode=\"train\", train_inception=train_inception)\n",
234 | " model.build()\n",
235 | "\n",
236 | " # Set up the learning rate.\n",
237 | " learning_rate_decay_fn = None\n",
238 | " if train_inception:\n",
239 | " learning_rate = tf.constant(training_config.train_inception_learning_rate)\n",
240 | " else:\n",
241 | " learning_rate = tf.constant(training_config.initial_learning_rate)\n",
242 | " if training_config.learning_rate_decay_factor > 0:\n",
243 | " num_batches_per_epoch = (training_config.num_examples_per_epoch /\n",
244 | " model_config.batch_size)\n",
245 | " decay_steps = int(num_batches_per_epoch *\n",
246 | " training_config.num_epochs_per_decay)\n",
247 | "\n",
248 | " def _learning_rate_decay_fn(learning_rate, global_step):\n",
249 | " return tf.train.exponential_decay(\n",
250 | " learning_rate,\n",
251 | " global_step,\n",
252 | " decay_steps=decay_steps,\n",
253 | " decay_rate=training_config.learning_rate_decay_factor,\n",
254 | " staircase=True)\n",
255 | "\n",
256 | " learning_rate_decay_fn = _learning_rate_decay_fn\n",
257 | "\n",
258 | " # Set up the training ops.\n",
259 | " train_op = tf.contrib.layers.optimize_loss(\n",
260 | " loss=model.total_loss,\n",
261 | " global_step=model.global_step,\n",
262 | " learning_rate=learning_rate,\n",
263 | " optimizer=training_config.optimizer,\n",
264 | " clip_gradients=training_config.clip_gradients,\n",
265 | " learning_rate_decay_fn=learning_rate_decay_fn)\n",
266 | "\n",
267 | " # Set up the Saver for saving and restoring model checkpoints.\n",
268 | " saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)\n",
269 | "\n",
270 | " # Run training.\n",
271 | " tf.contrib.slim.learning.train(\n",
272 | " train_op,\n",
273 | " train_dir,\n",
274 | " log_every_n_steps=log_every_n_steps,\n",
275 | " graph=g,\n",
276 | " global_step=model.global_step,\n",
277 | " number_of_steps=number_of_steps,\n",
278 | " init_fn=model.init_fn,\n",
279 | " saver=saver)\n",
280 | "\n"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "# Initial training\n",
290 | "input_file_pattern = 'im2txt/data/mscoco/train-?????-of-00256'\n",
291 | "\n",
292 | "# change these if you put your stuff somewhere else\n",
293 | "inception_checkpoint_file = 'im2txt/data/inception_v3.ckpt'\n",
294 | "train_dir = 'im2txt/model'\n",
295 | "\n",
296 | "# Don't train inception for initial run\n",
297 | "train_inception = False\n",
298 | "number_of_steps = 1000000\n",
299 | "log_every_n_steps = 1\n",
300 | "\n",
301 | "# Now run the training (warning: takes days-to-weeks!!!)\n",
302 | "train()"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": null,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "# Fine tuning\n",
312 | "input_file_pattern = 'im2txt/data/mscoco/train-?????-of-00256'\n",
313 | "\n",
314 | "# change these if you put your stuff somewhere else\n",
315 | "inception_checkpoint_file = 'im2txt/data/inception_v3.ckpt'\n",
316 | "train_dir = 'im2txt/model'\n",
317 | "\n",
318 | "# This will refine our results\n",
319 | "train_inception = True\n",
320 | "number_of_steps = 3000000\n",
321 | "log_every_n_steps = 1\n",
322 | "\n",
323 | "# Now run the training (warning: takes even longer than initial training!!!)\n",
324 | "train()"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": null,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "# If you completed this, you can go back to the start of this notebook and \n",
334 | "# point checkpoint_path and vocab_file to your generated files."
335 | ]
336 | }
337 | ],
338 | "metadata": {
339 | "kernelspec": {
340 | "display_name": "Python 3",
341 | "language": "python",
342 | "name": "python3"
343 | },
344 | "language_info": {
345 | "codemirror_mode": {
346 | "name": "ipython",
347 | "version": 3
348 | },
349 | "file_extension": ".py",
350 | "mimetype": "text/x-python",
351 | "name": "python",
352 | "nbconvert_exporter": "python",
353 | "pygments_lexer": "ipython3",
354 | "version": "3.6.5"
355 | }
356 | },
357 | "nbformat": 4,
358 | "nbformat_minor": 2
359 | }
360 |
--------------------------------------------------------------------------------
/Section 1/im2txt/configuration.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Image-to-text model and training configurations."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | class ModelConfig(object):
24 | """Wrapper class for model hyperparameters."""
25 |
26 | def __init__(self):
27 | """Sets the default model hyperparameters."""
28 | # File pattern of sharded TFRecord file containing SequenceExample protos.
29 | # Must be provided in training and evaluation modes.
30 | self.input_file_pattern = None
31 |
32 | # Image format ("jpeg" or "png").
33 | self.image_format = "jpeg"
34 |
35 | # Approximate number of values per input shard. Used to ensure sufficient
36 | # mixing between shards in training.
37 | self.values_per_input_shard = 2300
38 | # Minimum number of shards to keep in the input queue.
39 | self.input_queue_capacity_factor = 2
40 | # Number of threads for prefetching SequenceExample protos.
41 | self.num_input_reader_threads = 1
42 |
43 | # Name of the SequenceExample context feature containing image data.
44 | self.image_feature_name = "image/data"
45 | # Name of the SequenceExample feature list containing integer captions.
46 | self.caption_feature_name = "image/caption_ids"
47 |
48 | # Number of unique words in the vocab (plus 1, for ).
49 | # The default value is larger than the expected actual vocab size to allow
50 | # for differences between tokenizer versions used in preprocessing. There is
51 | # no harm in using a value greater than the actual vocab size, but using a
52 | # value less than the actual vocab size will result in an error.
53 | self.vocab_size = 12000
54 |
55 | # Number of threads for image preprocessing. Should be a multiple of 2.
56 | self.num_preprocess_threads = 4
57 |
58 | # Batch size.
59 | self.batch_size = 32
60 |
61 | # File containing an Inception v3 checkpoint to initialize the variables
62 | # of the Inception model. Must be provided when starting training for the
63 | # first time.
64 | self.inception_checkpoint_file = None
65 |
66 | # Dimensions of Inception v3 input images.
67 | self.image_height = 299
68 | self.image_width = 299
69 |
70 | # Scale used to initialize model variables.
71 | self.initializer_scale = 0.08
72 |
73 | # LSTM input and output dimensionality, respectively.
74 | self.embedding_size = 512
75 | self.num_lstm_units = 512
76 |
77 | # If < 1.0, the dropout keep probability applied to LSTM variables.
78 | self.lstm_dropout_keep_prob = 0.7
79 |
80 |
81 | class TrainingConfig(object):
82 | """Wrapper class for training hyperparameters."""
83 |
84 | def __init__(self):
85 | """Sets the default training hyperparameters."""
86 | # Number of examples per epoch of training data.
87 | self.num_examples_per_epoch = 586363
88 |
89 | # Optimizer for training the model.
90 | self.optimizer = "SGD"
91 |
92 | # Learning rate for the initial phase of training.
93 | self.initial_learning_rate = 2.0
94 | self.learning_rate_decay_factor = 0.5
95 | self.num_epochs_per_decay = 8.0
96 |
97 | # Learning rate when fine tuning the Inception v3 parameters.
98 | self.train_inception_learning_rate = 0.0005
99 |
100 | # If not None, clip gradients to this value.
101 | self.clip_gradients = 5.0
102 |
103 | # How many model checkpoints to keep.
104 | self.max_checkpoints_to_keep = 5
105 |
--------------------------------------------------------------------------------
/Section 1/im2txt/data/build_mscoco_data.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Converts MSCOCO data to TFRecord file format with SequenceExample protos.
16 |
17 | The MSCOCO images are expected to reside in JPEG files located in the following
18 | directory structure:
19 |
20 | train_image_dir/COCO_train2014_000000000151.jpg
21 | train_image_dir/COCO_train2014_000000000260.jpg
22 | ...
23 |
24 | and
25 |
26 | val_image_dir/COCO_val2014_000000000042.jpg
27 | val_image_dir/COCO_val2014_000000000073.jpg
28 | ...
29 |
30 | The MSCOCO annotations JSON files are expected to reside in train_captions_file
31 | and val_captions_file respectively.
32 |
33 | This script converts the combined MSCOCO data into sharded data files consisting
34 | of 256, 4 and 8 TFRecord files, respectively:
35 |
36 | output_dir/train-00000-of-00256
37 | output_dir/train-00001-of-00256
38 | ...
39 | output_dir/train-00255-of-00256
40 |
41 | and
42 |
43 | output_dir/val-00000-of-00004
44 | ...
45 | output_dir/val-00003-of-00004
46 |
47 | and
48 |
49 | output_dir/test-00000-of-00008
50 | ...
51 | output_dir/test-00007-of-00008
52 |
53 | Each TFRecord file contains ~2300 records. Each record within the TFRecord file
54 | is a serialized SequenceExample proto consisting of precisely one image-caption
55 | pair. Note that each image has multiple captions (usually 5) and therefore each
56 | image is replicated multiple times in the TFRecord files.
57 |
58 | The SequenceExample proto contains the following fields:
59 |
60 | context:
61 | image/image_id: integer MSCOCO image identifier
62 | image/data: string containing JPEG encoded image in RGB colorspace
63 |
64 | feature_lists:
65 | image/caption: list of strings containing the (tokenized) caption words
66 | image/caption_ids: list of integer ids corresponding to the caption words
67 |
68 | The captions are tokenized using the NLTK (http://www.nltk.org/) word tokenizer.
69 | The vocabulary of word identifiers is constructed from the sorted list (by
70 | descending frequency) of word tokens in the training set. Only tokens appearing
71 | at least 4 times are considered; all other words get the "unknown" word id.
72 |
73 | NOTE: This script will consume around 100GB of disk space because each image
74 | in the MSCOCO dataset is replicated ~5 times (once per caption) in the output.
75 | This is done for two reasons:
76 | 1. In order to better shuffle the training data.
77 | 2. It makes it easier to perform asynchronous preprocessing of each image in
78 | TensorFlow.
79 |
80 | Running this script using 16 threads may take around 1 hour on a HP Z420.
81 | """
82 |
83 | from __future__ import absolute_import
84 | from __future__ import division
85 | from __future__ import print_function
86 |
87 | from collections import Counter
88 | from collections import namedtuple
89 | from datetime import datetime
90 | import json
91 | import os.path
92 | import random
93 | import sys
94 | import threading
95 |
96 |
97 |
98 | import nltk.tokenize
99 | import numpy as np
100 | from six.moves import xrange
101 | import tensorflow as tf
102 |
103 | tf.flags.DEFINE_string("train_image_dir", "/tmp/train2014/",
104 | "Training image directory.")
105 | tf.flags.DEFINE_string("val_image_dir", "/tmp/val2014",
106 | "Validation image directory.")
107 |
108 | tf.flags.DEFINE_string("train_captions_file", "/tmp/captions_train2014.json",
109 | "Training captions JSON file.")
110 | tf.flags.DEFINE_string("val_captions_file", "/tmp/captions_val2014.json",
111 | "Validation captions JSON file.")
112 |
113 | tf.flags.DEFINE_string("output_dir", "/tmp/", "Output data directory.")
114 |
115 | tf.flags.DEFINE_integer("train_shards", 256,
116 | "Number of shards in training TFRecord files.")
117 | tf.flags.DEFINE_integer("val_shards", 4,
118 | "Number of shards in validation TFRecord files.")
119 | tf.flags.DEFINE_integer("test_shards", 8,
120 | "Number of shards in testing TFRecord files.")
121 |
122 | tf.flags.DEFINE_string("start_word", "",
123 | "Special word added to the beginning of each sentence.")
124 | tf.flags.DEFINE_string("end_word", "",
125 | "Special word added to the end of each sentence.")
126 | tf.flags.DEFINE_string("unknown_word", "",
127 | "Special word meaning 'unknown'.")
128 | tf.flags.DEFINE_integer("min_word_count", 4,
129 | "The minimum number of occurrences of each word in the "
130 | "training set for inclusion in the vocabulary.")
131 | tf.flags.DEFINE_string("word_counts_output_file", "/tmp/word_counts.txt",
132 | "Output vocabulary file of word counts.")
133 |
134 | tf.flags.DEFINE_integer("num_threads", 8,
135 | "Number of threads to preprocess the images.")
136 |
137 | FLAGS = tf.flags.FLAGS
138 |
139 | ImageMetadata = namedtuple("ImageMetadata",
140 | ["image_id", "filename", "captions"])
141 |
142 |
143 | class Vocabulary(object):
144 | """Simple vocabulary wrapper."""
145 |
146 | def __init__(self, vocab, unk_id):
147 | """Initializes the vocabulary.
148 |
149 | Args:
150 | vocab: A dictionary of word to word_id.
151 | unk_id: Id of the special 'unknown' word.
152 | """
153 | self._vocab = vocab
154 | self._unk_id = unk_id
155 |
156 | def word_to_id(self, word):
157 | """Returns the integer id of a word string."""
158 | if word in self._vocab:
159 | return self._vocab[word]
160 | else:
161 | return self._unk_id
162 |
163 |
164 | class ImageDecoder(object):
165 | """Helper class for decoding images in TensorFlow."""
166 |
167 | def __init__(self):
168 | # Create a single TensorFlow Session for all image decoding calls.
169 | self._sess = tf.Session()
170 |
171 | # TensorFlow ops for JPEG decoding.
172 | self._encoded_jpeg = tf.placeholder(dtype=tf.string)
173 | self._decode_jpeg = tf.image.decode_jpeg(self._encoded_jpeg, channels=3)
174 |
175 | def decode_jpeg(self, encoded_jpeg):
176 | image = self._sess.run(self._decode_jpeg,
177 | feed_dict={self._encoded_jpeg: encoded_jpeg})
178 | assert len(image.shape) == 3
179 | assert image.shape[2] == 3
180 | return image
181 |
182 |
183 | def _int64_feature(value):
184 | """Wrapper for inserting an int64 Feature into a SequenceExample proto."""
185 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
186 |
187 |
188 | def _bytes_feature(value):
189 | """Wrapper for inserting a bytes Feature into a SequenceExample proto."""
190 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[str(value)]))
191 |
192 |
193 | def _int64_feature_list(values):
194 | """Wrapper for inserting an int64 FeatureList into a SequenceExample proto."""
195 | return tf.train.FeatureList(feature=[_int64_feature(v) for v in values])
196 |
197 |
198 | def _bytes_feature_list(values):
199 | """Wrapper for inserting a bytes FeatureList into a SequenceExample proto."""
200 | return tf.train.FeatureList(feature=[_bytes_feature(v) for v in values])
201 |
202 |
203 | def _to_sequence_example(image, decoder, vocab):
204 | """Builds a SequenceExample proto for an image-caption pair.
205 |
206 | Args:
207 | image: An ImageMetadata object.
208 | decoder: An ImageDecoder object.
209 | vocab: A Vocabulary object.
210 |
211 | Returns:
212 | A SequenceExample proto.
213 | """
214 | with tf.gfile.FastGFile(image.filename, "r") as f:
215 | encoded_image = f.read()
216 |
217 | try:
218 | decoder.decode_jpeg(encoded_image)
219 | except (tf.errors.InvalidArgumentError, AssertionError):
220 | print("Skipping file with invalid JPEG data: %s" % image.filename)
221 | return
222 |
223 | context = tf.train.Features(feature={
224 | "image/image_id": _int64_feature(image.image_id),
225 | "image/data": _bytes_feature(encoded_image),
226 | })
227 |
228 | assert len(image.captions) == 1
229 | caption = image.captions[0]
230 | caption_ids = [vocab.word_to_id(word) for word in caption]
231 | feature_lists = tf.train.FeatureLists(feature_list={
232 | "image/caption": _bytes_feature_list(caption),
233 | "image/caption_ids": _int64_feature_list(caption_ids)
234 | })
235 | sequence_example = tf.train.SequenceExample(
236 | context=context, feature_lists=feature_lists)
237 |
238 | return sequence_example
239 |
240 |
241 | def _process_image_files(thread_index, ranges, name, images, decoder, vocab,
242 | num_shards):
243 | """Processes and saves a subset of images as TFRecord files in one thread.
244 |
245 | Args:
246 | thread_index: Integer thread identifier within [0, len(ranges)].
247 | ranges: A list of pairs of integers specifying the ranges of the dataset to
248 | process in parallel.
249 | name: Unique identifier specifying the dataset.
250 | images: List of ImageMetadata.
251 | decoder: An ImageDecoder object.
252 | vocab: A Vocabulary object.
253 | num_shards: Integer number of shards for the output files.
254 | """
255 | # Each thread produces N shards where N = num_shards / num_threads. For
256 | # instance, if num_shards = 128, and num_threads = 2, then the first thread
257 | # would produce shards [0, 64).
258 | num_threads = len(ranges)
259 | assert not num_shards % num_threads
260 | num_shards_per_batch = int(num_shards / num_threads)
261 |
262 | shard_ranges = np.linspace(ranges[thread_index][0], ranges[thread_index][1],
263 | num_shards_per_batch + 1).astype(int)
264 | num_images_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
265 |
266 | counter = 0
267 | for s in xrange(num_shards_per_batch):
268 | # Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
269 | shard = thread_index * num_shards_per_batch + s
270 | output_filename = "%s-%.5d-of-%.5d" % (name, shard, num_shards)
271 | output_file = os.path.join(FLAGS.output_dir, output_filename)
272 | writer = tf.python_io.TFRecordWriter(output_file)
273 |
274 | shard_counter = 0
275 | images_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
276 | for i in images_in_shard:
277 | image = images[i]
278 |
279 | sequence_example = _to_sequence_example(image, decoder, vocab)
280 | if sequence_example is not None:
281 | writer.write(sequence_example.SerializeToString())
282 | shard_counter += 1
283 | counter += 1
284 |
285 | if not counter % 1000:
286 | print("%s [thread %d]: Processed %d of %d items in thread batch." %
287 | (datetime.now(), thread_index, counter, num_images_in_thread))
288 | sys.stdout.flush()
289 |
290 | writer.close()
291 | print("%s [thread %d]: Wrote %d image-caption pairs to %s" %
292 | (datetime.now(), thread_index, shard_counter, output_file))
293 | sys.stdout.flush()
294 | shard_counter = 0
295 | print("%s [thread %d]: Wrote %d image-caption pairs to %d shards." %
296 | (datetime.now(), thread_index, counter, num_shards_per_batch))
297 | sys.stdout.flush()
298 |
299 |
300 | def _process_dataset(name, images, vocab, num_shards):
301 | """Processes a complete data set and saves it as a TFRecord.
302 |
303 | Args:
304 | name: Unique identifier specifying the dataset.
305 | images: List of ImageMetadata.
306 | vocab: A Vocabulary object.
307 | num_shards: Integer number of shards for the output files.
308 | """
309 | # Break up each image into a separate entity for each caption.
310 | images = [ImageMetadata(image.image_id, image.filename, [caption])
311 | for image in images for caption in image.captions]
312 |
313 | # Shuffle the ordering of images. Make the randomization repeatable.
314 | random.seed(12345)
315 | random.shuffle(images)
316 |
317 | # Break the images into num_threads batches. Batch i is defined as
318 | # images[ranges[i][0]:ranges[i][1]].
319 | num_threads = min(num_shards, FLAGS.num_threads)
320 | spacing = np.linspace(0, len(images), num_threads + 1).astype(np.int)
321 | ranges = []
322 | threads = []
323 | for i in xrange(len(spacing) - 1):
324 | ranges.append([spacing[i], spacing[i + 1]])
325 |
326 | # Create a mechanism for monitoring when all threads are finished.
327 | coord = tf.train.Coordinator()
328 |
329 | # Create a utility for decoding JPEG images to run sanity checks.
330 | decoder = ImageDecoder()
331 |
332 | # Launch a thread for each batch.
333 | print("Launching %d threads for spacings: %s" % (num_threads, ranges))
334 | for thread_index in xrange(len(ranges)):
335 | args = (thread_index, ranges, name, images, decoder, vocab, num_shards)
336 | t = threading.Thread(target=_process_image_files, args=args)
337 | t.start()
338 | threads.append(t)
339 |
340 | # Wait for all the threads to terminate.
341 | coord.join(threads)
342 | print("%s: Finished processing all %d image-caption pairs in data set '%s'." %
343 | (datetime.now(), len(images), name))
344 |
345 |
346 | def _create_vocab(captions):
347 | """Creates the vocabulary of word to word_id.
348 |
349 | The vocabulary is saved to disk in a text file of word counts. The id of each
350 | word in the file is its corresponding 0-based line number.
351 |
352 | Args:
353 | captions: A list of lists of strings.
354 |
355 | Returns:
356 | A Vocabulary object.
357 | """
358 | print("Creating vocabulary.")
359 | counter = Counter()
360 | for c in captions:
361 | counter.update(c)
362 | print("Total words:", len(counter))
363 |
364 | # Filter uncommon words and sort by descending count.
365 | word_counts = [x for x in counter.items() if x[1] >= FLAGS.min_word_count]
366 | word_counts.sort(key=lambda x: x[1], reverse=True)
367 | print("Words in vocabulary:", len(word_counts))
368 |
369 | # Write out the word counts file.
370 | with tf.gfile.FastGFile(FLAGS.word_counts_output_file, "w") as f:
371 | f.write("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
372 | print("Wrote vocabulary file:", FLAGS.word_counts_output_file)
373 |
374 | # Create the vocabulary dictionary.
375 | reverse_vocab = [x[0] for x in word_counts]
376 | unk_id = len(reverse_vocab)
377 | vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
378 | vocab = Vocabulary(vocab_dict, unk_id)
379 |
380 | return vocab
381 |
382 |
383 | def _process_caption(caption):
384 | """Processes a caption string into a list of tonenized words.
385 |
386 | Args:
387 | caption: A string caption.
388 |
389 | Returns:
390 | A list of strings; the tokenized caption.
391 | """
392 | tokenized_caption = [FLAGS.start_word]
393 | tokenized_caption.extend(nltk.tokenize.word_tokenize(caption.lower()))
394 | tokenized_caption.append(FLAGS.end_word)
395 | return tokenized_caption
396 |
397 |
398 | def _load_and_process_metadata(captions_file, image_dir):
399 | """Loads image metadata from a JSON file and processes the captions.
400 |
401 | Args:
402 | captions_file: JSON file containing caption annotations.
403 | image_dir: Directory containing the image files.
404 |
405 | Returns:
406 | A list of ImageMetadata.
407 | """
408 | with tf.gfile.FastGFile(captions_file, "r") as f:
409 | caption_data = json.load(f)
410 |
411 | # Extract the filenames.
412 | id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]]
413 |
414 | # Extract the captions. Each image_id is associated with multiple captions.
415 | id_to_captions = {}
416 | for annotation in caption_data["annotations"]:
417 | image_id = annotation["image_id"]
418 | caption = annotation["caption"]
419 | id_to_captions.setdefault(image_id, [])
420 | id_to_captions[image_id].append(caption)
421 |
422 | assert len(id_to_filename) == len(id_to_captions)
423 | assert set([x[0] for x in id_to_filename]) == set(id_to_captions.keys())
424 | print("Loaded caption metadata for %d images from %s" %
425 | (len(id_to_filename), captions_file))
426 |
427 | # Process the captions and combine the data into a list of ImageMetadata.
428 | print("Processing captions.")
429 | image_metadata = []
430 | num_captions = 0
431 | for image_id, base_filename in id_to_filename:
432 | filename = os.path.join(image_dir, base_filename)
433 | captions = [_process_caption(c) for c in id_to_captions[image_id]]
434 | image_metadata.append(ImageMetadata(image_id, filename, captions))
435 | num_captions += len(captions)
436 | print("Finished processing %d captions for %d images in %s" %
437 | (num_captions, len(id_to_filename), captions_file))
438 |
439 | return image_metadata
440 |
441 |
442 | def main(unused_argv):
443 | def _is_valid_num_shards(num_shards):
444 | """Returns True if num_shards is compatible with FLAGS.num_threads."""
445 | return num_shards < FLAGS.num_threads or not num_shards % FLAGS.num_threads
446 |
447 | assert _is_valid_num_shards(FLAGS.train_shards), (
448 | "Please make the FLAGS.num_threads commensurate with FLAGS.train_shards")
449 | assert _is_valid_num_shards(FLAGS.val_shards), (
450 | "Please make the FLAGS.num_threads commensurate with FLAGS.val_shards")
451 | assert _is_valid_num_shards(FLAGS.test_shards), (
452 | "Please make the FLAGS.num_threads commensurate with FLAGS.test_shards")
453 |
454 | if not tf.gfile.IsDirectory(FLAGS.output_dir):
455 | tf.gfile.MakeDirs(FLAGS.output_dir)
456 |
457 | # Load image metadata from caption files.
458 | mscoco_train_dataset = _load_and_process_metadata(FLAGS.train_captions_file,
459 | FLAGS.train_image_dir)
460 | mscoco_val_dataset = _load_and_process_metadata(FLAGS.val_captions_file,
461 | FLAGS.val_image_dir)
462 |
463 | # Redistribute the MSCOCO data as follows:
464 | # train_dataset = 100% of mscoco_train_dataset + 85% of mscoco_val_dataset.
465 | # val_dataset = 5% of mscoco_val_dataset (for validation during training).
466 | # test_dataset = 10% of mscoco_val_dataset (for final evaluation).
467 | train_cutoff = int(0.85 * len(mscoco_val_dataset))
468 | val_cutoff = int(0.90 * len(mscoco_val_dataset))
469 | train_dataset = mscoco_train_dataset + mscoco_val_dataset[0:train_cutoff]
470 | val_dataset = mscoco_val_dataset[train_cutoff:val_cutoff]
471 | test_dataset = mscoco_val_dataset[val_cutoff:]
472 |
473 | # Create vocabulary from the training captions.
474 | train_captions = [c for image in train_dataset for c in image.captions]
475 | vocab = _create_vocab(train_captions)
476 |
477 | _process_dataset("train", train_dataset, vocab, FLAGS.train_shards)
478 | _process_dataset("val", val_dataset, vocab, FLAGS.val_shards)
479 | _process_dataset("test", test_dataset, vocab, FLAGS.test_shards)
480 |
481 |
482 | if __name__ == "__main__":
483 | tf.app.run()
484 |
--------------------------------------------------------------------------------
/Section 1/im2txt/data/download_and_preprocess_mscoco.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | # ==============================================================================
16 |
17 | # Script to download and preprocess the MSCOCO data set.
18 | #
19 | # The outputs of this script are sharded TFRecord files containing serialized
20 | # SequenceExample protocol buffers. See build_mscoco_data.py for details of how
21 | # the SequenceExample protocol buffers are constructed.
22 | #
23 | # usage:
24 | # ./download_and_preprocess_mscoco.sh
25 | set -e
26 |
27 | if [ -z "$1" ]; then
28 | echo "usage download_and_preproces_mscoco.sh [data dir]"
29 | exit
30 | fi
31 |
32 | if [ "$(uname)" == "Darwin" ]; then
33 | UNZIP="tar -xf"
34 | else
35 | UNZIP="unzip -nq"
36 | fi
37 |
38 | # Create the output directories.
39 | OUTPUT_DIR="${1%/}"
40 | SCRATCH_DIR="${OUTPUT_DIR}/raw-data"
41 | mkdir -p "${OUTPUT_DIR}"
42 | mkdir -p "${SCRATCH_DIR}"
43 | CURRENT_DIR=$(pwd)
44 | WORK_DIR="$0.runfiles/im2txt/im2txt"
45 |
46 | # Helper function to download and unpack a .zip file.
47 | function download_and_unzip() {
48 | local BASE_URL=${1}
49 | local FILENAME=${2}
50 |
51 | if [ ! -f ${FILENAME} ]; then
52 | echo "Downloading ${FILENAME} to $(pwd)"
53 | wget -nd -c "${BASE_URL}/${FILENAME}"
54 | else
55 | echo "Skipping download of ${FILENAME}"
56 | fi
57 | echo "Unzipping ${FILENAME}"
58 | ${UNZIP} ${FILENAME}
59 | }
60 |
61 | cd ${SCRATCH_DIR}
62 |
63 | # Download the images.
64 | BASE_IMAGE_URL="http://msvocds.blob.core.windows.net/coco2014"
65 |
66 | TRAIN_IMAGE_FILE="train2014.zip"
67 | #download_and_unzip ${BASE_IMAGE_URL} ${TRAIN_IMAGE_FILE}
68 | TRAIN_IMAGE_DIR="${SCRATCH_DIR}/train2014"
69 |
70 | VAL_IMAGE_FILE="val2014.zip"
71 | #download_and_unzip ${BASE_IMAGE_URL} ${VAL_IMAGE_FILE}
72 | VAL_IMAGE_DIR="${SCRATCH_DIR}/val2014"
73 |
74 | # Download the captions.
75 | BASE_CAPTIONS_URL="http://msvocds.blob.core.windows.net/annotations-1-0-3"
76 | CAPTIONS_FILE="captions_train-val2014.zip"
77 | #download_and_unzip ${BASE_CAPTIONS_URL} ${CAPTIONS_FILE}
78 | TRAIN_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_train2014.json"
79 | VAL_CAPTIONS_FILE="${SCRATCH_DIR}/annotations/captions_val2014.json"
80 |
81 | # Build TFRecords of the image data.
82 | cd "${CURRENT_DIR}"
83 | #BUILD_SCRIPT="${WORK_DIR}/build_mscoco_data"
84 |
85 | echo $TRAIN_IMAGE_DIR
86 | echo $VAL_IMAGE_DIR
87 | echo $TRAIN_CAPTIONS_FILE
88 | echo $VAL_CAPTIONS_FILE
89 | echo $OUTPUT_DIR
90 |
91 | #BUILD_SCRIPT=./build_mscoco_data
92 | #"${BUILD_SCRIPT}" \
93 | #--train_image_dir="${TRAIN_IMAGE_DIR}" \
94 | #--val_image_dir="${VAL_IMAGE_DIR}" \
95 | #--train_captions_file="${TRAIN_CAPTIONS_FILE}" \
96 | #--val_captions_file="${VAL_CAPTIONS_FILE}" \
97 | #--output_dir="${OUTPUT_DIR}" \
98 | #--word_counts_output_file="${OUTPUT_DIR}/word_counts.txt" \
99 |
100 |
101 | echo python build_mscoco_data.py --train_image_dir="${TRAIN_IMAGE_DIR}" \
102 | --val_image_dir="${VAL_IMAGE_DIR}" \
103 | --train_captions_file="${TRAIN_CAPTIONS_FILE}" \
104 | --val_captions_file="${VAL_CAPTIONS_FILE}" \
105 | --output_dir="${OUTPUT_DIR}" \
106 | --word_counts_output_file="${OUTPUT_DIR}/word_counts.txt" \
107 |
--------------------------------------------------------------------------------
/Section 1/im2txt/evaluate.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Evaluate the model.
17 |
18 | This script should be run concurrently with training so that summaries show up
19 | in TensorBoard.
20 | """
21 |
22 | from __future__ import absolute_import
23 | from __future__ import division
24 | from __future__ import print_function
25 |
26 | import math
27 | import os.path
28 | import time
29 |
30 |
31 | import numpy as np
32 | import tensorflow as tf
33 |
34 | from im2txt import configuration
35 | from im2txt import show_and_tell_model
36 |
37 | FLAGS = tf.flags.FLAGS
38 |
39 | tf.flags.DEFINE_string("input_file_pattern", "",
40 | "File pattern of sharded TFRecord input files.")
41 | tf.flags.DEFINE_string("checkpoint_dir", "",
42 | "Directory containing model checkpoints.")
43 | tf.flags.DEFINE_string("eval_dir", "", "Directory to write event logs.")
44 |
45 | tf.flags.DEFINE_integer("eval_interval_secs", 600,
46 | "Interval between evaluation runs.")
47 | tf.flags.DEFINE_integer("num_eval_examples", 10132,
48 | "Number of examples for evaluation.")
49 |
50 | tf.flags.DEFINE_integer("min_global_step", 5000,
51 | "Minimum global step to run evaluation.")
52 |
53 | tf.logging.set_verbosity(tf.logging.INFO)
54 |
55 |
56 | def evaluate_model(sess, model, global_step, summary_writer, summary_op):
57 | """Computes perplexity-per-word over the evaluation dataset.
58 |
59 | Summaries and perplexity-per-word are written out to the eval directory.
60 |
61 | Args:
62 | sess: Session object.
63 | model: Instance of ShowAndTellModel; the model to evaluate.
64 | global_step: Integer; global step of the model checkpoint.
65 | summary_writer: Instance of FileWriter.
66 | summary_op: Op for generating model summaries.
67 | """
68 | # Log model summaries on a single batch.
69 | summary_str = sess.run(summary_op)
70 | summary_writer.add_summary(summary_str, global_step)
71 |
72 | # Compute perplexity over the entire dataset.
73 | num_eval_batches = int(
74 | math.ceil(FLAGS.num_eval_examples / model.config.batch_size))
75 |
76 | start_time = time.time()
77 | sum_losses = 0.
78 | sum_weights = 0.
79 | for i in range(num_eval_batches):
80 | cross_entropy_losses, weights = sess.run([
81 | model.target_cross_entropy_losses,
82 | model.target_cross_entropy_loss_weights
83 | ])
84 | sum_losses += np.sum(cross_entropy_losses * weights)
85 | sum_weights += np.sum(weights)
86 | if not i % 100:
87 | tf.logging.info("Computed losses for %d of %d batches.", i + 1,
88 | num_eval_batches)
89 | eval_time = time.time() - start_time
90 |
91 | perplexity = math.exp(sum_losses / sum_weights)
92 | tf.logging.info("Perplexity = %f (%.2g sec)", perplexity, eval_time)
93 |
94 | # Log perplexity to the FileWriter.
95 | summary = tf.Summary()
96 | value = summary.value.add()
97 | value.simple_value = perplexity
98 | value.tag = "Perplexity"
99 | summary_writer.add_summary(summary, global_step)
100 |
101 | # Write the Events file to the eval directory.
102 | summary_writer.flush()
103 | tf.logging.info("Finished processing evaluation at global step %d.",
104 | global_step)
105 |
106 |
107 | def run_once(model, saver, summary_writer, summary_op):
108 | """Evaluates the latest model checkpoint.
109 |
110 | Args:
111 | model: Instance of ShowAndTellModel; the model to evaluate.
112 | saver: Instance of tf.train.Saver for restoring model Variables.
113 | summary_writer: Instance of FileWriter.
114 | summary_op: Op for generating model summaries.
115 | """
116 | model_path = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
117 | if not model_path:
118 | tf.logging.info("Skipping evaluation. No checkpoint found in: %s",
119 | FLAGS.checkpoint_dir)
120 | return
121 |
122 | with tf.Session() as sess:
123 | # Load model from checkpoint.
124 | tf.logging.info("Loading model from checkpoint: %s", model_path)
125 | saver.restore(sess, model_path)
126 | global_step = tf.train.global_step(sess, model.global_step.name)
127 | tf.logging.info("Successfully loaded %s at global step = %d.",
128 | os.path.basename(model_path), global_step)
129 | if global_step < FLAGS.min_global_step:
130 | tf.logging.info("Skipping evaluation. Global step = %d < %d", global_step,
131 | FLAGS.min_global_step)
132 | return
133 |
134 | # Start the queue runners.
135 | coord = tf.train.Coordinator()
136 | threads = tf.train.start_queue_runners(coord=coord)
137 |
138 | # Run evaluation on the latest checkpoint.
139 | try:
140 | evaluate_model(
141 | sess=sess,
142 | model=model,
143 | global_step=global_step,
144 | summary_writer=summary_writer,
145 | summary_op=summary_op)
146 | except Exception as e: # pylint: disable=broad-except
147 | tf.logging.error("Evaluation failed.")
148 | coord.request_stop(e)
149 |
150 | coord.request_stop()
151 | coord.join(threads, stop_grace_period_secs=10)
152 |
153 |
154 | def run():
155 | """Runs evaluation in a loop, and logs summaries to TensorBoard."""
156 | # Create the evaluation directory if it doesn't exist.
157 | eval_dir = FLAGS.eval_dir
158 | if not tf.gfile.IsDirectory(eval_dir):
159 | tf.logging.info("Creating eval directory: %s", eval_dir)
160 | tf.gfile.MakeDirs(eval_dir)
161 |
162 | g = tf.Graph()
163 | with g.as_default():
164 | # Build the model for evaluation.
165 | model_config = configuration.ModelConfig()
166 | model_config.input_file_pattern = FLAGS.input_file_pattern
167 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="eval")
168 | model.build()
169 |
170 | # Create the Saver to restore model Variables.
171 | saver = tf.train.Saver()
172 |
173 | # Create the summary operation and the summary writer.
174 | summary_op = tf.summary.merge_all()
175 | summary_writer = tf.summary.FileWriter(eval_dir)
176 |
177 | g.finalize()
178 |
179 | # Run a new evaluation run every eval_interval_secs.
180 | while True:
181 | start = time.time()
182 | tf.logging.info("Starting evaluation at " + time.strftime(
183 | "%Y-%m-%d-%H:%M:%S", time.localtime()))
184 | run_once(model, saver, summary_writer, summary_op)
185 | time_to_next_eval = start + FLAGS.eval_interval_secs - time.time()
186 | if time_to_next_eval > 0:
187 | time.sleep(time_to_next_eval)
188 |
189 |
190 | def main(unused_argv):
191 | assert FLAGS.input_file_pattern, "--input_file_pattern is required"
192 | assert FLAGS.checkpoint_dir, "--checkpoint_dir is required"
193 | assert FLAGS.eval_dir, "--eval_dir is required"
194 | run()
195 |
196 |
197 | if __name__ == "__main__":
198 | tf.app.run()
199 |
--------------------------------------------------------------------------------
/Section 1/im2txt/inference_utils/__pycache__/caption_generator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/caption_generator.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/inference_utils/__pycache__/inference_wrapper_base.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/inference_wrapper_base.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/inference_utils/__pycache__/vocabulary.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/inference_utils/__pycache__/vocabulary.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/inference_wrapper.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Model wrapper class for performing inference with a ShowAndTellModel."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 |
24 | from im2txt import show_and_tell_model
25 | from im2txt.inference_utils import inference_wrapper_base
26 |
27 |
28 | class InferenceWrapper(inference_wrapper_base.InferenceWrapperBase):
29 | """Model wrapper class for performing inference with a ShowAndTellModel."""
30 |
31 | def __init__(self):
32 | super(InferenceWrapper, self).__init__()
33 |
34 | def build_model(self, model_config):
35 | model = show_and_tell_model.ShowAndTellModel(model_config, mode="inference")
36 | model.build()
37 | return model
38 |
39 | def feed_image(self, sess, encoded_image):
40 | initial_state = sess.run(fetches="lstm/initial_state:0",
41 | feed_dict={"image_feed:0": encoded_image})
42 | return initial_state
43 |
44 | def inference_step(self, sess, input_feed, state_feed):
45 | softmax_output, state_output = sess.run(
46 | fetches=["softmax:0", "lstm/state:0"],
47 | feed_dict={
48 | "input_feed:0": input_feed,
49 | "lstm/state_feed:0": state_feed,
50 | })
51 | return softmax_output, state_output, None
52 |
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/__pycache__/image_embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/image_embedding.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/__pycache__/image_processing.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/image_processing.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/__pycache__/inputs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/im2txt/ops/__pycache__/inputs.cpython-36.pyc
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/image_embedding.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Image embedding ops."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | import tensorflow as tf
24 |
25 | from tensorflow.contrib.slim.python.slim.nets.inception_v3 import inception_v3_base
26 |
27 | slim = tf.contrib.slim
28 |
29 |
30 | def inception_v3(images,
31 | trainable=True,
32 | is_training=True,
33 | weight_decay=0.00004,
34 | stddev=0.1,
35 | dropout_keep_prob=0.8,
36 | use_batch_norm=True,
37 | batch_norm_params=None,
38 | add_summaries=True,
39 | scope="InceptionV3"):
40 | """Builds an Inception V3 subgraph for image embeddings.
41 |
42 | Args:
43 | images: A float32 Tensor of shape [batch, height, width, channels].
44 | trainable: Whether the inception submodel should be trainable or not.
45 | is_training: Boolean indicating training mode or not.
46 | weight_decay: Coefficient for weight regularization.
47 | stddev: The standard deviation of the trunctated normal weight initializer.
48 | dropout_keep_prob: Dropout keep probability.
49 | use_batch_norm: Whether to use batch normalization.
50 | batch_norm_params: Parameters for batch normalization. See
51 | tf.contrib.layers.batch_norm for details.
52 | add_summaries: Whether to add activation summaries.
53 | scope: Optional Variable scope.
54 |
55 | Returns:
56 | end_points: A dictionary of activations from inception_v3 layers.
57 | """
58 | # Only consider the inception model to be in training mode if it's trainable.
59 | is_inception_model_training = trainable and is_training
60 |
61 | if use_batch_norm:
62 | # Default parameters for batch normalization.
63 | if not batch_norm_params:
64 | batch_norm_params = {
65 | "is_training": is_inception_model_training,
66 | "trainable": trainable,
67 | # Decay for the moving averages.
68 | "decay": 0.9997,
69 | # Epsilon to prevent 0s in variance.
70 | "epsilon": 0.001,
71 | # Collection containing the moving mean and moving variance.
72 | "variables_collections": {
73 | "beta": None,
74 | "gamma": None,
75 | "moving_mean": ["moving_vars"],
76 | "moving_variance": ["moving_vars"],
77 | }
78 | }
79 | else:
80 | batch_norm_params = None
81 |
82 | if trainable:
83 | weights_regularizer = tf.contrib.layers.l2_regularizer(weight_decay)
84 | else:
85 | weights_regularizer = None
86 |
87 | with tf.variable_scope(scope, "InceptionV3", [images]) as scope:
88 | with slim.arg_scope(
89 | [slim.conv2d, slim.fully_connected],
90 | weights_regularizer=weights_regularizer,
91 | trainable=trainable):
92 | with slim.arg_scope(
93 | [slim.conv2d],
94 | weights_initializer=tf.truncated_normal_initializer(stddev=stddev),
95 | activation_fn=tf.nn.relu,
96 | normalizer_fn=slim.batch_norm,
97 | normalizer_params=batch_norm_params):
98 | net, end_points = inception_v3_base(images, scope=scope)
99 | with tf.variable_scope("logits"):
100 | shape = net.get_shape()
101 | net = slim.avg_pool2d(net, shape[1:3], padding="VALID", scope="pool")
102 | net = slim.dropout(
103 | net,
104 | keep_prob=dropout_keep_prob,
105 | is_training=is_inception_model_training,
106 | scope="dropout")
107 | net = slim.flatten(net, scope="flatten")
108 |
109 | # Add summaries.
110 | if add_summaries:
111 | for v in end_points.values():
112 | tf.contrib.layers.summaries.summarize_activation(v)
113 |
114 | return net
115 |
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/image_embedding_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Tests for tensorflow_models.im2txt.ops.image_embedding."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | import tensorflow as tf
24 |
25 | from im2txt.ops import image_embedding
26 |
27 |
28 | class InceptionV3Test(tf.test.TestCase):
29 |
30 | def setUp(self):
31 | super(InceptionV3Test, self).setUp()
32 |
33 | batch_size = 4
34 | height = 299
35 | width = 299
36 | num_channels = 3
37 | self._images = tf.placeholder(tf.float32,
38 | [batch_size, height, width, num_channels])
39 | self._batch_size = batch_size
40 |
41 | def _countInceptionParameters(self):
42 | """Counts the number of parameters in the inception model at top scope."""
43 | counter = {}
44 | for v in tf.global_variables():
45 | name_tokens = v.op.name.split("/")
46 | if name_tokens[0] == "InceptionV3":
47 | name = "InceptionV3/" + name_tokens[1]
48 | num_params = v.get_shape().num_elements()
49 | assert num_params
50 | counter[name] = counter.get(name, 0) + num_params
51 | return counter
52 |
53 | def _verifyParameterCounts(self):
54 | """Verifies the number of parameters in the inception model."""
55 | param_counts = self._countInceptionParameters()
56 | expected_param_counts = {
57 | "InceptionV3/Conv2d_1a_3x3": 960,
58 | "InceptionV3/Conv2d_2a_3x3": 9312,
59 | "InceptionV3/Conv2d_2b_3x3": 18624,
60 | "InceptionV3/Conv2d_3b_1x1": 5360,
61 | "InceptionV3/Conv2d_4a_3x3": 138816,
62 | "InceptionV3/Mixed_5b": 256368,
63 | "InceptionV3/Mixed_5c": 277968,
64 | "InceptionV3/Mixed_5d": 285648,
65 | "InceptionV3/Mixed_6a": 1153920,
66 | "InceptionV3/Mixed_6b": 1298944,
67 | "InceptionV3/Mixed_6c": 1692736,
68 | "InceptionV3/Mixed_6d": 1692736,
69 | "InceptionV3/Mixed_6e": 2143872,
70 | "InceptionV3/Mixed_7a": 1699584,
71 | "InceptionV3/Mixed_7b": 5047872,
72 | "InceptionV3/Mixed_7c": 6080064,
73 | }
74 | self.assertDictEqual(expected_param_counts, param_counts)
75 |
76 | def _assertCollectionSize(self, expected_size, collection):
77 | actual_size = len(tf.get_collection(collection))
78 | if expected_size != actual_size:
79 | self.fail("Found %d items in collection %s (expected %d)." %
80 | (actual_size, collection, expected_size))
81 |
82 | def testTrainableTrueIsTrainingTrue(self):
83 | embeddings = image_embedding.inception_v3(
84 | self._images, trainable=True, is_training=True)
85 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
86 |
87 | self._verifyParameterCounts()
88 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
89 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES)
90 | self._assertCollectionSize(188, tf.GraphKeys.UPDATE_OPS)
91 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES)
92 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES)
93 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES)
94 |
95 | def testTrainableTrueIsTrainingFalse(self):
96 | embeddings = image_embedding.inception_v3(
97 | self._images, trainable=True, is_training=False)
98 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
99 |
100 | self._verifyParameterCounts()
101 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
102 | self._assertCollectionSize(188, tf.GraphKeys.TRAINABLE_VARIABLES)
103 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
104 | self._assertCollectionSize(94, tf.GraphKeys.REGULARIZATION_LOSSES)
105 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES)
106 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES)
107 |
108 | def testTrainableFalseIsTrainingTrue(self):
109 | embeddings = image_embedding.inception_v3(
110 | self._images, trainable=False, is_training=True)
111 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
112 |
113 | self._verifyParameterCounts()
114 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
115 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES)
116 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
117 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES)
118 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES)
119 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES)
120 |
121 | def testTrainableFalseIsTrainingFalse(self):
122 | embeddings = image_embedding.inception_v3(
123 | self._images, trainable=False, is_training=False)
124 | self.assertEqual([self._batch_size, 2048], embeddings.get_shape().as_list())
125 |
126 | self._verifyParameterCounts()
127 | self._assertCollectionSize(376, tf.GraphKeys.GLOBAL_VARIABLES)
128 | self._assertCollectionSize(0, tf.GraphKeys.TRAINABLE_VARIABLES)
129 | self._assertCollectionSize(0, tf.GraphKeys.UPDATE_OPS)
130 | self._assertCollectionSize(0, tf.GraphKeys.REGULARIZATION_LOSSES)
131 | self._assertCollectionSize(0, tf.GraphKeys.LOSSES)
132 | self._assertCollectionSize(23, tf.GraphKeys.SUMMARIES)
133 |
134 |
135 | if __name__ == "__main__":
136 | tf.test.main()
137 |
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/image_processing.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Helper functions for image preprocessing."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | import tensorflow as tf
24 |
25 |
26 | def distort_image(image, thread_id):
27 | """Perform random distortions on an image.
28 |
29 | Args:
30 | image: A float32 Tensor of shape [height, width, 3] with values in [0, 1).
31 | thread_id: Preprocessing thread id used to select the ordering of color
32 | distortions. There should be a multiple of 2 preprocessing threads.
33 |
34 | Returns:
35 | distorted_image: A float32 Tensor of shape [height, width, 3] with values in
36 | [0, 1].
37 | """
38 | # Randomly flip horizontally.
39 | with tf.name_scope("flip_horizontal", values=[image]):
40 | image = tf.image.random_flip_left_right(image)
41 |
42 | # Randomly distort the colors based on thread id.
43 | color_ordering = thread_id % 2
44 | with tf.name_scope("distort_color", values=[image]):
45 | if color_ordering == 0:
46 | image = tf.image.random_brightness(image, max_delta=32. / 255.)
47 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
48 | image = tf.image.random_hue(image, max_delta=0.032)
49 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
50 | elif color_ordering == 1:
51 | image = tf.image.random_brightness(image, max_delta=32. / 255.)
52 | image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
53 | image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
54 | image = tf.image.random_hue(image, max_delta=0.032)
55 |
56 | # The random_* ops do not necessarily clamp.
57 | image = tf.clip_by_value(image, 0.0, 1.0)
58 |
59 | return image
60 |
61 |
62 | def process_image(encoded_image,
63 | is_training,
64 | height,
65 | width,
66 | resize_height=346,
67 | resize_width=346,
68 | thread_id=0,
69 | image_format="jpeg"):
70 | """Decode an image, resize and apply random distortions.
71 |
72 | In training, images are distorted slightly differently depending on thread_id.
73 |
74 | Args:
75 | encoded_image: String Tensor containing the image.
76 | is_training: Boolean; whether preprocessing for training or eval.
77 | height: Height of the output image.
78 | width: Width of the output image.
79 | resize_height: If > 0, resize height before crop to final dimensions.
80 | resize_width: If > 0, resize width before crop to final dimensions.
81 | thread_id: Preprocessing thread id used to select the ordering of color
82 | distortions. There should be a multiple of 2 preprocessing threads.
83 | image_format: "jpeg" or "png".
84 |
85 | Returns:
86 | A float32 Tensor of shape [height, width, 3] with values in [-1, 1].
87 |
88 | Raises:
89 | ValueError: If image_format is invalid.
90 | """
91 | # Helper function to log an image summary to the visualizer. Summaries are
92 | # only logged in thread 0.
93 | def image_summary(name, image):
94 | if not thread_id:
95 | tf.summary.image(name, tf.expand_dims(image, 0))
96 |
97 | # Decode image into a float32 Tensor of shape [?, ?, 3] with values in [0, 1).
98 | with tf.name_scope("decode", values=[encoded_image]):
99 | if image_format == "jpeg":
100 | image = tf.image.decode_jpeg(encoded_image, channels=3)
101 | elif image_format == "png":
102 | image = tf.image.decode_png(encoded_image, channels=3)
103 | else:
104 | raise ValueError("Invalid image format: %s" % image_format)
105 | image = tf.image.convert_image_dtype(image, dtype=tf.float32)
106 | image_summary("original_image", image)
107 |
108 | # Resize image.
109 | assert (resize_height > 0) == (resize_width > 0)
110 | if resize_height:
111 | image = tf.image.resize_images(image,
112 | size=[resize_height, resize_width],
113 | method=tf.image.ResizeMethod.BILINEAR)
114 |
115 | # Crop to final dimensions.
116 | if is_training:
117 | image = tf.random_crop(image, [height, width, 3])
118 | else:
119 | # Central crop, assuming resize_height > height, resize_width > width.
120 | image = tf.image.resize_image_with_crop_or_pad(image, height, width)
121 |
122 | image_summary("resized_image", image)
123 |
124 | # Randomly distort the image.
125 | if is_training:
126 | image = distort_image(image, thread_id)
127 |
128 | image_summary("final_image", image)
129 |
130 | # Rescale to [-1,1] instead of [0, 1]
131 | image = tf.subtract(image, 0.5)
132 | image = tf.multiply(image, 2.0)
133 | return image
134 |
--------------------------------------------------------------------------------
/Section 1/im2txt/ops/inputs.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Input ops."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | import tensorflow as tf
24 |
25 |
26 | def parse_sequence_example(serialized, image_feature, caption_feature):
27 | """Parses a tensorflow.SequenceExample into an image and caption.
28 |
29 | Args:
30 | serialized: A scalar string Tensor; a single serialized SequenceExample.
31 | image_feature: Name of SequenceExample context feature containing image
32 | data.
33 | caption_feature: Name of SequenceExample feature list containing integer
34 | captions.
35 |
36 | Returns:
37 | encoded_image: A scalar string Tensor containing a JPEG encoded image.
38 | caption: A 1-D uint64 Tensor with dynamically specified length.
39 | """
40 | context, sequence = tf.parse_single_sequence_example(
41 | serialized,
42 | context_features={
43 | image_feature: tf.FixedLenFeature([], dtype=tf.string)
44 | },
45 | sequence_features={
46 | caption_feature: tf.FixedLenSequenceFeature([], dtype=tf.int64),
47 | })
48 |
49 | encoded_image = context[image_feature]
50 | caption = sequence[caption_feature]
51 | return encoded_image, caption
52 |
53 |
54 | def prefetch_input_data(reader,
55 | file_pattern,
56 | is_training,
57 | batch_size,
58 | values_per_shard,
59 | input_queue_capacity_factor=16,
60 | num_reader_threads=1,
61 | shard_queue_name="filename_queue",
62 | value_queue_name="input_queue"):
63 | """Prefetches string values from disk into an input queue.
64 |
65 | In training the capacity of the queue is important because a larger queue
66 | means better mixing of training examples between shards. The minimum number of
67 | values kept in the queue is values_per_shard * input_queue_capacity_factor,
68 | where input_queue_memory factor should be chosen to trade-off better mixing
69 | with memory usage.
70 |
71 | Args:
72 | reader: Instance of tf.ReaderBase.
73 | file_pattern: Comma-separated list of file patterns (e.g.
74 | /tmp/train_data-?????-of-00100).
75 | is_training: Boolean; whether prefetching for training or eval.
76 | batch_size: Model batch size used to determine queue capacity.
77 | values_per_shard: Approximate number of values per shard.
78 | input_queue_capacity_factor: Minimum number of values to keep in the queue
79 | in multiples of values_per_shard. See comments above.
80 | num_reader_threads: Number of reader threads to fill the queue.
81 | shard_queue_name: Name for the shards filename queue.
82 | value_queue_name: Name for the values input queue.
83 |
84 | Returns:
85 | A Queue containing prefetched string values.
86 | """
87 | data_files = []
88 | for pattern in file_pattern.split(","):
89 | data_files.extend(tf.gfile.Glob(pattern))
90 | if not data_files:
91 | tf.logging.fatal("Found no input files matching %s", file_pattern)
92 | else:
93 | tf.logging.info("Prefetching values from %d files matching %s",
94 | len(data_files), file_pattern)
95 |
96 | if is_training:
97 | filename_queue = tf.train.string_input_producer(
98 | data_files, shuffle=True, capacity=16, name=shard_queue_name)
99 | min_queue_examples = values_per_shard * input_queue_capacity_factor
100 | capacity = min_queue_examples + 100 * batch_size
101 | values_queue = tf.RandomShuffleQueue(
102 | capacity=capacity,
103 | min_after_dequeue=min_queue_examples,
104 | dtypes=[tf.string],
105 | name="random_" + value_queue_name)
106 | else:
107 | filename_queue = tf.train.string_input_producer(
108 | data_files, shuffle=False, capacity=1, name=shard_queue_name)
109 | capacity = values_per_shard + 3 * batch_size
110 | values_queue = tf.FIFOQueue(
111 | capacity=capacity, dtypes=[tf.string], name="fifo_" + value_queue_name)
112 |
113 | enqueue_ops = []
114 | for _ in range(num_reader_threads):
115 | _, value = reader.read(filename_queue)
116 | enqueue_ops.append(values_queue.enqueue([value]))
117 | tf.train.queue_runner.add_queue_runner(tf.train.queue_runner.QueueRunner(
118 | values_queue, enqueue_ops))
119 | tf.summary.scalar(
120 | "queue/%s/fraction_of_%d_full" % (values_queue.name, capacity),
121 | tf.cast(values_queue.size(), tf.float32) * (1. / capacity))
122 |
123 | return values_queue
124 |
125 |
126 | def batch_with_dynamic_pad(images_and_captions,
127 | batch_size,
128 | queue_capacity,
129 | add_summaries=True):
130 | """Batches input images and captions.
131 |
132 | This function splits the caption into an input sequence and a target sequence,
133 | where the target sequence is the input sequence right-shifted by 1. Input and
134 | target sequences are batched and padded up to the maximum length of sequences
135 | in the batch. A mask is created to distinguish real words from padding words.
136 |
137 | Example:
138 | Actual captions in the batch ('-' denotes padded character):
139 | [
140 | [ 1 2 3 4 5 ],
141 | [ 1 2 3 4 - ],
142 | [ 1 2 3 - - ],
143 | ]
144 |
145 | input_seqs:
146 | [
147 | [ 1 2 3 4 ],
148 | [ 1 2 3 - ],
149 | [ 1 2 - - ],
150 | ]
151 |
152 | target_seqs:
153 | [
154 | [ 2 3 4 5 ],
155 | [ 2 3 4 - ],
156 | [ 2 3 - - ],
157 | ]
158 |
159 | mask:
160 | [
161 | [ 1 1 1 1 ],
162 | [ 1 1 1 0 ],
163 | [ 1 1 0 0 ],
164 | ]
165 |
166 | Args:
167 | images_and_captions: A list of pairs [image, caption], where image is a
168 | Tensor of shape [height, width, channels] and caption is a 1-D Tensor of
169 | any length. Each pair will be processed and added to the queue in a
170 | separate thread.
171 | batch_size: Batch size.
172 | queue_capacity: Queue capacity.
173 | add_summaries: If true, add caption length summaries.
174 |
175 | Returns:
176 | images: A Tensor of shape [batch_size, height, width, channels].
177 | input_seqs: An int32 Tensor of shape [batch_size, padded_length].
178 | target_seqs: An int32 Tensor of shape [batch_size, padded_length].
179 | mask: An int32 0/1 Tensor of shape [batch_size, padded_length].
180 | """
181 | enqueue_list = []
182 | for image, caption in images_and_captions:
183 | caption_length = tf.shape(caption)[0]
184 | input_length = tf.expand_dims(tf.subtract(caption_length, 1), 0)
185 |
186 | input_seq = tf.slice(caption, [0], input_length)
187 | target_seq = tf.slice(caption, [1], input_length)
188 | indicator = tf.ones(input_length, dtype=tf.int32)
189 | enqueue_list.append([image, input_seq, target_seq, indicator])
190 |
191 | images, input_seqs, target_seqs, mask = tf.train.batch_join(
192 | enqueue_list,
193 | batch_size=batch_size,
194 | capacity=queue_capacity,
195 | dynamic_pad=True,
196 | name="batch_and_pad")
197 |
198 | if add_summaries:
199 | lengths = tf.add(tf.reduce_sum(mask, 1), 1)
200 | tf.summary.scalar("caption_length/batch_min", tf.reduce_min(lengths))
201 | tf.summary.scalar("caption_length/batch_max", tf.reduce_max(lengths))
202 | tf.summary.scalar("caption_length/batch_mean", tf.reduce_mean(lengths))
203 |
204 | return images, input_seqs, target_seqs, mask
205 |
--------------------------------------------------------------------------------
/Section 1/im2txt/show_and_tell_model.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555.
17 |
18 | "Show and Tell: A Neural Image Caption Generator"
19 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan
20 | """
21 |
22 | from __future__ import absolute_import
23 | from __future__ import division
24 | from __future__ import print_function
25 |
26 |
27 | import tensorflow as tf
28 |
29 | from im2txt.ops import image_embedding
30 | from im2txt.ops import image_processing
31 | from im2txt.ops import inputs as input_ops
32 |
33 |
34 | class ShowAndTellModel(object):
35 | """Image-to-text implementation based on http://arxiv.org/abs/1411.4555.
36 |
37 | "Show and Tell: A Neural Image Caption Generator"
38 | Oriol Vinyals, Alexander Toshev, Samy Bengio, Dumitru Erhan
39 | """
40 |
41 | def __init__(self, config, mode, train_inception=False):
42 | """Basic setup.
43 |
44 | Args:
45 | config: Object containing configuration parameters.
46 | mode: "train", "eval" or "inference".
47 | train_inception: Whether the inception submodel variables are trainable.
48 | """
49 | assert mode in ["train", "eval", "inference"]
50 | self.config = config
51 | self.mode = mode
52 | self.train_inception = train_inception
53 |
54 | # Reader for the input data.
55 | self.reader = tf.TFRecordReader()
56 |
57 | # To match the "Show and Tell" paper we initialize all variables with a
58 | # random uniform initializer.
59 | self.initializer = tf.random_uniform_initializer(
60 | minval=-self.config.initializer_scale,
61 | maxval=self.config.initializer_scale)
62 |
63 | # A float32 Tensor with shape [batch_size, height, width, channels].
64 | self.images = None
65 |
66 | # An int32 Tensor with shape [batch_size, padded_length].
67 | self.input_seqs = None
68 |
69 | # An int32 Tensor with shape [batch_size, padded_length].
70 | self.target_seqs = None
71 |
72 | # An int32 0/1 Tensor with shape [batch_size, padded_length].
73 | self.input_mask = None
74 |
75 | # A float32 Tensor with shape [batch_size, embedding_size].
76 | self.image_embeddings = None
77 |
78 | # A float32 Tensor with shape [batch_size, padded_length, embedding_size].
79 | self.seq_embeddings = None
80 |
81 | # A float32 scalar Tensor; the total loss for the trainer to optimize.
82 | self.total_loss = None
83 |
84 | # A float32 Tensor with shape [batch_size * padded_length].
85 | self.target_cross_entropy_losses = None
86 |
87 | # A float32 Tensor with shape [batch_size * padded_length].
88 | self.target_cross_entropy_loss_weights = None
89 |
90 | # Collection of variables from the inception submodel.
91 | self.inception_variables = []
92 |
93 | # Function to restore the inception submodel from checkpoint.
94 | self.init_fn = None
95 |
96 | # Global step Tensor.
97 | self.global_step = None
98 |
99 | def is_training(self):
100 | """Returns true if the model is built for training mode."""
101 | return self.mode == "train"
102 |
103 | def process_image(self, encoded_image, thread_id=0):
104 | """Decodes and processes an image string.
105 |
106 | Args:
107 | encoded_image: A scalar string Tensor; the encoded image.
108 | thread_id: Preprocessing thread id used to select the ordering of color
109 | distortions.
110 |
111 | Returns:
112 | A float32 Tensor of shape [height, width, 3]; the processed image.
113 | """
114 | return image_processing.process_image(encoded_image,
115 | is_training=self.is_training(),
116 | height=self.config.image_height,
117 | width=self.config.image_width,
118 | thread_id=thread_id,
119 | image_format=self.config.image_format)
120 |
121 | def build_inputs(self):
122 | """Input prefetching, preprocessing and batching.
123 |
124 | Outputs:
125 | self.images
126 | self.input_seqs
127 | self.target_seqs (training and eval only)
128 | self.input_mask (training and eval only)
129 | """
130 | if self.mode == "inference":
131 | # In inference mode, images and inputs are fed via placeholders.
132 | image_feed = tf.placeholder(dtype=tf.string, shape=[], name="image_feed")
133 | input_feed = tf.placeholder(dtype=tf.int64,
134 | shape=[None], # batch_size
135 | name="input_feed")
136 |
137 | # Process image and insert batch dimensions.
138 | images = tf.expand_dims(self.process_image(image_feed), 0)
139 | input_seqs = tf.expand_dims(input_feed, 1)
140 |
141 | # No target sequences or input mask in inference mode.
142 | target_seqs = None
143 | input_mask = None
144 | else:
145 | # Prefetch serialized SequenceExample protos.
146 | input_queue = input_ops.prefetch_input_data(
147 | self.reader,
148 | self.config.input_file_pattern,
149 | is_training=self.is_training(),
150 | batch_size=self.config.batch_size,
151 | values_per_shard=self.config.values_per_input_shard,
152 | input_queue_capacity_factor=self.config.input_queue_capacity_factor,
153 | num_reader_threads=self.config.num_input_reader_threads)
154 |
155 | # Image processing and random distortion. Split across multiple threads
156 | # with each thread applying a slightly different distortion.
157 | assert self.config.num_preprocess_threads % 2 == 0
158 | images_and_captions = []
159 | for thread_id in range(self.config.num_preprocess_threads):
160 | serialized_sequence_example = input_queue.dequeue()
161 | encoded_image, caption = input_ops.parse_sequence_example(
162 | serialized_sequence_example,
163 | image_feature=self.config.image_feature_name,
164 | caption_feature=self.config.caption_feature_name)
165 | image = self.process_image(encoded_image, thread_id=thread_id)
166 | images_and_captions.append([image, caption])
167 |
168 | # Batch inputs.
169 | queue_capacity = (2 * self.config.num_preprocess_threads *
170 | self.config.batch_size)
171 | images, input_seqs, target_seqs, input_mask = (
172 | input_ops.batch_with_dynamic_pad(images_and_captions,
173 | batch_size=self.config.batch_size,
174 | queue_capacity=queue_capacity))
175 |
176 | self.images = images
177 | self.input_seqs = input_seqs
178 | self.target_seqs = target_seqs
179 | self.input_mask = input_mask
180 |
181 | def build_image_embeddings(self):
182 | """Builds the image model subgraph and generates image embeddings.
183 |
184 | Inputs:
185 | self.images
186 |
187 | Outputs:
188 | self.image_embeddings
189 | """
190 | inception_output = image_embedding.inception_v3(
191 | self.images,
192 | trainable=self.train_inception,
193 | is_training=self.is_training())
194 | self.inception_variables = tf.get_collection(
195 | tf.GraphKeys.GLOBAL_VARIABLES, scope="InceptionV3")
196 |
197 | # Map inception output into embedding space.
198 | with tf.variable_scope("image_embedding") as scope:
199 | image_embeddings = tf.contrib.layers.fully_connected(
200 | inputs=inception_output,
201 | num_outputs=self.config.embedding_size,
202 | activation_fn=None,
203 | weights_initializer=self.initializer,
204 | biases_initializer=None,
205 | scope=scope)
206 |
207 | # Save the embedding size in the graph.
208 | tf.constant(self.config.embedding_size, name="embedding_size")
209 |
210 | self.image_embeddings = image_embeddings
211 |
212 | def build_seq_embeddings(self):
213 | """Builds the input sequence embeddings.
214 |
215 | Inputs:
216 | self.input_seqs
217 |
218 | Outputs:
219 | self.seq_embeddings
220 | """
221 | with tf.variable_scope("seq_embedding"), tf.device("/cpu:0"):
222 | embedding_map = tf.get_variable(
223 | name="map",
224 | shape=[self.config.vocab_size, self.config.embedding_size],
225 | initializer=self.initializer)
226 | seq_embeddings = tf.nn.embedding_lookup(embedding_map, self.input_seqs)
227 |
228 | self.seq_embeddings = seq_embeddings
229 |
230 | def build_model(self):
231 | """Builds the model.
232 |
233 | Inputs:
234 | self.image_embeddings
235 | self.seq_embeddings
236 | self.target_seqs (training and eval only)
237 | self.input_mask (training and eval only)
238 |
239 | Outputs:
240 | self.total_loss (training and eval only)
241 | self.target_cross_entropy_losses (training and eval only)
242 | self.target_cross_entropy_loss_weights (training and eval only)
243 | """
244 | # This LSTM cell has biases and outputs tanh(new_c) * sigmoid(o), but the
245 | # modified LSTM in the "Show and Tell" paper has no biases and outputs
246 | # new_c * sigmoid(o).
247 | lstm_cell = tf.contrib.rnn.BasicLSTMCell(
248 | num_units=self.config.num_lstm_units, state_is_tuple=True)
249 | if self.mode == "train":
250 | lstm_cell = tf.contrib.rnn.DropoutWrapper(
251 | lstm_cell,
252 | input_keep_prob=self.config.lstm_dropout_keep_prob,
253 | output_keep_prob=self.config.lstm_dropout_keep_prob)
254 |
255 | with tf.variable_scope("lstm", initializer=self.initializer) as lstm_scope:
256 | # Feed the image embeddings to set the initial LSTM state.
257 | zero_state = lstm_cell.zero_state(
258 | batch_size=self.image_embeddings.get_shape()[0], dtype=tf.float32)
259 | _, initial_state = lstm_cell(self.image_embeddings, zero_state)
260 |
261 | # Allow the LSTM variables to be reused.
262 | lstm_scope.reuse_variables()
263 |
264 | if self.mode == "inference":
265 | # In inference mode, use concatenated states for convenient feeding and
266 | # fetching.
267 | tf.concat(axis=1, values=initial_state, name="initial_state")
268 |
269 | # Placeholder for feeding a batch of concatenated states.
270 | state_feed = tf.placeholder(dtype=tf.float32,
271 | shape=[None, sum(lstm_cell.state_size)],
272 | name="state_feed")
273 | state_tuple = tf.split(value=state_feed, num_or_size_splits=2, axis=1)
274 |
275 | # Run a single LSTM step.
276 | lstm_outputs, state_tuple = lstm_cell(
277 | inputs=tf.squeeze(self.seq_embeddings, axis=[1]),
278 | state=state_tuple)
279 |
280 | # Concatentate the resulting state.
281 | tf.concat(axis=1, values=state_tuple, name="state")
282 | else:
283 | # Run the batch of sequence embeddings through the LSTM.
284 | sequence_length = tf.reduce_sum(self.input_mask, 1)
285 | lstm_outputs, _ = tf.nn.dynamic_rnn(cell=lstm_cell,
286 | inputs=self.seq_embeddings,
287 | sequence_length=sequence_length,
288 | initial_state=initial_state,
289 | dtype=tf.float32,
290 | scope=lstm_scope)
291 |
292 | # Stack batches vertically.
293 | lstm_outputs = tf.reshape(lstm_outputs, [-1, lstm_cell.output_size])
294 |
295 | with tf.variable_scope("logits") as logits_scope:
296 | logits = tf.contrib.layers.fully_connected(
297 | inputs=lstm_outputs,
298 | num_outputs=self.config.vocab_size,
299 | activation_fn=None,
300 | weights_initializer=self.initializer,
301 | scope=logits_scope)
302 |
303 | if self.mode == "inference":
304 | tf.nn.softmax(logits, name="softmax")
305 | else:
306 | targets = tf.reshape(self.target_seqs, [-1])
307 | weights = tf.to_float(tf.reshape(self.input_mask, [-1]))
308 |
309 | # Compute losses.
310 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=targets,
311 | logits=logits)
312 | batch_loss = tf.div(tf.reduce_sum(tf.multiply(losses, weights)),
313 | tf.reduce_sum(weights),
314 | name="batch_loss")
315 | tf.losses.add_loss(batch_loss)
316 | total_loss = tf.losses.get_total_loss()
317 |
318 | # Add summaries.
319 | tf.summary.scalar("losses/batch_loss", batch_loss)
320 | tf.summary.scalar("losses/total_loss", total_loss)
321 | for var in tf.trainable_variables():
322 | tf.summary.histogram("parameters/" + var.op.name, var)
323 |
324 | self.total_loss = total_loss
325 | self.target_cross_entropy_losses = losses # Used in evaluation.
326 | self.target_cross_entropy_loss_weights = weights # Used in evaluation.
327 |
328 | def setup_inception_initializer(self):
329 | """Sets up the function to restore inception variables from checkpoint."""
330 | if self.mode != "inference":
331 | # Restore inception variables only.
332 | saver = tf.train.Saver(self.inception_variables)
333 |
334 | def restore_fn(sess):
335 | tf.logging.info("Restoring Inception variables from checkpoint file %s",
336 | self.config.inception_checkpoint_file)
337 | saver.restore(sess, self.config.inception_checkpoint_file)
338 |
339 | self.init_fn = restore_fn
340 |
341 | def setup_global_step(self):
342 | """Sets up the global step Tensor."""
343 | global_step = tf.Variable(
344 | initial_value=0,
345 | name="global_step",
346 | trainable=False,
347 | collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])
348 |
349 | self.global_step = global_step
350 |
351 | def build(self):
352 | """Creates all ops for training and evaluation."""
353 | self.build_inputs()
354 | self.build_image_embeddings()
355 | self.build_seq_embeddings()
356 | self.build_model()
357 | self.setup_inception_initializer()
358 | self.setup_global_step()
359 |
--------------------------------------------------------------------------------
/Section 1/im2txt/show_and_tell_model_test.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 |
16 | """Tests for tensorflow_models.im2txt.show_and_tell_model."""
17 |
18 | from __future__ import absolute_import
19 | from __future__ import division
20 | from __future__ import print_function
21 |
22 |
23 | import numpy as np
24 | import tensorflow as tf
25 |
26 | from im2txt import configuration
27 | from im2txt import show_and_tell_model
28 |
29 |
30 | class ShowAndTellModel(show_and_tell_model.ShowAndTellModel):
31 | """Subclass of ShowAndTellModel without the disk I/O."""
32 |
33 | def build_inputs(self):
34 | if self.mode == "inference":
35 | # Inference mode doesn't read from disk, so defer to parent.
36 | return super(ShowAndTellModel, self).build_inputs()
37 | else:
38 | # Replace disk I/O with random Tensors.
39 | self.images = tf.random_uniform(
40 | shape=[self.config.batch_size, self.config.image_height,
41 | self.config.image_width, 3],
42 | minval=-1,
43 | maxval=1)
44 | self.input_seqs = tf.random_uniform(
45 | [self.config.batch_size, 15],
46 | minval=0,
47 | maxval=self.config.vocab_size,
48 | dtype=tf.int64)
49 | self.target_seqs = tf.random_uniform(
50 | [self.config.batch_size, 15],
51 | minval=0,
52 | maxval=self.config.vocab_size,
53 | dtype=tf.int64)
54 | self.input_mask = tf.ones_like(self.input_seqs)
55 |
56 |
57 | class ShowAndTellModelTest(tf.test.TestCase):
58 |
59 | def setUp(self):
60 | super(ShowAndTellModelTest, self).setUp()
61 | self._model_config = configuration.ModelConfig()
62 |
63 | def _countModelParameters(self):
64 | """Counts the number of parameters in the model at top level scope."""
65 | counter = {}
66 | for v in tf.global_variables():
67 | name = v.op.name.split("/")[0]
68 | num_params = v.get_shape().num_elements()
69 | assert num_params
70 | counter[name] = counter.get(name, 0) + num_params
71 | return counter
72 |
73 | def _checkModelParameters(self):
74 | """Verifies the number of parameters in the model."""
75 | param_counts = self._countModelParameters()
76 | expected_param_counts = {
77 | "InceptionV3": 21802784,
78 | # inception_output_size * embedding_size
79 | "image_embedding": 1048576,
80 | # vocab_size * embedding_size
81 | "seq_embedding": 6144000,
82 | # (embedding_size + num_lstm_units + 1) * 4 * num_lstm_units
83 | "lstm": 2099200,
84 | # (num_lstm_units + 1) * vocab_size
85 | "logits": 6156000,
86 | "global_step": 1,
87 | }
88 | self.assertDictEqual(expected_param_counts, param_counts)
89 |
90 | def _checkOutputs(self, expected_shapes, feed_dict=None):
91 | """Verifies that the model produces expected outputs.
92 |
93 | Args:
94 | expected_shapes: A dict mapping Tensor or Tensor name to expected output
95 | shape.
96 | feed_dict: Values of Tensors to feed into Session.run().
97 | """
98 | fetches = expected_shapes.keys()
99 |
100 | with self.test_session() as sess:
101 | sess.run(tf.global_variables_initializer())
102 | outputs = sess.run(fetches, feed_dict)
103 |
104 | for index, output in enumerate(outputs):
105 | tensor = fetches[index]
106 | expected = expected_shapes[tensor]
107 | actual = output.shape
108 | if expected != actual:
109 | self.fail("Tensor %s has shape %s (expected %s)." %
110 | (tensor, actual, expected))
111 |
112 | def testBuildForTraining(self):
113 | model = ShowAndTellModel(self._model_config, mode="train")
114 | model.build()
115 |
116 | self._checkModelParameters()
117 |
118 | expected_shapes = {
119 | # [batch_size, image_height, image_width, 3]
120 | model.images: (32, 299, 299, 3),
121 | # [batch_size, sequence_length]
122 | model.input_seqs: (32, 15),
123 | # [batch_size, sequence_length]
124 | model.target_seqs: (32, 15),
125 | # [batch_size, sequence_length]
126 | model.input_mask: (32, 15),
127 | # [batch_size, embedding_size]
128 | model.image_embeddings: (32, 512),
129 | # [batch_size, sequence_length, embedding_size]
130 | model.seq_embeddings: (32, 15, 512),
131 | # Scalar
132 | model.total_loss: (),
133 | # [batch_size * sequence_length]
134 | model.target_cross_entropy_losses: (480,),
135 | # [batch_size * sequence_length]
136 | model.target_cross_entropy_loss_weights: (480,),
137 | }
138 | self._checkOutputs(expected_shapes)
139 |
140 | def testBuildForEval(self):
141 | model = ShowAndTellModel(self._model_config, mode="eval")
142 | model.build()
143 |
144 | self._checkModelParameters()
145 |
146 | expected_shapes = {
147 | # [batch_size, image_height, image_width, 3]
148 | model.images: (32, 299, 299, 3),
149 | # [batch_size, sequence_length]
150 | model.input_seqs: (32, 15),
151 | # [batch_size, sequence_length]
152 | model.target_seqs: (32, 15),
153 | # [batch_size, sequence_length]
154 | model.input_mask: (32, 15),
155 | # [batch_size, embedding_size]
156 | model.image_embeddings: (32, 512),
157 | # [batch_size, sequence_length, embedding_size]
158 | model.seq_embeddings: (32, 15, 512),
159 | # Scalar
160 | model.total_loss: (),
161 | # [batch_size * sequence_length]
162 | model.target_cross_entropy_losses: (480,),
163 | # [batch_size * sequence_length]
164 | model.target_cross_entropy_loss_weights: (480,),
165 | }
166 | self._checkOutputs(expected_shapes)
167 |
168 | def testBuildForInference(self):
169 | model = ShowAndTellModel(self._model_config, mode="inference")
170 | model.build()
171 |
172 | self._checkModelParameters()
173 |
174 | # Test feeding an image to get the initial LSTM state.
175 | images_feed = np.random.rand(1, 299, 299, 3)
176 | feed_dict = {model.images: images_feed}
177 | expected_shapes = {
178 | # [batch_size, embedding_size]
179 | model.image_embeddings: (1, 512),
180 | # [batch_size, 2 * num_lstm_units]
181 | "lstm/initial_state:0": (1, 1024),
182 | }
183 | self._checkOutputs(expected_shapes, feed_dict)
184 |
185 | # Test feeding a batch of inputs and LSTM states to get softmax output and
186 | # LSTM states.
187 | input_feed = np.random.randint(0, 10, size=3)
188 | state_feed = np.random.rand(3, 1024)
189 | feed_dict = {"input_feed:0": input_feed, "lstm/state_feed:0": state_feed}
190 | expected_shapes = {
191 | # [batch_size, 2 * num_lstm_units]
192 | "lstm/state:0": (3, 1024),
193 | # [batch_size, vocab_size]
194 | "softmax:0": (3, 12000),
195 | }
196 | self._checkOutputs(expected_shapes, feed_dict)
197 |
198 |
199 | if __name__ == "__main__":
200 | tf.test.main()
201 |
--------------------------------------------------------------------------------
/Section 1/im2txt/train.py:
--------------------------------------------------------------------------------
1 | # Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | # ==============================================================================
15 | """Train the model."""
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 |
22 | import tensorflow as tf
23 |
24 | from im2txt import configuration
25 | from im2txt import show_and_tell_model
26 |
27 | FLAGS = tf.app.flags.FLAGS
28 |
29 | tf.flags.DEFINE_string("input_file_pattern", "",
30 | "File pattern of sharded TFRecord input files.")
31 | tf.flags.DEFINE_string("inception_checkpoint_file", "",
32 | "Path to a pretrained inception_v3 model.")
33 | tf.flags.DEFINE_string("train_dir", "",
34 | "Directory for saving and loading model checkpoints.")
35 | tf.flags.DEFINE_boolean("train_inception", False,
36 | "Whether to train inception submodel variables.")
37 | tf.flags.DEFINE_integer("number_of_steps", 1000000, "Number of training steps.")
38 | tf.flags.DEFINE_integer("log_every_n_steps", 1,
39 | "Frequency at which loss and global step are logged.")
40 |
41 | tf.logging.set_verbosity(tf.logging.INFO)
42 |
43 |
44 | def main(unused_argv):
45 | assert FLAGS.input_file_pattern, "--input_file_pattern is required"
46 | assert FLAGS.train_dir, "--train_dir is required"
47 |
48 | model_config = configuration.ModelConfig()
49 | model_config.input_file_pattern = FLAGS.input_file_pattern
50 | model_config.inception_checkpoint_file = FLAGS.inception_checkpoint_file
51 | training_config = configuration.TrainingConfig()
52 |
53 | # Create training directory.
54 | train_dir = FLAGS.train_dir
55 | if not tf.gfile.IsDirectory(train_dir):
56 | tf.logging.info("Creating training directory: %s", train_dir)
57 | tf.gfile.MakeDirs(train_dir)
58 |
59 | # Build the TensorFlow graph.
60 | g = tf.Graph()
61 | with g.as_default():
62 | # Build the model.
63 | model = show_and_tell_model.ShowAndTellModel(
64 | model_config, mode="train", train_inception=FLAGS.train_inception)
65 | model.build()
66 |
67 | # Set up the learning rate.
68 | learning_rate_decay_fn = None
69 | if FLAGS.train_inception:
70 | learning_rate = tf.constant(training_config.train_inception_learning_rate)
71 | else:
72 | learning_rate = tf.constant(training_config.initial_learning_rate)
73 | if training_config.learning_rate_decay_factor > 0:
74 | num_batches_per_epoch = (training_config.num_examples_per_epoch /
75 | model_config.batch_size)
76 | decay_steps = int(num_batches_per_epoch *
77 | training_config.num_epochs_per_decay)
78 |
79 | def _learning_rate_decay_fn(learning_rate, global_step):
80 | return tf.train.exponential_decay(
81 | learning_rate,
82 | global_step,
83 | decay_steps=decay_steps,
84 | decay_rate=training_config.learning_rate_decay_factor,
85 | staircase=True)
86 |
87 | learning_rate_decay_fn = _learning_rate_decay_fn
88 |
89 | # Set up the training ops.
90 | train_op = tf.contrib.layers.optimize_loss(
91 | loss=model.total_loss,
92 | global_step=model.global_step,
93 | learning_rate=learning_rate,
94 | optimizer=training_config.optimizer,
95 | clip_gradients=training_config.clip_gradients,
96 | learning_rate_decay_fn=learning_rate_decay_fn)
97 |
98 | # Set up the Saver for saving and restoring model checkpoints.
99 | saver = tf.train.Saver(max_to_keep=training_config.max_checkpoints_to_keep)
100 |
101 | # Run training.
102 | tf.contrib.slim.learning.train(
103 | train_op,
104 | train_dir,
105 | log_every_n_steps=FLAGS.log_every_n_steps,
106 | graph=g,
107 | global_step=model.global_step,
108 | number_of_steps=FLAGS.number_of_steps,
109 | init_fn=model.init_fn,
110 | saver=saver)
111 |
112 |
113 | if __name__ == "__main__":
114 | tf.app.run()
115 |
--------------------------------------------------------------------------------
/Section 1/test_images/ballons.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/ballons.jpeg
--------------------------------------------------------------------------------
/Section 1/test_images/bike.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/bike.jpeg
--------------------------------------------------------------------------------
/Section 1/test_images/fireworks.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/fireworks.jpeg
--------------------------------------------------------------------------------
/Section 1/test_images/football.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/football.jpeg
--------------------------------------------------------------------------------
/Section 1/test_images/headphones.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/headphones.jpeg
--------------------------------------------------------------------------------
/Section 1/test_images/laughing.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 1/test_images/laughing.jpeg
--------------------------------------------------------------------------------
/Section 2/knn.p:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/knn.p
--------------------------------------------------------------------------------
/Section 2/tests/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/Thumbs.db
--------------------------------------------------------------------------------
/Section 2/tests/p2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p2.jpg
--------------------------------------------------------------------------------
/Section 2/tests/p5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p5.jpg
--------------------------------------------------------------------------------
/Section 2/tests/p7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p7.jpg
--------------------------------------------------------------------------------
/Section 2/tests/p9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 2/tests/p9.jpg
--------------------------------------------------------------------------------
/Section 3/AUTHORS:
--------------------------------------------------------------------------------
1 | Eldar Insafutdinov, github.com/eldar
2 | Mikhaylo Andriluka, github.com/andriluka
3 | Mihai Fieraru, github.com/fierarufmihai
--------------------------------------------------------------------------------
/Section 3/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pprint
3 | import logging
4 |
5 | import yaml
6 | from easydict import EasyDict as edict
7 |
8 | import default_config
9 |
10 |
11 | cfg = default_config.cfg
12 |
13 |
14 | def _merge_a_into_b(a, b):
15 | """Merge config dictionary a into config dictionary b, clobbering the
16 | options in b whenever they are also specified in a.
17 | """
18 | if type(a) is not edict:
19 | return
20 |
21 | for k, v in a.items():
22 | # a must specify keys that are in b
23 | #if k not in b:
24 | # raise KeyError('{} is not a valid config key'.format(k))
25 |
26 | # recursively merge dicts
27 | if type(v) is edict:
28 | try:
29 | _merge_a_into_b(a[k], b[k])
30 | except:
31 | print('Error under config key: {}'.format(k))
32 | raise
33 | else:
34 | b[k] = v
35 |
36 |
37 | def cfg_from_file(filename):
38 | """Load a config from file filename and merge it into the default options.
39 | """
40 | with open(filename, 'r') as f:
41 | yaml_cfg = edict(yaml.load(f))
42 |
43 | _merge_a_into_b(yaml_cfg, cfg)
44 |
45 | logging.info("Config:\n"+pprint.pformat(cfg))
46 | return cfg
47 |
48 |
49 | def load_config(filename = "pose_cfg.yaml"):
50 | if 'POSE_PARAM_PATH' in os.environ:
51 | filename = os.environ['POSE_PARAM_PATH'] + '/' + filename
52 | return cfg_from_file(filename)
53 |
54 |
55 | if __name__ == "__main__":
56 | print(load_config())
--------------------------------------------------------------------------------
/Section 3/dlib_face_recognition_resnet_model_v1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/dlib_face_recognition_resnet_model_v1.dat
--------------------------------------------------------------------------------
/Section 3/models/coco/coco-resnet-101.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/coco-resnet-101.meta
--------------------------------------------------------------------------------
/Section 3/models/coco/download_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.data-00000-of-00001
4 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.meta
5 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/coco-resnet-101.index
6 |
7 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/pairwise_coco.tar.gz
8 | tar xvzf pairwise_coco.tar.gz
9 |
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/pairwise_stats.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/pairwise_stats.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_10_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_10_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_10_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_10_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_11_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_11_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_11_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_11_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_11_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_12_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_12_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_12_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_12_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_12_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_13_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_13_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_13_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_13_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_14_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_14_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_14_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_14_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_15_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_15_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_16_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_16_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_11.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_11.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_3.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_3.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_4.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_4.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_8.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_8.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_1_9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_1_9.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_11.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_11.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_3.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_3.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_5.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_5.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_7.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_7.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_2_9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_2_9.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_11.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_11.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_4.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_4.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_5.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_5.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_7.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_7.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_3_8.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_3_8.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_6.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_6.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_7.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_7.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_4_9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_4_9.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_11.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_11.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_6.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_6.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_8.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_8.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_5_9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_5_9.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_8.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_8.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_6_9.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_6_9.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_7_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_7_11.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_11.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_7_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_7_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_7_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_7_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_10.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_10.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_12.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_12.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_13.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_13.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_15.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_15.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_8_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_8_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_9_14.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_14.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_9_16.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_16.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise/spatial_model_cidx_9_17.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise/spatial_model_cidx_9_17.mat
--------------------------------------------------------------------------------
/Section 3/models/coco/pairwise_coco.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/coco/pairwise_coco.tar.gz
--------------------------------------------------------------------------------
/Section 3/models/coco/train/pose_cfg.yaml:
--------------------------------------------------------------------------------
1 | dataset: /path/to/coco/dataset
2 | dataset_phase: train2014
3 | dataset_ann: person_keypoints # 'image_info' or 'person_keypoints'
4 | dataset_type: coco
5 | coco_only_images_with_people: true
6 |
7 | global_scale: 1.0
8 | pos_dist_thresh: 17
9 | scale_jitter_lo: 0.85
10 | scale_jitter_up: 1.15
11 |
12 | net_type: resnet_101
13 | init_weights: ../../pretrained/resnet_v1_101.ckpt
14 |
15 | location_refinement: true
16 | locref_huber_loss: true
17 | locref_loss_weight: 0.05
18 | locref_stdev: 7.2801
19 |
20 | pairwise_predict: true
21 | pairwise_huber_loss: true
22 | pairwise_loss_weight: 0.05
23 | pairwise_stats_fn: ../pairwise/pairwise_stats.mat
24 |
25 | intermediate_supervision: true
26 | intermediate_supervision_layer: 12
27 |
28 | max_input_size: 850
29 | multi_step:
30 | - [0.005, 10000]
31 | - [0.02, 450000]
32 | - [0.002, 750000]
33 | - [0.0005, 1050000]
34 | - [0.0002, 1550000]
35 | - [0.00005, 1800000]
36 | display_iters: 20
37 | save_iters: 200000
38 |
39 | mirror: true
40 |
--------------------------------------------------------------------------------
/Section 3/models/mpii/download_models.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.data-00000-of-00001
4 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.meta
5 | curl -L -O https://datasets.d2.mpi-inf.mpg.de/deepercut-models-tensorflow/mpii-single-resnet-101.index
6 |
--------------------------------------------------------------------------------
/Section 3/models/mpii/mpii-single-resnet-101.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/mpii/mpii-single-resnet-101.index
--------------------------------------------------------------------------------
/Section 3/models/mpii/mpii-single-resnet-101.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/models/mpii/mpii-single-resnet-101.meta
--------------------------------------------------------------------------------
/Section 3/models/mpii/test/pose_cfg.yaml:
--------------------------------------------------------------------------------
1 | dataset: dataset-test.mat
2 | dataset_type: "mpii"
3 | global_scale: 1.0
4 | init_weights: models/mpii/snapshot-1030000
5 | location_refinement: true
6 | locref_stdev: 7.2801
7 | net_type: resnet_101
8 | scoremap_dir: test
9 |
--------------------------------------------------------------------------------
/Section 3/models/mpii/train/pose_cfg.yaml:
--------------------------------------------------------------------------------
1 | dataset: /path/to/dataset.mat
2 | dataset_type: "mpii"
3 |
4 | pos_dist_thresh: 17
5 | global_scale: 0.8452830189
6 | scale_jitter_lo: 0.85
7 | scale_jitter_up: 1.15
8 |
9 | net_type: resnet_101
10 | init_weights: ../../pretrained/resnet_v1_101.ckpt
11 |
12 | location_refinement: true
13 | locref_huber_loss: true
14 | locref_loss_weight: 0.05
15 | locref_stdev: 7.2801
16 |
17 | intermediate_supervision: true
18 | intermediate_supervision_layer: 12
19 |
20 | max_input_size: 850
21 | multi_step:
22 | - [0.005, 10000]
23 | - [0.02, 430000]
24 | - [0.002, 730000]
25 | - [0.001, 1030000]
26 | display_iters: 20
27 | save_iters: 60000
28 |
29 | mirror: true
30 |
--------------------------------------------------------------------------------
/Section 3/models/pretrained/download.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # Listed here https://github.com/tensorflow/models/tree/master/slim#pre-trained-models
4 |
5 | curl http://download.tensorflow.org/models/resnet_v1_50_2016_08_28.tar.gz | tar xvz
6 | curl http://download.tensorflow.org/models/resnet_v1_101_2016_08_28.tar.gz | tar xvz
7 |
--------------------------------------------------------------------------------
/Section 3/pexels-photo-712521.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/pexels-photo-712521.jpeg
--------------------------------------------------------------------------------
/Section 3/pexels-photo-776615.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/pexels-photo-776615.jpeg
--------------------------------------------------------------------------------
/Section 3/testcases/vids/boy_walking.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/testcases/vids/boy_walking.mp4
--------------------------------------------------------------------------------
/Section 3/testcases/vids/sidewalk.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Computer-Vision-Projects/14b07cf240e2b63ca9bb2c32cc8a94ec78c818ee/Section 3/testcases/vids/sidewalk.mp4
--------------------------------------------------------------------------------