├── .gitignore ├── 1. O'Reilly Training.ipynb ├── 2. O'Reilly Generate.ipynb ├── 3. O'Reilly Generate from image.ipynb ├── README.md ├── data └── README.md ├── dockerfiles ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.gpu └── requirements.txt └── models ├── tensorflow └── .keep └── tf_final ├── checkpoint ├── model-500.data-00000-of-00001 ├── model-500.index └── model-500.meta /.gitignore: -------------------------------------------------------------------------------- 1 | *.jpg 2 | *.npy 3 | *.h5 4 | *.json 5 | *.mat 6 | -------------------------------------------------------------------------------- /1. O'Reilly Training.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Training a caption generator\n", 11 | "This notebook implements the Show and Tell caption generation model described in our corresponding article. The key portions of this notebook are loading the data with `get_data`, processing the text data with `preProBuildWordVocab`, building the `Caption_Generator` in `train` and tracking our progress.\n", 12 | "\n", 13 | "*Note:* create a directory to save your tensorflow models and assign this directory path to the `model_path` variable." 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": { 20 | "collapsed": false, 21 | "deletable": true, 22 | "editable": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "import math\n", 27 | "import os\n", 28 | "import tensorflow as tf\n", 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "import pickle\n", 32 | "import cv2\n", 33 | "import skimage\n", 34 | "import pickle as pkl\n", 35 | "\n", 36 | "import tensorflow.python.platform\n", 37 | "from keras.preprocessing import sequence\n", 38 | "from collections import Counter" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "# Downloading Data\n", 46 | "As mentioned in the README, in order to run this notebook, you will need VGG-16 image embeddings for the Flickr-30K dataset. These image embeddings are available from our [Google Drive](https://drive.google.com/file/d/0B5o40yxdA9PqTnJuWGVkcFlqcG8/view?usp=sharing).\n", 47 | "\n", 48 | "Additionally, you will need the corresponding captions for these images (`results_20130124.token`), which can also be downloaded from our [Google Drive](https://drive.google.com/file/d/0B2vTU3h54lTydXFjSVM5T2t4WmM/view?usp=sharing).\n", 49 | "\n", 50 | "Place all of these downloads in the `./data/` folder.\n", 51 | "\n", 52 | "The feature embeddings will be in `./data/feats.npy` and the embeddings' corresponding captions will be saved to `./data/results_20130124.token` ." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "collapsed": true, 60 | "deletable": true, 61 | "editable": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "model_path = './models/tensorflow'\n", 66 | "model_path_transfer = './models/tf_final'\n", 67 | "feature_path = './data/feats.npy'\n", 68 | "annotation_path = './data/results_20130124.token'" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "source": [ 78 | "## Loading data\n", 79 | "Parse the image embedding features from the Flickr30k dataset `./data/feats.npy`, and load the caption data via `pandas` from `./data/results_20130124.token`" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true, 87 | "deletable": true, 88 | "editable": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "def get_data(annotation_path, feature_path):\n", 93 | " annotations = pd.read_table(annotation_path, sep='\\t', header=None, names=['image', 'caption'])\n", 94 | " return np.load(feature_path,'r'), annotations['caption'].values" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": { 101 | "collapsed": false, 102 | "deletable": true, 103 | "editable": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "feats, captions = get_data(annotation_path, feature_path)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": false, 115 | "deletable": true, 116 | "editable": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "print(feats.shape)\n", 121 | "print(captions.shape)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": { 128 | "collapsed": false, 129 | "deletable": true, 130 | "editable": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "print(captions[0])" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": { 141 | "collapsed": true, 142 | "deletable": true, 143 | "editable": true 144 | }, 145 | "outputs": [], 146 | "source": [ 147 | "def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # function from Andre Karpathy's NeuralTalk\n", 148 | " print('preprocessing %d word vocab' % (word_count_threshold, ))\n", 149 | " word_counts = {}\n", 150 | " nsents = 0\n", 151 | " for sent in sentence_iterator:\n", 152 | " nsents += 1\n", 153 | " for w in sent.lower().split(' '):\n", 154 | " word_counts[w] = word_counts.get(w, 0) + 1\n", 155 | " vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]\n", 156 | " print('preprocessed words %d -> %d' % (len(word_counts), len(vocab)))\n", 157 | "\n", 158 | " ixtoword = {}\n", 159 | " ixtoword[0] = '.' \n", 160 | " wordtoix = {}\n", 161 | " wordtoix['#START#'] = 0 \n", 162 | " ix = 1\n", 163 | " for w in vocab:\n", 164 | " wordtoix[w] = ix\n", 165 | " ixtoword[ix] = w\n", 166 | " ix += 1\n", 167 | "\n", 168 | " word_counts['.'] = nsents\n", 169 | " bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])\n", 170 | " bias_init_vector /= np.sum(bias_init_vector) \n", 171 | " bias_init_vector = np.log(bias_init_vector)\n", 172 | " bias_init_vector -= np.max(bias_init_vector) \n", 173 | " return wordtoix, ixtoword, bias_init_vector.astype(np.float32)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": false, 181 | "deletable": true, 182 | "editable": true 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "class Caption_Generator():\n", 187 | " def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b):\n", 188 | "\n", 189 | " self.dim_in = dim_in\n", 190 | " self.dim_embed = dim_embed\n", 191 | " self.dim_hidden = dim_hidden\n", 192 | " self.batch_size = batch_size\n", 193 | " self.n_lstm_steps = n_lstm_steps\n", 194 | " self.n_words = n_words\n", 195 | " \n", 196 | " # declare the variables to be used for our word embeddings\n", 197 | " with tf.device(\"/cpu:0\"):\n", 198 | " self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')\n", 199 | "\n", 200 | " self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')\n", 201 | " \n", 202 | " # declare the LSTM itself\n", 203 | " self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)\n", 204 | " \n", 205 | " # declare the variables to be used to embed the image feature embedding to the word embedding space\n", 206 | " self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')\n", 207 | " self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')\n", 208 | "\n", 209 | " # declare the variables to go from an LSTM output to a word encoding output\n", 210 | " self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')\n", 211 | " # initialize this bias variable from the preProBuildWordVocab output\n", 212 | " self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')\n", 213 | "\n", 214 | " def build_model(self):\n", 215 | " # declaring the placeholders for our extracted image feature vectors, our caption, and our mask\n", 216 | " # (describes how long our caption is with an array of 0/1 values of length `maxlen` \n", 217 | " img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])\n", 218 | " caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])\n", 219 | " mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])\n", 220 | " \n", 221 | " # getting an initial LSTM embedding from our image_imbedding\n", 222 | " image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias\n", 223 | " \n", 224 | " # setting initial state of our LSTM\n", 225 | " state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)\n", 226 | "\n", 227 | " total_loss = 0.0\n", 228 | " with tf.variable_scope(\"RNN\"):\n", 229 | " for i in range(self.n_lstm_steps): \n", 230 | " if i > 0:\n", 231 | " #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding\n", 232 | " # to the (i-1)th word in our caption \n", 233 | " with tf.device(\"/cpu:0\"):\n", 234 | " current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias\n", 235 | " else:\n", 236 | " #if this is the first iteration of our LSTM we utilize the embedded image as our input \n", 237 | " current_embedding = image_embedding\n", 238 | " if i > 0: \n", 239 | " # allows us to reuse the LSTM tensor variable on each iteration\n", 240 | " tf.get_variable_scope().reuse_variables()\n", 241 | "\n", 242 | " out, state = self.lstm(current_embedding, state)\n", 243 | "\n", 244 | " \n", 245 | " if i > 0:\n", 246 | " #get the one-hot representation of the next word in our caption \n", 247 | " labels = tf.expand_dims(caption_placeholder[:, i], 1)\n", 248 | " ix_range=tf.range(0, self.batch_size, 1)\n", 249 | " ixs = tf.expand_dims(ix_range, 1)\n", 250 | " concat = tf.concat([ixs, labels],1)\n", 251 | " onehot = tf.sparse_to_dense(\n", 252 | " concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)\n", 253 | "\n", 254 | "\n", 255 | " #perform a softmax classification to generate the next word in the caption\n", 256 | " logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias\n", 257 | " xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)\n", 258 | " xentropy = xentropy * mask[:,i]\n", 259 | "\n", 260 | " loss = tf.reduce_sum(xentropy)\n", 261 | " total_loss += loss\n", 262 | "\n", 263 | " total_loss = total_loss / tf.reduce_sum(mask[:,1:])\n", 264 | " return total_loss, img, caption_placeholder, mask\n" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "collapsed": false, 272 | "deletable": true, 273 | "editable": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "### Parameters ###\n", 278 | "dim_embed = 256\n", 279 | "dim_hidden = 256\n", 280 | "dim_in = 4096\n", 281 | "batch_size = 128\n", 282 | "momentum = 0.9\n", 283 | "n_epochs = 150\n", 284 | "\n", 285 | "def train(learning_rate=0.001, continue_training=False, transfer=True):\n", 286 | " \n", 287 | " tf.reset_default_graph()\n", 288 | "\n", 289 | " feats, captions = get_data(annotation_path, feature_path)\n", 290 | " wordtoix, ixtoword, init_b = preProBuildWordVocab(captions)\n", 291 | "\n", 292 | " np.save('data/ixtoword', ixtoword)\n", 293 | "\n", 294 | " index = (np.arange(len(feats)).astype(int))\n", 295 | " np.random.shuffle(index)\n", 296 | "\n", 297 | "\n", 298 | " sess = tf.InteractiveSession()\n", 299 | " n_words = len(wordtoix)\n", 300 | " maxlen = np.max( [x for x in map(lambda x: len(x.split(' ')), captions) ] )\n", 301 | " caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words, init_b)\n", 302 | "\n", 303 | " loss, image, sentence, mask = caption_generator.build_model()\n", 304 | "\n", 305 | " saver = tf.train.Saver(max_to_keep=100)\n", 306 | " global_step=tf.Variable(0,trainable=False)\n", 307 | " learning_rate = tf.train.exponential_decay(learning_rate, global_step,\n", 308 | " int(len(index)/batch_size), 0.95)\n", 309 | " train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)\n", 310 | " tf.global_variables_initializer().run()\n", 311 | "\n", 312 | " if continue_training:\n", 313 | " if not transfer:\n", 314 | " saver.restore(sess,tf.train.latest_checkpoint(model_path))\n", 315 | " else:\n", 316 | " saver.restore(sess,tf.train.latest_checkpoint(model_path_transfer))\n", 317 | " losses=[]\n", 318 | " for epoch in range(n_epochs):\n", 319 | " for start, end in zip( range(0, len(index), batch_size), range(batch_size, len(index), batch_size)):\n", 320 | "\n", 321 | " current_feats = feats[index[start:end]]\n", 322 | " current_captions = captions[index[start:end]]\n", 323 | " current_caption_ind = [x for x in map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)]\n", 324 | "\n", 325 | " current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)\n", 326 | " current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] )\n", 327 | "\n", 328 | " current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))\n", 329 | " nonzeros = np.array([x for x in map(lambda x: (x != 0).sum()+2, current_caption_matrix )])\n", 330 | "\n", 331 | " for ind, row in enumerate(current_mask_matrix):\n", 332 | " row[:nonzeros[ind]] = 1\n", 333 | "\n", 334 | " _, loss_value = sess.run([train_op, loss], feed_dict={\n", 335 | " image: current_feats.astype(np.float32),\n", 336 | " sentence : current_caption_matrix.astype(np.int32),\n", 337 | " mask : current_mask_matrix.astype(np.float32)\n", 338 | " })\n", 339 | "\n", 340 | " print(\"Current Cost: \", loss_value, \"\\t Epoch {}/{}\".format(epoch, n_epochs), \"\\t Iter {}/{}\".format(start,len(feats)))\n", 341 | " print(\"Saving the model from epoch: \", epoch)\n", 342 | " saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": false, 350 | "deletable": true, 351 | "editable": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "try:\n", 356 | " #train(.001,False,False) #train from scratch\n", 357 | " train(.001,True,True) #continue training from pretrained weights @epoch500\n", 358 | " #train(.001,True,False) #train from previously saved weights \n", 359 | "except KeyboardInterrupt:\n", 360 | " print('Exiting Training')" 361 | ] 362 | } 363 | ], 364 | "metadata": { 365 | "anaconda-cloud": {}, 366 | "kernelspec": { 367 | "display_name": "Python [conda root]", 368 | "language": "python", 369 | "name": "conda-root-py" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 2 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython2", 381 | "version": "2.7.12" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 1 386 | } 387 | -------------------------------------------------------------------------------- /2. O'Reilly Generate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Generating captions from Flickr-30K feature embeddings\n", 11 | "This notebook generates captions from the Flickr-30K dataset. This allows us to analyze the quality of captions generated by the LSTM we trained in the `O'Reilly Training.ipynb` notebook. Unfortunately, we can not see the images associated with the Feature embedding. For that please look at the `O'Reilly Generate from images.ipynb` notebook.\n", 12 | "\n", 13 | "*Note:* The `model_path` variable should be the same as the `model_path` variable set in `O'Reilly Training.ipynb`\n", 14 | "\n", 15 | "# Make sure to run the 'O'Reilly Training.ipynb' notebook for at least one epoch before running this notebook." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": false, 23 | "deletable": true, 24 | "editable": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import math\n", 29 | "import os\n", 30 | "import tensorflow as tf\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import pickle\n", 34 | "\n", 35 | "import tensorflow.python.platform\n", 36 | "from keras.preprocessing import sequence\n", 37 | "from collections import Counter" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Downloading Data\n", 45 | "As mentioned in the README, in order to run this notebook, you will need VGG-16 image embeddings for the Flickr-30K dataset. These image embeddings are available from our [Google Drive](https://drive.google.com/file/d/0B5o40yxdA9PqTnJuWGVkcFlqcG8/view?usp=sharing).\n", 46 | "\n", 47 | "Additionally, you will need the corresponding captions for these images (`results_20130124.token`), which can also be downloaded from our [Google Drive](https://drive.google.com/file/d/0B2vTU3h54lTydXFjSVM5T2t4WmM/view?usp=sharing).\n", 48 | "\n", 49 | "(**Note**: These are the same downloads from `O'Reilly Training.ipynb`. If you downloaded them while working through that notebook there is no need to download them again)\n", 50 | "\n", 51 | "Place all of these downloads in the `./data/` folder.\n", 52 | "\n", 53 | "The feature embeddings will be in `./data/feats.npy` and the embeddings' corresponding captions will be saved to `./data/results_20130124.token` ." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true, 61 | "deletable": true, 62 | "editable": true 63 | }, 64 | "outputs": [], 65 | "source": [ 66 | "model_path = './models/tensorflow'\n", 67 | "feature_path = './data/feats.npy'\n", 68 | "annotation_path = './data/results_20130124.token'" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true, 76 | "deletable": true, 77 | "editable": true 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "### Set Hyperparameters ###\n", 82 | "dim_embed = 256\n", 83 | "dim_hidden = 256\n", 84 | "dim_in = 4096\n", 85 | "batch_size = 1\n", 86 | "learning_rate = 0.001\n", 87 | "momentum = 0.9\n", 88 | "n_epochs = 25" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": true, 96 | "deletable": true, 97 | "editable": true 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "def get_data(annotation_path, feature_path):\n", 102 | " annotations = pd.read_table(annotation_path, sep='\\t', header=None, names=['image', 'caption'])\n", 103 | " return np.load(feature_path,'r'), annotations['caption'].values" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": false, 111 | "deletable": true, 112 | "editable": true 113 | }, 114 | "outputs": [], 115 | "source": [ 116 | "feats, captions = get_data(annotation_path, feature_path)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false, 124 | "deletable": true, 125 | "editable": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "print(feats.shape)\n", 130 | "print(captions.shape)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false, 138 | "deletable": true, 139 | "editable": true 140 | }, 141 | "outputs": [], 142 | "source": [ 143 | "print(captions[0])" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": { 150 | "collapsed": false, 151 | "deletable": true, 152 | "editable": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "class Caption_Generator():\n", 157 | " def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b=None):\n", 158 | "\n", 159 | " self.dim_in = dim_in\n", 160 | " self.dim_embed = dim_embed\n", 161 | " self.dim_hidden = dim_hidden\n", 162 | " self.batch_size = batch_size\n", 163 | " self.n_lstm_steps = n_lstm_steps\n", 164 | " self.n_words = n_words\n", 165 | " \n", 166 | " # declare the variables to be used for our word embeddings\n", 167 | " with tf.device(\"/cpu:0\"):\n", 168 | " self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')\n", 169 | "\n", 170 | " self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')\n", 171 | " \n", 172 | " # declare the LSTM itself\n", 173 | " self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)\n", 174 | " \n", 175 | " # declare the variables to be used to embed the image feature embedding to the word embedding space\n", 176 | " self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')\n", 177 | " self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')\n", 178 | "\n", 179 | " # declare the variables to go from an LSTM output to a word encoding output\n", 180 | " self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')\n", 181 | " \n", 182 | " # optional initialization setter for encoding bias variable \n", 183 | " if init_b is not None:\n", 184 | " self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')\n", 185 | " else:\n", 186 | " self.word_encoding_bias = tf.Variable(tf.zeros([n_words]), name='word_encoding_bias')\n", 187 | "\n", 188 | " def build_model(self):\n", 189 | " # declaring the placeholders for our extracted image feature vectors, our caption, and our mask\n", 190 | " # (describes how long our caption is with an array of 0/1 values of length `maxlen` \n", 191 | " img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])\n", 192 | " caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])\n", 193 | " mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])\n", 194 | " \n", 195 | " # getting an initial LSTM embedding from our image_imbedding\n", 196 | " image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias\n", 197 | " \n", 198 | " # setting initial state of our LSTM\n", 199 | " state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)\n", 200 | "\n", 201 | " total_loss = 0.0\n", 202 | " with tf.variable_scope(\"RNN\"):\n", 203 | " for i in range(self.n_lstm_steps): \n", 204 | " if i > 0:\n", 205 | " # if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding\n", 206 | " # to the (i-1)th word in our caption \n", 207 | " with tf.device(\"/cpu:0\"):\n", 208 | " current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias\n", 209 | " else:\n", 210 | " #if this is the first iteration of our LSTM we utilize the embedded image as our input \n", 211 | " current_embedding = image_embedding\n", 212 | " if i > 0: \n", 213 | " # allows us to reuse the LSTM tensor variable on each iteration\n", 214 | " tf.get_variable_scope().reuse_variables()\n", 215 | "\n", 216 | " out, state = self.lstm(current_embedding, state)\n", 217 | "\n", 218 | " \n", 219 | " if i > 0:\n", 220 | " #get the one-hot representation of the next word in our caption \n", 221 | " labels = tf.expand_dims(caption_placeholder[:, i], 1)\n", 222 | " ix_range=tf.range(0, self.batch_size, 1)\n", 223 | " ixs = tf.expand_dims(ix_range, 1)\n", 224 | " concat = tf.concat([ixs, labels],1)\n", 225 | " onehot = tf.sparse_to_dense(\n", 226 | " concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)\n", 227 | "\n", 228 | "\n", 229 | " #perform a softmax classification to generate the next word in the caption\n", 230 | " logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias\n", 231 | " xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)\n", 232 | " xentropy = xentropy * mask[:,i]\n", 233 | "\n", 234 | " loss = tf.reduce_sum(xentropy)\n", 235 | " total_loss += loss\n", 236 | "\n", 237 | " total_loss = total_loss / tf.reduce_sum(mask[:,1:])\n", 238 | " return total_loss, img, caption_placeholder, mask\n", 239 | "\n", 240 | "\n", 241 | " def build_generator(self, maxlen, batchsize=1):\n", 242 | " #same setup as `build_model` function \n", 243 | " img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])\n", 244 | " image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias\n", 245 | " state = self.lstm.zero_state(batchsize,dtype=tf.float32)\n", 246 | "\n", 247 | " #declare list to hold the words of our generated captions\n", 248 | " all_words = []\n", 249 | " with tf.variable_scope(\"RNN\"):\n", 250 | " # in the first iteration we have no previous word, so we directly pass in the image embedding\n", 251 | " # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations\n", 252 | " output, state = self.lstm(image_embedding, state)\n", 253 | " previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias\n", 254 | "\n", 255 | " for i in range(maxlen):\n", 256 | " tf.get_variable_scope().reuse_variables()\n", 257 | "\n", 258 | " out, state = self.lstm(previous_word, state)\n", 259 | "\n", 260 | "\n", 261 | " # get a get maximum probability word and it's encoding from the output of the LSTM\n", 262 | " logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias\n", 263 | " best_word = tf.argmax(logit, 1)\n", 264 | "\n", 265 | " with tf.device(\"/cpu:0\"):\n", 266 | " # get the embedding of the best_word to use as input to the next iteration of our LSTM \n", 267 | " previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)\n", 268 | "\n", 269 | " previous_word += self.embedding_bias\n", 270 | "\n", 271 | " all_words.append(best_word)\n", 272 | "\n", 273 | " return img, all_words" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "collapsed": false, 281 | "deletable": true, 282 | "editable": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "if not os.path.exists('data/ixtoword.npy'):\n", 287 | " print ('You must run 1. O\\'reilly Training.ipynb first.')\n", 288 | "else:\n", 289 | " ixtoword = np.load('data/ixtoword.npy').tolist()\n", 290 | " n_words = len(ixtoword)\n", 291 | " maxlen=15\n", 292 | " \n", 293 | " tf.reset_default_graph()\n", 294 | " sess = tf.InteractiveSession()\n", 295 | " \n", 296 | " caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words)\n", 297 | "\n", 298 | " image, generated_words = caption_generator.build_generator(maxlen=maxlen)" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": {}, 304 | "source": [ 305 | "## Note:\n", 306 | "In order for the call to `saver.restore(sess, tf.train.latest_checkpoint(model_path))` to work you must have run `O' Reilly Training.ipynb` for 1 full epoch. This is because the call in `O' Reilly Training.ipynb` to save the graph to `model_path` only occurs after successfully completing one full epoch of training. If you would like to run this notebook to sanity check the code, uncomment `sanity_check=True`." 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true, 314 | "deletable": true, 315 | "editable": true 316 | }, 317 | "outputs": [], 318 | "source": [ 319 | "def test(sess,image,generated_words,ixtoword,idx=0): # Naive greedy search\n", 320 | "\n", 321 | " feats, captions = get_data(annotation_path, feature_path)\n", 322 | " feat = np.array([feats[idx]])\n", 323 | " \n", 324 | " saver = tf.train.Saver()\n", 325 | " sanity_check= False\n", 326 | " # sanity_check=True\n", 327 | " if not sanity_check:\n", 328 | " saved_path=tf.train.latest_checkpoint(model_path)\n", 329 | " saver.restore(sess, saved_path)\n", 330 | " else:\n", 331 | " tf.global_variables_initializer().run()\n", 332 | "\n", 333 | " generated_word_index= sess.run(generated_words, feed_dict={image:feat})\n", 334 | " generated_word_index = np.hstack(generated_word_index)\n", 335 | "\n", 336 | " generated_sentence = [ixtoword[x] for x in generated_word_index]\n", 337 | " print(generated_sentence)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": { 344 | "collapsed": false, 345 | "deletable": true, 346 | "editable": true 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "test(sess,image,generated_words,ixtoword,1)" 351 | ] 352 | } 353 | ], 354 | "metadata": { 355 | "anaconda-cloud": {}, 356 | "kernelspec": { 357 | "display_name": "Python [conda root]", 358 | "language": "python", 359 | "name": "conda-root-py" 360 | }, 361 | "language_info": { 362 | "codemirror_mode": { 363 | "name": "ipython", 364 | "version": 2 365 | }, 366 | "file_extension": ".py", 367 | "mimetype": "text/x-python", 368 | "name": "python", 369 | "nbconvert_exporter": "python", 370 | "pygments_lexer": "ipython2", 371 | "version": "2.7.12" 372 | } 373 | }, 374 | "nbformat": 4, 375 | "nbformat_minor": 1 376 | } 377 | -------------------------------------------------------------------------------- /3. O'Reilly Generate from image.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Generating captions from images\n", 11 | "This notebook builds on the previous caption generation notebook. The key difference however is that the image feature embedding is generated from an image passed through the VGG-16 network, as opposed to just pulling the feature embedding from an already precomputed set of feature embeddings for the Flickr-30K dataset. This allows the user to generate captions for their own images in addition to generating captions for the Flickr-30K images.\n", 12 | "\n", 13 | "*Note:* The `model_path` variable should be the same as the `model_path` variable set in `O'Reilly Training.ipynb`\n", 14 | "\n", 15 | "# Make sure to run the 'O'Reilly Training.ipynb' notebook for at least one epoch before running this notebook." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": { 22 | "collapsed": false, 23 | "deletable": true, 24 | "editable": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import math\n", 29 | "import os\n", 30 | "import tensorflow as tf\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import pickle\n", 34 | "import cv2\n", 35 | "import skimage\n", 36 | "\n", 37 | "import tensorflow.python.platform\n", 38 | "from keras.preprocessing import sequence\n", 39 | "from collections import Counter" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "deletable": true, 46 | "editable": true 47 | }, 48 | "source": [ 49 | "# Downloading Data\n", 50 | "In order to run this notebook you will need to download a pretrained TensorFlow model for [VGG-16](https://drive.google.com/file/d/0B2vTU3h54lTyaDczbFhsZFpsUGs/view?usp=sharing) generated from the original Caffe model from the VGG-16 paper. \n", 51 | "\n", 52 | "Place this download in the `./data/` folder.\n", 53 | "\n", 54 | "The graph model should now be saved at `./data/vgg16-20160129.tfmodel` ." 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true, 62 | "deletable": true, 63 | "editable": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "model_path = './models/tensorflow'\n", 68 | "vgg_path = './data/vgg16-20160129.tfmodel'" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "deletable": true, 75 | "editable": true 76 | }, 77 | "source": [ 78 | "# Pick your image\n", 79 | "Set `image_path` to point to the image you'd like to generate a caption for." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": false, 87 | "deletable": true, 88 | "editable": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "image_path = './image_path.jpg'" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true, 100 | "deletable": true, 101 | "editable": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "dim_embed = 256\n", 106 | "dim_hidden = 256\n", 107 | "dim_in = 4096\n", 108 | "batch_size = 1\n", 109 | "learning_rate = 0.001\n", 110 | "momentum = 0.9\n", 111 | "n_epochs = 25" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true, 119 | "deletable": true, 120 | "editable": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "class Caption_Generator():\n", 125 | " def __init__(self, dim_in, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, init_b=None):\n", 126 | "\n", 127 | " self.dim_in = dim_in\n", 128 | " self.dim_embed = dim_embed\n", 129 | " self.dim_hidden = dim_hidden\n", 130 | " self.batch_size = batch_size\n", 131 | " self.n_lstm_steps = n_lstm_steps\n", 132 | " self.n_words = n_words\n", 133 | " \n", 134 | " # declare the variables to be used for our word embeddings\n", 135 | " with tf.device(\"/cpu:0\"):\n", 136 | " self.word_embedding = tf.Variable(tf.random_uniform([self.n_words, self.dim_embed], -0.1, 0.1), name='word_embedding')\n", 137 | "\n", 138 | " self.embedding_bias = tf.Variable(tf.zeros([dim_embed]), name='embedding_bias')\n", 139 | " \n", 140 | " # declare the LSTM itself\n", 141 | " self.lstm = tf.contrib.rnn.BasicLSTMCell(dim_hidden)\n", 142 | " \n", 143 | " # declare the variables to be used to embed the image feature embedding to the word embedding space\n", 144 | " self.img_embedding = tf.Variable(tf.random_uniform([dim_in, dim_hidden], -0.1, 0.1), name='img_embedding')\n", 145 | " self.img_embedding_bias = tf.Variable(tf.zeros([dim_hidden]), name='img_embedding_bias')\n", 146 | "\n", 147 | " # declare the variables to go from an LSTM output to a word encoding output\n", 148 | " self.word_encoding = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='word_encoding')\n", 149 | " \n", 150 | " # optional initialization setter for encoding bias variable \n", 151 | " if init_b is not None:\n", 152 | " self.word_encoding_bias = tf.Variable(init_b, name='word_encoding_bias')\n", 153 | " else:\n", 154 | " self.word_encoding_bias = tf.Variable(tf.zeros([n_words]), name='word_encoding_bias')\n", 155 | "\n", 156 | " def build_model(self):\n", 157 | " # declaring the placeholders for our extracted image feature vectors, our caption, and our mask\n", 158 | " # (describes how long our caption is with an array of 0/1 values of length `maxlen` \n", 159 | " img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])\n", 160 | " caption_placeholder = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])\n", 161 | " mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])\n", 162 | " \n", 163 | " # getting an initial LSTM embedding from our image_imbedding\n", 164 | " image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias\n", 165 | " \n", 166 | " # setting initial state of our LSTM\n", 167 | " state = self.lstm.zero_state(self.batch_size, dtype=tf.float32)\n", 168 | "\n", 169 | " total_loss = 0.0\n", 170 | " with tf.variable_scope(\"RNN\"):\n", 171 | " for i in range(self.n_lstm_steps): \n", 172 | " if i > 0:\n", 173 | " #if this isn’t the first iteration of our LSTM we need to get the word_embedding corresponding\n", 174 | " # to the (i-1)th word in our caption \n", 175 | " with tf.device(\"/cpu:0\"):\n", 176 | " current_embedding = tf.nn.embedding_lookup(self.word_embedding, caption_placeholder[:,i-1]) + self.embedding_bias\n", 177 | " else:\n", 178 | " #if this is the first iteration of our LSTM we utilize the embedded image as our input \n", 179 | " current_embedding = image_embedding\n", 180 | " if i > 0: \n", 181 | " # allows us to reuse the LSTM tensor variable on each iteration\n", 182 | " tf.get_variable_scope().reuse_variables()\n", 183 | "\n", 184 | " out, state = self.lstm(current_embedding, state)\n", 185 | "\n", 186 | " \n", 187 | " if i > 0:\n", 188 | " #get the one-hot representation of the next word in our caption \n", 189 | " labels = tf.expand_dims(caption_placeholder[:, i], 1)\n", 190 | " ix_range=tf.range(0, self.batch_size, 1)\n", 191 | " ixs = tf.expand_dims(ix_range, 1)\n", 192 | " concat = tf.concat([ixs, labels],1)\n", 193 | " onehot = tf.sparse_to_dense(\n", 194 | " concat, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0)\n", 195 | "\n", 196 | "\n", 197 | " #perform a softmax classification to generate the next word in the caption\n", 198 | " logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias\n", 199 | " xentropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=onehot)\n", 200 | " xentropy = xentropy * mask[:,i]\n", 201 | "\n", 202 | " loss = tf.reduce_sum(xentropy)\n", 203 | " total_loss += loss\n", 204 | "\n", 205 | " total_loss = total_loss / tf.reduce_sum(mask[:,1:])\n", 206 | " return total_loss, img, caption_placeholder, mask\n", 207 | "\n", 208 | "\n", 209 | " def build_generator(self, maxlen, batchsize=1):\n", 210 | " #same setup as `build_model` function \n", 211 | " img = tf.placeholder(tf.float32, [self.batch_size, self.dim_in])\n", 212 | " image_embedding = tf.matmul(img, self.img_embedding) + self.img_embedding_bias\n", 213 | " state = self.lstm.zero_state(batchsize,dtype=tf.float32)\n", 214 | "\n", 215 | " #declare list to hold the words of our generated captions\n", 216 | " all_words = []\n", 217 | " with tf.variable_scope(\"RNN\"):\n", 218 | " # in the first iteration we have no previous word, so we directly pass in the image embedding\n", 219 | " # and set the `previous_word` to the embedding of the start token ([0]) for the future iterations\n", 220 | " output, state = self.lstm(image_embedding, state)\n", 221 | " previous_word = tf.nn.embedding_lookup(self.word_embedding, [0]) + self.embedding_bias\n", 222 | "\n", 223 | " for i in range(maxlen):\n", 224 | " tf.get_variable_scope().reuse_variables()\n", 225 | "\n", 226 | " out, state = self.lstm(previous_word, state)\n", 227 | "\n", 228 | "\n", 229 | " # get a one-hot word encoding from the output of the LSTM\n", 230 | " logit = tf.matmul(out, self.word_encoding) + self.word_encoding_bias\n", 231 | " best_word = tf.argmax(logit, 1)\n", 232 | "\n", 233 | " with tf.device(\"/cpu:0\"):\n", 234 | " # get the embedding of the best_word to use as input to the next iteration of our LSTM \n", 235 | " previous_word = tf.nn.embedding_lookup(self.word_embedding, best_word)\n", 236 | "\n", 237 | " previous_word += self.embedding_bias\n", 238 | "\n", 239 | " all_words.append(best_word)\n", 240 | "\n", 241 | " return img, all_words" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "collapsed": false, 249 | "deletable": true, 250 | "editable": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "if not os.path.exists('data/ixtoword.npy'):\n", 255 | " print ('You must run 1. O\\'reilly Training.ipynb first.')\n", 256 | "else:\n", 257 | " tf.reset_default_graph()\n", 258 | " with open(vgg_path,'rb') as f:\n", 259 | " fileContent = f.read()\n", 260 | " graph_def = tf.GraphDef()\n", 261 | " graph_def.ParseFromString(fileContent)\n", 262 | "\n", 263 | " images = tf.placeholder(\"float32\", [1, 224, 224, 3])\n", 264 | " tf.import_graph_def(graph_def, input_map={\"images\":images})\n", 265 | "\n", 266 | " ixtoword = np.load('data/ixtoword.npy').tolist()\n", 267 | " n_words = len(ixtoword)\n", 268 | " maxlen=15\n", 269 | " graph = tf.get_default_graph()\n", 270 | " sess = tf.InteractiveSession(graph=graph)\n", 271 | " caption_generator = Caption_Generator(dim_in, dim_hidden, dim_embed, batch_size, maxlen+2, n_words)\n", 272 | " graph = tf.get_default_graph()\n", 273 | "\n", 274 | " image, generated_words = caption_generator.build_generator(maxlen=maxlen)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": true, 282 | "deletable": true, 283 | "editable": true 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "def crop_image(x, target_height=227, target_width=227, as_float=True):\n", 288 | " image = cv2.imread(x)\n", 289 | " if as_float:\n", 290 | " image = image.astype(np.float32)\n", 291 | "\n", 292 | " if len(image.shape) == 2:\n", 293 | " image = np.tile(image[:,:,None], 3)\n", 294 | " elif len(image.shape) == 4:\n", 295 | " image = image[:,:,:,0]\n", 296 | "\n", 297 | " height, width, rgb = image.shape\n", 298 | " if width == height:\n", 299 | " resized_image = cv2.resize(image, (target_height,target_width))\n", 300 | "\n", 301 | " elif height < width:\n", 302 | " resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))\n", 303 | " cropping_length = int((resized_image.shape[1] - target_height) / 2)\n", 304 | " resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]\n", 305 | "\n", 306 | " else:\n", 307 | " resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))\n", 308 | " cropping_length = int((resized_image.shape[0] - target_width) / 2)\n", 309 | " resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]\n", 310 | "\n", 311 | " return cv2.resize(resized_image, (target_height, target_width))" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": true, 319 | "deletable": true, 320 | "editable": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "def read_image(path):\n", 325 | "\n", 326 | " img = crop_image(path, target_height=224, target_width=224)\n", 327 | " if img.shape[2] == 4:\n", 328 | " img = img[:,:,:3]\n", 329 | "\n", 330 | " img = img[None, ...]\n", 331 | " return img" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": { 337 | "deletable": true, 338 | "editable": true 339 | }, 340 | "source": [ 341 | "## Note:\n", 342 | "In order for the call to `saver.restore(sess, tf.train.latest_checkpoint(model_path))` to work you must have run `O' Reilly Training.ipynb` for 1 full epoch. This is because the call in `O' Reilly Training.ipynb` to save the graph to `model_path` only occurs after successfully completing one full epoch of training. If you would like to run this notebook to sanity check the code, uncomment `sanity_check=True`." 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": { 349 | "collapsed": true, 350 | "deletable": true, 351 | "editable": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "def test(sess,image,generated_words,ixtoword,test_image_path=0): # Naive greedy search\n", 356 | "\n", 357 | " \n", 358 | "\n", 359 | " feat = read_image(test_image_path)\n", 360 | " fc7 = sess.run(graph.get_tensor_by_name(\"import/Relu_1:0\"), feed_dict={images:feat})\n", 361 | "\n", 362 | " saver = tf.train.Saver()\n", 363 | " sanity_check=False\n", 364 | " # sanity_check=True\n", 365 | " if not sanity_check:\n", 366 | " saved_path=tf.train.latest_checkpoint(model_path)\n", 367 | " saver.restore(sess, saved_path)\n", 368 | " else:\n", 369 | " tf.global_variables_initializer().run()\n", 370 | "\n", 371 | " generated_word_index= sess.run(generated_words, feed_dict={image:fc7})\n", 372 | " generated_word_index = np.hstack(generated_word_index)\n", 373 | " generated_words = [ixtoword[x] for x in generated_word_index]\n", 374 | " punctuation = np.argmax(np.array(generated_words) == '.')+1\n", 375 | "\n", 376 | " generated_words = generated_words[:punctuation]\n", 377 | " generated_sentence = ' '.join(generated_words)\n", 378 | " print(generated_sentence)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "collapsed": false, 386 | "deletable": true, 387 | "editable": true 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "test(sess,image,generated_words,ixtoword, image_path)" 392 | ] 393 | } 394 | ], 395 | "metadata": { 396 | "anaconda-cloud": {}, 397 | "kernelspec": { 398 | "display_name": "Python [conda root]", 399 | "language": "python", 400 | "name": "conda-root-py" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 2 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython2", 412 | "version": "2.7.12" 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 1 417 | } 418 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Caption This! 2 | 3 | This repository contains source code corresponding to our article ["Caption this, with TensorFlow!"]( https://www.oreilly.com/learning/caption-this-with-tensorflow) 4 | 5 | 6 | # Git Basics 7 | 1. Go to your home directory by opening your terminal and entering `cd ~` 8 | 9 | 2. Clone the repository by entering 10 | 11 | ``` 12 | git clone https://github.com/mlberkeley/oreilly-captions.git 13 | ``` 14 | 15 | # Docker (highly recommended) 16 | Install Docker using the platform-specific installation instructions for Docker [here](https://docs.docker.com/engine/installation/#platform-support-matrix). Our iPython notebooks are compatible with TensorFlow 1.0. 17 | 18 | ### Option A: Use our pre-built Docker image from Docker Hub 19 | 20 | 3. After installing Docker, pull a prebuilt image from our Docker Hub by entering: 21 | 22 | ``` 23 | docker pull mlatberkeley/showandtell 24 | ``` 25 | 26 | You will need a Docker Hub account in order to pull the image (get one [here](https://hub.docker.com/)). If it's your first time pulling a Docker image from Docker Hub you will need to login to your Docker Hub account from your terminal with `docker login`, and follow the username and password prompt. 27 | 28 | 4. To run the pulled image (after cloning and downloading the repository) enter 29 | 30 | ``` 31 | docker run -it -p 8888:8888 -v :/root mlatberkeley/showandtell 32 | ``` 33 | 34 | where `` should be the __absolute path__ to your cloned repository. If you followed our **Git Basics** section the path should be `/oreilly-captions`. 35 | 36 | 5. After building, starting, and attaching to the appropriate Docker container, run the provided Jupyter notebooks by entering 37 | 38 | ``` 39 | jupyter notebook --ip 0.0.0.0 40 | ``` 41 | 42 | and navigate to [http://0.0.0.0:8888](http://0.0.0.0:8888) in your browser. 43 | 44 | ### Option B: Download and build your own Docker image from our GitHub repo 45 | If you want to build a GPU or CPU-based Docker image of your own, you can use the Dockerfiles provided in the `/dockerfiles/` subdirectory of our GitHub repo. 46 | 47 | 3. After cloning the repo to your machine, enter 48 | 49 | ``` 50 | docker build -t showandtell_ -f ./dockerfiles/Dockerfile. ./dockerfiles/ 51 | ``` 52 | 53 | where `` is either `gpu` or `cpu`. (Note that, in order to run these files on your GPU, you'll need to have a compatible GPU, with drivers installed and configured properly [as described in TensorFlow's documentation](https://www.tensorflow.org/install/).) 54 | 55 | 4. Run the Docker image by entering 56 | 57 | ``` 58 | docker run -it -p 8888:8888 -v :/root showandtell_ 59 | ``` 60 | 61 | where `` is either `gpu` or `cpu`, depending on the image you built in the last step. 62 | 63 | 5. After building, starting, and attaching to the appropriate Docker container, run the provided Jupyter notebooks by entering 64 | 65 | ``` 66 | jupyter notebook --ip 0.0.0.0 67 | ``` 68 | 69 | and navigate to [http://0.0.0.0:8888](http://0.0.0.0:8888) in your browser. 70 | 71 | **Note** 72 | If you are using Docker Toolbox as opposed to native Docker you will have to navigate to the Daemon IP adress (instead of 0.0.0.0) provided right after starting the Docker Quickstart Terminal (for us this was 192.168.99.100) in order to use Jupyter. 73 | 74 | ### Debugging docker 75 | If you receive an error of the form: 76 | 77 | ``` 78 | WARNING: Error loading config file:/home/rp/.docker/config.json - stat /home/rp/.docker/config.json: permission denied 79 | Got permission denied while trying to connect to the Docker daemon socket at unix:///var/run/docker.sock: Get http://%2Fvar%2Frun%2Fdocker.sock/v1.26/images/json: dial unix /var/run/docker.sock: connect: permission denied 80 | ``` 81 | 82 | It's most likely because you installed Docker using sudo permissions with a packet manager such as `brew` or `apt-get`. To solve this `permission denied` simply run docker with `sudo` (ie. run `docker` commands with `sudo docker ` instead of just `docker `). 83 | 84 | # The Notebooks 85 | There are three notebooks: 86 | * `1. O'Reilly Training.ipynb` - Contains code to train a TensorFlow caption generator from a VGG16 word embedding as described in our article. *Note:* you must run this notebook's `train` method before running any of the other notebooks in order to generate a mapping between integers and our vocabulary's words that will be reused in the other notebooks. 87 | * `2. O'Reilly Generate.ipynb` - Contains the same code as `1. O'Reilly Training.ipynb` except it introduces functionality to generate captions from an image embedding (as opposed to just being able to train on captions). Functions as a sanity check for the quality of captions we are generating. 88 | * `3. O'Reilly Generate from image.ipynb` - Builds on the previous notebook, except instead of feeding an image embedding to our caption generation model, it first feeds an image to the VGG-16 Convolutional Neural Network to generate an image feature embedding. This gives us an end-to-end pipeline for going from an image to a caption. 89 | * In order to run the test notebook edit the image path in the ipynb (more details in the `.ipynb` itself). 90 | 91 | # Additional Downloads: 92 | In order to run the first two notebooks, you will need VGG-16 image embeddings for the Flickr-30K dataset. These image embeddings are available from our [Google Drive](https://drive.google.com/file/d/0B5o40yxdA9PqTnJuWGVkcFlqcG8/view?usp=sharing). 93 | 94 | Additionally, you will need the corresponding captions for these images (`results_20130124.token`), which can also be downloaded from our [Google Drive](https://drive.google.com/file/d/0B2vTU3h54lTydXFjSVM5T2t4WmM/view?usp=sharing). 95 | 96 | In order to run the `3. O'Reilly Generate from image.ipynb` notebook you will need to download a pretrained TensorFlow model for [VGG-16](https://drive.google.com/file/d/0B2vTU3h54lTyaDczbFhsZFpsUGs/view?usp=sharing) generated from the original Caffe model from the VGG-16 paper. 97 | 98 | Place all of these downloads in the `./data/` directory. 99 | 100 | # Pretrained Weights: 101 | We've trained the caption generator (w/o training VGG-16 End2End) to 500 epochs, and we've placed the resulting checkpoint files in `./models/tensorflow`. You should experience an average reconstruction loss of ~1.75-1.85. 102 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | #Import necessary data files 2 | Add caption data, image embeddings, and a vgg16 model as instructued in `../README.md` 3 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | # Set the base image to Ubuntu 2 | FROM tensorflow/tensorflow:latest-py3 3 | 4 | # File Author / Maintainer 5 | MAINTAINER Raul Puri 6 | 7 | # Install git and TF dependencies 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends libboost-all-dev && \ 10 | apt-get install -y software-properties-common \ 11 | git \ 12 | wget \ 13 | cmake \ 14 | python-zmq \ 15 | python-dev \ 16 | libzmq3-dev \ 17 | libssl-dev \ 18 | libgflags-dev \ 19 | libgoogle-glog-dev \ 20 | liblmdb-dev \ 21 | libatlas-base-dev \ 22 | libblas-dev \ 23 | liblapack-dev \ 24 | libgflags-dev \ 25 | libgoogle-glog-dev \ 26 | liblmdb-dev \ 27 | libprotobuf-dev \ 28 | libleveldb-dev \ 29 | libsnappy-dev \ 30 | libopencv-dev \ 31 | libhdf5-serial-dev \ 32 | protobuf-compiler 33 | 34 | COPY requirements.txt /root/ 35 | 36 | RUN pip install keras nose Cython 37 | RUN pip install -r /root/requirements.txt 38 | RUN rm /root/requirements.txt 39 | 40 | 41 | WORKDIR /root 42 | 43 | CMD ["/bin/bash"] 44 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.cpu: -------------------------------------------------------------------------------- 1 | # Set the base image to Ubuntu 2 | FROM tensorflow/tensorflow:latest-py3 3 | 4 | # File Author / Maintainer 5 | MAINTAINER Raul Puri 6 | 7 | # Install git and TF dependencies 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends libboost-all-dev && \ 10 | apt-get install -y software-properties-common \ 11 | git \ 12 | wget \ 13 | cmake \ 14 | python-zmq \ 15 | python-dev \ 16 | libzmq3-dev \ 17 | libssl-dev \ 18 | libgflags-dev \ 19 | libgoogle-glog-dev \ 20 | liblmdb-dev \ 21 | libatlas-base-dev \ 22 | libblas-dev \ 23 | liblapack-dev \ 24 | libgflags-dev \ 25 | libgoogle-glog-dev \ 26 | liblmdb-dev \ 27 | libprotobuf-dev \ 28 | libleveldb-dev \ 29 | libsnappy-dev \ 30 | libopencv-dev \ 31 | libhdf5-serial-dev \ 32 | protobuf-compiler 33 | 34 | COPY requirements.txt /root/ 35 | 36 | RUN pip install keras nose Cython 37 | RUN pip install -r /root/requirements.txt 38 | RUN rm /root/requirements.txt 39 | 40 | 41 | WORKDIR /root 42 | 43 | CMD ["/bin/bash"] 44 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.gpu: -------------------------------------------------------------------------------- 1 | 2 | # Set the base image to Ubuntu 3 | FROM tensorflow/tensorflow:latest-gpu-py3 4 | 5 | # File Author / Maintainer 6 | MAINTAINER Raul Puri 7 | 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends libboost-all-dev && \ 10 | apt-get install -y software-properties-common \ 11 | git \ 12 | wget \ 13 | cmake \ 14 | python-zmq \ 15 | python-dev \ 16 | libzmq3-dev \ 17 | libssl-dev \ 18 | libgflags-dev \ 19 | libgoogle-glog-dev \ 20 | liblmdb-dev \ 21 | libatlas-base-dev \ 22 | libblas-dev \ 23 | liblapack-dev \ 24 | libgflags-dev \ 25 | libgoogle-glog-dev \ 26 | liblmdb-dev \ 27 | libprotobuf-dev \ 28 | libleveldb-dev \ 29 | libsnappy-dev \ 30 | libopencv-dev \ 31 | libhdf5-serial-dev \ 32 | protobuf-compiler \ 33 | python-tk 34 | 35 | COPY requirements.txt /root/ 36 | 37 | RUN pip install keras nose Cython 38 | RUN pip install -r /root/requirements.txt 39 | RUN rm /root/requirements.txt 40 | 41 | WORKDIR /root 42 | 43 | CMD ["/bin/bash"] 44 | 45 | -------------------------------------------------------------------------------- /dockerfiles/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | scikit-learn 3 | scikit-image 4 | opencv-python -------------------------------------------------------------------------------- /models/tensorflow/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlberkeley/oreilly-captions/1a75dfd99062f90ea2d930ed577f344c0a3b11bc/models/tensorflow/.keep -------------------------------------------------------------------------------- /models/tf_final/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "model-500" 2 | all_model_checkpoint_paths: "model-500" 3 | -------------------------------------------------------------------------------- /models/tf_final/model-500.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlberkeley/oreilly-captions/1a75dfd99062f90ea2d930ed577f344c0a3b11bc/models/tf_final/model-500.data-00000-of-00001 -------------------------------------------------------------------------------- /models/tf_final/model-500.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlberkeley/oreilly-captions/1a75dfd99062f90ea2d930ed577f344c0a3b11bc/models/tf_final/model-500.index -------------------------------------------------------------------------------- /models/tf_final/model-500.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlberkeley/oreilly-captions/1a75dfd99062f90ea2d930ed577f344c0a3b11bc/models/tf_final/model-500.meta --------------------------------------------------------------------------------