├── .gitignore
├── result.jpg
├── ixtoword.npy
├── acoustic-guitar-player.jpg
├── make_flickr_dataset.py
├── LICENSE
├── Readme.md
├── cnn_util.py
└── model.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | models/*
3 | data/*
4 | 


--------------------------------------------------------------------------------
/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/result.jpg


--------------------------------------------------------------------------------
/ixtoword.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/ixtoword.npy


--------------------------------------------------------------------------------
/acoustic-guitar-player.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/acoustic-guitar-player.jpg


--------------------------------------------------------------------------------
/make_flickr_dataset.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import cPickle
 5 | from cnn_util import *
 6 | 
 7 | vgg_model = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers.caffemodel'
 8 | vgg_deploy = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers_deploy.prototxt'
 9 | 
10 | annotation_path = './data/results_20130124.token'
11 | flickr_image_path = '../show_attend_and_tell/images/flickr30k-images/'
12 | feat_path = './data/feats.npy'
13 | 
14 | cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224)
15 | 
16 | ipdb.set_trace()
17 | 
18 | annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
19 | annotations['image_num'] = annotations['image'].map(lambda x: x.split('#')[1])
20 | annotations['image'] = annotations['image'].map(lambda x: os.path.join(flickr_image_path,x.split('#')[0]))
21 | 
22 | if not os.path.exists(feat_path):
23 |     feats = cnn.get_features(annotations['image'].values)
24 |     np.save(feat_path, feats)
25 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, Taeksoo Kim
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | # Neural Caption Generator
 2 | * Tensorflow implementation of "Show and Tell" http://arxiv.org/abs/1411.4555
 3 |  * Borrowed some code and ideas from Andrej Karpathy's NeuralTalk.
 4 | * You need flickr30k data (images and annotations)
 5 |  
 6 | ### Code
 7 | * make_flickr_dataset.py : Extracting feats of flickr30k images, and save them in './data/feats.npy' 
 8 | * model.py : TensorFlow Version
 9 |  
10 | #### Usage
11 | * Flickr30k Dataset Download
12 | * Extract VGG Featues of Flicker30k images (make_flickr_dataset.py)
13 | * Train: run train() in  model.py
14 | * Test: run test() or test_tf() in model.py
15 |  * parameters: VGG FC7 feature of test image, trained model path
16 |  * Once you download Tensorflow VGG Net (one of the links below), you don't need Caffe when testing.
17 | 
18 | #### Downloading data/trained model
19 | * Extraced FC7 data: [download](https://drive.google.com/file/d/0B5o40yxdA9PqTnJuWGVkcFlqcG8/view?usp=sharing)
20 |  * This is used in train() function in model.py. You can skip feature extraction part by using this.
21 | * Pretrained model [download](https://drive.google.com/file/d/0B5o40yxdA9PqeW4wY0wwZXhrZkE/view?usp=sharing)
22 |  * This is used in test() and test_tf() in model.py. If you do not have time for training, or if you just want to check out captioning, download and test the model.
23 | * Tensorflow VGG net [download](https://drive.google.com/file/d/0B5o40yxdA9PqSGtVODN0UUlaWTg/view?usp=sharing)
24 |  * This file is used in test_tf() in model.py
25 | * Along with the files above, you might want to download flickr30k annotation data from [link](http://shannon.cs.illinois.edu/DenotationGraph/) 
26 | 
27 | ![alt tag](https://github.com/jazzsaxmafia/show_and_tell.tensorflow/blob/master/result.jpg)
28 | 
29 | ### License
30 | * BSD license
31 | 


--------------------------------------------------------------------------------
/cnn_util.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import ipdb
  3 | import cv2
  4 | import numpy as np
  5 | import skimage
  6 | 
  7 | def crop_image(x, target_height=227, target_width=227, as_float=True):
  8 |     #image = skimage.img_as_float(skimage.io.imread(x)).astype(np.float32)
  9 |     image = skimage.io.imread(x)
 10 |     if as_float:
 11 |         image = skimage.img_as_float(image).astype(np.float32)
 12 | 
 13 |     if len(image.shape) == 2:
 14 |         image = np.tile(image[:,:,None], 3)
 15 |     elif len(image.shape) == 4:
 16 |         image = image[:,:,:,0]
 17 | 
 18 |     height, width, rgb = image.shape
 19 |     if width == height:
 20 |         resized_image = cv2.resize(image, (target_height,target_width))
 21 | 
 22 |     elif height < width:
 23 |         resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))
 24 |         cropping_length = int((resized_image.shape[1] - target_height) / 2)
 25 |         resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]
 26 | 
 27 |     else:
 28 |         resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
 29 |         cropping_length = int((resized_image.shape[0] - target_width) / 2)
 30 |         resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]
 31 | 
 32 |     return cv2.resize(resized_image, (target_height, target_width))
 33 | 
 34 | deploy = '/home/taeksoo/Package/caffe/models/bvlc_reference_caffenet/deploy.prototxt'
 35 | model = '/home/taeksoo/Package/caffe/models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'
 36 | mean = '/home/taeksoo/Package/caffe/python/caffe/imagenet/ilsvrc_2012_mean.npy'
 37 | 
 38 | class CNN(object):
 39 | 
 40 |     def __init__(self, deploy=deploy, model=model, mean=mean, batch_size=10, width=227, height=227):
 41 | 
 42 |         self.deploy = deploy
 43 |         self.model = model
 44 |         self.mean = mean
 45 | 
 46 |         self.batch_size = batch_size
 47 |         self.net, self.transformer = self.get_net()
 48 |         self.net.blobs['data'].reshape(self.batch_size, 3, height, width)
 49 | 
 50 |         self.width = width
 51 |         self.height = height
 52 | 
 53 |     def get_net(self):
 54 |         caffe.set_mode_gpu()
 55 |         net = caffe.Net(self.deploy, self.model, caffe.TEST)
 56 | 
 57 |         transformer = caffe.io.Transformer({'data':net.blobs['data'].data.shape})
 58 |         transformer.set_transpose('data', (2,0,1))
 59 |         transformer.set_mean('data', np.load(self.mean).mean(1).mean(1))
 60 |         transformer.set_raw_scale('data', 255)
 61 |         transformer.set_channel_swap('data', (2,1,0))
 62 | 
 63 |         return net, transformer
 64 | 
 65 |     def get_features(self, image_list, layers='fc7', layer_sizes=[4096]):
 66 |         iter_until = len(image_list) + self.batch_size
 67 |         all_feats = np.zeros([len(image_list)] + layer_sizes)
 68 | 
 69 |         for start, end in zip(range(0, iter_until, self.batch_size), \
 70 |                               range(self.batch_size, iter_until, self.batch_size)):
 71 | 
 72 |             image_batch_file = image_list[start:end]
 73 |             image_batch = np.array(map(lambda x: crop_image(x, target_width=self.width, target_height=self.height), image_batch_file))
 74 | 
 75 |             caffe_in = np.zeros(np.array(image_batch.shape)[[0,3,1,2]], dtype=np.float32)
 76 | 
 77 |             for idx, in_ in enumerate(image_batch):
 78 |                 caffe_in[idx] = self.transformer.preprocess('data', in_)
 79 | 
 80 |             out = self.net.forward_all(blobs=[layers], **{'data':caffe_in})
 81 |             feats = out[layers]
 82 | 
 83 |             all_feats[start:end] = feats
 84 | 
 85 |         return all_feats
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | #-*- coding: utf-8 -*-
  2 | import math
  3 | import os
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import pandas as pd
  7 | import cPickle
  8 | 
  9 | from tensorflow.models.rnn import rnn_cell
 10 | import tensorflow.python.platform
 11 | from keras.preprocessing import sequence
 12 | from collections import Counter
 13 | from cnn_util import *
 14 | 
 15 | class Caption_Generator():
 16 |     def init_weight(self, dim_in, dim_out, name=None, stddev=1.0):
 17 |         return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name)
 18 | 
 19 |     def init_bias(self, dim_out, name=None):
 20 |         return tf.Variable(tf.zeros([dim_out]), name=name)
 21 | 
 22 |     def __init__(self, dim_image, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, bias_init_vector=None):
 23 | 
 24 |         self.dim_image = np.int(dim_image)
 25 |         self.dim_embed = np.int(dim_embed)
 26 |         self.dim_hidden = np.int(dim_hidden)
 27 |         self.batch_size = np.int(batch_size)
 28 |         self.n_lstm_steps = np.int(n_lstm_steps)
 29 |         self.n_words = np.int(n_words)
 30 | 
 31 |         with tf.device("/cpu:0"):
 32 |             self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_embed], -0.1, 0.1), name='Wemb')
 33 | 
 34 |         self.bemb = self.init_bias(dim_embed, name='bemb')
 35 | 
 36 |         self.lstm = rnn_cell.BasicLSTMCell(dim_hidden)
 37 | 
 38 |         #self.encode_img_W = self.init_weight(dim_image, dim_hidden, name='encode_img_W')
 39 |         self.encode_img_W = tf.Variable(tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_img_W')
 40 |         self.encode_img_b = self.init_bias(dim_hidden, name='encode_img_b')
 41 | 
 42 |         self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='embed_word_W')
 43 | 
 44 |         if bias_init_vector is not None:
 45 |             self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b')
 46 |         else:
 47 |             self.embed_word_b = self.init_bias(n_words, name='embed_word_b')
 48 | 
 49 |     def build_model(self):
 50 | 
 51 |         image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image])
 52 |         sentence = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps])
 53 |         mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps])
 54 | 
 55 |         image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b # (batch_size, dim_hidden)
 56 | 
 57 |         state = tf.zeros([self.batch_size, self.lstm.state_size])
 58 | 
 59 |         loss = 0.0
 60 |         with tf.variable_scope("RNN"):
 61 |             for i in range(self.n_lstm_steps): # maxlen + 1
 62 |                 if i == 0:
 63 |                     current_emb = image_emb
 64 |                 else:
 65 |                     with tf.device("/cpu:0"):
 66 |                         current_emb = tf.nn.embedding_lookup(self.Wemb, sentence[:,i-1]) + self.bemb
 67 | 
 68 |                 if i > 0 : tf.get_variable_scope().reuse_variables()
 69 | 
 70 |                 output, state = self.lstm(current_emb, state) # (batch_size, dim_hidden)
 71 | 
 72 |                 if i > 0: # 이미지 다음 바로 나오는건 #START# 임. 이건 무시.
 73 |                     labels = tf.expand_dims(sentence[:, i], 1) # (batch_size)
 74 |                     indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
 75 |                     concated = tf.concat(1, [indices, labels])
 76 |                     onehot_labels = tf.sparse_to_dense(
 77 |                             concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0) # (batch_size, n_words)
 78 | 
 79 |                     logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b # (batch_size, n_words)
 80 |                     cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels)
 81 |                     cross_entropy = cross_entropy * mask[:,i]#tf.expand_dims(mask, 1)
 82 | 
 83 |                     current_loss = tf.reduce_sum(cross_entropy)
 84 |                     loss = loss + current_loss
 85 | 
 86 |             loss = loss / tf.reduce_sum(mask[:,1:])
 87 |             return loss, image, sentence, mask
 88 | 
 89 |     def build_generator(self, maxlen):
 90 |         image = tf.placeholder(tf.float32, [1, self.dim_image])
 91 |         image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b
 92 | 
 93 |         state = tf.zeros([1, self.lstm.state_size])
 94 |         #last_word = image_emb # 첫 단어 대신 이미지
 95 |         generated_words = []
 96 | 
 97 |         with tf.variable_scope("RNN"):
 98 |             output, state = self.lstm(image_emb, state)
 99 |             last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb
100 | 
101 |             for i in range(maxlen):
102 |                 tf.get_variable_scope().reuse_variables()
103 | 
104 |                 output, state = self.lstm(last_word, state)
105 | 
106 |                 logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b
107 |                 max_prob_word = tf.argmax(logit_words, 1)
108 | 
109 |                 with tf.device("/cpu:0"):
110 |                     last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word)
111 | 
112 |                 last_word += self.bemb
113 | 
114 |                 generated_words.append(max_prob_word)
115 | 
116 |         return image, generated_words
117 | 
118 | def get_caption_data(annotation_path, feat_path):
119 |      feats = np.load(feat_path)
120 |      annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption'])
121 |      captions = annotations['caption'].values
122 | 
123 |      return feats, captions
124 | 
125 | def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # borrowed this function from NeuralTalk
126 |     print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, )
127 |     word_counts = {}
128 |     nsents = 0
129 |     for sent in sentence_iterator:
130 |       nsents += 1
131 |       for w in sent.lower().split(' '):
132 |         word_counts[w] = word_counts.get(w, 0) + 1
133 |     vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
134 |     print 'filtered words from %d to %d' % (len(word_counts), len(vocab))
135 | 
136 |     ixtoword = {}
137 |     ixtoword[0] = '.'  # period at the end of the sentence. make first dimension be end token
138 |     wordtoix = {}
139 |     wordtoix['#START#'] = 0 # make first vector be the start token
140 |     ix = 1
141 |     for w in vocab:
142 |       wordtoix[w] = ix
143 |       ixtoword[ix] = w
144 |       ix += 1
145 | 
146 |     word_counts['.'] = nsents
147 |     bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword])
148 |     bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
149 |     bias_init_vector = np.log(bias_init_vector)
150 |     bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
151 |     return wordtoix, ixtoword, bias_init_vector
152 | 
153 | 
154 | ################### 학습 관련 Parameters #####################
155 | 
156 | dim_embed = 256
157 | dim_hidden = 256
158 | dim_image = 4096
159 | batch_size = 128
160 | 
161 | #learning_rate = 0.001
162 | n_epochs = 1000
163 | ###############################################################
164 | #################### 잡다한 Parameters ########################
165 | model_path = './models/tensorflow'
166 | vgg_path = './data/vgg16.tfmodel'
167 | data_path = './data'
168 | feat_path = './data/feats.npy'
169 | annotation_path = os.path.join(data_path, 'results_20130124.token')
170 | ################################################################
171 | 
172 | 
173 | def train():
174 | 
175 |     learning_rate = 0.001
176 |     momentum = 0.9
177 |     feats, captions = get_caption_data(annotation_path, feat_path)
178 |     wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)
179 | 
180 |     np.save('data/ixtoword', ixtoword)
181 | 
182 |     index = np.arange(len(feats))
183 |     np.random.shuffle(index)
184 | 
185 |     feats = feats[index]
186 |     captions = captions[index]
187 | 
188 |     sess = tf.InteractiveSession()
189 |     n_words = len(wordtoix)
190 |     maxlen = np.max( map(lambda x: len(x.split(' ')), captions) )
191 |     caption_generator = Caption_Generator(
192 |             dim_image=dim_image,
193 |             dim_hidden=dim_hidden,
194 |             dim_embed=dim_embed,
195 |             batch_size=batch_size,
196 |             n_lstm_steps=maxlen+2,
197 |             n_words=n_words,
198 |             bias_init_vector=bias_init_vector)
199 | 
200 |     loss, image, sentence, mask = caption_generator.build_model()
201 | 
202 |     saver = tf.train.Saver(max_to_keep=50)
203 |     train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
204 |     tf.initialize_all_variables().run()
205 | 
206 |     for epoch in range(n_epochs):
207 |         #train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
208 |         for start, end in zip( \
209 |                 range(0, len(feats), batch_size),
210 |                 range(batch_size, len(feats), batch_size)
211 |                 ):
212 | 
213 |             current_feats = feats[start:end]
214 |             current_captions = captions[start:end]
215 | 
216 |             current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions)
217 | 
218 |             current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1)
219 |             current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] ).astype(int)
220 | 
221 |             current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1]))
222 |             nonzeros = np.array( map(lambda x: (x != 0).sum()+2, current_caption_matrix ))
223 |             #  +2 -> #START# and '.'
224 | 
225 |             for ind, row in enumerate(current_mask_matrix):
226 |                 row[:nonzeros[ind]] = 1
227 | 
228 |             _, loss_value = sess.run([train_op, loss], feed_dict={
229 |                 image: current_feats,
230 |                 sentence : current_caption_matrix,
231 |                 mask : current_mask_matrix
232 |                 })
233 | 
234 |             print "Current Cost: ", loss_value
235 | 
236 |         print "Epoch ", epoch, " is done. Saving the model ... "
237 |         saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
238 |         learning_rate *= 0.95
239 | 
240 | def test(test_feat='./guitar_player.npy', model_path='./models/tensorflow/model-1', maxlen=30): # Naive greedy search
241 | 
242 |     ixtoword = np.load('data/ixtoword.npy').tolist()
243 |     n_words = len(ixtoword)
244 | 
245 |     feat = [np.load(test_feat)]
246 |     sess = tf.InteractiveSession()
247 |     caption_generator = Caption_Generator(
248 |            dim_image=dim_image,
249 |            dim_hidden=dim_hidden,
250 |            dim_embed=dim_embed,
251 |            batch_size=batch_size,
252 |            n_lstm_steps=maxlen,
253 |            n_words=n_words)
254 | 
255 |     image, generated_words = caption_generator.build_generator(maxlen=maxlen)
256 |     # 이 부분이 존나 중요함. 계속 caption_generator를 가져온 뒤 바로 restore를 했었는데,
257 |     # TensorFlow의 LSTM은 call을 한 뒤에 weight가 만들어지기 때문에 build_generator보다 뒤쪽에서 restore를 해야 함.
258 |     saver = tf.train.Saver()
259 |     saver.restore(sess, model_path)
260 | 
261 |     generated_word_index= sess.run(generated_words, feed_dict={image:feat})
262 |     generated_word_index = np.hstack(generated_word_index)
263 | 
264 |     generated_sentence = [ixtoword[x] for x in generated_word_index]
265 | 
266 | 
267 | def read_image(path):
268 | 
269 |      img = crop_image(path, target_height=224, target_width=224)
270 |      if img.shape[2] == 4:
271 |          img = img[:,:,:3]
272 | 
273 |      img = img[None, ...]
274 |      return img
275 | 
276 | 
277 | def test_tf(test_image_path=None, model_path='./models/model-72', maxlen=30):
278 |     with open(vgg_path) as f:
279 |         fileContent = f.read()
280 |         graph_def = tf.GraphDef()
281 |         graph_def.ParseFromString(fileContent)
282 | 
283 |     images = tf.placeholder("float32", [1, 224, 224, 3])
284 |     tf.import_graph_def(graph_def, input_map={"images":images})
285 | 
286 |     ixtoword = np.load('./data/ixtoword.npy').tolist()
287 |     n_words = len(ixtoword)
288 | 
289 |     image_val = read_image(test_image_path)
290 |     sess = tf.InteractiveSession()
291 | 
292 |     caption_generator = Caption_Generator(
293 |            dim_image=dim_image,
294 |            dim_hidden=dim_hidden,
295 |            dim_embed=dim_embed,
296 |            batch_size=batch_size,
297 |            n_lstm_steps=maxlen,
298 |            n_words=n_words)
299 | 
300 |     graph = tf.get_default_graph()
301 |     fc7 = sess.run(graph.get_tensor_by_name("import/fc7_relu:0"), feed_dict={images:image_val})
302 | 
303 |     fc7_tf, generated_words = caption_generator.build_generator(maxlen=maxlen)
304 | 
305 |     saver = tf.train.Saver()
306 |     saver.restore(sess, model_path)
307 | 
308 |     generated_word_index= sess.run(generated_words, feed_dict={fc7_tf:fc7})
309 |     generated_word_index = np.hstack(generated_word_index)
310 | 
311 |     generated_words = [ixtoword[x] for x in generated_word_index]
312 |     punctuation = np.argmax(np.array(generated_words) == '.')+1
313 | 
314 |     generated_words = generated_words[:punctuation]
315 |     generated_sentence = ' '.join(generated_words)
316 |     print generated_sentence
317 | 


--------------------------------------------------------------------------------