├── .gitignore ├── result.jpg ├── ixtoword.npy ├── acoustic-guitar-player.jpg ├── make_flickr_dataset.py ├── LICENSE ├── Readme.md ├── cnn_util.py └── model.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | models/* 3 | data/* 4 | -------------------------------------------------------------------------------- /result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/result.jpg -------------------------------------------------------------------------------- /ixtoword.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/ixtoword.npy -------------------------------------------------------------------------------- /acoustic-guitar-player.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jazzsaxmafia/show_and_tell.tensorflow/HEAD/acoustic-guitar-player.jpg -------------------------------------------------------------------------------- /make_flickr_dataset.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import os 4 | import cPickle 5 | from cnn_util import * 6 | 7 | vgg_model = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers.caffemodel' 8 | vgg_deploy = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers_deploy.prototxt' 9 | 10 | annotation_path = './data/results_20130124.token' 11 | flickr_image_path = '../show_attend_and_tell/images/flickr30k-images/' 12 | feat_path = './data/feats.npy' 13 | 14 | cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224) 15 | 16 | ipdb.set_trace() 17 | 18 | annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption']) 19 | annotations['image_num'] = annotations['image'].map(lambda x: x.split('#')[1]) 20 | annotations['image'] = annotations['image'].map(lambda x: os.path.join(flickr_image_path,x.split('#')[0])) 21 | 22 | if not os.path.exists(feat_path): 23 | feats = cnn.get_features(annotations['image'].values) 24 | np.save(feat_path, feats) 25 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, Taeksoo Kim 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Neural Caption Generator 2 | * Tensorflow implementation of "Show and Tell" http://arxiv.org/abs/1411.4555 3 | * Borrowed some code and ideas from Andrej Karpathy's NeuralTalk. 4 | * You need flickr30k data (images and annotations) 5 | 6 | ### Code 7 | * make_flickr_dataset.py : Extracting feats of flickr30k images, and save them in './data/feats.npy' 8 | * model.py : TensorFlow Version 9 | 10 | #### Usage 11 | * Flickr30k Dataset Download 12 | * Extract VGG Featues of Flicker30k images (make_flickr_dataset.py) 13 | * Train: run train() in model.py 14 | * Test: run test() or test_tf() in model.py 15 | * parameters: VGG FC7 feature of test image, trained model path 16 | * Once you download Tensorflow VGG Net (one of the links below), you don't need Caffe when testing. 17 | 18 | #### Downloading data/trained model 19 | * Extraced FC7 data: [download](https://drive.google.com/file/d/0B5o40yxdA9PqTnJuWGVkcFlqcG8/view?usp=sharing) 20 | * This is used in train() function in model.py. You can skip feature extraction part by using this. 21 | * Pretrained model [download](https://drive.google.com/file/d/0B5o40yxdA9PqeW4wY0wwZXhrZkE/view?usp=sharing) 22 | * This is used in test() and test_tf() in model.py. If you do not have time for training, or if you just want to check out captioning, download and test the model. 23 | * Tensorflow VGG net [download](https://drive.google.com/file/d/0B5o40yxdA9PqSGtVODN0UUlaWTg/view?usp=sharing) 24 | * This file is used in test_tf() in model.py 25 | * Along with the files above, you might want to download flickr30k annotation data from [link](http://shannon.cs.illinois.edu/DenotationGraph/) 26 | 27 | ![alt tag](https://github.com/jazzsaxmafia/show_and_tell.tensorflow/blob/master/result.jpg) 28 | 29 | ### License 30 | * BSD license 31 | -------------------------------------------------------------------------------- /cnn_util.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import ipdb 3 | import cv2 4 | import numpy as np 5 | import skimage 6 | 7 | def crop_image(x, target_height=227, target_width=227, as_float=True): 8 | #image = skimage.img_as_float(skimage.io.imread(x)).astype(np.float32) 9 | image = skimage.io.imread(x) 10 | if as_float: 11 | image = skimage.img_as_float(image).astype(np.float32) 12 | 13 | if len(image.shape) == 2: 14 | image = np.tile(image[:,:,None], 3) 15 | elif len(image.shape) == 4: 16 | image = image[:,:,:,0] 17 | 18 | height, width, rgb = image.shape 19 | if width == height: 20 | resized_image = cv2.resize(image, (target_height,target_width)) 21 | 22 | elif height < width: 23 | resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width)) 24 | cropping_length = int((resized_image.shape[1] - target_height) / 2) 25 | resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length] 26 | 27 | else: 28 | resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width))) 29 | cropping_length = int((resized_image.shape[0] - target_width) / 2) 30 | resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:] 31 | 32 | return cv2.resize(resized_image, (target_height, target_width)) 33 | 34 | deploy = '/home/taeksoo/Package/caffe/models/bvlc_reference_caffenet/deploy.prototxt' 35 | model = '/home/taeksoo/Package/caffe/models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel' 36 | mean = '/home/taeksoo/Package/caffe/python/caffe/imagenet/ilsvrc_2012_mean.npy' 37 | 38 | class CNN(object): 39 | 40 | def __init__(self, deploy=deploy, model=model, mean=mean, batch_size=10, width=227, height=227): 41 | 42 | self.deploy = deploy 43 | self.model = model 44 | self.mean = mean 45 | 46 | self.batch_size = batch_size 47 | self.net, self.transformer = self.get_net() 48 | self.net.blobs['data'].reshape(self.batch_size, 3, height, width) 49 | 50 | self.width = width 51 | self.height = height 52 | 53 | def get_net(self): 54 | caffe.set_mode_gpu() 55 | net = caffe.Net(self.deploy, self.model, caffe.TEST) 56 | 57 | transformer = caffe.io.Transformer({'data':net.blobs['data'].data.shape}) 58 | transformer.set_transpose('data', (2,0,1)) 59 | transformer.set_mean('data', np.load(self.mean).mean(1).mean(1)) 60 | transformer.set_raw_scale('data', 255) 61 | transformer.set_channel_swap('data', (2,1,0)) 62 | 63 | return net, transformer 64 | 65 | def get_features(self, image_list, layers='fc7', layer_sizes=[4096]): 66 | iter_until = len(image_list) + self.batch_size 67 | all_feats = np.zeros([len(image_list)] + layer_sizes) 68 | 69 | for start, end in zip(range(0, iter_until, self.batch_size), \ 70 | range(self.batch_size, iter_until, self.batch_size)): 71 | 72 | image_batch_file = image_list[start:end] 73 | image_batch = np.array(map(lambda x: crop_image(x, target_width=self.width, target_height=self.height), image_batch_file)) 74 | 75 | caffe_in = np.zeros(np.array(image_batch.shape)[[0,3,1,2]], dtype=np.float32) 76 | 77 | for idx, in_ in enumerate(image_batch): 78 | caffe_in[idx] = self.transformer.preprocess('data', in_) 79 | 80 | out = self.net.forward_all(blobs=[layers], **{'data':caffe_in}) 81 | feats = out[layers] 82 | 83 | all_feats[start:end] = feats 84 | 85 | return all_feats 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | import math 3 | import os 4 | import tensorflow as tf 5 | import numpy as np 6 | import pandas as pd 7 | import cPickle 8 | 9 | from tensorflow.models.rnn import rnn_cell 10 | import tensorflow.python.platform 11 | from keras.preprocessing import sequence 12 | from collections import Counter 13 | from cnn_util import * 14 | 15 | class Caption_Generator(): 16 | def init_weight(self, dim_in, dim_out, name=None, stddev=1.0): 17 | return tf.Variable(tf.truncated_normal([dim_in, dim_out], stddev=stddev/math.sqrt(float(dim_in))), name=name) 18 | 19 | def init_bias(self, dim_out, name=None): 20 | return tf.Variable(tf.zeros([dim_out]), name=name) 21 | 22 | def __init__(self, dim_image, dim_embed, dim_hidden, batch_size, n_lstm_steps, n_words, bias_init_vector=None): 23 | 24 | self.dim_image = np.int(dim_image) 25 | self.dim_embed = np.int(dim_embed) 26 | self.dim_hidden = np.int(dim_hidden) 27 | self.batch_size = np.int(batch_size) 28 | self.n_lstm_steps = np.int(n_lstm_steps) 29 | self.n_words = np.int(n_words) 30 | 31 | with tf.device("/cpu:0"): 32 | self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_embed], -0.1, 0.1), name='Wemb') 33 | 34 | self.bemb = self.init_bias(dim_embed, name='bemb') 35 | 36 | self.lstm = rnn_cell.BasicLSTMCell(dim_hidden) 37 | 38 | #self.encode_img_W = self.init_weight(dim_image, dim_hidden, name='encode_img_W') 39 | self.encode_img_W = tf.Variable(tf.random_uniform([dim_image, dim_hidden], -0.1, 0.1), name='encode_img_W') 40 | self.encode_img_b = self.init_bias(dim_hidden, name='encode_img_b') 41 | 42 | self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1, 0.1), name='embed_word_W') 43 | 44 | if bias_init_vector is not None: 45 | self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b') 46 | else: 47 | self.embed_word_b = self.init_bias(n_words, name='embed_word_b') 48 | 49 | def build_model(self): 50 | 51 | image = tf.placeholder(tf.float32, [self.batch_size, self.dim_image]) 52 | sentence = tf.placeholder(tf.int32, [self.batch_size, self.n_lstm_steps]) 53 | mask = tf.placeholder(tf.float32, [self.batch_size, self.n_lstm_steps]) 54 | 55 | image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b # (batch_size, dim_hidden) 56 | 57 | state = tf.zeros([self.batch_size, self.lstm.state_size]) 58 | 59 | loss = 0.0 60 | with tf.variable_scope("RNN"): 61 | for i in range(self.n_lstm_steps): # maxlen + 1 62 | if i == 0: 63 | current_emb = image_emb 64 | else: 65 | with tf.device("/cpu:0"): 66 | current_emb = tf.nn.embedding_lookup(self.Wemb, sentence[:,i-1]) + self.bemb 67 | 68 | if i > 0 : tf.get_variable_scope().reuse_variables() 69 | 70 | output, state = self.lstm(current_emb, state) # (batch_size, dim_hidden) 71 | 72 | if i > 0: # 이미지 다음 바로 나오는건 #START# 임. 이건 무시. 73 | labels = tf.expand_dims(sentence[:, i], 1) # (batch_size) 74 | indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) 75 | concated = tf.concat(1, [indices, labels]) 76 | onehot_labels = tf.sparse_to_dense( 77 | concated, tf.pack([self.batch_size, self.n_words]), 1.0, 0.0) # (batch_size, n_words) 78 | 79 | logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b # (batch_size, n_words) 80 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels) 81 | cross_entropy = cross_entropy * mask[:,i]#tf.expand_dims(mask, 1) 82 | 83 | current_loss = tf.reduce_sum(cross_entropy) 84 | loss = loss + current_loss 85 | 86 | loss = loss / tf.reduce_sum(mask[:,1:]) 87 | return loss, image, sentence, mask 88 | 89 | def build_generator(self, maxlen): 90 | image = tf.placeholder(tf.float32, [1, self.dim_image]) 91 | image_emb = tf.matmul(image, self.encode_img_W) + self.encode_img_b 92 | 93 | state = tf.zeros([1, self.lstm.state_size]) 94 | #last_word = image_emb # 첫 단어 대신 이미지 95 | generated_words = [] 96 | 97 | with tf.variable_scope("RNN"): 98 | output, state = self.lstm(image_emb, state) 99 | last_word = tf.nn.embedding_lookup(self.Wemb, [0]) + self.bemb 100 | 101 | for i in range(maxlen): 102 | tf.get_variable_scope().reuse_variables() 103 | 104 | output, state = self.lstm(last_word, state) 105 | 106 | logit_words = tf.matmul(output, self.embed_word_W) + self.embed_word_b 107 | max_prob_word = tf.argmax(logit_words, 1) 108 | 109 | with tf.device("/cpu:0"): 110 | last_word = tf.nn.embedding_lookup(self.Wemb, max_prob_word) 111 | 112 | last_word += self.bemb 113 | 114 | generated_words.append(max_prob_word) 115 | 116 | return image, generated_words 117 | 118 | def get_caption_data(annotation_path, feat_path): 119 | feats = np.load(feat_path) 120 | annotations = pd.read_table(annotation_path, sep='\t', header=None, names=['image', 'caption']) 121 | captions = annotations['caption'].values 122 | 123 | return feats, captions 124 | 125 | def preProBuildWordVocab(sentence_iterator, word_count_threshold=30): # borrowed this function from NeuralTalk 126 | print 'preprocessing word counts and creating vocab based on word count threshold %d' % (word_count_threshold, ) 127 | word_counts = {} 128 | nsents = 0 129 | for sent in sentence_iterator: 130 | nsents += 1 131 | for w in sent.lower().split(' '): 132 | word_counts[w] = word_counts.get(w, 0) + 1 133 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] 134 | print 'filtered words from %d to %d' % (len(word_counts), len(vocab)) 135 | 136 | ixtoword = {} 137 | ixtoword[0] = '.' # period at the end of the sentence. make first dimension be end token 138 | wordtoix = {} 139 | wordtoix['#START#'] = 0 # make first vector be the start token 140 | ix = 1 141 | for w in vocab: 142 | wordtoix[w] = ix 143 | ixtoword[ix] = w 144 | ix += 1 145 | 146 | word_counts['.'] = nsents 147 | bias_init_vector = np.array([1.0*word_counts[ixtoword[i]] for i in ixtoword]) 148 | bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies 149 | bias_init_vector = np.log(bias_init_vector) 150 | bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range 151 | return wordtoix, ixtoword, bias_init_vector 152 | 153 | 154 | ################### 학습 관련 Parameters ##################### 155 | 156 | dim_embed = 256 157 | dim_hidden = 256 158 | dim_image = 4096 159 | batch_size = 128 160 | 161 | #learning_rate = 0.001 162 | n_epochs = 1000 163 | ############################################################### 164 | #################### 잡다한 Parameters ######################## 165 | model_path = './models/tensorflow' 166 | vgg_path = './data/vgg16.tfmodel' 167 | data_path = './data' 168 | feat_path = './data/feats.npy' 169 | annotation_path = os.path.join(data_path, 'results_20130124.token') 170 | ################################################################ 171 | 172 | 173 | def train(): 174 | 175 | learning_rate = 0.001 176 | momentum = 0.9 177 | feats, captions = get_caption_data(annotation_path, feat_path) 178 | wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions) 179 | 180 | np.save('data/ixtoword', ixtoword) 181 | 182 | index = np.arange(len(feats)) 183 | np.random.shuffle(index) 184 | 185 | feats = feats[index] 186 | captions = captions[index] 187 | 188 | sess = tf.InteractiveSession() 189 | n_words = len(wordtoix) 190 | maxlen = np.max( map(lambda x: len(x.split(' ')), captions) ) 191 | caption_generator = Caption_Generator( 192 | dim_image=dim_image, 193 | dim_hidden=dim_hidden, 194 | dim_embed=dim_embed, 195 | batch_size=batch_size, 196 | n_lstm_steps=maxlen+2, 197 | n_words=n_words, 198 | bias_init_vector=bias_init_vector) 199 | 200 | loss, image, sentence, mask = caption_generator.build_model() 201 | 202 | saver = tf.train.Saver(max_to_keep=50) 203 | train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) 204 | tf.initialize_all_variables().run() 205 | 206 | for epoch in range(n_epochs): 207 | #train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) 208 | for start, end in zip( \ 209 | range(0, len(feats), batch_size), 210 | range(batch_size, len(feats), batch_size) 211 | ): 212 | 213 | current_feats = feats[start:end] 214 | current_captions = captions[start:end] 215 | 216 | current_caption_ind = map(lambda cap: [wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix], current_captions) 217 | 218 | current_caption_matrix = sequence.pad_sequences(current_caption_ind, padding='post', maxlen=maxlen+1) 219 | current_caption_matrix = np.hstack( [np.full( (len(current_caption_matrix),1), 0), current_caption_matrix] ).astype(int) 220 | 221 | current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) 222 | nonzeros = np.array( map(lambda x: (x != 0).sum()+2, current_caption_matrix )) 223 | # +2 -> #START# and '.' 224 | 225 | for ind, row in enumerate(current_mask_matrix): 226 | row[:nonzeros[ind]] = 1 227 | 228 | _, loss_value = sess.run([train_op, loss], feed_dict={ 229 | image: current_feats, 230 | sentence : current_caption_matrix, 231 | mask : current_mask_matrix 232 | }) 233 | 234 | print "Current Cost: ", loss_value 235 | 236 | print "Epoch ", epoch, " is done. Saving the model ... " 237 | saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch) 238 | learning_rate *= 0.95 239 | 240 | def test(test_feat='./guitar_player.npy', model_path='./models/tensorflow/model-1', maxlen=30): # Naive greedy search 241 | 242 | ixtoword = np.load('data/ixtoword.npy').tolist() 243 | n_words = len(ixtoword) 244 | 245 | feat = [np.load(test_feat)] 246 | sess = tf.InteractiveSession() 247 | caption_generator = Caption_Generator( 248 | dim_image=dim_image, 249 | dim_hidden=dim_hidden, 250 | dim_embed=dim_embed, 251 | batch_size=batch_size, 252 | n_lstm_steps=maxlen, 253 | n_words=n_words) 254 | 255 | image, generated_words = caption_generator.build_generator(maxlen=maxlen) 256 | # 이 부분이 존나 중요함. 계속 caption_generator를 가져온 뒤 바로 restore를 했었는데, 257 | # TensorFlow의 LSTM은 call을 한 뒤에 weight가 만들어지기 때문에 build_generator보다 뒤쪽에서 restore를 해야 함. 258 | saver = tf.train.Saver() 259 | saver.restore(sess, model_path) 260 | 261 | generated_word_index= sess.run(generated_words, feed_dict={image:feat}) 262 | generated_word_index = np.hstack(generated_word_index) 263 | 264 | generated_sentence = [ixtoword[x] for x in generated_word_index] 265 | 266 | 267 | def read_image(path): 268 | 269 | img = crop_image(path, target_height=224, target_width=224) 270 | if img.shape[2] == 4: 271 | img = img[:,:,:3] 272 | 273 | img = img[None, ...] 274 | return img 275 | 276 | 277 | def test_tf(test_image_path=None, model_path='./models/model-72', maxlen=30): 278 | with open(vgg_path) as f: 279 | fileContent = f.read() 280 | graph_def = tf.GraphDef() 281 | graph_def.ParseFromString(fileContent) 282 | 283 | images = tf.placeholder("float32", [1, 224, 224, 3]) 284 | tf.import_graph_def(graph_def, input_map={"images":images}) 285 | 286 | ixtoword = np.load('./data/ixtoword.npy').tolist() 287 | n_words = len(ixtoword) 288 | 289 | image_val = read_image(test_image_path) 290 | sess = tf.InteractiveSession() 291 | 292 | caption_generator = Caption_Generator( 293 | dim_image=dim_image, 294 | dim_hidden=dim_hidden, 295 | dim_embed=dim_embed, 296 | batch_size=batch_size, 297 | n_lstm_steps=maxlen, 298 | n_words=n_words) 299 | 300 | graph = tf.get_default_graph() 301 | fc7 = sess.run(graph.get_tensor_by_name("import/fc7_relu:0"), feed_dict={images:image_val}) 302 | 303 | fc7_tf, generated_words = caption_generator.build_generator(maxlen=maxlen) 304 | 305 | saver = tf.train.Saver() 306 | saver.restore(sess, model_path) 307 | 308 | generated_word_index= sess.run(generated_words, feed_dict={fc7_tf:fc7}) 309 | generated_word_index = np.hstack(generated_word_index) 310 | 311 | generated_words = [ixtoword[x] for x in generated_word_index] 312 | punctuation = np.argmax(np.array(generated_words) == '.')+1 313 | 314 | generated_words = generated_words[:punctuation] 315 | generated_sentence = ' '.join(generated_words) 316 | print generated_sentence 317 | --------------------------------------------------------------------------------