├── README.md ├── data ├── test │ ├── answers.txt │ ├── img_ids.txt │ ├── questions.txt │ └── types.txt └── train │ ├── answers.txt │ ├── img_ids.txt │ ├── questions.txt │ └── types.txt ├── imageQA_demo.gif ├── preprocessing ├── 1_data2pickle.py ├── 2_makeDict.py ├── 3_modifyPkl.py ├── 4_downloadImage.py ├── 5_getCNNFeature.py └── cnn.py ├── tensorflow_simple ├── imageQA_tensorflow.py └── web │ ├── cnn.pkl │ ├── cnn.py │ ├── cnn.pyc │ ├── cnn4web.py │ ├── imageQA_tensorflow.py │ ├── imageQA_tensorflow.pyc │ ├── images │ └── moodo.jpg │ ├── server.py │ └── templates │ └── iqa.html ├── theano_attention ├── cnn.py ├── cnn.pyc ├── main.py ├── models.py ├── models.pyc ├── optimizer.py ├── optimizer.pyc └── prediction.py └── theano_simple ├── cnn.py ├── main.py ├── models.py ├── optimizer.py └── prediction.py /README.md: -------------------------------------------------------------------------------- 1 | # Image Question Answering 2 | 3 | reference paper : http://arxiv.org/abs/1505.02074 4 |
5 | dataset : http://www.cs.toronto.edu/~mren/imageqa/data/cocoqa/ 6 | 7 |

8 | theano 9 | - simple : fc7 features with RNNs 10 |
11 | - attention : conv5_4 features (similar to 'show, attend and tell' http://arxiv.org/abs/1502.03044) with RNNs 12 | 13 | 14 |

15 | tensorflow 16 | - simple : fc7 features with RNNs 17 | 18 |

19 | [demo]

20 | 21 | -------------------------------------------------------------------------------- /imageQA_demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/imageQA_demo.gif -------------------------------------------------------------------------------- /preprocessing/1_data2pickle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | import os 5 | 6 | ### test data to pickle 7 | os.chdir('/Users/seonhoon/Desktop/workspace_python/ImageQA/data/test') 8 | test=pd.DataFrame() 9 | 10 | data=[] 11 | with open('img_ids.txt') as f: 12 | for line in f: 13 | data.append(line.split('\n')[0]) 14 | test['img_id']=data 15 | 16 | data=[] 17 | with open('questions.txt') as f: 18 | for line in f: 19 | data.append(line.split('\n')[0]) 20 | test['question']=data 21 | 22 | data=[] 23 | with open('answers.txt') as f: 24 | for line in f: 25 | data.append(line.split('\n')[0]) 26 | test['answer']=data 27 | 28 | data=[] 29 | with open('types.txt') as f: 30 | for line in f: 31 | data.append(line.split('\n')[0]) 32 | test['type']=data 33 | 34 | test.to_pickle('../test.pkl') 35 | 36 | 37 | ### train data to pickle 38 | os.chdir('/Users/seonhoon/Desktop/workspace_python/ImageQA/data/train') 39 | train=pd.DataFrame() 40 | 41 | data=[] 42 | with open('img_ids.txt') as f: 43 | for line in f: 44 | data.append(line.split('\n')[0]) 45 | train['img_id']=data 46 | 47 | data=[] 48 | with open('questions.txt') as f: 49 | for line in f: 50 | data.append(line.split('\n')[0]) 51 | train['question']=data 52 | 53 | data=[] 54 | with open('answers.txt') as f: 55 | for line in f: 56 | data.append(line.split('\n')[0]) 57 | train['answer']=data 58 | 59 | data=[] 60 | with open('types.txt') as f: 61 | for line in f: 62 | data.append(line.split('\n')[0]) 63 | train['type']=data 64 | 65 | train.to_pickle('../train.pkl') 66 | -------------------------------------------------------------------------------- /preprocessing/2_makeDict.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import os 4 | import cPickle 5 | 6 | os.chdir('/Users/seonhoon/Desktop/workspace_python/ImageQA/data/') 7 | 8 | train=pd.read_pickle('train.pkl') 9 | test=pd.read_pickle('test.pkl') 10 | 11 | questions = train['question'].tolist()+test['question'].tolist() 12 | questions = [question.split() for question in questions] 13 | 14 | tokens=[] 15 | 16 | for question in questions: 17 | for word in question: 18 | tokens.append(word) 19 | 20 | tokens = list(set(tokens)) 21 | tokens.sort() 22 | 23 | idx2word = dict([(i,k) for i,k in enumerate(tokens)]) 24 | word2idx = dict([(k,i) for i,k in enumerate(tokens)]) 25 | 26 | 27 | 28 | 29 | answers = list(set(train['answer'].tolist()+test['answer'].tolist())) 30 | answers.sort() 31 | 32 | idx2answer = dict([(i,k) for i,k in enumerate(answers)]) 33 | answer2idx = dict([(k,i) for i,k in enumerate(answers)]) 34 | 35 | with open('dict.pkl', 'wb') as f: 36 | cPickle.dump([idx2word, word2idx, idx2answer, answer2idx], f) -------------------------------------------------------------------------------- /preprocessing/3_modifyPkl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | import os 5 | import cPickle 6 | 7 | os.chdir('/Users/seonhoon/Desktop/workspace_python/ImageQA/data/') 8 | 9 | 10 | with open('dict.pkl', 'rb') as f: 11 | dict=cPickle.load(f) 12 | idx2word=dict[0] 13 | word2idx=dict[1] 14 | idx2answer=dict[2] 15 | answer2idx=dict[3] 16 | 17 | train=pd.read_pickle('train.pkl') 18 | test=pd.read_pickle('test.pkl') 19 | 20 | def getIndex(x, type='question'): 21 | xs=x.split() 22 | idx=[] 23 | getIdx=word2idx 24 | if type=='answer': 25 | getIdx=answer2idx 26 | for x in xs: 27 | idx.append(getIdx[x]) 28 | return idx 29 | 30 | train['q']=train['question'].apply(lambda x : getIndex(x)) 31 | train['a']=train['answer'].apply(lambda x : getIndex(x,type='answer')) 32 | test['q']=test['question'].apply(lambda x : getIndex(x)) 33 | test['a']=test['answer'].apply(lambda x : getIndex(x,type='answer')) 34 | 35 | train.to_pickle('train.pkl') 36 | test.to_pickle('test.pkl') 37 | -------------------------------------------------------------------------------- /preprocessing/4_downloadImage.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import urllib 4 | import pandas as pd 5 | import os 6 | import shutil 7 | 8 | os.chdir("/home/seonhoon/Desktop/workspace/ImageQA/data/images/") 9 | 10 | 11 | 12 | i=0 13 | 14 | def downloadImage(id, location): 15 | global i 16 | i=i+1 17 | if i%1000==0 : 18 | print 'iter : ', i 19 | fn=id.zfill(12) 20 | if os.path.isfile('/home/seonhoon/Downloads/test2014/COCO_test2014_'+fn+'.jpg'): 21 | print 'shutil.copy2 test ', id 22 | shutil.copy2('/home/seonhoon/Downloads/test2014/COCO_test2014_'+fn+'.jpg', location+'/'+id+".jpg") 23 | elif os.path.isfile('/home/seonhoon/Downloads/train2014/COCO_train2014_'+fn+'.jpg'): 24 | print 'shutil.copy2 train ', id 25 | shutil.copy2('/home/seonhoon/Downloads/train2014/COCO_train2014_'+fn+'.jpg', location+'/'+id+".jpg") 26 | elif os.path.isfile(location+'/'+id+".jpg"): 27 | print 'location ', id 28 | else: 29 | print 'download ', id 30 | urllib.urlretrieve("http://mscoco.org/images/"+id, location+'/'+id+".jpg") 31 | 32 | #train=pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA/data/train.pkl') 33 | 34 | #train['img_id'].apply(lambda x : downloadImage(x, 'train')) 35 | 36 | test=pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA/data/test.pkl') 37 | 38 | test['img_id'].apply(lambda x : downloadImage(x, 'test')) -------------------------------------------------------------------------------- /preprocessing/5_getCNNFeature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from cnn import * 5 | import pandas as pd 6 | import os 7 | 8 | 9 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 10 | train=pd.read_pickle('train.pkl') 11 | test=pd.read_pickle('test.pkl') 12 | 13 | 14 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/caffemodel/vgg16') 15 | 16 | #/home/seonhoon/Desktop/workspace/ImageQA/data/images/train/xxx.jpg 17 | img_folder='/home/seonhoon/Desktop/workspace/ImageQA/data/images/' 18 | 19 | # train 20 | print 'train .. ' 21 | imglist = train['img_id'].apply(lambda x : img_folder+'train/'+x+'.jpg') 22 | imglist = imglist.tolist() 23 | cnn = CNN() 24 | featurelist = cnn.get_features(imglist) 25 | train['cnn_feature']=0 26 | train['cnn_feature']=train['cnn_feature'].astype(object) 27 | for i in range(len(train)): 28 | train.loc[i,'cnn_feature']=featurelist[i] 29 | 30 | 31 | 32 | #test 33 | print 'test .. ' 34 | imglist = test['img_id'].apply(lambda x : img_folder+'test/'+x+'.jpg') 35 | imglist = imglist.tolist() 36 | cnn = CNN() 37 | featurelist = cnn.get_features(imglist) 38 | test['cnn_feature']=0 39 | test['cnn_feature']=test['cnn_feature'].astype(object) 40 | for i in range(len(test)): 41 | test.loc[i,'cnn_feature']=featurelist[i] 42 | 43 | #modify pickle 44 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 45 | train.to_pickle('train_vgg.pkl') 46 | test.to_pickle('test_vgg.pkl') 47 | -------------------------------------------------------------------------------- /preprocessing/cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import caffe 5 | import numpy as np 6 | from scipy.misc import imread, imresize 7 | 8 | class CNN(object): 9 | 10 | def __init__(self, deploy='VGG_ILSVRC_16_layers_deploy.prototxt', model='VGG_ILSVRC_16_layers.caffemodel'): 11 | caffe.set_mode_gpu() 12 | self.net = caffe.Net(deploy, model, caffe.TEST) 13 | 14 | if model.startswith('VGG_ILSVRC_16_layers'): 15 | self.mean = np.array([103.939, 116.779, 123.68]) 16 | 17 | 18 | def get_batch_features(self, in_data, net, layer): 19 | out = net.forward(blobs=[layer], **{net.inputs[0]: in_data}) 20 | features = out[layer]#.squeeze(axis=(2,3)) 21 | return features 22 | 23 | def get_features(self, filelist, layer='fc7'): 24 | 25 | N, channel, height, width = self.net.blobs[self.net.inputs[0]].data.shape 26 | feature = self.net.blobs[layer].data.shape[1] 27 | n_files = len(filelist) 28 | img_height, img_width, _ = imread(filelist[0]).shape 29 | all_features = np.zeros((n_files, feature)) 30 | for i in range(0, n_files, N): 31 | in_data = np.zeros((N, channel, height, width), dtype=np.float32) 32 | 33 | batch_range = range(i, min(i+N, n_files)) 34 | batch_filelist = [filelist[j] for j in batch_range] 35 | 36 | batch_images = np.zeros((len(batch_range), 3, height, width)) 37 | for j,file in enumerate(batch_filelist): 38 | im = imread(file) 39 | if len(im.shape) == 2: 40 | im = np.tile(im[:,:,np.newaxis], (1,1,3)) 41 | im = im[:,:,(2,1,0)] # RGB -> BGR 42 | im = im - self.mean # mean subtraction 43 | im = imresize(im, (height, width), 'bicubic') # resize 44 | im = np.transpose(im, (2, 0, 1)) # get channel in correct dimension 45 | batch_images[j,:,:,:] = im 46 | 47 | in_data[0:len(batch_range), :, :, :] = batch_images 48 | 49 | features = self.get_batch_features(in_data, self.net, layer) 50 | 51 | for j in range(len(batch_range)): 52 | all_features[i+j,:] = features[j,:] 53 | 54 | print 'Done %d/%d files' % (i+len(batch_range), len(filelist)) 55 | 56 | return all_features 57 | -------------------------------------------------------------------------------- /tensorflow_simple/imageQA_tensorflow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import time 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from keras.utils import np_utils 9 | 10 | #import tensorflow.python.platform 11 | import tensorflow as tf 12 | from tensorflow.models.rnn import rnn_cell 13 | 14 | 15 | # To do 16 | ''' 17 | 마지막 배치 사이즈 이슈 18 | 19 | 20 | ''' 21 | 22 | def prepare_data(seqs_x, maxlen=None): 23 | lengths_x = [len(s) for s in seqs_x] 24 | 25 | n_samples = len(seqs_x) 26 | if maxlen is None: 27 | maxlen = np.max(lengths_x) + 1 28 | 29 | x = np.zeros((n_samples, maxlen)).astype('int32') 30 | x_mask = np.zeros((n_samples, maxlen)).astype('float32') 31 | 32 | for idx, s_x in enumerate(seqs_x): 33 | x[idx, :lengths_x[idx]] = s_x 34 | x_mask[idx, :lengths_x[idx]+1] = 1. # Adding 1, for image 35 | 36 | return x, x_mask 37 | 38 | 39 | def get_minibatch_indices(n, batch_size, shuffle=False): 40 | 41 | idx_list = np.arange(n, dtype="int32") 42 | 43 | if shuffle: 44 | np.random.shuffle(idx_list) 45 | 46 | minibatches = [] 47 | minibatch_start = 0 48 | for i in range(n // batch_size): 49 | minibatches.append(idx_list[minibatch_start: 50 | minibatch_start + batch_size]) 51 | minibatch_start += batch_size 52 | # if (minibatch_start != n): # last mini-batch issue !!! 53 | # minibatches.append(idx_list[minibatch_start:]) 54 | return minibatches 55 | 56 | 57 | 58 | class ImageQA(object): 59 | 60 | def __init__(self, config): 61 | 62 | self.config = config 63 | 64 | self.vocab_size = vocab_size = config.vocab_size 65 | self.y_size = y_size = config.y_size 66 | 67 | self.batch_size = batch_size = config.batch_size 68 | self.steps = config.steps 69 | 70 | self.layers = layers = config.layers 71 | 72 | self.dim_ictx = dim_ictx = config.dim_ictx 73 | self.dim_iemb = dim_iemb = config.dim_iemb 74 | self.dim_wemb = dim_wemb = config.dim_wemb 75 | self.dim_hidden = dim_hidden = config.dim_hidden 76 | 77 | self.lr = tf.Variable(config.lr, trainable=False) 78 | 79 | rnn_type = config.rnn_type 80 | if rnn_type == 'gru': 81 | rnn_ = rnn_cell.GRUCell(dim_hidden) 82 | elif rnn_type == 'lstm': 83 | rnn_ = rnn_cell.BasicLSTMCell(dim_hidden) 84 | 85 | if layers is not None: 86 | self.my_rnn = my_rnn = rnn_cell.MultiRNNCell([rnn_] * layers) 87 | self.init_state = my_rnn.zero_state(batch_size, tf.float32) 88 | else: 89 | self.my_rnn = my_rnn = rnn_ 90 | self.init_state = tf.zeros([batch_size, my_rnn.state_size]) 91 | 92 | self.W_iemb = tf.get_variable("W_iemb", [dim_ictx, dim_iemb]) 93 | self.b_iemb = tf.get_variable("b_iemb", [dim_iemb]) 94 | with tf.device("/cpu:0"): 95 | self.W_wemb = tf.get_variable("W_wemb", [vocab_size, dim_wemb]) 96 | 97 | if config.is_birnn : # add 보다 concat이 더 잘나오는듯.. 98 | self.W_pred = tf.get_variable("W_pred", [dim_hidden * 2, y_size]) 99 | else : 100 | self.W_pred = tf.get_variable("W_pred", [dim_hidden, y_size]) 101 | 102 | self.b_pred = tf.get_variable("b_pred", [y_size]) 103 | 104 | 105 | def build_model(self): 106 | 107 | x = tf.placeholder(tf.int32, [self.batch_size, self.steps]) 108 | x_mask = tf.placeholder(tf.float32, [self.batch_size, self.steps]) 109 | y = tf.placeholder(tf.float32, [self.batch_size, self.y_size]) 110 | img = tf.placeholder(tf.float32, [self.batch_size, self.dim_ictx]) 111 | 112 | 113 | 114 | with tf.device("/cpu:0"): 115 | inputs = tf.split(1, self.steps, tf.nn.embedding_lookup(self.W_wemb, x)) 116 | # sample * steps * dim -> split -> sample * 1 * dim 117 | inputs = [tf.squeeze(input_, [1]) for input_ in inputs] 118 | # [sample * dim, sample * dim, sample * dim, ... ] 119 | img_emb = tf.nn.xw_plus_b(img, self.W_iemb, self.b_iemb) 120 | inputs = [img_emb]+inputs[:-1] # -1 is for img 121 | 122 | 123 | 124 | hiddens = [] 125 | states = [] 126 | 127 | 128 | state = self.init_state 129 | with tf.variable_scope("RNN", reuse=None): 130 | for i in range(len(inputs)): 131 | if i == 0: 132 | (hidden, state) = self.my_rnn(inputs[i], state) 133 | else: 134 | m = x_mask[:, i] 135 | 136 | tf.get_variable_scope().reuse_variables() 137 | (prev_hidden, prev_state) = (hidden, state) # for masking 138 | (hidden, state) = self.my_rnn(inputs[i], state) 139 | 140 | m_1 = tf.expand_dims(m,1) 141 | m_1 = tf.tile(m_1, [1, self.dim_hidden]) 142 | m_0 = tf.expand_dims(1. - m,1) 143 | m_0 = tf.tile(m_0, [1, self.dim_hidden]) 144 | hidden = tf.add(tf.mul(m_1, hidden), tf.mul(m_0, prev_hidden)) 145 | state = tf.add(tf.mul(m_1, state), tf.mul(m_0, prev_state)) 146 | hiddens.append(hidden) 147 | states.append(state) 148 | 149 | if self.config.is_birnn : 150 | rhiddens = [] 151 | rstates = [] 152 | rx_mask = tf.reverse(x_mask,[False, True]) 153 | rinputs = inputs[::-1] 154 | state = self.init_state 155 | with tf.variable_scope("rRNN", reuse=None): 156 | for i in range(len(rinputs)): 157 | if i == 0: 158 | (hidden, state) = self.my_rnn(inputs[i], state) 159 | else: 160 | m = rx_mask[:, i] 161 | 162 | tf.get_variable_scope().reuse_variables() 163 | (prev_hidden, prev_state) = (hidden, state) # for masking 164 | (hidden, state) = self.my_rnn(rinputs[i], state) 165 | m_1 = tf.expand_dims(m,1) 166 | m_1 = tf.tile(m_1, [1, self.dim_hidden]) 167 | m_0 = tf.expand_dims(1. - m,1) 168 | m_0 = tf.tile(m_0, [1, self.dim_hidden]) 169 | hidden = tf.add(tf.mul(m_1, hidden), tf.mul(m_0, prev_hidden)) 170 | state = tf.add(tf.mul(m_1, state), tf.mul(m_0, prev_state)) 171 | rhiddens.append(hidden) 172 | rstates.append(state) 173 | 174 | 175 | hiddens = tf.concat(2, [tf.pack(hiddens), tf.pack(rhiddens[::-1])]) 176 | #hiddens = tf.add(tf.pack(hiddens), tf.pack(rhiddens[::-1])) 177 | hiddens = tf.unpack(hiddens) 178 | 179 | ''' 180 | mean or last hidden -> logit_hidden : sample * dim 181 | ''' 182 | # 1. last hidden 183 | #logit_hidden = hiddens[-1] #tf.reduce_mean(hiddens, 0) 184 | # 2. mean of hiddens 185 | x_mask_t = tf.transpose(x_mask) 186 | x_mask_t_denom = tf.expand_dims(tf.reduce_sum(x_mask_t, 0), 1) 187 | x_mask_t_denom = tf.tile(x_mask_t_denom, [1, self.W_pred.get_shape().dims[0].value]) 188 | x_mask_t = tf.expand_dims(x_mask_t, 2) 189 | x_mask_t = tf.tile(x_mask_t, [1, 1, self.W_pred.get_shape().dims[0].value]) 190 | hiddens = tf.pack(hiddens) 191 | logit_hidden = tf.reduce_sum(tf.mul(hiddens, x_mask_t), 0) 192 | logit_hidden = tf.div(logit_hidden, x_mask_t_denom) 193 | 194 | 195 | if self.config.dropout is not None: 196 | logit_hidden = tf.nn.dropout(logit_hidden, self.config.dropout) 197 | 198 | 199 | logits = tf.nn.xw_plus_b(logit_hidden, self.W_pred, self.b_pred) 200 | 201 | probs = tf.nn.softmax(logits) 202 | prediction = tf.argmax(probs, 1) 203 | correct_prediction = tf.equal(prediction, tf.argmax(y,1)) 204 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 205 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, y) 206 | loss = tf.reduce_mean(cross_entropy) 207 | train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) 208 | return x, x_mask, y, img, loss, train_op, accuracy, prediction 209 | 210 | 211 | def train(): 212 | 213 | 214 | config = get_config() 215 | 216 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 217 | train = pd.read_pickle('train_vgg.pkl') 218 | train_x = [ q for q in train['q'] ] 219 | train_y = [ a[0] for a in train['a'] ] 220 | train_y = np.array(train_y)[:,None] 221 | train_y = np_utils.to_categorical(train_y, config.y_size).astype('float32') 222 | train_x , train_x_mask = prepare_data(train_x, config.steps) 223 | train_x_img = np.array([ img.tolist() for img in train['cnn_feature'] ]).astype('float32') 224 | 225 | n_train = len(train_x) 226 | 227 | print 'train_x :', train_x.shape 228 | print 'train_x_mask :', train_x_mask.shape 229 | print 'train_x_img :', train_x_img.shape 230 | print 'train_y :',train_y.shape 231 | 232 | if config.valid_epoch is not None: 233 | valid=pd.read_pickle('test_vgg.pkl') 234 | valid_x=[ q for q in valid['q'] ] 235 | valid_y=[ a[0] for a in valid['a'] ] 236 | valid_y=np.array(valid_y)[:,None] 237 | valid_y = np_utils.to_categorical(valid_y, config.y_size).astype('float32') 238 | valid_x , valid_x_mask = prepare_data(valid_x, config.steps) 239 | valid_x_img = np.array([ img.tolist() for img in valid['cnn_feature'] ]).astype('float32') 240 | n_valid = len(valid_x) 241 | valid_batch_indices=get_minibatch_indices(n_valid, config.batch_size, shuffle=False) 242 | 243 | print 'valid_x :', valid_x.shape 244 | print 'valid_x_mask :', valid_x_mask.shape 245 | print 'valid_x_img :', valid_x_img.shape 246 | print 'valid_y :',valid_y.shape 247 | 248 | 249 | 250 | with tf.Session() as sess: 251 | 252 | 253 | initializer = tf.random_normal_initializer(0, 0.1) 254 | with tf.variable_scope("model", reuse=None, initializer=initializer): 255 | model = ImageQA(config = config) 256 | 257 | x, x_mask, y, img, loss, train_op, accuracy, prediction = model.build_model() 258 | 259 | saver = tf.train.Saver() 260 | sess.run(tf.initialize_all_variables()) 261 | 262 | for i in range(config.epoch): 263 | start = time.time() 264 | lr_decay = config.lr_decay ** max(i - config.decay_epoch, 0.0) 265 | sess.run(tf.assign(model.lr, config.lr * lr_decay)) 266 | 267 | 268 | batch_indices=get_minibatch_indices(n_train, config.batch_size, shuffle=True) 269 | 270 | preds = [] 271 | for j, indices in enumerate(batch_indices): 272 | 273 | x_ = np.array([ train_x[k,:] for k in indices]) 274 | x_mask_ = np.array([ train_x_mask[k,:] for k in indices]) 275 | y_ = np.array([ train_y[k,:] for k in indices]) 276 | img_ = np.array([ train_x_img[k,:] for k in indices]) 277 | 278 | 279 | 280 | cost, _, acc, pred = sess.run([loss, train_op, accuracy, prediction], 281 | {x: x_, 282 | x_mask: x_mask_, 283 | y: y_, 284 | img : img_}) 285 | preds = preds + pred.tolist() 286 | if j % 99 == 0 : 287 | print 'cost : ', cost, ', accuracy : ', acc, ', iter : ', j+1, ' in epoch : ',i+1 288 | print 'cost : ', cost, ', accuracy : ', acc, ', iter : ', j+1, ' in epoch : ',i+1,' elapsed time : ', int(time.time()-start) 289 | if config.valid_epoch is not None: # for validation 290 | best_accuracy = 0. 291 | 292 | if (i+1) % config.valid_epoch == 0: 293 | val_preds = [] 294 | for j, indices in enumerate(valid_batch_indices): 295 | x_ = np.array([ valid_x[k,:] for k in indices]) 296 | x_mask_ = np.array([ valid_x_mask[k,:] for k in indices]) 297 | y_ = np.array([ valid_y[k,:] for k in indices]) 298 | img_ = np.array([ valid_x_img[k,:] for k in indices]) 299 | 300 | pred = sess.run(prediction, 301 | {x: x_, 302 | x_mask: x_mask_, 303 | y: y_, 304 | img : img_}) 305 | 306 | val_preds = val_preds + pred.tolist() 307 | valid_acc = np.mean(np.equal(val_preds, np.argmax(valid_y,1))) 308 | print '##### valid accuracy : ', valid_acc, ' after epoch ', i+1 309 | if valid_acc > best_accuracy and i >= 10: 310 | best_accuracy = valid_acc 311 | saver.save(sess, config.model_ckpt_path, global_step=int(best_accuracy*100)) 312 | 313 | 314 | 315 | def test(): 316 | 317 | config = get_config() 318 | 319 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 320 | 321 | test=pd.read_pickle('test_vgg.pkl') 322 | test_x=[ q for q in test['q'] ] 323 | test_y=[ a[0] for a in test['a'] ] 324 | test_y=np.array(test_y)[:,None] 325 | test_y = np_utils.to_categorical(test_y, config.y_size).astype('float32') 326 | test_x , test_x_mask = prepare_data(test_x, config.steps) 327 | test_x_img = np.array([ img.tolist() for img in test['cnn_feature'] ]).astype('float32') 328 | n_test = len(test_x) 329 | test_batch_indices=get_minibatch_indices(n_test, config.batch_size, shuffle=False) 330 | 331 | 332 | 333 | with tf.Session() as sess: 334 | 335 | 336 | with tf.variable_scope("model", reuse=None): 337 | model = ImageQA(config = config) 338 | 339 | x, x_mask, y, img, _, _, _, prediction = model.build_model() 340 | saver = tf.train.Saver() 341 | ckpt = tf.train.get_checkpoint_state(os.path.dirname(config.model_ckpt_path)) 342 | sess.run(tf.initialize_all_variables()) 343 | saver.restore(sess, ckpt.model_checkpoint_path) 344 | test_preds = [] 345 | for j, indices in enumerate(test_batch_indices): 346 | x_ = np.array([test_x[k,:] for k in indices]) 347 | x_mask_ = np.array([ test_x_mask[k,:] for k in indices]) 348 | y_ = np.array([ test_y[k,:] for k in indices]) 349 | img_ = np.array([ test_x_img[k,:] for k in indices]) 350 | 351 | pred = sess.run(prediction, 352 | {x: x_, 353 | x_mask: x_mask_, 354 | y: y_, 355 | img : img_}) 356 | test_preds = test_preds + pred.tolist() 357 | 358 | 359 | test_acc = np.mean(np.equal(test_preds, np.argmax(test_y,1))) 360 | print 'test accuracy :', test_acc 361 | 362 | 363 | def test_sample(test_x, test_x_maks, test_x_img): 364 | 365 | config = get_config() 366 | config.batch_size = 1 367 | 368 | with tf.Session() as sess: 369 | with tf.variable_scope("model", reuse=None): 370 | model = ImageQA(config = config) 371 | 372 | x, x_mask, _, img, _, _, _, prediction = model.build_model() 373 | saver = tf.train.Saver() 374 | ckpt = tf.train.get_checkpoint_state(os.path.dirname(config.model_ckpt_path)) 375 | saver.restore(sess, ckpt.model_checkpoint_path) 376 | 377 | x_ = test_x 378 | x_mask_ = test_x_maks 379 | img_ = test_x_img 380 | 381 | pred = sess.run(prediction, 382 | {x: x_, 383 | x_mask: x_mask_, 384 | img : img_}) 385 | 386 | return pred 387 | 388 | 389 | 390 | 391 | def get_config(): 392 | class Config1(object): 393 | vocab_size = 12047 394 | y_size = 430 395 | batch_size = 364 #703 #s28 396 | steps = 60 397 | 398 | dim_ictx = 4096 399 | dim_iemb = 1024 # image embedding 400 | dim_wemb = 1024 # word embedding 401 | dim_hidden = 1024 402 | epoch = 100 403 | 404 | lr = 0.001 405 | lr_decay = 0.9 406 | decay_epoch = 333. # epoch 보다 크면 decay 하지않음. 407 | 408 | dropout = 0.4 409 | 410 | rnn_type = 'gru' 411 | layers = None #it doesn’t work yet 412 | is_birnn = True #False 413 | valid_epoch = 1 # or None 414 | model_ckpt_path = '/home/seonhoon/Desktop/workspace/ImageQA/version_tensorflow/model/model.ckpt' 415 | return Config1() 416 | 417 | def main(_): 418 | 419 | 420 | is_train = False # if False then test 421 | 422 | if is_train : 423 | train() 424 | 425 | else: 426 | test() 427 | 428 | 429 | if __name__ == "__main__": 430 | tf.app.run() 431 | 432 | 433 | 434 | 435 | 436 | -------------------------------------------------------------------------------- /tensorflow_simple/web/cnn.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/tensorflow_simple/web/cnn.pkl -------------------------------------------------------------------------------- /tensorflow_simple/web/cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import caffe 4 | import numpy as np 5 | from scipy.misc import imread, imresize 6 | 7 | class CNN(object): 8 | 9 | def __init__(self, deploy='/home/seonhoon/Desktop/caffemodel/vgg16/VGG_ILSVRC_16_layers_deploy.prototxt', 10 | model='/home/seonhoon/Desktop/caffemodel/vgg16/VGG_ILSVRC_16_layers.caffemodel'): 11 | caffe.set_mode_gpu() 12 | self.net = caffe.Net(deploy, model, caffe.TEST) 13 | 14 | #if model.startswith('VGG'): 15 | self.mean = np.array([103.939, 116.779, 123.68]) 16 | 17 | 18 | def get_batch_features(self, in_data, net, layer): 19 | out = net.forward(blobs=[layer], **{net.inputs[0]: in_data}) 20 | features = out[layer]#.squeeze(axis=(2,3)) 21 | return features 22 | 23 | def get_features(self, filelist, layer='fc7'): 24 | 25 | N, channel, height, width = self.net.blobs[self.net.inputs[0]].data.shape 26 | n_files = len(filelist) 27 | 28 | 29 | if str(layer).startswith('fc'): 30 | feature = self.net.blobs[layer].data.shape[1] 31 | all_features = np.zeros((n_files, feature)) 32 | else : 33 | feature1, feature2, feature3 = self.net.blobs[layer].data.shape[1], self.net.blobs[layer].data.shape[2], self.net.blobs[layer].data.shape[3] 34 | all_features = np.zeros((n_files, feature1, feature2, feature3)) 35 | 36 | 37 | for i in range(0, n_files, N): 38 | in_data = np.zeros((N, channel, height, width), dtype=np.float32) 39 | 40 | batch_range = range(i, min(i+N, n_files)) 41 | batch_filelist = [filelist[j] for j in batch_range] 42 | 43 | batch_images = np.zeros((len(batch_range), 3, height, width)) 44 | for j,file in enumerate(batch_filelist): 45 | im = imread(file) 46 | if len(im.shape) == 2: 47 | im = np.tile(im[:,:,np.newaxis], (1,1,3)) 48 | im = im[:,:,(2,1,0)] # RGB -> BGR 49 | im = im - self.mean # mean subtraction 50 | im = imresize(im, (height, width), 'bicubic') # resize 51 | im = np.transpose(im, (2, 0, 1)) # get channel in correct dimension 52 | batch_images[j,:,:,:] = im 53 | 54 | in_data[0:len(batch_range), :, :, :] = batch_images 55 | 56 | features = self.get_batch_features(in_data, self.net, layer) 57 | 58 | for j in range(len(batch_range)): 59 | all_features[i+j,:] = features[j,:] 60 | 61 | #print 'Done %d/%d files' % (i+len(batch_range), len(filelist)) 62 | 63 | return all_features 64 | -------------------------------------------------------------------------------- /tensorflow_simple/web/cnn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/tensorflow_simple/web/cnn.pyc -------------------------------------------------------------------------------- /tensorflow_simple/web/cnn4web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from cnn import CNN 4 | import pandas as pd 5 | 6 | 7 | cnn = CNN() 8 | data=pd.DataFrame() 9 | 10 | data['cnn_feature']=0 11 | data['cnn_feature']=data['cnn_feature'].astype(object) 12 | 13 | featurelist = cnn.get_features(['/home/seonhoon/Desktop/workspace/ImageQA/version_tensorflow/web/images/moodo.jpg'], layer='fc7') 14 | data.loc[0,'cnn_feature']=featurelist[0].flatten() 15 | 16 | 17 | data.to_pickle('/home/seonhoon/Desktop/workspace/ImageQA/version_tensorflow/web/cnn.pkl') 18 | -------------------------------------------------------------------------------- /tensorflow_simple/web/imageQA_tensorflow.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import time 5 | import pandas as pd 6 | import numpy as np 7 | 8 | from keras.utils import np_utils 9 | 10 | import tensorflow.python.platform 11 | import tensorflow as tf 12 | from tensorflow.models.rnn import rnn_cell 13 | 14 | 15 | # To do 16 | ''' 17 | 마지막 배치 사이즈 이슈 18 | 19 | 20 | ''' 21 | 22 | def prepare_data(seqs_x, maxlen=None): 23 | lengths_x = [len(s) for s in seqs_x] 24 | 25 | n_samples = len(seqs_x) 26 | if maxlen is None: 27 | maxlen = np.max(lengths_x) + 1 28 | 29 | x = np.zeros((n_samples, maxlen)).astype('int32') 30 | x_mask = np.zeros((n_samples, maxlen)).astype('float32') 31 | 32 | for idx, s_x in enumerate(seqs_x): 33 | x[idx, :lengths_x[idx]] = s_x 34 | x_mask[idx, :lengths_x[idx]+1] = 1. # Adding 1, for image 35 | 36 | return x, x_mask 37 | 38 | 39 | def get_minibatch_indices(n, batch_size, shuffle=False): 40 | 41 | idx_list = np.arange(n, dtype="int32") 42 | 43 | if shuffle: 44 | np.random.shuffle(idx_list) 45 | 46 | minibatches = [] 47 | minibatch_start = 0 48 | for i in range(n // batch_size): 49 | minibatches.append(idx_list[minibatch_start: 50 | minibatch_start + batch_size]) 51 | minibatch_start += batch_size 52 | # if (minibatch_start != n): # last mini-batch issue !!! 53 | # minibatches.append(idx_list[minibatch_start:]) 54 | return minibatches 55 | 56 | 57 | 58 | class ImageQA(object): 59 | 60 | def __init__(self, config): 61 | 62 | self.config = config 63 | 64 | self.vocab_size = vocab_size = config.vocab_size 65 | self.y_size = y_size = config.y_size 66 | 67 | self.batch_size = batch_size = config.batch_size 68 | self.steps = config.steps 69 | 70 | self.layers = layers = config.layers 71 | 72 | self.dim_ictx = dim_ictx = config.dim_ictx 73 | self.dim_iemb = dim_iemb = config.dim_iemb 74 | self.dim_wemb = dim_wemb = config.dim_wemb 75 | self.dim_hidden = dim_hidden = config.dim_hidden 76 | 77 | self.lr = tf.Variable(config.lr, trainable=False) 78 | 79 | rnn_type = config.rnn_type 80 | if rnn_type == 'gru': 81 | rnn_ = rnn_cell.GRUCell(dim_hidden) 82 | elif rnn_type == 'lstm': 83 | rnn_ = rnn_cell.BasicLSTMCell(dim_hidden) 84 | 85 | if layers is not None: 86 | self.my_rnn = my_rnn = rnn_cell.MultiRNNCell([rnn_] * layers) 87 | self.init_state = my_rnn.zero_state(batch_size, tf.float32) 88 | else: 89 | self.my_rnn = my_rnn = rnn_ 90 | self.init_state = tf.zeros([batch_size, my_rnn.state_size]) 91 | 92 | self.W_iemb = tf.get_variable("W_iemb", [dim_ictx, dim_iemb]) 93 | self.b_iemb = tf.get_variable("b_iemb", [dim_iemb]) 94 | with tf.device("/cpu:0"): 95 | self.W_wemb = tf.get_variable("W_wemb", [vocab_size, dim_wemb]) 96 | 97 | if config.is_birnn : # add 보다 concat이 더 잘나오는듯.. 98 | self.W_pred = tf.get_variable("W_pred", [dim_hidden * 2, y_size]) 99 | else : 100 | self.W_pred = tf.get_variable("W_pred", [dim_hidden, y_size]) 101 | 102 | self.b_pred = tf.get_variable("b_pred", [y_size]) 103 | 104 | 105 | def build_model(self): 106 | 107 | x = tf.placeholder(tf.int32, [self.batch_size, self.steps]) 108 | x_mask = tf.placeholder(tf.float32, [self.batch_size, self.steps]) 109 | y = tf.placeholder(tf.float32, [self.batch_size, self.y_size]) 110 | img = tf.placeholder(tf.float32, [self.batch_size, self.dim_ictx]) 111 | 112 | 113 | 114 | with tf.device("/cpu:0"): 115 | inputs = tf.split(1, self.steps, tf.nn.embedding_lookup(self.W_wemb, x)) 116 | # sample * steps * dim -> split -> sample * 1 * dim 117 | inputs = [tf.squeeze(input_, [1]) for input_ in inputs] 118 | # [sample * dim, sample * dim, sample * dim, ... ] 119 | img_emb = tf.nn.xw_plus_b(img, self.W_iemb, self.b_iemb) 120 | inputs = [img_emb]+inputs[:-1] # -1 is for img 121 | 122 | 123 | 124 | hiddens = [] 125 | states = [] 126 | 127 | 128 | state = self.init_state 129 | with tf.variable_scope("RNN", reuse=None): 130 | for i in range(len(inputs)): 131 | if i == 0: 132 | (hidden, state) = self.my_rnn(inputs[i], state) 133 | else: 134 | m = x_mask[:, i] 135 | 136 | tf.get_variable_scope().reuse_variables() 137 | (prev_hidden, prev_state) = (hidden, state) # for masking 138 | (hidden, state) = self.my_rnn(inputs[i], state) 139 | 140 | m_1 = tf.expand_dims(m,1) 141 | m_1 = tf.tile(m_1, [1, self.dim_hidden]) 142 | m_0 = tf.expand_dims(1. - m,1) 143 | m_0 = tf.tile(m_0, [1, self.dim_hidden]) 144 | hidden = tf.add(tf.mul(m_1, hidden), tf.mul(m_0, prev_hidden)) 145 | state = tf.add(tf.mul(m_1, state), tf.mul(m_0, prev_state)) 146 | hiddens.append(hidden) 147 | states.append(state) 148 | 149 | if self.config.is_birnn : 150 | rhiddens = [] 151 | rstates = [] 152 | rx_mask = tf.reverse(x_mask,[False, True]) 153 | rinputs = inputs[::-1] 154 | state = self.init_state 155 | with tf.variable_scope("rRNN", reuse=None): 156 | for i in range(len(rinputs)): 157 | if i == 0: 158 | (hidden, state) = self.my_rnn(inputs[i], state) 159 | else: 160 | m = rx_mask[:, i] 161 | 162 | tf.get_variable_scope().reuse_variables() 163 | (prev_hidden, prev_state) = (hidden, state) # for masking 164 | (hidden, state) = self.my_rnn(rinputs[i], state) 165 | m_1 = tf.expand_dims(m,1) 166 | m_1 = tf.tile(m_1, [1, self.dim_hidden]) 167 | m_0 = tf.expand_dims(1. - m,1) 168 | m_0 = tf.tile(m_0, [1, self.dim_hidden]) 169 | hidden = tf.add(tf.mul(m_1, hidden), tf.mul(m_0, prev_hidden)) 170 | state = tf.add(tf.mul(m_1, state), tf.mul(m_0, prev_state)) 171 | rhiddens.append(hidden) 172 | rstates.append(state) 173 | 174 | 175 | hiddens = tf.concat(2, [tf.pack(hiddens), tf.pack(rhiddens[::-1])]) 176 | #hiddens = tf.add(tf.pack(hiddens), tf.pack(rhiddens[::-1])) 177 | hiddens = tf.unpack(hiddens) 178 | 179 | ''' 180 | mean or last hidden -> logit_hidden : sample * dim 181 | ''' 182 | # 1. last hidden 183 | #logit_hidden = hiddens[-1] #tf.reduce_mean(hiddens, 0) 184 | # 2. mean of hiddens 185 | x_mask_t = tf.transpose(x_mask) 186 | x_mask_t_denom = tf.expand_dims(tf.reduce_sum(x_mask_t, 0), 1) 187 | x_mask_t_denom = tf.tile(x_mask_t_denom, [1, self.W_pred.get_shape().dims[0].value]) 188 | x_mask_t = tf.expand_dims(x_mask_t, 2) 189 | x_mask_t = tf.tile(x_mask_t, [1, 1, self.W_pred.get_shape().dims[0].value]) 190 | hiddens = tf.pack(hiddens) 191 | logit_hidden = tf.reduce_sum(tf.mul(hiddens, x_mask_t), 0) 192 | logit_hidden = tf.div(logit_hidden, x_mask_t_denom) 193 | 194 | 195 | if self.config.dropout is not None: 196 | logit_hidden = tf.nn.dropout(logit_hidden, self.config.dropout) 197 | 198 | 199 | logits = tf.nn.xw_plus_b(logit_hidden, self.W_pred, self.b_pred) 200 | 201 | probs = tf.nn.softmax(logits) 202 | prediction = tf.argmax(probs, 1) 203 | correct_prediction = tf.equal(prediction, tf.argmax(y,1)) 204 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 205 | cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, y) 206 | loss = tf.reduce_mean(cross_entropy) 207 | train_op = tf.train.AdamOptimizer(self.lr).minimize(loss) 208 | return x, x_mask, y, img, loss, train_op, accuracy, prediction 209 | 210 | 211 | def train(): 212 | 213 | 214 | config = get_config() 215 | 216 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 217 | train = pd.read_pickle('train_vgg.pkl') 218 | train_x = [ q for q in train['q'] ] 219 | train_y = [ a[0] for a in train['a'] ] 220 | train_y = np.array(train_y)[:,None] 221 | train_y = np_utils.to_categorical(train_y, config.y_size).astype('float32') 222 | train_x , train_x_mask = prepare_data(train_x, config.steps) 223 | train_x_img = np.array([ img.tolist() for img in train['cnn_feature'] ]).astype('float32') 224 | 225 | n_train = len(train_x) 226 | 227 | print 'train_x :', train_x.shape 228 | print 'train_x_mask :', train_x_mask.shape 229 | print 'train_x_img :', train_x_img.shape 230 | print 'train_y :',train_y.shape 231 | 232 | if config.valid_epoch is not None: 233 | valid=pd.read_pickle('test_vgg.pkl') 234 | valid_x=[ q for q in valid['q'] ] 235 | valid_y=[ a[0] for a in valid['a'] ] 236 | valid_y=np.array(valid_y)[:,None] 237 | valid_y = np_utils.to_categorical(valid_y, config.y_size).astype('float32') 238 | valid_x , valid_x_mask = prepare_data(valid_x, config.steps) 239 | valid_x_img = np.array([ img.tolist() for img in valid['cnn_feature'] ]).astype('float32') 240 | n_valid = len(valid_x) 241 | valid_batch_indices=get_minibatch_indices(n_valid, config.batch_size, shuffle=False) 242 | 243 | print 'valid_x :', valid_x.shape 244 | print 'valid_x_mask :', valid_x_mask.shape 245 | print 'valid_x_img :', valid_x_img.shape 246 | print 'valid_y :',valid_y.shape 247 | 248 | 249 | 250 | with tf.Session() as sess: 251 | 252 | 253 | initializer = tf.random_normal_initializer(0, 0.1) 254 | with tf.variable_scope("model", reuse=None, initializer=initializer): 255 | model = ImageQA(config = config) 256 | 257 | x, x_mask, y, img, loss, train_op, accuracy, prediction = model.build_model() 258 | 259 | saver = tf.train.Saver() 260 | sess.run(tf.initialize_all_variables()) 261 | 262 | for i in range(config.epoch): 263 | start = time.time() 264 | lr_decay = config.lr_decay ** max(i - config.decay_epoch, 0.0) 265 | sess.run(tf.assign(model.lr, config.lr * lr_decay)) 266 | 267 | 268 | batch_indices=get_minibatch_indices(n_train, config.batch_size, shuffle=True) 269 | 270 | preds = [] 271 | for j, indices in enumerate(batch_indices): 272 | 273 | x_ = np.array([ train_x[k,:] for k in indices]) 274 | x_mask_ = np.array([ train_x_mask[k,:] for k in indices]) 275 | y_ = np.array([ train_y[k,:] for k in indices]) 276 | img_ = np.array([ train_x_img[k,:] for k in indices]) 277 | 278 | 279 | 280 | cost, _, acc, pred = sess.run([loss, train_op, accuracy, prediction], 281 | {x: x_, 282 | x_mask: x_mask_, 283 | y: y_, 284 | img : img_}) 285 | preds = preds + pred.tolist() 286 | if j % 99 == 0 : 287 | print 'cost : ', cost, ', accuracy : ', acc, ', iter : ', j+1, ' in epoch : ',i+1 288 | print 'cost : ', cost, ', accuracy : ', acc, ', iter : ', j+1, ' in epoch : ',i+1,' elapsed time : ', int(time.time()-start) 289 | if config.valid_epoch is not None: # for validation 290 | best_accuracy = 0. 291 | 292 | if (i+1) % config.valid_epoch == 0: 293 | val_preds = [] 294 | for j, indices in enumerate(valid_batch_indices): 295 | x_ = np.array([ valid_x[k,:] for k in indices]) 296 | x_mask_ = np.array([ valid_x_mask[k,:] for k in indices]) 297 | y_ = np.array([ valid_y[k,:] for k in indices]) 298 | img_ = np.array([ valid_x_img[k,:] for k in indices]) 299 | 300 | pred = sess.run(prediction, 301 | {x: x_, 302 | x_mask: x_mask_, 303 | y: y_, 304 | img : img_}) 305 | 306 | val_preds = val_preds + pred.tolist() 307 | valid_acc = np.mean(np.equal(val_preds, np.argmax(valid_y,1))) 308 | print '##### valid accuracy : ', valid_acc, ' after epoch ', i+1 309 | if valid_acc > best_accuracy and i >= 10: 310 | best_accuracy = valid_acc 311 | saver.save(sess, config.model_ckpt_path, global_step=int(best_accuracy*100)) 312 | 313 | 314 | 315 | def test(): 316 | 317 | config = get_config() 318 | 319 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 320 | 321 | test=pd.read_pickle('test_vgg.pkl') 322 | test_x=[ q for q in test['q'] ] 323 | test_y=[ a[0] for a in test['a'] ] 324 | test_y=np.array(test_y)[:,None] 325 | test_y = np_utils.to_categorical(test_y, config.y_size).astype('float32') 326 | test_x , test_x_mask = prepare_data(test_x, config.steps) 327 | test_x_img = np.array([ img.tolist() for img in test['cnn_feature'] ]).astype('float32') 328 | n_test = len(test_x) 329 | test_batch_indices=get_minibatch_indices(n_test, config.batch_size, shuffle=False) 330 | 331 | 332 | 333 | with tf.Session() as sess: 334 | 335 | 336 | with tf.variable_scope("model", reuse=None): 337 | model = ImageQA(config = config) 338 | 339 | x, x_mask, y, img, _, _, _, prediction = model.build_model() 340 | saver = tf.train.Saver() 341 | ckpt = tf.train.get_checkpoint_state(os.path.dirname(config.model_ckpt_path)) 342 | sess.run(tf.initialize_all_variables()) 343 | saver.restore(sess, ckpt.model_checkpoint_path) 344 | test_preds = [] 345 | for j, indices in enumerate(test_batch_indices): 346 | x_ = np.array([test_x[k,:] for k in indices]) 347 | x_mask_ = np.array([ test_x_mask[k,:] for k in indices]) 348 | y_ = np.array([ test_y[k,:] for k in indices]) 349 | img_ = np.array([ test_x_img[k,:] for k in indices]) 350 | 351 | pred = sess.run(prediction, 352 | {x: x_, 353 | x_mask: x_mask_, 354 | y: y_, 355 | img : img_}) 356 | test_preds = test_preds + pred.tolist() 357 | 358 | 359 | test_acc = np.mean(np.equal(test_preds, np.argmax(test_y,1))) 360 | print 'test accuracy :', test_acc 361 | 362 | def test_sample(test_x, test_x_maks, test_x_img): 363 | 364 | config = get_config() 365 | config.batch_size = 1 366 | 367 | with tf.Session() as sess: 368 | with tf.variable_scope("model", reuse=None): 369 | model = ImageQA(config = config) 370 | 371 | x, x_mask, _, img, _, _, _, prediction = model.build_model() 372 | saver = tf.train.Saver() 373 | ckpt = tf.train.get_checkpoint_state(os.path.dirname(config.model_ckpt_path)) 374 | saver.restore(sess, ckpt.model_checkpoint_path) 375 | 376 | x_ = test_x 377 | x_mask_ = test_x_maks 378 | img_ = test_x_img 379 | 380 | pred = sess.run(prediction, 381 | {x: x_, 382 | x_mask: x_mask_, 383 | img : img_}) 384 | 385 | return pred 386 | 387 | 388 | 389 | 390 | def get_config(): 391 | class Config1(object): 392 | vocab_size = 12047 393 | y_size = 430 394 | batch_size = 364 #703 #s28 395 | steps = 60 396 | 397 | dim_ictx = 4096 398 | dim_iemb = 1024 # image embedding 399 | dim_wemb = 1024 # word embedding 400 | dim_hidden = 1024 401 | epoch = 100 402 | 403 | lr = 0.001 404 | lr_decay = 0.9 405 | decay_epoch = 333. # epoch 보다 크면 decay 하지않음. 406 | 407 | dropout = 0.4 408 | 409 | rnn_type = 'gru' 410 | layers = None #Don't work yet 411 | is_birnn = True #False 412 | valid_epoch = 1 # or None 413 | model_ckpt_path = '/home/seonhoon/Desktop/workspace/ImageQA/version_tensorflow/model/model.ckpt' 414 | return Config1() 415 | 416 | def main(_): 417 | 418 | 419 | is_train = False # if False then test 420 | 421 | if is_train : 422 | train() 423 | 424 | else: 425 | test() 426 | 427 | 428 | if __name__ == "__main__": 429 | tf.app.run() 430 | 431 | 432 | 433 | 434 | 435 | -------------------------------------------------------------------------------- /tensorflow_simple/web/imageQA_tensorflow.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/tensorflow_simple/web/imageQA_tensorflow.pyc -------------------------------------------------------------------------------- /tensorflow_simple/web/images/moodo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/tensorflow_simple/web/images/moodo.jpg -------------------------------------------------------------------------------- /tensorflow_simple/web/server.py: -------------------------------------------------------------------------------- 1 | 2 | from flask import Flask 3 | from flask import request 4 | from flask import render_template 5 | 6 | import cPickle 7 | import pandas as pd 8 | import numpy as np 9 | import sys 10 | import os 11 | 12 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | 14 | from imageQA_tensorflow import * 15 | 16 | app = Flask(__name__, static_url_path = "/images", static_folder='images') 17 | 18 | config = get_config() 19 | 20 | 21 | # http://127.0.0.1:5000/ 22 | @app.route('/') 23 | def my_form(): 24 | return render_template('iqa.html') 25 | 26 | 27 | @app.route('/', methods=['POST']) 28 | def my_form_post(answer=None): 29 | 30 | filename='/home/seonhoon/Desktop/workspace/ImageQA/data/dict.pkl' 31 | 32 | with open(filename, 'rb') as fp: 33 | idx2word, word2idx, idx2answer, answer2idx = cPickle.load(fp) 34 | 35 | text = request.form['text'] 36 | print text 37 | 38 | question=text.split() 39 | 40 | q_idx=[] 41 | for i in range(len(question)): 42 | q_idx.append(word2idx[question[i]]) 43 | q_idx=np.array(q_idx) 44 | 45 | print q_idx 46 | 47 | #running caffe and tensorflow seems not so easy simultaneously 48 | pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl') 49 | x_img = np.array([pd.read_pickle('/home/seonhoon/Desktop/workspace/ImageQA_Web/cnn.pkl')['cnn_feature'][0].tolist()]) 50 | 51 | 52 | 53 | x , x_mask = prepare_data([q_idx], config.steps) 54 | 55 | 56 | y = test_sample(x, x_mask, x_img) 57 | 58 | print idx2answer[y[0]] 59 | 60 | params = {'answer' : idx2answer[y[0]], 'text' : text} 61 | 62 | return render_template('iqa.html', **params) 63 | 64 | 65 | if __name__ == '__main__': 66 | app.run() 67 | -------------------------------------------------------------------------------- /tensorflow_simple/web/templates/iqa.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Image Question Answering 9 | 10 | 11 | 12 |

13 | 14 |

15 |
16 | 17 | 18 | 24 |

25 | {% if answer %} 26 |

the answer is {{answer}}

27 | {% endif %} 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /theano_attention/cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import caffe 5 | import numpy as np 6 | from scipy.misc import imread, imresize 7 | 8 | class CNN(object): 9 | 10 | def __init__(self, deploy='VGG_ILSVRC_16_layers_deploy.prototxt', model='VGG_ILSVRC_16_layers.caffemodel'): 11 | caffe.set_mode_gpu() 12 | self.net = caffe.Net(deploy, model, caffe.TEST) 13 | 14 | #if model.startswith('VGG'): 15 | self.mean = np.array([103.939, 116.779, 123.68]) 16 | 17 | 18 | def get_batch_features(self, in_data, net, layer): 19 | out = net.forward(blobs=[layer], **{net.inputs[0]: in_data}) 20 | features = out[layer]#.squeeze(axis=(2,3)) 21 | return features 22 | 23 | def get_features(self, filelist, layer='fc7'): 24 | 25 | N, channel, height, width = self.net.blobs[self.net.inputs[0]].data.shape 26 | n_files = len(filelist) 27 | 28 | 29 | if str(layer).startswith('fc'): 30 | feature = self.net.blobs[layer].data.shape[1] 31 | all_features = np.zeros((n_files, feature)) 32 | else : 33 | feature1, feature2, feature3 = self.net.blobs[layer].data.shape[1], self.net.blobs[layer].data.shape[2], self.net.blobs[layer].data.shape[3] 34 | all_features = np.zeros((n_files, feature1, feature2, feature3)) 35 | 36 | 37 | for i in range(0, n_files, N): 38 | in_data = np.zeros((N, channel, height, width), dtype=np.float32) 39 | 40 | batch_range = range(i, min(i+N, n_files)) 41 | batch_filelist = [filelist[j] for j in batch_range] 42 | 43 | batch_images = np.zeros((len(batch_range), 3, height, width)) 44 | for j,file in enumerate(batch_filelist): 45 | im = imread(file) 46 | if len(im.shape) == 2: 47 | im = np.tile(im[:,:,np.newaxis], (1,1,3)) 48 | im = im[:,:,(2,1,0)] # RGB -> BGR 49 | im = im - self.mean # mean subtraction 50 | im = imresize(im, (height, width), 'bicubic') # resize 51 | im = np.transpose(im, (2, 0, 1)) # get channel in correct dimension 52 | batch_images[j,:,:,:] = im 53 | 54 | in_data[0:len(batch_range), :, :, :] = batch_images 55 | 56 | features = self.get_batch_features(in_data, self.net, layer) 57 | 58 | for j in range(len(batch_range)): 59 | all_features[i+j,:] = features[j,:] 60 | 61 | #print 'Done %d/%d files' % (i+len(batch_range), len(filelist)) 62 | 63 | return all_features 64 | -------------------------------------------------------------------------------- /theano_attention/cnn.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/theano_attention/cnn.pyc -------------------------------------------------------------------------------- /theano_attention/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from models import RNN_GRU, RNN_LSTM, BIRNN_GRU 8 | from keras.utils import np_utils 9 | 10 | from sklearn.cross_validation import train_test_split 11 | 12 | 13 | def prepare_data(seqs_x, maxlen=None): 14 | lengths_x = [len(s) for s in seqs_x] 15 | 16 | n_samples = len(seqs_x) 17 | if maxlen is None: 18 | maxlen = np.max(lengths_x) + 1 19 | 20 | x = np.zeros((maxlen, n_samples)).astype('int32') 21 | x_mask = np.zeros((maxlen, n_samples)).astype('float32') 22 | 23 | for idx, s_x in enumerate(seqs_x): 24 | x[:lengths_x[idx],idx] = s_x 25 | #x_mask[:lengths_x[idx],idx] = 1. 26 | x_mask[:lengths_x[idx],idx] = 1. 27 | 28 | return x, x_mask 29 | 30 | 31 | 32 | def main(): 33 | 34 | 35 | 36 | 37 | 38 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 39 | 40 | n_vocab = 12047 41 | y_vocab = 430 42 | dim_word = 1024 43 | dim = 1024 44 | dim_ctx = 512 45 | maxlen = 60 46 | 47 | train=pd.read_pickle('train.pkl') 48 | # train, valid = train_test_split(train, test_size=0.2) 49 | valid=pd.read_pickle('test.pkl') 50 | 51 | train_x=[ q for q in train['q'] ] 52 | train_y=[ a[0] for a in train['a'] ] 53 | train_y=np.array(train_y)[:,None] 54 | train_y = np_utils.to_categorical(train_y, y_vocab).astype('int32') 55 | train_x , train_x_mask = prepare_data(train_x, maxlen) 56 | 57 | 58 | img_folder='/home/seonhoon/Desktop/workspace/ImageQA/data/images/' 59 | train_imgs = train['img_id'].apply(lambda x : img_folder+'train/'+x+'.jpg') 60 | train_imgs = train_imgs.tolist() 61 | 62 | valid_x=[ q for q in valid['q'] ] 63 | valid_y=[ a[0] for a in valid['a'] ] 64 | valid_x , valid_x_mask = prepare_data(valid_x, maxlen) 65 | 66 | valid_imgs = valid['img_id'].apply(lambda x : img_folder+'test/'+x+'.jpg') 67 | valid_imgs = valid_imgs.tolist() 68 | 69 | print 'train x :', train_x.shape 70 | print 'train x_mask:', train_x_mask.shape 71 | print 'train y : ', train_y.shape 72 | print 'train imgs :', len(train_imgs) 73 | 74 | print 'valid x :', valid_x.shape 75 | print 'valid x_mask:', valid_x_mask.shape 76 | #print 'valid y : ', valid_y.shape 77 | print 'valid imgs :', len(valid_imgs) 78 | 79 | model = BIRNN_GRU(n_vocab, y_vocab, dim_word, dim, dim_ctx) 80 | 81 | model.train(train_x, train_x_mask, train_imgs, train_y, 82 | valid_x, valid_x_mask, valid_imgs, valid_y, valid=2, 83 | lr=0.008, dropout=0.4, batch_size=512, epoch=60, save=4) 84 | 85 | if __name__ == '__main__': 86 | main() 87 | 88 | -------------------------------------------------------------------------------- /theano_attention/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import theano 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | import theano.tensor as T 5 | from keras import initializations 6 | 7 | from cnn import * 8 | 9 | import optimizer 10 | import cPickle 11 | import time 12 | import numpy as np 13 | 14 | 15 | 16 | 17 | 18 | def save_tparams(tparams, path): 19 | with open(path,'wb') as f: 20 | for params in tparams: 21 | cPickle.dump(params.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL) 22 | 23 | def load_tparams(tparams, path): 24 | with open(path,'rb') as f: 25 | for i in range(len(tparams)): 26 | tparams[i].set_value(cPickle.load(f)) 27 | return tparams 28 | 29 | def get_minibatch_indices(n, batch_size, shuffle=False): 30 | 31 | idx_list = np.arange(n, dtype="int32") 32 | 33 | if shuffle: 34 | np.random.shuffle(idx_list) 35 | 36 | minibatches = [] 37 | minibatch_start = 0 38 | for i in range(n // batch_size): 39 | minibatches.append(idx_list[minibatch_start: 40 | minibatch_start + batch_size]) 41 | minibatch_start += batch_size 42 | if (minibatch_start != n): 43 | minibatches.append(idx_list[minibatch_start:]) 44 | return minibatches 45 | 46 | def get_minibatch_cnn_features(cnn, filelist) : 47 | minibatch_size = len(filelist) 48 | cnn = cnn 49 | featurelist = cnn.get_features(filelist, layer='conv5_4') # sample * 512 * 14 * 14 50 | featurelist = featurelist.reshape(minibatch_size, 512, -1).swapaxes(1, 2) # sample * 512 * 196 -> sample * 196 * 512 51 | featurelist = np.array(featurelist, dtype='float32') 52 | return featurelist 53 | 54 | 55 | def dropout_layer(state_before, use_noise, trng, p): 56 | ratio = 1. - p 57 | proj = T.switch(use_noise, 58 | state_before * trng.binomial(state_before.shape, p=ratio, n=1, dtype=state_before.dtype), # for training.. 59 | state_before * ratio) 60 | return proj 61 | 62 | 63 | class RNN_GRU: 64 | def __init__(self, n_vocab, y_vocab, dim_word, dim, dim_ctx): 65 | 66 | self.n_vocab = n_vocab # 12047 67 | self.y_vocab = y_vocab # 430 68 | self.dim_word = dim_word # 1024 69 | self.dim = dim # 1024 70 | self.dim_ctx = dim_ctx # 512 71 | 72 | ### initial context 73 | self.W_ctx_init = initializations.uniform((self.dim_ctx, self.dim)) 74 | self.b_ctx_init = initializations.zero((self.dim)) 75 | 76 | 77 | ### forward : img_dim to context 78 | self.W_ctx_att = initializations.uniform((self.dim_ctx, self.dim_ctx)) 79 | self.b_ctx_att = initializations.zero((self.dim_ctx)) 80 | 81 | ### forward : hidden_dim to context 82 | self.W_dim_att = initializations.uniform((self.dim, self.dim_ctx)) 83 | 84 | ### context energy 85 | self.U_att = initializations.uniform((self.dim_ctx, 1)) 86 | self.c_att = initializations.zero((1)) 87 | 88 | 89 | 90 | ### Word Embedding ### 91 | self.W_emb = initializations.uniform((self.n_vocab, self.dim_word)) 92 | 93 | ### enc forward GRU ### 94 | self.W_gru_ctx = initializations.uniform((self.dim_word, self.dim_ctx)) 95 | self.b_gru_ctx = initializations.zero((self.dim_ctx)) 96 | 97 | 98 | self.W_gru = initializations.uniform((self.dim_word, self.dim * 2)) 99 | self.U_gru = initializations.uniform((self.dim, self.dim * 2)) 100 | self.b_gru = initializations.zero((self.dim * 2)) 101 | self.U_gru_ctx = initializations.uniform((self.dim_ctx, self.dim * 2)) 102 | 103 | self.W_gru_cdd = initializations.uniform((self.dim_word, self.dim)) # cdd : candidate 104 | self.U_gru_cdd = initializations.uniform((self.dim, self.dim)) 105 | self.b_gru_cdd = initializations.zero((self.dim)) 106 | self.U_gru_cdd_ctx = initializations.uniform((self.dim_ctx, self.dim)) 107 | 108 | ### prediction ### 109 | self.W_pred = initializations.uniform((self.dim, self.y_vocab)) 110 | self.b_pred = initializations.zero((self.y_vocab)) 111 | 112 | 113 | self.params = [self.W_ctx_init, self.b_ctx_init, 114 | self.W_ctx_att, self.b_ctx_att, 115 | self.W_dim_att, 116 | self.U_att, self.c_att, 117 | self.W_emb, 118 | self.W_gru_ctx, self.b_gru_ctx, 119 | self.W_gru, self.U_gru, self.b_gru, self.U_gru_ctx, 120 | self.W_gru_cdd, self.U_gru_cdd, self.b_gru_cdd, self.U_gru_cdd_ctx, 121 | self.W_pred, self.b_pred] 122 | 123 | def gru_layer(self, state_below, mask=None, context=None, init_state=None): 124 | #state_below : step * sample * dim 125 | nsteps = state_below.shape[0] 126 | n_samples = state_below.shape[1] 127 | 128 | dim = self.dim 129 | 130 | def _slice(_x, n, dim): 131 | if _x.ndim == 3: 132 | print '_x.ndim : ' , _x.ndim 133 | return _x[:, :, n*dim:(n+1)*dim] 134 | return _x[:, n*dim:(n+1)*dim] 135 | 136 | if init_state is None : 137 | init_state = T.alloc(0., n_samples, dim) 138 | 139 | # context forward 140 | proj_ctx = T.dot(context, self.W_ctx_att) + self.b_ctx_att 141 | #context : sample * annotation * dim_ctx 142 | 143 | # step * samples * dim 144 | state_below_ = T.dot(state_below, self.W_gru) + self.b_gru 145 | state_belowx = T.dot(state_below, self.W_gru_cdd) + self.b_gru_cdd 146 | state_belowc = T.dot(state_below, self.W_gru_ctx) + self.b_gru_ctx 147 | 148 | def _step(m_, xc, x_, xx_, h_, a_, as_, proj_ctx): 149 | ''' 150 | m_ : (samples,) 151 | x_, h_ : samples * dimensions 152 | ''' 153 | 154 | pstate_ = T.dot(h_, self.W_dim_att) # sample * dim_ctx 155 | proj_ctx_ = proj_ctx + pstate_[:, None, :] # sample * annotation * dim_ctx 156 | proj_ctx_ = proj_ctx_ + xc[:, None, :] 157 | #추후 x to ctx 추가해볼 것. -- state_belowc 158 | 159 | 160 | proj_ctx_ = T.tanh(proj_ctx_) 161 | 162 | alpha = T.dot(proj_ctx_, self.U_att)+self.c_att 163 | alpha = T.nnet.softmax(alpha.reshape([alpha.shape[0], alpha.shape[1]])) # softmax 164 | # alpha = sample * annotation(prob) 165 | ctx_ = (context * alpha[:,:,None]).sum(1) # sample * expected_dim_ctx 166 | alpha_sample = alpha # you can return something else reasonable here to debug 167 | 168 | 169 | preact = T.dot(h_, self.U_gru) 170 | preact += x_ # samples * 1024 171 | preact += T.dot(ctx_, self.U_gru_ctx) 172 | 173 | r = T.nnet.sigmoid(_slice(preact, 0, dim)) 174 | u = T.nnet.sigmoid(_slice(preact, 1, dim)) 175 | 176 | preactx = T.dot(h_, self.U_gru_cdd) 177 | preactx *= r 178 | preactx += xx_ # samples * 512 179 | preactx += T.dot(ctx_, self.U_gru_cdd_ctx) 180 | 181 | h = T.tanh(preactx) 182 | 183 | h = u * h_ + (1. - u) * h 184 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ # m_[:,None] : samples * 1 185 | 186 | return [h, alpha, alpha_sample, ctx_]#, r, u, preact, preactx 187 | 188 | seqs = [mask, state_belowc, state_below_, state_belowx] 189 | outputs_info = [init_state, 190 | T.alloc(0., n_samples, proj_ctx.shape[1]), 191 | T.alloc(0., n_samples, proj_ctx.shape[1]), 192 | None] 193 | rval, updates = theano.scan(_step, 194 | sequences=seqs, 195 | outputs_info = outputs_info, 196 | non_sequences = [proj_ctx], 197 | name='gru_layer', 198 | n_steps=nsteps) 199 | return rval 200 | 201 | 202 | def build_model(self, lr=0.001, dropout=None): 203 | 204 | trng = RandomStreams(1234) 205 | use_noise = theano.shared(np.float32(0.)) 206 | 207 | # description string: #words x #samples 208 | 209 | 210 | x = T.matrix('x', dtype = 'int32') # step * samples 211 | x_mask = T.matrix('x_mask', dtype='float32') # step * samples 212 | y = T.matrix('y', dtype = 'int32') # sample * emb 213 | ctx = T.tensor3('ctx', dtype = 'float32') # sample * annotation * dim 214 | 215 | n_timesteps = x.shape[0] 216 | n_samples = x.shape[1] 217 | 218 | 219 | emb = self.W_emb[x.flatten()] 220 | 221 | emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) 222 | 223 | ctx0 = ctx 224 | ctx_mean = ctx0.mean(1) 225 | 226 | init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init 227 | 228 | 229 | # proj : gru hidden 들의 리스트 230 | proj = self.gru_layer(emb, mask=x_mask, context=ctx, init_state=init_state) 231 | proj_h = proj[0] 232 | 233 | # hidden 들의 평균 234 | proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0) 235 | proj_h = proj_h / x_mask.sum(axis=0)[:, None] # sample * dim 236 | 237 | # 마지막 hidden 238 | #proj = proj[-1] # sample * dim 239 | 240 | if dropout is not None : 241 | proj_h = dropout_layer(proj_h, use_noise, trng, dropout) 242 | 243 | 244 | 245 | output = T.dot(proj_h, self.W_pred) + self.b_pred 246 | 247 | probs = T.nnet.softmax(output) 248 | prediction = probs.argmax(axis=1) 249 | 250 | ## avoid NaN 251 | epsilon = 1.0e-9 252 | probs = T.clip(probs, epsilon, 1.0 - epsilon) 253 | probs /= probs.sum(axis=-1, keepdims=True) 254 | ## avoid NaN 255 | 256 | 257 | cost = T.nnet.categorical_crossentropy(probs, y) 258 | cost = T.mean(cost) 259 | 260 | updates = optimizer.adam(cost=cost, params=self.params, lr=lr) 261 | 262 | return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction 263 | 264 | 265 | 266 | def train(self, train_x, train_mask_x, train_imgs, train_y, 267 | valid_x=None, valid_mask_x=None, valid_imgs=None, valid_y=None, 268 | valid=None, 269 | lr=0.001, 270 | dropout=None, 271 | batch_size=16, 272 | epoch=100, 273 | save=None): 274 | 275 | cnn = CNN(deploy='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers_deploy.prototxt', model='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers.caffemodel') 276 | 277 | n_train = train_x.shape[1] 278 | 279 | trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction = self.build_model(lr, dropout) 280 | # x : step * sample * dim 281 | # x_mask : step * sample 282 | # ctx : sample * annotation * dim_ctx 283 | # y : sample * emb 284 | 285 | 286 | train_model = theano.function(inputs=[x, x_mask, ctx, y], 287 | outputs=cost, 288 | updates=updates) 289 | 290 | if valid is not None: 291 | valid_model = theano.function(inputs=[x, x_mask, ctx], 292 | outputs=prediction) 293 | valid_batch_indices = get_minibatch_indices(valid_x.shape[1], batch_size) 294 | 295 | 296 | best_cost = np.inf 297 | best_cost_epoch = 0 298 | best_params = self.params 299 | for i in xrange(epoch): 300 | start_time = time.time() 301 | use_noise.set_value(1.) 302 | batch_indices=get_minibatch_indices(n_train, batch_size, shuffle=True) 303 | 304 | 305 | 306 | for j, indices in enumerate(batch_indices): 307 | minibatch_avg_cost = 0 308 | 309 | x = [ train_x[:,t] for t in indices] 310 | x = np.transpose(x) 311 | x_mask = [ train_mask_x[:,t] for t in indices] 312 | x_mask = np.transpose(x_mask) 313 | y = [ train_y[t,:] for t in indices] 314 | y = np.array(y) 315 | batch_imgs = [ train_imgs[t] for t in indices] 316 | ctx = get_minibatch_cnn_features(cnn, batch_imgs) 317 | 318 | minibatch_avg_cost = train_model(x, x_mask, ctx, y) 319 | print ' minibatch cost : ' , minibatch_avg_cost, ' , minibatch :', (j+1), ' in epoch \'', (i+1) 320 | 321 | if minibatch_avg_cost < best_cost : 322 | best_cost = minibatch_avg_cost 323 | best_cost_epoch = i+1 324 | best_params = self.params 325 | end_time = time.time() 326 | print 'cost : ' , minibatch_avg_cost, ' in epoch \'', (i+1) ,'\' ', '[ ', int(end_time-start_time), 'sec ] [ best_cost : ', best_cost, ' in epoch \'', best_cost_epoch, '\' ]' 327 | 328 | 329 | 330 | # validation 331 | if valid is not None: 332 | if (i+1) % valid == 0: 333 | use_noise.set_value(0.) 334 | 335 | valid_prediction = [] 336 | for k, valid_indices in enumerate(valid_batch_indices): 337 | 338 | val_x = [ valid_x[:,t] for t in valid_indices] 339 | val_x = np.transpose(val_x) 340 | val_x_mask = [ valid_mask_x[:,t] for t in valid_indices] 341 | val_x_mask = np.transpose(val_x_mask) 342 | val_batch_imgs = [ valid_imgs[t] for t in valid_indices] 343 | val_ctx = get_minibatch_cnn_features(cnn, val_batch_imgs) 344 | 345 | valid_prediction += valid_model(val_x, val_x_mask, val_ctx).tolist() 346 | correct = 0 347 | for l in range(len(valid_prediction)): 348 | if valid_prediction[l]==valid_y[l] : 349 | correct += 1 350 | print '## valid accuracy : ', float(correct) / len(valid_prediction) 351 | 352 | 353 | # save model 354 | if save is not None: 355 | if (i+1) % save == 0: 356 | print 'save best param..', 357 | save_tparams(best_params, 'model.pkl') 358 | print 'Done' 359 | 360 | 361 | def prediction(self, test_x, test_mask_x, test_img, test_y, 362 | # valid_x=None, valid_mask_x=None, 363 | #valid_y=None, valid_mask_y=None, 364 | # optimizer=None, 365 | lr=0.001, 366 | batch_size=16, 367 | epoch=100, 368 | save=None): 369 | 370 | load_tparams(self.params, 'model.pkl') 371 | test_shared_x = theano.shared(np.asarray(test_x, dtype='int32'), borrow=True) 372 | #test_shared_y = theano.shared(np.asarray(test_y, dtype='int32'), borrow=True) 373 | test_shared_mask_x = theano.shared(np.asarray(test_mask_x, dtype='float32'), borrow=True) 374 | test_shared_img = theano.shared(np.asarray(test_img, dtype='float32'), borrow=True) 375 | 376 | n_test = test_x.shape[1] 377 | n_test_batches = int(np.ceil(1.0 * n_test / batch_size)) 378 | 379 | index = T.lscalar('index') # index to a case 380 | final_index = T.lscalar('final_index') 381 | 382 | 383 | trng, use_noise, x, x_mask, img, y, _, _, prediction = self.build_model(lr) 384 | 385 | batch_start = index * batch_size 386 | batch_stop = T.minimum(final_index, (index + 1) * batch_size) 387 | 388 | test_model = theano.function(inputs=[index, final_index], 389 | outputs=prediction, 390 | givens={ 391 | x: test_shared_x[:,batch_start:batch_stop], 392 | x_mask: test_shared_mask_x[:,batch_start:batch_stop], 393 | img: test_shared_img[batch_start:batch_stop,:]}) 394 | 395 | 396 | prediction = [] 397 | for minibatch_idx in xrange(n_test_batches): 398 | print minibatch_idx, ' / ' , n_test_batches 399 | prediction += test_model(minibatch_idx, n_test).tolist() 400 | return prediction 401 | 402 | 403 | 404 | 405 | 406 | class RNN_LSTM: 407 | def __init__(self, n_vocab, y_vocab, dim_word, dim, dim_ctx): 408 | 409 | self.n_vocab = n_vocab # 12047 410 | self.y_vocab = y_vocab # 430 411 | self.dim_word = dim_word # 1024 412 | self.dim = dim # 1024 413 | self.dim_ctx = dim_ctx # 512 414 | 415 | ### initial context 416 | self.W_hidden_init = initializations.uniform((self.dim_ctx, self.dim)) 417 | self.b_hidden_init = initializations.zero((self.dim)) 418 | self.W_memory_init = initializations.uniform((self.dim_ctx, self.dim)) 419 | self.b_memory_init = initializations.zero((self.dim)) 420 | 421 | 422 | ### forward : img_dim to context 423 | self.W_ctx_att = initializations.uniform((self.dim_ctx, self.dim_ctx)) 424 | self.b_ctx_att = initializations.zero((self.dim_ctx)) 425 | 426 | ### forward : hidden_dim to context 427 | self.W_dim_att = initializations.uniform((self.dim, self.dim_ctx)) 428 | 429 | ### context energy 430 | self.U_att = initializations.uniform((self.dim_ctx, 1)) 431 | self.c_att = initializations.zero((1)) 432 | 433 | 434 | 435 | ### Word Embedding ### 436 | self.W_emb = initializations.uniform((self.n_vocab, self.dim_word)) 437 | 438 | ### enc forward GRU ### 439 | self.W_lstm_ctx = initializations.uniform((self.dim_word, self.dim_ctx)) 440 | self.b_lstm_ctx = initializations.zero((self.dim_ctx)) 441 | 442 | 443 | self.W_lstm = initializations.uniform((self.dim_word, self.dim * 4)) 444 | self.U_lstm = initializations.uniform((self.dim, self.dim * 4)) 445 | self.b_lstm = initializations.zero((self.dim * 4)) 446 | self.U_lstm_ctx = initializations.uniform((self.dim_ctx, self.dim * 4)) 447 | 448 | 449 | 450 | ### prediction ### 451 | self.W_pred = initializations.uniform((self.dim, self.y_vocab)) 452 | self.b_pred = initializations.zero((self.y_vocab)) 453 | 454 | 455 | self.params = [self.W_hidden_init, self.b_hidden_init,self.W_memory_init, self.b_memory_init, 456 | self.W_ctx_att, self.b_ctx_att, 457 | self.W_dim_att, 458 | self.U_att, self.c_att, 459 | self.W_emb, 460 | # self.W_lstm_ctx, self.b_lstm_ctx, 461 | self.W_lstm, self.U_lstm, self.b_lstm, self.U_lstm_ctx, 462 | self.W_pred, self.b_pred] 463 | 464 | def lstm_layer(self, state_below, mask=None, context=None, init_state=None, init_memory=None): 465 | #state_below : step * sample * dim 466 | nsteps = state_below.shape[0] 467 | n_samples = state_below.shape[1] 468 | 469 | dim = self.dim 470 | 471 | def _slice(_x, n, dim): 472 | if _x.ndim == 3: 473 | print '_x.ndim : ' , _x.ndim 474 | return _x[:, :, n*dim:(n+1)*dim] 475 | return _x[:, n*dim:(n+1)*dim] 476 | 477 | if init_state is None : 478 | init_state = T.alloc(0., n_samples, dim) 479 | if init_memory is None: 480 | init_memory = T.alloc(0., n_samples, dim) 481 | # context forward 482 | proj_ctx = T.dot(context, self.W_ctx_att) + self.b_ctx_att 483 | #context : sample * annotation * dim_ctx 484 | 485 | # step * samples * dim 486 | state_below_ = T.dot(state_below, self.W_lstm) + self.b_lstm 487 | #state_belowc = T.dot(state_below, self.W_lstm_ctx) + self.b_lstm_ctx 488 | 489 | def _step(m_, x_, h_, c_, a_, as_, proj_ctx): 490 | ''' 491 | m_ : (samples,) 492 | x_, h_ : samples * dimensions 493 | ''' 494 | 495 | pstate_ = T.dot(h_, self.W_dim_att) # sample * dim_ctx 496 | proj_ctx_ = proj_ctx + pstate_[:, None, :] # sample * annotation * dim_ctx 497 | #proj_ctx_ = proj_ctx_ + xc[:, None, :] 498 | #추후 x to ctx 추가해볼 것. -- state_belowc 499 | 500 | 501 | proj_ctx_ = T.tanh(proj_ctx_) 502 | 503 | alpha = T.dot(proj_ctx_, self.U_att)+self.c_att 504 | alpha = T.nnet.softmax(alpha.reshape([alpha.shape[0], alpha.shape[1]])) # softmax 505 | # alpha = sample * annotation(prob) 506 | ctx_ = (context * alpha[:,:,None]).sum(1) # sample * expected_dim_ctx 507 | alpha_sample = alpha # you can return something else reasonable here to debug 508 | 509 | 510 | preact = T.dot(h_, self.U_lstm) 511 | preact += x_ # samples * 1024 512 | preact += T.dot(ctx_, self.U_lstm_ctx) 513 | 514 | i = _slice(preact, 0, dim) 515 | f = _slice(preact, 1, dim) 516 | o = _slice(preact, 2, dim) 517 | 518 | i = T.nnet.sigmoid(i) 519 | f = T.nnet.sigmoid(f) 520 | o = T.nnet.sigmoid(o) 521 | c = T.tanh(_slice(preact, 3, dim)) 522 | 523 | # compute the new memory/hidden state 524 | # if the mask is 0, just copy the previous state 525 | c = f * c_ + i * c 526 | c = m_[:,None] * c + (1. - m_)[:,None] * c_ 527 | 528 | h = o * T.tanh(c) 529 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ 530 | 531 | 532 | return [h, c, alpha, alpha_sample, ctx_]#, r, u, preact, preactx 533 | 534 | seqs = [mask, state_below_] 535 | outputs_info = [init_state, 536 | init_memory, 537 | T.alloc(0., n_samples, proj_ctx.shape[1]), 538 | T.alloc(0., n_samples, proj_ctx.shape[1]), 539 | None] 540 | rval, updates = theano.scan(_step, 541 | sequences=seqs, 542 | outputs_info = outputs_info, 543 | non_sequences = [proj_ctx], 544 | name='lstm_layer', 545 | n_steps=nsteps) 546 | return rval 547 | 548 | 549 | def build_model(self, lr=0.001, dropout=None): 550 | 551 | trng = RandomStreams(1234) 552 | use_noise = theano.shared(np.float32(0.)) 553 | 554 | # description string: #words x #samples 555 | 556 | 557 | x = T.matrix('x', dtype = 'int32') # step * samples 558 | x_mask = T.matrix('x_mask', dtype='float32') # step * samples 559 | y = T.matrix('y', dtype = 'int32') # sample * emb 560 | ctx = T.tensor3('ctx', dtype = 'float32') # sample * annotation * dim 561 | 562 | n_timesteps = x.shape[0] 563 | n_samples = x.shape[1] 564 | 565 | 566 | emb = self.W_emb[x.flatten()] 567 | 568 | emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) 569 | 570 | ctx0 = ctx 571 | ctx_mean = ctx0.mean(1) 572 | 573 | init_state = T.dot(ctx_mean, self.W_hidden_init) + self.b_hidden_init 574 | init_memory = T.dot(ctx_mean, self.W_memory_init) + self.b_memory_init 575 | 576 | 577 | # proj : lstm hidden 들의 리스트 578 | proj = self.lstm_layer(emb, mask=x_mask, context=ctx, init_state=init_state, init_memory=init_memory) 579 | proj_h = proj[0] 580 | 581 | # hidden 들의 평균 582 | proj_h = (proj_h * x_mask[:, :, None]).sum(axis=0) 583 | proj_h = proj_h / x_mask.sum(axis=0)[:, None] # sample * dim 584 | 585 | # 마지막 hidden 586 | #proj_h = proj_h[-1] # sample * dim 587 | 588 | 589 | if dropout is not None : 590 | proj_h = dropout_layer(proj_h, use_noise, trng, dropout) 591 | 592 | 593 | output = T.dot(proj_h, self.W_pred) + self.b_pred 594 | 595 | probs = T.nnet.softmax(output) 596 | prediction = probs.argmax(axis=1) 597 | 598 | ## avoid NaN 599 | epsilon = 1.0e-9 600 | probs = T.clip(probs, epsilon, 1.0 - epsilon) 601 | probs /= probs.sum(axis=-1, keepdims=True) 602 | ## avoid NaN 603 | 604 | 605 | cost = T.nnet.categorical_crossentropy(probs, y) 606 | cost = T.mean(cost) 607 | 608 | updates = optimizer.adam(cost=cost, params=self.params, lr=lr) 609 | 610 | return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction 611 | 612 | 613 | 614 | def train(self, train_x, train_mask_x, train_imgs, train_y, 615 | valid_x=None, valid_mask_x=None, valid_imgs=None, valid_y=None, 616 | valid=None, 617 | lr=0.001, 618 | dropout=None, 619 | batch_size=16, 620 | epoch=100, 621 | save=None): 622 | 623 | cnn = CNN(deploy='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers_deploy.prototxt', model='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers.caffemodel') 624 | 625 | n_train = train_x.shape[1] 626 | 627 | trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction = self.build_model(lr, dropout) 628 | # x : step * sample * dim 629 | # x_mask : step * sample 630 | # ctx : sample * annotation * dim_ctx 631 | # y : sample * emb 632 | 633 | 634 | train_model = theano.function(inputs=[x, x_mask, ctx, y], 635 | outputs=cost, 636 | updates=updates) 637 | 638 | if valid is not None: 639 | valid_model = theano.function(inputs=[x, x_mask, ctx], 640 | outputs=prediction) 641 | valid_batch_indices = get_minibatch_indices(valid_x.shape[1], batch_size) 642 | 643 | 644 | best_cost = np.inf 645 | best_cost_epoch = 0 646 | best_params = self.params 647 | for i in xrange(epoch): 648 | start_time = time.time() 649 | use_noise.set_value(1.) 650 | batch_indices=get_minibatch_indices(n_train, batch_size, shuffle=True) 651 | 652 | 653 | 654 | 655 | 656 | for j, indices in enumerate(batch_indices): 657 | minibatch_avg_cost = 0 658 | 659 | x = [ train_x[:,t] for t in indices] 660 | x = np.transpose(x) 661 | x_mask = [ train_mask_x[:,t] for t in indices] 662 | x_mask = np.transpose(x_mask) 663 | y = [ train_y[t,:] for t in indices] 664 | y = np.array(y) 665 | batch_imgs = [ train_imgs[t] for t in indices] 666 | ctx = get_minibatch_cnn_features(cnn, batch_imgs) 667 | 668 | minibatch_avg_cost = train_model(x, x_mask, ctx, y) 669 | if j % 15 == 0 : 670 | print ' minibatch cost : ' , minibatch_avg_cost, ' , minibatch :', (j+1), ' in epoch \'', (i+1) 671 | 672 | if minibatch_avg_cost < best_cost : 673 | best_cost = minibatch_avg_cost 674 | best_cost_epoch = i+1 675 | best_params = self.params 676 | end_time = time.time() 677 | print 'cost : ' , minibatch_avg_cost, ' in epoch \'', (i+1) ,'\' ', '[ ', int(end_time-start_time), 'sec ] [ best_cost : ', best_cost, ' in epoch \'', best_cost_epoch, '\' ]' 678 | 679 | 680 | # validation 681 | if valid is not None: 682 | if (i+1) % valid == 0: 683 | use_noise.set_value(0.) 684 | 685 | valid_prediction = [] 686 | for k, valid_indices in enumerate(valid_batch_indices): 687 | 688 | val_x = [ valid_x[:,t] for t in valid_indices] 689 | val_x = np.transpose(val_x) 690 | val_x_mask = [ valid_mask_x[:,t] for t in valid_indices] 691 | val_x_mask = np.transpose(val_x_mask) 692 | val_batch_imgs = [ valid_imgs[t] for t in valid_indices] 693 | val_ctx = get_minibatch_cnn_features(cnn, val_batch_imgs) 694 | 695 | valid_prediction += valid_model(val_x, val_x_mask, val_ctx).tolist() 696 | correct = 0 697 | for l in range(len(valid_prediction)): 698 | if valid_prediction[l]==valid_y[l] : 699 | correct += 1 700 | print '## valid accuracy : ', float(correct) / len(valid_prediction) 701 | 702 | 703 | # save model 704 | if save is not None: 705 | if (i+1) % save == 0: 706 | print 'save best param..', 707 | save_tparams(best_params, 'model.pkl') 708 | print 'Done' 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | class BIRNN_GRU: 717 | def __init__(self, n_vocab, y_vocab, dim_word, dim, dim_ctx): 718 | 719 | self.n_vocab = n_vocab # 12047 720 | self.y_vocab = y_vocab # 430 721 | self.dim_word = dim_word # 1024 722 | self.dim = dim # 1024 723 | self.dim_ctx = dim_ctx # 512 724 | 725 | ### initial context 726 | self.W_ctx_init = initializations.uniform((self.dim_ctx, self.dim)) 727 | self.b_ctx_init = initializations.zero((self.dim)) 728 | 729 | 730 | ### forward : img_dim to context 731 | self.W_ctx_att = initializations.uniform((self.dim_ctx, self.dim_ctx)) 732 | self.b_ctx_att = initializations.zero((self.dim_ctx)) 733 | 734 | ### forward : hidden_dim to context 735 | self.W_dim_att = initializations.uniform((self.dim, self.dim_ctx)) 736 | 737 | ### context energy 738 | self.U_att = initializations.uniform((self.dim_ctx, 1)) 739 | self.c_att = initializations.zero((1)) 740 | 741 | 742 | 743 | ### Word Embedding ### 744 | self.W_emb = initializations.uniform((self.n_vocab, self.dim_word)) 745 | 746 | ### enc forward GRU ### 747 | self.W_gru_ctx = initializations.uniform((self.dim_word, self.dim_ctx)) 748 | self.b_gru_ctx = initializations.zero((self.dim_ctx)) 749 | 750 | 751 | self.W_gru = initializations.uniform((self.dim_word, self.dim * 2)) 752 | self.U_gru = initializations.uniform((self.dim, self.dim * 2)) 753 | self.b_gru = initializations.zero((self.dim * 2)) 754 | self.U_gru_ctx = initializations.uniform((self.dim_ctx, self.dim * 2)) 755 | 756 | self.W_gru_cdd = initializations.uniform((self.dim_word, self.dim)) # cdd : candidate 757 | self.U_gru_cdd = initializations.uniform((self.dim, self.dim)) 758 | self.b_gru_cdd = initializations.zero((self.dim)) 759 | self.U_gru_cdd_ctx = initializations.uniform((self.dim_ctx, self.dim)) 760 | 761 | ### prediction ### 762 | self.W_pred = initializations.uniform((self.dim * 2, self.y_vocab)) 763 | self.b_pred = initializations.zero((self.y_vocab)) 764 | 765 | 766 | self.params = [self.W_ctx_init, self.b_ctx_init, 767 | self.W_ctx_att, self.b_ctx_att, 768 | self.W_dim_att, 769 | self.U_att, self.c_att, 770 | self.W_emb, 771 | self.W_gru_ctx, self.b_gru_ctx, 772 | self.W_gru, self.U_gru, self.b_gru, self.U_gru_ctx, 773 | self.W_gru_cdd, self.U_gru_cdd, self.b_gru_cdd, self.U_gru_cdd_ctx, 774 | self.W_pred, self.b_pred] 775 | 776 | def gru_layer(self, state_below, mask=None, context=None, init_state=None): 777 | #state_below : step * sample * dim 778 | nsteps = state_below.shape[0] 779 | n_samples = state_below.shape[1] 780 | 781 | dim = self.dim 782 | 783 | def _slice(_x, n, dim): 784 | if _x.ndim == 3: 785 | print '_x.ndim : ' , _x.ndim 786 | return _x[:, :, n*dim:(n+1)*dim] 787 | return _x[:, n*dim:(n+1)*dim] 788 | 789 | if init_state is None : 790 | init_state = T.alloc(0., n_samples, dim) 791 | 792 | # context forward 793 | proj_ctx = T.dot(context, self.W_ctx_att) + self.b_ctx_att 794 | #context : sample * annotation * dim_ctx 795 | 796 | # step * samples * dim 797 | state_below_ = T.dot(state_below, self.W_gru) + self.b_gru 798 | state_belowx = T.dot(state_below, self.W_gru_cdd) + self.b_gru_cdd 799 | state_belowc = T.dot(state_below, self.W_gru_ctx) + self.b_gru_ctx 800 | 801 | def _step(m_, xc, x_, xx_, h_, a_, as_, proj_ctx): 802 | ''' 803 | m_ : (samples,) 804 | x_, h_ : samples * dimensions 805 | ''' 806 | 807 | pstate_ = T.dot(h_, self.W_dim_att) # sample * dim_ctx 808 | proj_ctx_ = proj_ctx + pstate_[:, None, :] # sample * annotation * dim_ctx 809 | proj_ctx_ = proj_ctx_ + xc[:, None, :] 810 | #추후 x to ctx 추가해볼 것. -- state_belowc 811 | 812 | 813 | proj_ctx_ = T.tanh(proj_ctx_) 814 | 815 | alpha = T.dot(proj_ctx_, self.U_att)+self.c_att 816 | alpha = T.nnet.softmax(alpha.reshape([alpha.shape[0], alpha.shape[1]])) # softmax 817 | # alpha = sample * annotation(prob) 818 | ctx_ = (context * alpha[:,:,None]).sum(1) # sample * expected_dim_ctx 819 | alpha_sample = alpha # you can return something else reasonable here to debug 820 | 821 | 822 | preact = T.dot(h_, self.U_gru) 823 | preact += x_ # samples * 1024 824 | preact += T.dot(ctx_, self.U_gru_ctx) 825 | 826 | r = T.nnet.sigmoid(_slice(preact, 0, dim)) 827 | u = T.nnet.sigmoid(_slice(preact, 1, dim)) 828 | 829 | preactx = T.dot(h_, self.U_gru_cdd) 830 | preactx *= r 831 | preactx += xx_ # samples * 512 832 | preactx += T.dot(ctx_, self.U_gru_cdd_ctx) 833 | 834 | h = T.tanh(preactx) 835 | 836 | h = u * h_ + (1. - u) * h 837 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ # m_[:,None] : samples * 1 838 | 839 | return [h, alpha, alpha_sample, ctx_]#, r, u, preact, preactx 840 | 841 | seqs = [mask, state_belowc, state_below_, state_belowx] 842 | outputs_info = [init_state, 843 | T.alloc(0., n_samples, proj_ctx.shape[1]), 844 | T.alloc(0., n_samples, proj_ctx.shape[1]), 845 | None] 846 | rval, updates = theano.scan(_step, 847 | sequences=seqs, 848 | outputs_info = outputs_info, 849 | non_sequences = [proj_ctx], 850 | name='gru_layer', 851 | n_steps=nsteps) 852 | return rval 853 | 854 | 855 | def build_model(self, lr=0.001, dropout=None): 856 | def concatenate(tensor_list, axis=0): 857 | concat_size = sum(tt.shape[axis] for tt in tensor_list) 858 | output_shape = () 859 | for k in range(axis): 860 | output_shape += (tensor_list[0].shape[k],) 861 | output_shape += (concat_size,) 862 | for k in range(axis + 1, tensor_list[0].ndim): 863 | output_shape += (tensor_list[0].shape[k],) 864 | out = tensor.zeros(output_shape) 865 | offset = 0 866 | for tt in tensor_list: 867 | indices = () 868 | for k in range(axis): 869 | indices += (slice(None),) 870 | indices += (slice(offset, offset + tt.shape[axis]),) 871 | for k in range(axis + 1, tensor_list[0].ndim): 872 | indices += (slice(None),) 873 | 874 | out = tensor.set_subtensor(out[indices], tt) 875 | offset += tt.shape[axis] 876 | 877 | return out 878 | 879 | 880 | 881 | trng = RandomStreams(1234) 882 | use_noise = theano.shared(np.float32(0.)) 883 | 884 | # description string: #words x #samples 885 | 886 | 887 | x = T.matrix('x', dtype = 'int32') # step * samples 888 | x_mask = T.matrix('x_mask', dtype='float32') # step * samples 889 | y = T.matrix('y', dtype = 'int32') # sample * emb 890 | ctx = T.tensor3('ctx', dtype = 'float32') # sample * annotation * dim 891 | 892 | n_timesteps = x.shape[0] 893 | n_samples = x.shape[1] 894 | 895 | xr = x[::-1] 896 | xr_mask = x_mask[::-1] 897 | 898 | emb = self.W_emb[x.flatten()] 899 | emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) 900 | 901 | embr = self.W_emb[xr.flatten()] 902 | embr = embr.reshape([n_timesteps, n_samples, self.dim_word]) 903 | 904 | ctx0 = ctx 905 | ctx_mean = ctx0.mean(1) 906 | 907 | init_state = T.dot(ctx_mean, self.W_ctx_init) + self.b_ctx_init 908 | 909 | 910 | # proj : gru hidden 들의 리스트 911 | proj = self.gru_layer(emb, mask=x_mask, context=ctx, init_state=init_state) 912 | proj_h = proj[0] 913 | 914 | projr = self.gru_layer(embr, mask=xr_mask, context=ctx, init_state=init_state) 915 | projr_h = projr[0] 916 | 917 | 918 | concat_proj_h = concatenate([proj_h, projr_h[::-1]], axis=proj_h.ndim-1) 919 | # step_ctx : step * samples * (dim*2) 920 | concat_proj_h = (concat_proj_h * x_mask[:,:,None]).sum(0) / x_mask.sum(0)[:,None] 921 | # step_ctx_mean : samples * (dim*2) 922 | 923 | 924 | if dropout is not None : 925 | concat_proj_h = dropout_layer(concat_proj_h, use_noise, trng, dropout) 926 | 927 | 928 | 929 | output = T.dot(concat_proj_h, self.W_pred) + self.b_pred 930 | 931 | probs = T.nnet.softmax(output) 932 | prediction = probs.argmax(axis=1) 933 | 934 | ## avoid NaN 935 | epsilon = 1.0e-9 936 | probs = T.clip(probs, epsilon, 1.0 - epsilon) 937 | probs /= probs.sum(axis=-1, keepdims=True) 938 | ## avoid NaN 939 | 940 | 941 | cost = T.nnet.categorical_crossentropy(probs, y) 942 | cost = T.mean(cost) 943 | 944 | updates = optimizer.adam(cost=cost, params=self.params, lr=lr) 945 | 946 | return trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction 947 | 948 | 949 | 950 | def train(self, train_x, train_mask_x, train_imgs, train_y, 951 | valid_x=None, valid_mask_x=None, valid_imgs=None, valid_y=None, 952 | valid=None, 953 | lr=0.001, 954 | dropout=None, 955 | batch_size=16, 956 | epoch=100, 957 | save=None): 958 | 959 | cnn = CNN(deploy='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers_deploy.prototxt', model='/home/seonhoon/Desktop/caffemodel/vgg19/VGG_ILSVRC_19_layers.caffemodel') 960 | 961 | n_train = train_x.shape[1] 962 | 963 | trng, use_noise, x, x_mask, ctx, y, cost, updates, prediction = self.build_model(lr, dropout) 964 | # x : step * sample * dim 965 | # x_mask : step * sample 966 | # ctx : sample * annotation * dim_ctx 967 | # y : sample * emb 968 | 969 | 970 | train_model = theano.function(inputs=[x, x_mask, ctx, y], 971 | outputs=cost, 972 | updates=updates) 973 | 974 | if valid is not None: 975 | valid_model = theano.function(inputs=[x, x_mask, ctx], 976 | outputs=prediction) 977 | valid_batch_indices = get_minibatch_indices(valid_x.shape[1], batch_size) 978 | 979 | 980 | best_cost = np.inf 981 | best_cost_epoch = 0 982 | best_params = self.params 983 | for i in xrange(epoch): 984 | start_time = time.time() 985 | use_noise.set_value(1.) 986 | batch_indices=get_minibatch_indices(n_train, batch_size, shuffle=True) 987 | 988 | 989 | 990 | for j, indices in enumerate(batch_indices): 991 | minibatch_avg_cost = 0 992 | 993 | x = [ train_x[:,t] for t in indices] 994 | x = np.transpose(x) 995 | x_mask = [ train_mask_x[:,t] for t in indices] 996 | x_mask = np.transpose(x_mask) 997 | y = [ train_y[t,:] for t in indices] 998 | y = np.array(y) 999 | batch_imgs = [ train_imgs[t] for t in indices] 1000 | ctx = get_minibatch_cnn_features(cnn, batch_imgs) 1001 | 1002 | minibatch_avg_cost = train_model(x, x_mask, ctx, y) 1003 | print ' minibatch cost : ' , minibatch_avg_cost, ' , minibatch :', (j+1), ' in epoch \'', (i+1) 1004 | 1005 | if minibatch_avg_cost < best_cost : 1006 | best_cost = minibatch_avg_cost 1007 | best_cost_epoch = i+1 1008 | best_params = self.params 1009 | end_time = time.time() 1010 | print 'cost : ' , minibatch_avg_cost, ' in epoch \'', (i+1) ,'\' ', '[ ', int(end_time-start_time), 'sec ] [ best_cost : ', best_cost, ' in epoch \'', best_cost_epoch, '\' ]' 1011 | 1012 | 1013 | 1014 | # validation 1015 | if valid is not None: 1016 | if (i+1) % valid == 0: 1017 | use_noise.set_value(0.) 1018 | 1019 | valid_prediction = [] 1020 | for k, valid_indices in enumerate(valid_batch_indices): 1021 | 1022 | val_x = [ valid_x[:,t] for t in valid_indices] 1023 | val_x = np.transpose(val_x) 1024 | val_x_mask = [ valid_mask_x[:,t] for t in valid_indices] 1025 | val_x_mask = np.transpose(val_x_mask) 1026 | val_batch_imgs = [ valid_imgs[t] for t in valid_indices] 1027 | val_ctx = get_minibatch_cnn_features(cnn, val_batch_imgs) 1028 | 1029 | valid_prediction += valid_model(val_x, val_x_mask, val_ctx).tolist() 1030 | correct = 0 1031 | for l in range(len(valid_prediction)): 1032 | if valid_prediction[l]==valid_y[l] : 1033 | correct += 1 1034 | print '## valid accuracy : ', float(correct) / len(valid_prediction) 1035 | 1036 | 1037 | # save model 1038 | if save is not None: 1039 | if (i+1) % save == 0: 1040 | print 'save best param..', 1041 | save_tparams(best_params, 'model.pkl') 1042 | print 'Done' 1043 | 1044 | 1045 | 1046 | 1047 | 1048 | 1049 | 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | 1056 | 1057 | 1058 | -------------------------------------------------------------------------------- /theano_attention/models.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/theano_attention/models.pyc -------------------------------------------------------------------------------- /theano_attention/optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import theano 4 | import theano.tensor as T 5 | 6 | import numpy as np 7 | 8 | 9 | def sgd(cost, params, lr ): 10 | grads = T.grad(cost, params) 11 | updates = [] 12 | for param, grad in zip(params, grads): 13 | updates.append((param, param - lr*grad)) 14 | 15 | return updates 16 | 17 | def adam( cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8): 18 | updates = [] 19 | grads = T.grad(cost, params) 20 | i = theano.shared(np.float32(0.)) 21 | i_t = i + 1. 22 | fix1 = 1. - (1. - b1)**i_t 23 | fix2 = 1. - (1. - b2)**i_t 24 | lr_t = lr * (T.sqrt(fix2) / fix1) 25 | for p, g in zip(params, grads): 26 | m = theano.shared(p.get_value() * 0.) 27 | v = theano.shared(p.get_value() * 0.) 28 | m_t = (b1 * g) + ((1. - b1) * m) 29 | v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) 30 | g_t = m_t / (T.sqrt(v_t) + e) 31 | p_t = p - (lr_t * g_t) 32 | updates.append((m, m_t)) 33 | updates.append((v, v_t)) 34 | updates.append((p, p_t)) 35 | updates.append((i, i_t)) 36 | return updates -------------------------------------------------------------------------------- /theano_attention/optimizer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/seankim902/imageQA/25f70ba1dc013eac6f7c734830f04663fd7b58f2/theano_attention/optimizer.pyc -------------------------------------------------------------------------------- /theano_attention/prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from models import RNN 8 | from keras.utils import np_utils 9 | 10 | def prepare_data(seqs_x, maxlen=None): 11 | lengths_x = [len(s) for s in seqs_x] 12 | 13 | n_samples = len(seqs_x) 14 | if maxlen is None: 15 | maxlen = np.max(lengths_x) + 1 16 | 17 | x = np.zeros((maxlen, n_samples)).astype('int32') 18 | x_mask = np.zeros((maxlen, n_samples)).astype('float32') 19 | 20 | for idx, s_x in enumerate(seqs_x): 21 | x[:lengths_x[idx],idx] = s_x 22 | #x_mask[:lengths_x[idx],idx] = 1. 23 | x_mask[:lengths_x[idx],idx] = 1. 24 | 25 | return x, x_mask 26 | 27 | def main(): 28 | 29 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 30 | 31 | n_vocab = 12047 32 | y_vocab = 430 33 | dim_word = 1024 34 | dim = 1024 35 | maxlen = 60 36 | 37 | test=pd.read_pickle('test_vgg.pkl') 38 | 39 | test_x=[ q for q in test['q'] ] 40 | test_y=[ a[0] for a in test['a'] ] 41 | test_y_original=test_y 42 | test_y=np.array(test_y)[:,None] 43 | test_y = np_utils.to_categorical(test_y, y_vocab) 44 | test_x , test_x_mask = prepare_data(test_x, maxlen) 45 | test_x_img = [ img.tolist() for img in test['cnn_feature'] ] 46 | 47 | 48 | print 'x :', test_x.shape 49 | print 'x_mask:', test_x_mask.shape 50 | print 'y : ', test_y.shape 51 | model = RNN(n_vocab, y_vocab, dim_word, dim) 52 | 53 | pred_y = model.prediction(test_x, test_x_mask, test_x_img, test_y, batch_size=2048) 54 | 55 | print pred_y[:10], len(pred_y) 56 | print test_y_original[:10], len(test_y_original) 57 | 58 | correct = 0 59 | for i in range(len(pred_y)): 60 | if pred_y[i]==test_y_original[i] : 61 | correct += 1 62 | print 'accuracy : ', float(correct) / len(pred_y) 63 | if __name__ == '__main__': 64 | main() 65 | 66 | -------------------------------------------------------------------------------- /theano_simple/cnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import caffe 5 | import numpy as np 6 | from scipy.misc import imread, imresize 7 | 8 | class CNN(object): 9 | 10 | def __init__(self, deploy='VGG_ILSVRC_16_layers_deploy.prototxt', model='VGG_ILSVRC_16_layers.caffemodel'): 11 | caffe.set_mode_gpu() 12 | self.net = caffe.Net(deploy, model, caffe.TEST) 13 | 14 | #if model.startswith('VGG'): 15 | self.mean = np.array([103.939, 116.779, 123.68]) 16 | 17 | 18 | def get_batch_features(self, in_data, net, layer): 19 | out = net.forward(blobs=[layer], **{net.inputs[0]: in_data}) 20 | features = out[layer]#.squeeze(axis=(2,3)) 21 | return features 22 | 23 | def get_features(self, filelist, layer='fc7'): 24 | 25 | N, channel, height, width = self.net.blobs[self.net.inputs[0]].data.shape 26 | n_files = len(filelist) 27 | 28 | 29 | if str(layer).startswith('fc'): 30 | feature = self.net.blobs[layer].data.shape[1] 31 | all_features = np.zeros((n_files, feature)) 32 | else : 33 | feature1, feature2, feature3 = self.net.blobs[layer].data.shape[1], self.net.blobs[layer].data.shape[2], self.net.blobs[layer].data.shape[3] 34 | all_features = np.zeros((n_files, feature1, feature2, feature3)) 35 | 36 | 37 | for i in range(0, n_files, N): 38 | in_data = np.zeros((N, channel, height, width), dtype=np.float32) 39 | 40 | batch_range = range(i, min(i+N, n_files)) 41 | batch_filelist = [filelist[j] for j in batch_range] 42 | 43 | batch_images = np.zeros((len(batch_range), 3, height, width)) 44 | for j,file in enumerate(batch_filelist): 45 | im = imread(file) 46 | if len(im.shape) == 2: 47 | im = np.tile(im[:,:,np.newaxis], (1,1,3)) 48 | im = im[:,:,(2,1,0)] # RGB -> BGR 49 | im = im - self.mean # mean subtraction 50 | im = imresize(im, (height, width), 'bicubic') # resize 51 | im = np.transpose(im, (2, 0, 1)) # get channel in correct dimension 52 | batch_images[j,:,:,:] = im 53 | 54 | in_data[0:len(batch_range), :, :, :] = batch_images 55 | 56 | features = self.get_batch_features(in_data, self.net, layer) 57 | 58 | for j in range(len(batch_range)): 59 | all_features[i+j,:] = features[j,:] 60 | 61 | #print 'Done %d/%d files' % (i+len(batch_range), len(filelist)) 62 | 63 | return all_features 64 | -------------------------------------------------------------------------------- /theano_simple/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from models import RNN 8 | from keras.utils import np_utils 9 | 10 | 11 | def prepare_data(seqs_x, maxlen=None): 12 | lengths_x = [len(s) for s in seqs_x] 13 | 14 | n_samples = len(seqs_x) 15 | if maxlen is None: 16 | maxlen = np.max(lengths_x) + 1 17 | 18 | x = np.zeros((maxlen, n_samples)).astype('int32') 19 | x_mask = np.zeros((maxlen, n_samples)).astype('float32') 20 | 21 | for idx, s_x in enumerate(seqs_x): 22 | x[:lengths_x[idx],idx] = s_x 23 | x_mask[:lengths_x[idx],idx] = 1. 24 | #x_mask[:lengths_x[idx]+1,idx] = 1. 25 | 26 | return x, x_mask 27 | 28 | 29 | 30 | def main(): 31 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 32 | 33 | n_vocab = 12047 34 | y_vocab = 430 35 | dim_word = 1024 36 | dim = 1024 37 | maxlen = 60 38 | 39 | train=pd.read_pickle('train_vgg.pkl') 40 | 41 | 42 | 43 | train_x=[ q for q in train['q'] ] 44 | train_y=[ a[0] for a in train['a'] ] 45 | train_y=np.array(train_y)[:,None] 46 | train_y = np_utils.to_categorical(train_y, y_vocab).astype('int32') 47 | train_x , train_x_mask = prepare_data(train_x, maxlen) 48 | train_x_img = np.array([ img.tolist() for img in train['cnn_feature'] ]).astype('float32') 49 | 50 | 51 | print 'x :', train_x.shape 52 | print 'x_mask:', train_x_mask.shape 53 | print 'x_img:', train_x_img.shape 54 | print 'y : ', train_y.shape 55 | model = RNN(n_vocab, y_vocab, dim_word, dim) 56 | 57 | model.train(train_x, train_x_mask, train_x_img, train_y, batch_size=512, epoch=50, save=15) 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /theano_simple/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import theano 3 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 4 | import theano.tensor as T 5 | from keras import activations, initializations 6 | 7 | import optimizer 8 | import cPickle 9 | 10 | import numpy as np 11 | 12 | 13 | def save_tparams(tparams, path): 14 | with open(path,'wb') as f: 15 | for params in tparams: 16 | cPickle.dump(params.get_value(), f, protocol=cPickle.HIGHEST_PROTOCOL) 17 | 18 | def load_tparams(tparams, path): 19 | with open(path,'rb') as f: 20 | for i in range(len(tparams)): 21 | tparams[i].set_value(cPickle.load(f)) 22 | return tparams 23 | 24 | def get_minibatch_indices(n, batch_size, shuffle=False): 25 | 26 | idx_list = np.arange(n, dtype="int32") 27 | 28 | if shuffle: 29 | np.random.shuffle(idx_list) 30 | 31 | minibatches = [] 32 | minibatch_start = 0 33 | for i in range(n // batch_size): 34 | minibatches.append(idx_list[minibatch_start: 35 | minibatch_start + batch_size]) 36 | minibatch_start += batch_size 37 | if (minibatch_start != n): 38 | minibatches.append(idx_list[minibatch_start:]) 39 | return minibatches 40 | 41 | class RNN: 42 | def __init__(self, n_vocab, y_vocab, dim_word, dim): 43 | self.n_vocab = n_vocab # 12047 44 | self.y_vocab = y_vocab # 430 45 | self.dim_word = dim_word # 1024 46 | self.dim = dim # 512 47 | 48 | 49 | ### image Embedding 50 | self.W_img_emb = initializations.uniform((4096, self.dim)) 51 | self.b_img_emb = initializations.zero((self.dim)) 52 | 53 | 54 | ### Word Embedding ### 55 | self.W_emb = initializations.uniform((self.n_vocab, self.dim_word)) 56 | 57 | ### enc forward GRU ### 58 | self.W_gru = initializations.uniform((self.dim_word, self.dim * 2)) 59 | self.U_gru = initializations.uniform((self.dim, self.dim * 2)) 60 | self.b_gru = initializations.zero((self.dim * 2)) 61 | self.W_gru_cdd = initializations.uniform((self.dim_word, self.dim)) # cdd : candidate 62 | self.U_gru_cdd = initializations.uniform((self.dim, self.dim)) 63 | self.b_gru_cdd = initializations.zero((self.dim)) 64 | ### prediction ### 65 | self.W_pred = initializations.uniform((self.dim, self.y_vocab)) 66 | self.b_pred = initializations.zero((self.y_vocab)) 67 | 68 | 69 | self.params = [self.W_img_emb, self.b_img_emb, 70 | self.W_emb, 71 | self.W_gru, self.U_gru, self.b_gru, 72 | self.W_gru_cdd, self.U_gru_cdd, self.b_gru_cdd, 73 | self.W_pred, self.b_pred] 74 | 75 | def gru_layer(self, state_below, init_state, mask=None): 76 | #state_below : step * sample * dim 77 | nsteps = state_below.shape[0] 78 | n_samples = state_below.shape[1] 79 | 80 | dim = self.dim 81 | 82 | def _slice(_x, n, dim): 83 | if _x.ndim == 3: 84 | print '_x.ndim : ' , _x.ndim 85 | return _x[:, :, n*dim:(n+1)*dim] 86 | return _x[:, n*dim:(n+1)*dim] 87 | 88 | # step * samples * dim 89 | state_below_ = T.dot(state_below, self.W_gru) + self.b_gru 90 | state_belowx = T.dot(state_below, self.W_gru_cdd) + self.b_gru_cdd 91 | 92 | def _step(m_, x_, xx_, h_, U, Ux): 93 | ''' 94 | m_ : (samples,) 95 | x_, h_ : samples * dimensions 96 | ''' 97 | preact = T.dot(h_, U) 98 | preact += x_ # samples * 1024 99 | 100 | r = T.nnet.sigmoid(_slice(preact, 0, dim)) 101 | u = T.nnet.sigmoid(_slice(preact, 1, dim)) 102 | 103 | preactx = T.dot(h_, Ux) 104 | preactx = preactx * r 105 | preactx = preactx + xx_ # samples * 512 106 | 107 | h = T.tanh(preactx) 108 | 109 | h = u * h_ + (1. - u) * h 110 | h = m_[:,None] * h + (1. - m_)[:,None] * h_ # m_[:,None] : samples * 1 111 | 112 | return h#, r, u, preact, preactx 113 | seqs = [mask, state_below_, state_belowx] 114 | 115 | rval, updates = theano.scan(_step, 116 | sequences=seqs, 117 | outputs_info = [init_state], #T.alloc(0., n_samples, dim)], 118 | non_sequences = [self.U_gru, self.U_gru_cdd], 119 | name='gru_layer', 120 | n_steps=nsteps) 121 | return rval 122 | 123 | 124 | def build_model(self, lr=0.001): 125 | 126 | trng = RandomStreams(1234) 127 | use_noise = theano.shared(np.float32(0.)) 128 | 129 | # description string: #words x #samples 130 | 131 | 132 | x = T.matrix('x', dtype = 'int32') 133 | x_mask = T.matrix('x_mask', dtype='float32') 134 | y = T.matrix('y', dtype = 'int32') 135 | img = T.matrix('img', dtype = 'float32') 136 | 137 | n_timesteps = x.shape[0] 138 | n_samples = x.shape[1] 139 | 140 | init_state = T.dot(img, self.W_img_emb) + self.b_img_emb 141 | emb = self.W_emb[x.flatten()] 142 | 143 | emb = emb.reshape([n_timesteps, n_samples, self.dim_word]) 144 | # proj : gru hidden 들의 리스트 145 | proj = self.gru_layer(emb, init_state, mask=x_mask) 146 | 147 | 148 | # hidden 들의 평균 149 | #proj = (proj * x_mask[:, :, None]).sum(axis=0) 150 | #proj = proj / x_mask.sum(axis=0)[:, None] # sample * dim 151 | 152 | # 마지막 hidden 153 | proj = proj[-1] # sample * dim 154 | 155 | 156 | 157 | 158 | output = T.dot(proj, self.W_pred) + self.b_pred 159 | 160 | probs = T.nnet.softmax(output) 161 | prediction = probs.argmax(axis=1) 162 | cost = T.nnet.categorical_crossentropy(probs, y) 163 | cost = T.mean(cost) 164 | 165 | updates = optimizer.adam(cost=cost, params=self.params, lr=lr) 166 | 167 | return x, x_mask, img, y, cost, updates, prediction 168 | 169 | 170 | 171 | 172 | def train(self, train_x, train_mask_x, train_img, train_y, 173 | lr=0.001, 174 | batch_size=16, 175 | epoch=100, 176 | save=None): 177 | 178 | 179 | n_train = train_x.shape[1] 180 | 181 | x, x_mask, img, y, cost, updates, prediction = self.build_model(lr) 182 | # x : step * sample * dim 183 | # x_mask : step * sample 184 | # y : sample * emb 185 | 186 | 187 | train_model = theano.function(inputs=[x, x_mask, img, y], 188 | outputs=cost, 189 | updates=updates) 190 | 191 | 192 | #valid_batch_indices = self.get_minibatch_indices(valid_x.shape[1], batch_size) 193 | for i in xrange(epoch): 194 | 195 | batch_indices=get_minibatch_indices(n_train, batch_size, shuffle=True) 196 | 197 | for j, indices in enumerate(batch_indices): 198 | 199 | x = [ train_x[:,t] for t in indices] 200 | x = np.transpose(x) 201 | x_mask = [ train_mask_x[:,t] for t in indices] 202 | x_mask = np.transpose(x_mask) 203 | y = [ train_y[t,:] for t in indices] 204 | y = np.array(y) 205 | img = [ train_img[t,:] for t in indices] 206 | img = np.array(img) 207 | 208 | minibatch_avg_cost = train_model(x, x_mask, img, y) 209 | print 'cost : ' , minibatch_avg_cost, ' [ mini batch \'', j+1, '\' in epoch \'', (i+1) ,'\' ]' 210 | 211 | # if valid is not None: 212 | # if (i+1) % valid == 0: 213 | # valid_accuracy = self.pred_accuracy(f_prediction, prepare_data, valid, kf_valid) 214 | if save is not None: 215 | if (i+1) % save == 0: 216 | print 'save param..', 217 | save_tparams(self.params, 'model.pkl') 218 | print 'Done' 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | def prediction(self, test_x, test_mask_x, test_img, test_y, 228 | # valid_x=None, valid_mask_x=None, 229 | #valid_y=None, valid_mask_y=None, 230 | # optimizer=None, 231 | lr=0.001, 232 | batch_size=16, 233 | epoch=100, 234 | save=None): 235 | 236 | load_tparams(self.params, 'model.pkl') 237 | test_shared_x = theano.shared(np.asarray(test_x, dtype='int32'), borrow=True) 238 | #test_shared_y = theano.shared(np.asarray(test_y, dtype='int32'), borrow=True) 239 | test_shared_mask_x = theano.shared(np.asarray(test_mask_x, dtype='float32'), borrow=True) 240 | test_shared_img = theano.shared(np.asarray(test_img, dtype='float32'), borrow=True) 241 | 242 | n_test = test_x.shape[1] 243 | n_test_batches = int(np.ceil(1.0 * n_test / batch_size)) 244 | 245 | index = T.lscalar('index') # index to a case 246 | final_index = T.lscalar('final_index') 247 | 248 | 249 | x, x_mask, img, y, _, _, prediction = self.build_model(lr) 250 | 251 | batch_start = index * batch_size 252 | batch_stop = T.minimum(final_index, (index + 1) * batch_size) 253 | 254 | test_model = theano.function(inputs=[index, final_index], 255 | outputs=prediction, 256 | givens={ 257 | x: test_shared_x[:,batch_start:batch_stop], 258 | x_mask: test_shared_mask_x[:,batch_start:batch_stop], 259 | img: test_shared_img[batch_start:batch_stop,:]}) 260 | 261 | 262 | prediction = [] 263 | for minibatch_idx in xrange(n_test_batches): 264 | print minibatch_idx, ' / ' , n_test_batches 265 | prediction += test_model(minibatch_idx, n_test).tolist() 266 | return prediction -------------------------------------------------------------------------------- /theano_simple/optimizer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import theano 4 | import theano.tensor as T 5 | 6 | import numpy as np 7 | 8 | 9 | def sgd(cost, params, lr ): 10 | grads = T.grad(cost, params) 11 | updates = [] 12 | for param, grad in zip(params, grads): 13 | updates.append((param, param - lr*grad)) 14 | 15 | return updates 16 | 17 | def adam( cost, params, lr=0.001, b1=0.9, b2=0.999, e=1e-8): 18 | updates = [] 19 | grads = T.grad(cost, params) 20 | i = theano.shared(np.float32(0.)) 21 | i_t = i + 1. 22 | fix1 = 1. - (1. - b1)**i_t 23 | fix2 = 1. - (1. - b2)**i_t 24 | lr_t = lr * (T.sqrt(fix2) / fix1) 25 | for p, g in zip(params, grads): 26 | m = theano.shared(p.get_value() * 0.) 27 | v = theano.shared(p.get_value() * 0.) 28 | m_t = (b1 * g) + ((1. - b1) * m) 29 | v_t = (b2 * T.sqr(g)) + ((1. - b2) * v) 30 | g_t = m_t / (T.sqrt(v_t) + e) 31 | p_t = p - (lr_t * g_t) 32 | updates.append((m, m_t)) 33 | updates.append((v, v_t)) 34 | updates.append((p, p_t)) 35 | updates.append((i, i_t)) 36 | return updates -------------------------------------------------------------------------------- /theano_simple/prediction.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | import os 5 | import pandas as pd 6 | import numpy as np 7 | from models import RNN 8 | from keras.utils import np_utils 9 | 10 | def prepare_data(seqs_x, maxlen=None): 11 | lengths_x = [len(s) for s in seqs_x] 12 | 13 | n_samples = len(seqs_x) 14 | if maxlen is None: 15 | maxlen = np.max(lengths_x) + 1 16 | 17 | x = np.zeros((maxlen, n_samples)).astype('int32') 18 | x_mask = np.zeros((maxlen, n_samples)).astype('float32') 19 | 20 | for idx, s_x in enumerate(seqs_x): 21 | x[:lengths_x[idx],idx] = s_x 22 | x_mask[:lengths_x[idx],idx] = 1. 23 | #x_mask[:lengths_x[idx]+1,idx] = 1. 24 | 25 | return x, x_mask 26 | 27 | def main(): 28 | 29 | os.chdir('/home/seonhoon/Desktop/workspace/ImageQA/data/') 30 | 31 | n_vocab = 12047 32 | y_vocab = 430 33 | dim_word = 1024 34 | dim = 1024 35 | maxlen = 60 36 | 37 | test=pd.read_pickle('test_vgg.pkl') 38 | 39 | test_x=[ q for q in test['q'] ] 40 | test_y=[ a[0] for a in test['a'] ] 41 | test_y_original=test_y 42 | test_y=np.array(test_y)[:,None] 43 | test_y = np_utils.to_categorical(test_y, y_vocab) 44 | test_x , test_x_mask = prepare_data(test_x, maxlen) 45 | test_x_img = [ img.tolist() for img in test['cnn_feature'] ] 46 | 47 | 48 | print 'x :', test_x.shape 49 | print 'x_mask:', test_x_mask.shape 50 | print 'y : ', test_y.shape 51 | model = RNN(n_vocab, y_vocab, dim_word, dim) 52 | 53 | pred_y = model.prediction(test_x, test_x_mask, test_x_img, test_y, batch_size=2048) 54 | 55 | print pred_y[:10], len(pred_y) 56 | print test_y_original[:10], len(test_y_original) 57 | 58 | correct = 0 59 | for i in range(len(pred_y)): 60 | if pred_y[i]==test_y_original[i] : 61 | correct += 1 62 | print 'accuracy : ', float(correct) / len(pred_y) 63 | if __name__ == '__main__': 64 | main() --------------------------------------------------------------------------------