├── .gitignore
├── CNN Deep Residual
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception (char)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception (char+word)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception (word)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception + Bottleneck
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception + Gate (tanh)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception + Gate
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Inception + Residual
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── CNN Non-Inception
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── LSTM (baseline)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
├── README.md
├── fastText (char+word)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py
└── fastText (word)
    ├── config.py
    ├── qlstm_solver.prototxt
    ├── train_att_bc.py
    ├── visualize_tools.py
    ├── vqa_data_provider_layer.py
    └── write_to_log.py


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | .DS_Store
3 | 


--------------------------------------------------------------------------------
/CNN Deep Residual/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 9
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | #MAX_WORDS_IN_QUESTION = 22
 6 | MAX_CHARS_IN_QUESTION = 100
 7 | MAX_ITERATIONS = 1000000
 8 | PRINT_INTERVAL = 1000
 9 | VALIDATE_INTERVAL = 110000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
10 | 
11 | # what data to use for training
12 | TRAIN_DATA_SPLITS = 'train'
13 | 
14 | # what data to use for the vocabulary
15 | QUESTION_VOCAB_SPACE = 'train'
16 | ANSWER_VOCAB_SPACE = 'train'
17 | 
18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
21 | 
22 | # location of the data
23 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
26 | 
27 | DATA_PATHS = {
28 | 	'train': {
29 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
30 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
31 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
32 | 	},
33 | 	'val': {
34 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
35 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
36 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
37 | 	},
38 | 	'test-dev': {
39 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
40 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
41 | 	},
42 | 	'test': {
43 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
44 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
45 | 	},
46 | 	# TODO it would be nice if genome also followed the same file format as vqa
47 | 	'genome': {
48 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
49 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/CNN Deep Residual/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Deep Residual/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
140 |         net.blobs['img_feature'].data[...] = t_img_feature
141 |         net.blobs['label'].data[...] = t_answer
142 |         #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
143 |         net.forward()
144 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
145 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
146 |         testloss_list.append(net.blobs['loss'].data)
147 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
148 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
149 |             if visualize:
150 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
151 |                 if mode == 'test-dev' or 'test':
152 |                     ans_str = ''
153 |                     ans_list = ['']*10
154 |                 else:
155 |                     ans_str = dp.vec_to_answer(ans)
156 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
157 |                 stat_list.append({\
158 |                                     'qid'   : qid,
159 |                                     'q_list' : q_list,
160 |                                     'iid'   : iid,
161 |                                     'answer': ans_str,
162 |                                     'ans_list': ans_list,
163 |                                     'pred'  : pred })
164 |         #percent = 100 * float(len(pred_list)) / total_questions
165 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
166 |         #sys.stdout.flush()
167 | 
168 | 
169 | 
170 |     mean_testloss = np.array(testloss_list).mean()
171 | 
172 |     if mode == 'val':
173 |         valFile = './result/val2014_resfile'
174 |         with open(valFile, 'w') as f:
175 |             json.dump(pred_list, f)
176 |         if visualize:
177 |             visualize_failures(stat_list,mode)
178 |         annFile = config.DATA_PATHS['val']['ans_file']
179 |         quesFile = config.DATA_PATHS['val']['ques_file']
180 |         vqa = VQA(annFile, quesFile)
181 |         vqaRes = vqa.loadRes(valFile, quesFile)
182 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
183 |         vqaEval.evaluate()
184 |         acc_overall = vqaEval.accuracy['overall']
185 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
186 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
187 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
188 |     elif mode == 'test-dev':
189 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
190 |         with open(filename+'.json', 'w') as f:
191 |             json.dump(pred_list, f)
192 |         if visualize:
193 |             visualize_failures(stat_list,mode)
194 |     elif mode == 'test':
195 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
196 |         with open(filename+'.json', 'w') as f:
197 |             json.dump(pred_list, f)
198 |         if visualize:
199 |             visualize_failures(stat_list,mode)
200 | 
201 | def drawgraph(results, save_question_type_graphs=False):
202 |     # 0:it
203 |     # 1:trainloss
204 |     # 2:testloss
205 |     # 3:oa_acc
206 |     # 4:qt_acc
207 |     # 5:at_acc
208 | 
209 |     # training curve
210 |     it = np.array([l[0] for l in results])
211 |     loss = np.array([l[1] for l in results])
212 |     valloss = np.array([l[2] for l in results])
213 |     valacc = np.array([l[3] for l in results])
214 | 
215 |     fig = plt.figure()
216 |     ax1 = fig.add_subplot(111)
217 |     ax2 = ax1.twinx()
218 | 
219 |     ax1.plot(it,loss, color='blue', label='train loss')
220 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
221 |     ax2.plot(it,valacc, color='red', label='acc on val')
222 |     plt.legend(loc='lower left')
223 | 
224 |     ax1.set_xlabel('Iterations')
225 |     ax1.set_ylabel('Loss Value')
226 |     ax2.set_ylabel('Accuracy on Val [%]')
227 | 
228 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
229 |     plt.clf()
230 |     plt.close("all")
231 | 
232 |     # question type
233 |     it = np.array([l[0] for l in results])
234 |     oa_acc = np.array([l[3] for l in results])
235 |     qt_dic_list = [l[4] for l in results]
236 | 
237 |     def draw_qt_acc(target_key_list, figname):
238 |         fig = plt.figure()
239 |         for k in target_key_list:
240 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
241 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
242 |             plt.plot(it,t_val,label=str(k))
243 |         plt.legend(fontsize='small')
244 |         plt.ylim(0,100.)
245 |         #plt.legend(prop={'size':6})
246 | 
247 |         plt.xlabel('Iterations')
248 |         plt.ylabel('Accuracy on Val [%]')
249 | 
250 |         plt.savefig(figname,dpi=200)
251 |         plt.clf()
252 |         plt.close("all")
253 | 
254 |     if save_question_type_graphs:
255 |         s_keys = sorted(qt_dic_list[0].keys())
256 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
257 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
258 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
259 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
260 |         draw_qt_acc(['what color is the','what color are the','what color is',\
261 |             'what color','what is the color of the'],'./qt_color.png')
262 |         draw_qt_acc(['how many','how','how many people are',\
263 |             'how many people are in'],'./qt_number.png')
264 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
265 |             'which'],'./qt_who_why_where_which.png')
266 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
267 |             'is the woman','is this person','what is the woman','is the person',\
268 |             'what is the person'],'./qt_human.png')
269 | 
270 | 
271 | 


--------------------------------------------------------------------------------
/CNN Deep Residual/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception (char)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 10
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | # MAX_WORDS_IN_QUESTION = 22
 6 | MAX_CHARS_IN_QUESTION = 100
 7 | MAX_ITERATIONS = 1000000
 8 | PRINT_INTERVAL = 1000
 9 | VALIDATE_INTERVAL = 150000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
10 | 
11 | # what data to use for training
12 | TRAIN_DATA_SPLITS = 'train'
13 | 
14 | # what data to use for the vocabulary
15 | QUESTION_VOCAB_SPACE = 'train'
16 | ANSWER_VOCAB_SPACE = 'train'
17 | 
18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
21 | 
22 | # location of the data
23 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
26 | 
27 | DATA_PATHS = {
28 | 	'train': {
29 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
30 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
31 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
32 | 	},
33 | 	'val': {
34 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
35 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
36 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
37 | 	},
38 | 	'test-dev': {
39 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
40 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
41 | 	},
42 | 	'test': {
43 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
44 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
45 | 	},
46 | 	# TODO it would be nice if genome also followed the same file format as vqa
47 | 	'genome': {
48 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
49 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/CNN Inception (char)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception (char)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
140 |         net.blobs['img_feature'].data[...] = t_img_feature
141 |         net.blobs['label'].data[...] = t_answer
142 |         #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
143 |         net.forward()
144 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
145 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
146 |         testloss_list.append(net.blobs['loss'].data)
147 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
148 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
149 |             if visualize:
150 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
151 |                 if mode == 'test-dev' or 'test':
152 |                     ans_str = ''
153 |                     ans_list = ['']*10
154 |                 else:
155 |                     ans_str = dp.vec_to_answer(ans)
156 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
157 |                 stat_list.append({\
158 |                                     'qid'   : qid,
159 |                                     'q_list' : q_list,
160 |                                     'iid'   : iid,
161 |                                     'answer': ans_str,
162 |                                     'ans_list': ans_list,
163 |                                     'pred'  : pred })
164 |         #percent = 100 * float(len(pred_list)) / total_questions
165 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
166 |         #sys.stdout.flush()
167 | 
168 | 
169 | 
170 |     mean_testloss = np.array(testloss_list).mean()
171 | 
172 |     if mode == 'val':
173 |         valFile = './result/val2014_resfile'
174 |         with open(valFile, 'w') as f:
175 |             json.dump(pred_list, f)
176 |         if visualize:
177 |             visualize_failures(stat_list,mode)
178 |         annFile = config.DATA_PATHS['val']['ans_file']
179 |         quesFile = config.DATA_PATHS['val']['ques_file']
180 |         vqa = VQA(annFile, quesFile)
181 |         vqaRes = vqa.loadRes(valFile, quesFile)
182 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
183 |         vqaEval.evaluate()
184 |         acc_overall = vqaEval.accuracy['overall']
185 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
186 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
187 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
188 |     elif mode == 'test-dev':
189 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
190 |         with open(filename+'.json', 'w') as f:
191 |             json.dump(pred_list, f)
192 |         if visualize:
193 |             visualize_failures(stat_list,mode)
194 |     elif mode == 'test':
195 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
196 |         with open(filename+'.json', 'w') as f:
197 |             json.dump(pred_list, f)
198 |         if visualize:
199 |             visualize_failures(stat_list,mode)
200 | 
201 | def drawgraph(results, save_question_type_graphs=False):
202 |     # 0:it
203 |     # 1:trainloss
204 |     # 2:testloss
205 |     # 3:oa_acc
206 |     # 4:qt_acc
207 |     # 5:at_acc
208 | 
209 |     # training curve
210 |     it = np.array([l[0] for l in results])
211 |     loss = np.array([l[1] for l in results])
212 |     valloss = np.array([l[2] for l in results])
213 |     valacc = np.array([l[3] for l in results])
214 | 
215 |     fig = plt.figure()
216 |     ax1 = fig.add_subplot(111)
217 |     ax2 = ax1.twinx()
218 | 
219 |     ax1.plot(it,loss, color='blue', label='train loss')
220 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
221 |     ax2.plot(it,valacc, color='red', label='acc on val')
222 |     plt.legend(loc='lower left')
223 | 
224 |     ax1.set_xlabel('Iterations')
225 |     ax1.set_ylabel('Loss Value')
226 |     ax2.set_ylabel('Accuracy on Val [%]')
227 | 
228 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
229 |     plt.clf()
230 |     plt.close("all")
231 | 
232 |     # question type
233 |     it = np.array([l[0] for l in results])
234 |     oa_acc = np.array([l[3] for l in results])
235 |     qt_dic_list = [l[4] for l in results]
236 | 
237 |     def draw_qt_acc(target_key_list, figname):
238 |         fig = plt.figure()
239 |         for k in target_key_list:
240 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
241 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
242 |             plt.plot(it,t_val,label=str(k))
243 |         plt.legend(fontsize='small')
244 |         plt.ylim(0,100.)
245 |         #plt.legend(prop={'size':6})
246 | 
247 |         plt.xlabel('Iterations')
248 |         plt.ylabel('Accuracy on Val [%]')
249 | 
250 |         plt.savefig(figname,dpi=200)
251 |         plt.clf()
252 |         plt.close("all")
253 | 
254 |     if save_question_type_graphs:
255 |         s_keys = sorted(qt_dic_list[0].keys())
256 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
257 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
258 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
259 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
260 |         draw_qt_acc(['what color is the','what color are the','what color is',\
261 |             'what color','what is the color of the'],'./qt_color.png')
262 |         draw_qt_acc(['how many','how','how many people are',\
263 |             'how many people are in'],'./qt_number.png')
264 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
265 |             'which'],'./qt_who_why_where_which.png')
266 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
267 |             'is the woman','is this person','what is the woman','is the person',\
268 |             'what is the person'],'./qt_human.png')
269 | 
270 | 
271 | 


--------------------------------------------------------------------------------
/CNN Inception (char)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception (char+word)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 7
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22
 6 | LENGTH_OF_LONGEST_WORD = 17
 7 | #MAX_CHARS_IN_QUESTION = 100
 8 | MAX_ITERATIONS = 1000000
 9 | PRINT_INTERVAL = 1000
10 | VALIDATE_INTERVAL = 160000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
11 | 
12 | # what data to use for training
13 | TRAIN_DATA_SPLITS = 'train'
14 | 
15 | # what data to use for the vocabulary
16 | QUESTION_VOCAB_SPACE = 'train'
17 | ANSWER_VOCAB_SPACE = 'train'
18 | 
19 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
20 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
21 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
22 | 
23 | # location of the data
24 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
25 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
26 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
27 | 
28 | DATA_PATHS = {
29 | 	'train': {
30 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
31 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
32 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
33 | 	},
34 | 	'val': {
35 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
36 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
37 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
38 | 	},
39 | 	'test-dev': {
40 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
41 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
42 | 	},
43 | 	'test': {
44 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
45 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
46 | 	},
47 | 	# TODO it would be nice if genome also followed the same file format as vqa
48 | 	'genome': {
49 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
50 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/CNN Inception (char+word)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception (char+word)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         t_word, t_cont, t_word_c, t_cont_c, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
140 |         net.blobs['data1'].data[...] = t_word_c
141 |         net.blobs['cont1'].data[...] = t_cont_c
142 |         net.blobs['img_feature'].data[...] = t_img_feature
143 |         net.blobs['label'].data[...] = t_answer
144 |         #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
145 |         net.forward()
146 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
147 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
148 |         testloss_list.append(net.blobs['loss'].data)
149 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
150 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
151 |             if visualize:
152 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
153 |                 if mode == 'test-dev' or 'test':
154 |                     ans_str = ''
155 |                     ans_list = ['']*10
156 |                 else:
157 |                     ans_str = dp.vec_to_answer(ans)
158 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
159 |                 stat_list.append({\
160 |                                     'qid'   : qid,
161 |                                     'q_list' : q_list,
162 |                                     'iid'   : iid,
163 |                                     'answer': ans_str,
164 |                                     'ans_list': ans_list,
165 |                                     'pred'  : pred })
166 |         #percent = 100 * float(len(pred_list)) / total_questions
167 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
168 |         #sys.stdout.flush()
169 | 
170 | 
171 | 
172 |     mean_testloss = np.array(testloss_list).mean()
173 | 
174 |     if mode == 'val':
175 |         valFile = './result/val2014_resfile'
176 |         with open(valFile, 'w') as f:
177 |             json.dump(pred_list, f)
178 |         if visualize:
179 |             visualize_failures(stat_list,mode)
180 |         annFile = config.DATA_PATHS['val']['ans_file']
181 |         quesFile = config.DATA_PATHS['val']['ques_file']
182 |         vqa = VQA(annFile, quesFile)
183 |         vqaRes = vqa.loadRes(valFile, quesFile)
184 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
185 |         vqaEval.evaluate()
186 |         acc_overall = vqaEval.accuracy['overall']
187 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
188 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
189 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
190 |     elif mode == 'test-dev':
191 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
192 |         with open(filename+'.json', 'w') as f:
193 |             json.dump(pred_list, f)
194 |         if visualize:
195 |             visualize_failures(stat_list,mode)
196 |     elif mode == 'test':
197 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
198 |         with open(filename+'.json', 'w') as f:
199 |             json.dump(pred_list, f)
200 |         if visualize:
201 |             visualize_failures(stat_list,mode)
202 | 
203 | def drawgraph(results, save_question_type_graphs=False):
204 |     # 0:it
205 |     # 1:trainloss
206 |     # 2:testloss
207 |     # 3:oa_acc
208 |     # 4:qt_acc
209 |     # 5:at_acc
210 | 
211 |     # training curve
212 |     it = np.array([l[0] for l in results])
213 |     loss = np.array([l[1] for l in results])
214 |     valloss = np.array([l[2] for l in results])
215 |     valacc = np.array([l[3] for l in results])
216 | 
217 |     fig = plt.figure()
218 |     ax1 = fig.add_subplot(111)
219 |     ax2 = ax1.twinx()
220 | 
221 |     ax1.plot(it,loss, color='blue', label='train loss')
222 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
223 |     ax2.plot(it,valacc, color='red', label='acc on val')
224 |     plt.legend(loc='lower left')
225 | 
226 |     ax1.set_xlabel('Iterations')
227 |     ax1.set_ylabel('Loss Value')
228 |     ax2.set_ylabel('Accuracy on Val [%]')
229 | 
230 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
231 |     plt.clf()
232 |     plt.close("all")
233 | 
234 |     # question type
235 |     it = np.array([l[0] for l in results])
236 |     oa_acc = np.array([l[3] for l in results])
237 |     qt_dic_list = [l[4] for l in results]
238 | 
239 |     def draw_qt_acc(target_key_list, figname):
240 |         fig = plt.figure()
241 |         for k in target_key_list:
242 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
243 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
244 |             plt.plot(it,t_val,label=str(k))
245 |         plt.legend(fontsize='small')
246 |         plt.ylim(0,100.)
247 |         #plt.legend(prop={'size':6})
248 | 
249 |         plt.xlabel('Iterations')
250 |         plt.ylabel('Accuracy on Val [%]')
251 | 
252 |         plt.savefig(figname,dpi=200)
253 |         plt.clf()
254 |         plt.close("all")
255 | 
256 |     if save_question_type_graphs:
257 |         s_keys = sorted(qt_dic_list[0].keys())
258 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
259 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
260 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
261 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
262 |         draw_qt_acc(['what color is the','what color are the','what color is',\
263 |             'what color','what is the color of the'],'./qt_color.png')
264 |         draw_qt_acc(['how many','how','how many people are',\
265 |             'how many people are in'],'./qt_number.png')
266 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
267 |             'which'],'./qt_who_why_where_which.png')
268 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
269 |             'is the woman','is this person','what is the woman','is the person',\
270 |             'what is the person'],'./qt_human.png')
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/CNN Inception (char+word)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception (word)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 8
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 140000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Inception (word)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception (word)/train_att_bc.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | from write_to_log import write_log
  9 | 
 10 | import caffe
 11 | from caffe import layers as L
 12 | from caffe import params as P
 13 | 
 14 | from vqa_data_provider_layer import VQADataProvider
 15 | from visualize_tools import exec_validation, drawgraph
 16 | import config
 17 | 
 18 | 
 19 | def qlstm(mode, batchsize, T, question_vocab_size):
 20 |     n = caffe.NetSpec()
 21 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
 22 |     # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
 23 |     #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
 24 |     n.data, n.cont, n.img_feature, n.label = L.Python(\
 25 |         module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )
 26 |     
 27 |     # word embedding (static + dynamic)
 28 |     n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 29 |         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
 30 |     # n.embed = L.TanH(n.embed_ba)
 31 |     n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
 32 |     n.embed_scale_resh = L.Reshape(n.embed_scale,\
 33 |                           reshape_param=dict(\
 34 |                               shape=dict(dim=[batchsize,1,T,300])))
 35 | 
 36 |     # convolution
 37 |     n.word_feature_2 = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x 512 x ? x 1
 38 |     n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier'))
 39 |     n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier'))
 40 |     n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier'))
 41 |     n.word_relu_2 = L.ReLU(n.word_feature_2)
 42 |     n.word_relu_3 = L.ReLU(n.word_feature_3)
 43 |     n.word_relu_4 = L.ReLU(n.word_feature_4)
 44 |     n.word_relu_5 = L.ReLU(n.word_feature_5)
 45 |     n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T+1, kernel_w=1, stride=T+1, pool=P.Pooling.MAX) # N x 512 x 1 x 1
 46 |     n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T+2, kernel_w=1, stride=T+2, pool=P.Pooling.MAX)
 47 |     n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T+3, kernel_w=1, stride=T+3, pool=P.Pooling.MAX)
 48 |     n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T+4, kernel_w=1, stride=T+4, pool=P.Pooling.MAX)
 49 |     word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5]
 50 |     n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 2048 x 1 x 1
 51 |     n.concat_vec_dropped = L.Dropout(n.concat_vec,dropout_param={'dropout_ratio':0.5})
 52 |     
 53 |     n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14)
 54 |     n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14)
 55 |     n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14])))
 56 |     n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 57 |     n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
 58 |     n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
 59 |     n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1})
 60 | 
 61 |     # multi-channel attention
 62 |     n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
 63 |     n.att_conv1_relu = L.ReLU(n.att_conv1)
 64 |     n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier'))
 65 |     n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14])))
 66 |     n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
 67 |     n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14])))
 68 |     att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1})
 69 |     n.att_map0 = att_maps[0]
 70 |     n.att_map1 = att_maps[1]
 71 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 72 |     n.att_feature0  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy)
 73 |     n.att_feature1  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy)
 74 |     n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048])))
 75 |     n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048])))
 76 |     n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
 77 | 
 78 |     # merge attention and lstm with compact bilinear pooling
 79 |     n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1])))
 80 |     #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 81 |     n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 
 82 |                                       compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 83 |     n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
 84 |     n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)
 85 | 
 86 |     n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1})
 87 |     n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))
 88 | 
 89 |     n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier'))
 90 |     n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
 91 |     return n.to_proto()
 92 | 
 93 | def make_answer_vocab(adic, vocab_size):
 94 |     """
 95 |     Returns a dictionary that maps words to indices.
 96 |     """
 97 |     adict = {'':0}
 98 |     nadict = {'':1000000}
 99 |     vid = 1
100 |     for qid in adic.keys():
101 |         answer_obj = adic[qid]
102 |         answer_list = [ans['answer'] for ans in answer_obj]
103 |         
104 |         for q_ans in answer_list:
105 |             # create dict
106 |             if adict.has_key(q_ans):
107 |                 nadict[q_ans] += 1
108 |             else:
109 |                 nadict[q_ans] = 1
110 |                 adict[q_ans] = vid
111 |                 vid +=1
112 | 
113 |     # debug
114 |     nalist = []
115 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
116 |         nalist.append((k,v))
117 | 
118 |     # remove words that appear less than once 
119 |     n_del_ans = 0
120 |     n_valid_ans = 0
121 |     adict_nid = {}
122 |     for i, w in enumerate(nalist[:-vocab_size]):
123 |         del adict[w[0]]
124 |         n_del_ans += w[1]
125 |     for i, w in enumerate(nalist[-vocab_size:]):
126 |         n_valid_ans += w[1]
127 |         adict_nid[w[0]] = i
128 |     
129 |     return adict_nid
130 | 
131 | def make_question_vocab(qdic):
132 |     """
133 |     Returns a dictionary that maps words to indices.
134 |     """
135 |     vdict = {'':0}
136 |     vid = 1
137 |     for qid in qdic.keys():
138 |         # sequence to list
139 |         q_str = qdic[qid]['qstr']
140 |         q_list = VQADataProvider.seq_to_list(q_str)
141 | 
142 |         # create dict
143 |         for w in q_list:
144 |             if not vdict.has_key(w):
145 |                 vdict[w] = vid
146 |                 vid +=1
147 | 
148 |     return vdict
149 | 
150 | def make_vocab_files():
151 |     """
152 |     Produce the question and answer vocabulary files.
153 |     """
154 |     write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt')
155 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
156 |     question_vocab = make_question_vocab(qdic)
157 |     write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt')
158 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
159 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
160 |     return question_vocab, answer_vocab
161 | 
162 | def main():
163 |     if not os.path.exists('./result'):
164 |         os.makedirs('./result')
165 | 
166 |     question_vocab, answer_vocab = {}, {}
167 |     if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'):
168 |         write_log('restoring vocab', 'log.txt')
169 |         with open('./result/vdict.json','r') as f:
170 |             question_vocab = json.load(f)
171 |         with open('./result/adict.json','r') as f:
172 |             answer_vocab = json.load(f)
173 |     else:
174 |         question_vocab, answer_vocab = make_vocab_files()
175 |         with open('./result/vdict.json','w') as f:
176 |             json.dump(question_vocab, f)
177 |         with open('./result/adict.json','w') as f:
178 |             json.dump(answer_vocab, f)
179 | 
180 |     write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt')
181 |     write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt')
182 | 
183 |     with open('./result/proto_train.prototxt', 'w') as f:
184 |         f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
185 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
186 | 
187 |     with open('./result/proto_test.prototxt', 'w') as f:
188 |         f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \
189 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
190 | 
191 |     caffe.set_device(config.GPU_ID)
192 |     caffe.set_mode_gpu()
193 |     solver = caffe.get_solver('./qlstm_solver.prototxt')
194 | 
195 |     train_loss = np.zeros(config.MAX_ITERATIONS)
196 |     # results = []
197 | 
198 |     for it in range(config.MAX_ITERATIONS):
199 |         solver.step(1)
200 |     
201 |         # store the train loss
202 |         train_loss[it] = solver.net.blobs['loss'].data
203 |    
204 |         if it != 0 and it % config.PRINT_INTERVAL == 0:
205 |             write_log('------------------------------------', 'log.txt')
206 |             write_log('Iteration: ' + str(it), 'log.txt')
207 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
208 |             write_log('Train loss: ' + str(c_mean_loss), 'log.txt')
209 |         if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test
210 |             solver.test_nets[0].save('./result/tmp.caffemodel')
211 |             write_log('Validating...', 'log.txt')
212 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it)
213 |             write_log('Iteration: ' + str(it), 'log.txt')
214 |             write_log('Test loss: ' + str(test_loss), 'log.txt')
215 |             write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt')
216 |             write_log('Per Question Type Accuracy is the following:', 'log.txt')
217 |             for quesType in acc_per_ques:
218 |                 write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt')
219 |             write_log('Per Answer Type Accuracy is the following:', 'log.txt')
220 |             for ansType in acc_per_ans:
221 |                 write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt')
222 |             # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
223 |             # best_result_idx = np.array([x[3] for x in results]).argmax()
224 |             # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt')
225 |             # drawgraph(results)
226 | 
227 | if __name__ == '__main__':
228 |     main()
229 | 


--------------------------------------------------------------------------------
/CNN Inception (word)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         # percent = 100 * float(len(pred_list)) / total_questions
166 |         # sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         # sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/CNN Inception (word)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception + Bottleneck/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 10
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 120000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Inception + Bottleneck/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception + Bottleneck/train_att_bc.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | from write_to_log import write_log
  9 | 
 10 | import caffe
 11 | from caffe import layers as L
 12 | from caffe import params as P
 13 | 
 14 | from vqa_data_provider_layer import VQADataProvider
 15 | from visualize_tools import exec_validation, drawgraph
 16 | import config
 17 | 
 18 | 
 19 | def qlstm(mode, batchsize, T, question_vocab_size):
 20 |     n = caffe.NetSpec()
 21 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
 22 |     # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
 23 |     #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
 24 |     n.data, n.cont, n.img_feature, n.label = L.Python(\
 25 |         module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )
 26 |     
 27 |     # word embedding
 28 |     n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 29 |         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
 30 |     # n.embed = L.TanH(n.embed_ba)
 31 |     n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
 32 |     n.embed_scale_resh = L.Reshape(n.embed_scale,\
 33 |                           reshape_param=dict(\
 34 |                               shape=dict(dim=[batchsize,1,T,-1])))
 35 |     
 36 |     # convolution
 37 |     n.word_feature_3_1 = L.Convolution(n.embed_scale_resh, kernel_h=1, kernel_w=300, stride=1, num_output=256, pad_h=0, pad_w=0, weight_filler=dict(type='xavier'))
 38 |     n.word_relu_3_1_r = L.ReLU(n.word_feature_3_1)
 39 |     n.word_feature_3_2 = L.Convolution(n.word_relu_3_1_r, kernel_h=3, kernel_w=1, stride=1, num_output=256, pad_h=1, pad_w=0, weight_filler=dict(type='xavier'))
 40 |     n.word_relu_3_2_r = L.ReLU(n.word_feature_3_2)
 41 |     n.word_feature_3 = L.Convolution(n.word_relu_3_2_r, kernel_h=1, kernel_w=1, stride=1, num_output=1024, pad_h=0, pad_w=0, weight_filler=dict(type='xavier'))
 42 | 
 43 |     n.word_feature_5_1 = L.Convolution(n.embed_scale_resh, kernel_h=1, kernel_w=300, stride=1, num_output=256, pad_h=0, pad_w=0, weight_filler=dict(type='xavier'))
 44 |     n.word_relu_5_1_r = L.ReLU(n.word_feature_5_1)
 45 |     n.word_feature_5_2 = L.Convolution(n.word_relu_5_1_r, kernel_h=5, kernel_w=1, stride=1, num_output=256, pad_h=2, pad_w=0, weight_filler=dict(type='xavier'))
 46 |     n.word_relu_5_2_r = L.ReLU(n.word_feature_5_2)
 47 |     n.word_feature_5 = L.Convolution(n.word_relu_5_2_r, kernel_h=1, kernel_w=1, stride=1, num_output=1024, pad_h=0, pad_w=0, weight_filler=dict(type='xavier'))
 48 |     
 49 |     n.word_relu_3 = L.ReLU(n.word_feature_3)
 50 |     n.word_relu_5 = L.ReLU(n.word_feature_5)
 51 | 
 52 |     n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX)
 53 |     n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX)
 54 |     
 55 |     word_vec = [n.word_vec_3, n.word_vec_5]
 56 |     n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 2*d_w x 1 x 1
 57 |     n.concat_vec_dropped = L.Dropout(n.concat_vec,dropout_param={'dropout_ratio':0.5})
 58 | 
 59 |     n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14)
 60 |     n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14)
 61 |     n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14])))
 62 |     n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 63 |     n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
 64 |     n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
 65 |     n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1})
 66 | 
 67 |     # multi-channel attention
 68 |     n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
 69 |     n.att_conv1_relu = L.ReLU(n.att_conv1)
 70 |     n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier'))
 71 |     n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14])))
 72 |     n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
 73 |     n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14])))
 74 |     att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1})
 75 |     n.att_map0 = att_maps[0]
 76 |     n.att_map1 = att_maps[1]
 77 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 78 |     n.att_feature0  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy)
 79 |     n.att_feature1  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy)
 80 |     n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048])))
 81 |     n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048])))
 82 |     n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
 83 | 
 84 |     # merge attention and lstm with compact bilinear pooling
 85 |     n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1])))
 86 |     #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 87 |     n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 
 88 |                                       compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 89 |     n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
 90 |     n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)
 91 | 
 92 |     n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1})
 93 |     n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))
 94 | 
 95 |     n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier'))
 96 |     n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
 97 |     return n.to_proto()
 98 | 
 99 | def make_answer_vocab(adic, vocab_size):
100 |     """
101 |     Returns a dictionary that maps words to indices.
102 |     """
103 |     adict = {'':0}
104 |     nadict = {'':1000000}
105 |     vid = 1
106 |     for qid in adic.keys():
107 |         answer_obj = adic[qid]
108 |         answer_list = [ans['answer'] for ans in answer_obj]
109 |         
110 |         for q_ans in answer_list:
111 |             # create dict
112 |             if adict.has_key(q_ans):
113 |                 nadict[q_ans] += 1
114 |             else:
115 |                 nadict[q_ans] = 1
116 |                 adict[q_ans] = vid
117 |                 vid +=1
118 | 
119 |     # debug
120 |     nalist = []
121 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
122 |         nalist.append((k,v))
123 | 
124 |     # remove words that appear less than once 
125 |     n_del_ans = 0
126 |     n_valid_ans = 0
127 |     adict_nid = {}
128 |     for i, w in enumerate(nalist[:-vocab_size]):
129 |         del adict[w[0]]
130 |         n_del_ans += w[1]
131 |     for i, w in enumerate(nalist[-vocab_size:]):
132 |         n_valid_ans += w[1]
133 |         adict_nid[w[0]] = i
134 |     
135 |     return adict_nid
136 | 
137 | def make_question_vocab(qdic):
138 |     """
139 |     Returns a dictionary that maps words to indices.
140 |     """
141 |     vdict = {'':0}
142 |     vid = 1
143 |     for qid in qdic.keys():
144 |         # sequence to list
145 |         q_str = qdic[qid]['qstr']
146 |         q_list = VQADataProvider.seq_to_list(q_str)
147 | 
148 |         # create dict
149 |         for w in q_list:
150 |             if not vdict.has_key(w):
151 |                 vdict[w] = vid
152 |                 vid +=1
153 | 
154 |     return vdict
155 | 
156 | def make_vocab_files():
157 |     """
158 |     Produce the question and answer vocabulary files.
159 |     """
160 |     write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt')
161 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
162 |     question_vocab = make_question_vocab(qdic)
163 |     write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt')
164 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
165 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
166 |     return question_vocab, answer_vocab
167 | 
168 | def main():
169 |     if not os.path.exists('./result'):
170 |         os.makedirs('./result')
171 | 
172 |     question_vocab, answer_vocab = {}, {}
173 |     if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'):
174 |         write_log('restoring vocab', 'log.txt')
175 |         with open('./result/vdict.json','r') as f:
176 |             question_vocab = json.load(f)
177 |         with open('./result/adict.json','r') as f:
178 |             answer_vocab = json.load(f)
179 |     else:
180 |         question_vocab, answer_vocab = make_vocab_files()
181 |         with open('./result/vdict.json','w') as f:
182 |             json.dump(question_vocab, f)
183 |         with open('./result/adict.json','w') as f:
184 |             json.dump(answer_vocab, f)
185 | 
186 |     write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt')
187 |     write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt')
188 | 
189 |     with open('./result/proto_train.prototxt', 'w') as f:
190 |         f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
191 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
192 | 
193 |     with open('./result/proto_test.prototxt', 'w') as f:
194 |         f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \
195 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
196 | 
197 |     caffe.set_device(config.GPU_ID)
198 |     caffe.set_mode_gpu()
199 |     solver = caffe.get_solver('./qlstm_solver.prototxt')
200 | 
201 |     train_loss = np.zeros(config.MAX_ITERATIONS)
202 |     # results = []
203 | 
204 |     for it in range(config.MAX_ITERATIONS):
205 |         solver.step(1)
206 |     
207 |         # store the train loss
208 |         train_loss[it] = solver.net.blobs['loss'].data
209 |    
210 |         if it != 0 and it % config.PRINT_INTERVAL == 0:
211 |             write_log('------------------------------------', 'log.txt')
212 |             write_log('Iteration: ' + str(it), 'log.txt')
213 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
214 |             write_log('Train loss: ' + str(c_mean_loss), 'log.txt')
215 |         if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test
216 |             solver.test_nets[0].save('./result/tmp.caffemodel')
217 |             write_log('Validating...', 'log.txt')
218 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it)
219 |             write_log('Iteration: ' + str(it), 'log.txt')
220 |             write_log('Test loss: ' + str(test_loss), 'log.txt')
221 |             write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt')
222 |             write_log('Per Question Type Accuracy is the following:', 'log.txt')
223 |             for quesType in acc_per_ques:
224 |                 write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt')
225 |             write_log('Per Answer Type Accuracy is the following:', 'log.txt')
226 |             for ansType in acc_per_ans:
227 |                 write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt')
228 |             # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
229 |             # best_result_idx = np.array([x[3] for x in results]).argmax()
230 |             # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt')
231 |             # drawgraph(results)
232 | 
233 | if __name__ == '__main__':
234 |     main()
235 | 


--------------------------------------------------------------------------------
/CNN Inception + Bottleneck/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         # percent = 100 * float(len(pred_list)) / total_questions
166 |         # sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         # sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/CNN Inception + Bottleneck/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate (tanh)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 10
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 110000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate (tanh)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate (tanh)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         # percent = 100 * float(len(pred_list)) / total_questions
166 |         # sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         # sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate (tanh)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 7
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 150000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         # percent = 100 * float(len(pred_list)) / total_questions
166 |         # sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         # sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/CNN Inception + Gate/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Inception + Residual/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 6
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 130000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Inception + Residual/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Inception + Residual/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/CNN Non-Inception/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 9
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 160000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/CNN Non-Inception/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/CNN Non-Inception/train_att_bc.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | from write_to_log import write_log
  9 | 
 10 | import caffe
 11 | from caffe import layers as L
 12 | from caffe import params as P
 13 | 
 14 | from vqa_data_provider_layer import VQADataProvider
 15 | from visualize_tools import exec_validation, drawgraph
 16 | import config
 17 | 
 18 | 
 19 | def qlstm(mode, batchsize, T, question_vocab_size):
 20 |     n = caffe.NetSpec()
 21 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
 22 |     # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
 23 |     #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
 24 |     n.data, n.cont, n.img_feature, n.label = L.Python(\
 25 |         module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )
 26 |     
 27 |     # word embedding
 28 |     n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 29 |         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
 30 |     # n.embed = L.TanH(n.embed_ba)
 31 |     n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0)))
 32 |     n.embed_scale_resh = L.Reshape(n.embed_scale,\
 33 |                           reshape_param=dict(\
 34 |                               shape=dict(dim=[batchsize,1,T,-1])))
 35 | 
 36 |     # convolution
 37 |     n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=2048, pad_h=1, pad_w=0, weight_filler=dict(type='xavier'))
 38 |     n.word_relu_3 = L.ReLU(n.word_feature_3)
 39 |     n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX)
 40 |     n.concat_vec_dropped = L.Dropout(n.word_vec_3,dropout_param={'dropout_ratio':0.5})
 41 | 
 42 |     n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14)
 43 |     n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14)
 44 |     n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14])))
 45 |     n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 46 |     n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
 47 |     n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
 48 |     n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1})
 49 | 
 50 |     # multi-channel attention
 51 |     n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
 52 |     n.att_conv1_relu = L.ReLU(n.att_conv1)
 53 |     n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier'))
 54 |     n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14])))
 55 |     n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
 56 |     n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14])))
 57 |     att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1})
 58 |     n.att_map0 = att_maps[0]
 59 |     n.att_map1 = att_maps[1]
 60 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 61 |     n.att_feature0  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy)
 62 |     n.att_feature1  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy)
 63 |     n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048])))
 64 |     n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048])))
 65 |     n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
 66 | 
 67 |     # merge attention and lstm with compact bilinear pooling
 68 |     n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1])))
 69 |     #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 70 |     n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 
 71 |                                       compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 72 |     n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
 73 |     n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)
 74 | 
 75 |     n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1})
 76 |     n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))
 77 | 
 78 |     n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier'))
 79 |     n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
 80 |     return n.to_proto()
 81 | 
 82 | def make_answer_vocab(adic, vocab_size):
 83 |     """
 84 |     Returns a dictionary that maps words to indices.
 85 |     """
 86 |     adict = {'':0}
 87 |     nadict = {'':1000000}
 88 |     vid = 1
 89 |     for qid in adic.keys():
 90 |         answer_obj = adic[qid]
 91 |         answer_list = [ans['answer'] for ans in answer_obj]
 92 |         
 93 |         for q_ans in answer_list:
 94 |             # create dict
 95 |             if adict.has_key(q_ans):
 96 |                 nadict[q_ans] += 1
 97 |             else:
 98 |                 nadict[q_ans] = 1
 99 |                 adict[q_ans] = vid
100 |                 vid +=1
101 | 
102 |     # debug
103 |     nalist = []
104 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
105 |         nalist.append((k,v))
106 | 
107 |     # remove words that appear less than once 
108 |     n_del_ans = 0
109 |     n_valid_ans = 0
110 |     adict_nid = {}
111 |     for i, w in enumerate(nalist[:-vocab_size]):
112 |         del adict[w[0]]
113 |         n_del_ans += w[1]
114 |     for i, w in enumerate(nalist[-vocab_size:]):
115 |         n_valid_ans += w[1]
116 |         adict_nid[w[0]] = i
117 |     
118 |     return adict_nid
119 | 
120 | def make_question_vocab(qdic):
121 |     """
122 |     Returns a dictionary that maps words to indices.
123 |     """
124 |     vdict = {'':0}
125 |     vid = 1
126 |     for qid in qdic.keys():
127 |         # sequence to list
128 |         q_str = qdic[qid]['qstr']
129 |         q_list = VQADataProvider.seq_to_list(q_str)
130 | 
131 |         # create dict
132 |         for w in q_list:
133 |             if not vdict.has_key(w):
134 |                 vdict[w] = vid
135 |                 vid +=1
136 | 
137 |     return vdict
138 | 
139 | def make_vocab_files():
140 |     """
141 |     Produce the question and answer vocabulary files.
142 |     """
143 |     write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt')
144 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
145 |     question_vocab = make_question_vocab(qdic)
146 |     write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt')
147 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
148 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
149 |     return question_vocab, answer_vocab
150 | 
151 | def main():
152 |     if not os.path.exists('./result'):
153 |         os.makedirs('./result')
154 | 
155 |     question_vocab, answer_vocab = {}, {}
156 |     if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'):
157 |         write_log('restoring vocab', 'log.txt')
158 |         with open('./result/vdict.json','r') as f:
159 |             question_vocab = json.load(f)
160 |         with open('./result/adict.json','r') as f:
161 |             answer_vocab = json.load(f)
162 |     else:
163 |         question_vocab, answer_vocab = make_vocab_files()
164 |         with open('./result/vdict.json','w') as f:
165 |             json.dump(question_vocab, f)
166 |         with open('./result/adict.json','w') as f:
167 |             json.dump(answer_vocab, f)
168 | 
169 |     write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt')
170 |     write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt')
171 | 
172 |     with open('./result/proto_train.prototxt', 'w') as f:
173 |         f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
174 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
175 | 
176 |     with open('./result/proto_test.prototxt', 'w') as f:
177 |         f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \
178 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
179 | 
180 |     caffe.set_device(config.GPU_ID)
181 |     caffe.set_mode_gpu()
182 |     solver = caffe.get_solver('./qlstm_solver.prototxt')
183 | 
184 |     train_loss = np.zeros(config.MAX_ITERATIONS)
185 |     # results = []
186 | 
187 |     for it in range(config.MAX_ITERATIONS):
188 |         solver.step(1)
189 |     
190 |         # store the train loss
191 |         train_loss[it] = solver.net.blobs['loss'].data
192 |    
193 |         if it != 0 and it % config.PRINT_INTERVAL == 0:
194 |             write_log('------------------------------------', 'log.txt')
195 |             write_log('Iteration: ' + str(it), 'log.txt')
196 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
197 |             write_log('Train loss: ' + str(c_mean_loss), 'log.txt')
198 |         if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test
199 |             solver.test_nets[0].save('./result/tmp.caffemodel')
200 |             write_log('Validating...', 'log.txt')
201 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it)
202 |             write_log('Iteration: ' + str(it), 'log.txt')
203 |             write_log('Test loss: ' + str(test_loss), 'log.txt')
204 |             write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt')
205 |             write_log('Per Question Type Accuracy is the following:', 'log.txt')
206 |             for quesType in acc_per_ques:
207 |                 write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt')
208 |             write_log('Per Answer Type Accuracy is the following:', 'log.txt')
209 |             for ansType in acc_per_ans:
210 |                 write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt')
211 |             # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
212 |             # best_result_idx = np.array([x[3] for x in results]).argmax()
213 |             # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt')
214 |             # drawgraph(results)
215 | 
216 | if __name__ == '__main__':
217 |     main()
218 | 


--------------------------------------------------------------------------------
/CNN Non-Inception/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         # percent = 100 * float(len(pred_list)) / total_questions
166 |         # sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         # sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/CNN Non-Inception/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/LSTM (baseline)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 10
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | MAX_ITERATIONS = 1000000
 7 | PRINT_INTERVAL = 1000
 8 | VALIDATE_INTERVAL = 90000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
 9 | 
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | 
13 | # what data to use for the vocabulary
14 | QUESTION_VOCAB_SPACE = 'train'
15 | ANSWER_VOCAB_SPACE = 'train'
16 | 
17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
20 | 
21 | # location of the data
22 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
25 | 
26 | DATA_PATHS = {
27 | 	'train': {
28 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
29 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
30 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
31 | 	},
32 | 	'val': {
33 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
34 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
35 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
36 | 	},
37 | 	'test-dev': {
38 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
39 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
40 | 	},
41 | 	'test': {
42 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
43 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
44 | 	},
45 | 	# TODO it would be nice if genome also followed the same file format as vqa
46 | 	'genome': {
47 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
48 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/LSTM (baseline)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/LSTM (baseline)/train_att_bc.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | from write_to_log import write_log
  9 | 
 10 | import caffe
 11 | from caffe import layers as L
 12 | from caffe import params as P
 13 | 
 14 | from vqa_data_provider_layer import VQADataProvider
 15 | from visualize_tools import exec_validation, drawgraph
 16 | import config
 17 | 
 18 | 
 19 | def qlstm(mode, batchsize, T, question_vocab_size):
 20 |     n = caffe.NetSpec()
 21 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
 22 |     # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
 23 |     #     module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 )
 24 |     n.data, n.cont, n.img_feature, n.label = L.Python(\
 25 |         module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )
 26 | 
 27 |     n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 28 |         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
 29 |     n.embed = L.TanH(n.embed_ba) 
 30 |     # concat_word_embed = [n.embed, n.glove]
 31 |     # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600
 32 | 
 33 |     # LSTM1
 34 |     n.lstm1 = L.LSTM(\
 35 |                    n.embed, n.cont,\
 36 |                    recurrent_param=dict(\
 37 |                        num_output=1024,\
 38 |                        weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
 39 |                        bias_filler=dict(type='constant',value=0)))
 40 |     tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0})
 41 |     for i in xrange(T-1):
 42 |         n.__setattr__('slice_first'+str(i), tops1[int(i)])
 43 |         n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
 44 |     n.lstm1_out = tops1[T-1]
 45 |     n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
 46 |                           reshape_param=dict(\
 47 |                               shape=dict(dim=[-1,1024])))
 48 |     n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3})
 49 |     n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3})
 50 |     # LSTM2
 51 |     n.lstm2 = L.LSTM(\
 52 |                    n.lstm1_droped, n.cont,\
 53 |                    recurrent_param=dict(\
 54 |                        num_output=1024,\
 55 |                        weight_filler=dict(type='uniform',min=-0.08,max=0.08),\
 56 |                        bias_filler=dict(type='constant',value=0)))
 57 |     tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0})
 58 |     for i in xrange(T-1):
 59 |         n.__setattr__('slice_second'+str(i), tops2[int(i)])
 60 |         n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0))
 61 |     n.lstm2_out = tops2[T-1]
 62 |     n.lstm2_reshaped = L.Reshape(n.lstm2_out,\
 63 |                           reshape_param=dict(\
 64 |                               shape=dict(dim=[-1,1024])))
 65 |     n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3})
 66 |     concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped]
 67 |     n.lstm_12 = L.Concat(*concat_botom)
 68 | 
 69 |     n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12,reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 70 |     n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14)
 71 |     n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14)
 72 |     n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14])))
 73 |     n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 74 |     n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
 75 |     n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
 76 |     n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1})
 77 | 
 78 |     # multi-channel attention
 79 |     n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
 80 |     n.att_conv1_relu = L.ReLU(n.att_conv1)
 81 |     n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier'))
 82 |     n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14])))
 83 |     n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
 84 |     n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14])))
 85 |     att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1})
 86 |     n.att_map0 = att_maps[0]
 87 |     n.att_map1 = att_maps[1]
 88 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 89 |     n.att_feature0  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy)
 90 |     n.att_feature1  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy)
 91 |     n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048])))
 92 |     n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048])))
 93 |     n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
 94 | 
 95 |     # merge attention and lstm with compact bilinear pooling
 96 |     n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1])))
 97 |     n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 98 |     n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, 
 99 |                                       compact_bilinear_param=dict(num_output=16000,sum_pool=False))
100 |     n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
101 |     n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)
102 | 
103 |     n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1})
104 |     n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))
105 | 
106 |     n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier'))
107 |     n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
108 |     return n.to_proto()
109 | 
110 | def make_answer_vocab(adic, vocab_size):
111 |     """
112 |     Returns a dictionary that maps words to indices.
113 |     """
114 |     adict = {'':0}
115 |     nadict = {'':1000000}
116 |     vid = 1
117 |     for qid in adic.keys():
118 |         answer_obj = adic[qid]
119 |         answer_list = [ans['answer'] for ans in answer_obj]
120 |         
121 |         for q_ans in answer_list:
122 |             # create dict
123 |             if adict.has_key(q_ans):
124 |                 nadict[q_ans] += 1
125 |             else:
126 |                 nadict[q_ans] = 1
127 |                 adict[q_ans] = vid
128 |                 vid +=1
129 | 
130 |     # debug
131 |     nalist = []
132 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
133 |         nalist.append((k,v))
134 | 
135 |     # remove words that appear less than once 
136 |     n_del_ans = 0
137 |     n_valid_ans = 0
138 |     adict_nid = {}
139 |     for i, w in enumerate(nalist[:-vocab_size]):
140 |         del adict[w[0]]
141 |         n_del_ans += w[1]
142 |     for i, w in enumerate(nalist[-vocab_size:]):
143 |         n_valid_ans += w[1]
144 |         adict_nid[w[0]] = i
145 |     
146 |     return adict_nid
147 | 
148 | def make_question_vocab(qdic):
149 |     """
150 |     Returns a dictionary that maps words to indices.
151 |     """
152 |     vdict = {'':0}
153 |     vid = 1
154 |     for qid in qdic.keys():
155 |         # sequence to list
156 |         q_str = qdic[qid]['qstr']
157 |         q_list = VQADataProvider.seq_to_list(q_str)
158 | 
159 |         # create dict
160 |         for w in q_list:
161 |             if not vdict.has_key(w):
162 |                 vdict[w] = vid
163 |                 vid +=1
164 | 
165 |     return vdict
166 | 
167 | def make_vocab_files():
168 |     """
169 |     Produce the question and answer vocabulary files.
170 |     """
171 |     write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt')
172 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
173 |     question_vocab = make_question_vocab(qdic)
174 |     write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt')
175 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
176 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
177 |     return question_vocab, answer_vocab
178 | 
179 | def main():
180 |     if not os.path.exists('./result'):
181 |         os.makedirs('./result')
182 | 
183 |     question_vocab, answer_vocab = {}, {}
184 |     if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'):
185 |         write_log('restoring vocab', 'log.txt')
186 |         with open('./result/vdict.json','r') as f:
187 |             question_vocab = json.load(f)
188 |         with open('./result/adict.json','r') as f:
189 |             answer_vocab = json.load(f)
190 |     else:
191 |         question_vocab, answer_vocab = make_vocab_files()
192 |         with open('./result/vdict.json','w') as f:
193 |             json.dump(question_vocab, f)
194 |         with open('./result/adict.json','w') as f:
195 |             json.dump(answer_vocab, f)
196 | 
197 |     write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt')
198 |     write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt')
199 | 
200 |     with open('./result/proto_train.prototxt', 'w') as f:
201 |         f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
202 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
203 |     
204 |     with open('./result/proto_test.prototxt', 'w') as f:
205 |         f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \
206 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab))))
207 | 
208 |     caffe.set_device(config.GPU_ID)
209 |     caffe.set_mode_gpu()
210 |     solver = caffe.get_solver('./qlstm_solver.prototxt')
211 | 
212 |     train_loss = np.zeros(config.MAX_ITERATIONS)
213 |     # results = []
214 | 
215 |     for it in range(config.MAX_ITERATIONS):
216 |         solver.step(1)
217 |     
218 |         # store the train loss
219 |         train_loss[it] = solver.net.blobs['loss'].data
220 |    
221 |         if it != 0 and it % config.PRINT_INTERVAL == 0:
222 |             write_log('------------------------------------', 'log.txt')
223 |             write_log('Iteration: ' + str(it), 'log.txt')
224 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
225 |             write_log('Train loss: ' + str(c_mean_loss), 'log.txt')
226 |         if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test
227 |             solver.test_nets[0].save('./result/tmp.caffemodel')
228 |             write_log('Validating...', 'log.txt')
229 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it)
230 |             write_log('Iteration: ' + str(it), 'log.txt')
231 |             write_log('Test loss: ' + str(test_loss), 'log.txt')
232 |             write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt')
233 |             write_log('Per Question Type Accuracy is the following:', 'log.txt')
234 |             for quesType in acc_per_ques:
235 |                 write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt')
236 |             write_log('Per Answer Type Accuracy is the following:', 'log.txt')
237 |             for ansType in acc_per_ans:
238 |                 write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt')
239 |             # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
240 |             # best_result_idx = np.array([x[3] for x in results]).argmax()
241 |             # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt')
242 |             # drawgraph(results)
243 | 
244 | if __name__ == '__main__':
245 |     main()
246 | 


--------------------------------------------------------------------------------
/LSTM (baseline)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
139 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
140 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
141 |         net.blobs['img_feature'].data[...] = t_img_feature
142 |         net.blobs['label'].data[...] = t_answer
143 |         # net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
146 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
147 |         testloss_list.append(net.blobs['loss'].data)
148 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
149 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
150 |             if visualize:
151 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
152 |                 if mode == 'test-dev' or 'test':
153 |                     ans_str = ''
154 |                     ans_list = ['']*10
155 |                 else:
156 |                     ans_str = dp.vec_to_answer(ans)
157 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
158 |                 stat_list.append({\
159 |                                     'qid'   : qid,
160 |                                     'q_list' : q_list,
161 |                                     'iid'   : iid,
162 |                                     'answer': ans_str,
163 |                                     'ans_list': ans_list,
164 |                                     'pred'  : pred })
165 |         #percent = 100 * float(len(pred_list)) / total_questions
166 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
167 |         #sys.stdout.flush()
168 | 
169 | 
170 | 
171 |     mean_testloss = np.array(testloss_list).mean()
172 | 
173 |     if mode == 'val':
174 |         valFile = './result/val2014_resfile'
175 |         with open(valFile, 'w') as f:
176 |             json.dump(pred_list, f)
177 |         if visualize:
178 |             visualize_failures(stat_list,mode)
179 |         annFile = config.DATA_PATHS['val']['ans_file']
180 |         quesFile = config.DATA_PATHS['val']['ques_file']
181 |         vqa = VQA(annFile, quesFile)
182 |         vqaRes = vqa.loadRes(valFile, quesFile)
183 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
184 |         vqaEval.evaluate()
185 |         acc_overall = vqaEval.accuracy['overall']
186 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
187 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
188 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
189 |     elif mode == 'test-dev':
190 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
191 |         with open(filename+'.json', 'w') as f:
192 |             json.dump(pred_list, f)
193 |         if visualize:
194 |             visualize_failures(stat_list,mode)
195 |     elif mode == 'test':
196 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(pred_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 | 
202 | def drawgraph(results, save_question_type_graphs=False):
203 |     # 0:it
204 |     # 1:trainloss
205 |     # 2:testloss
206 |     # 3:oa_acc
207 |     # 4:qt_acc
208 |     # 5:at_acc
209 | 
210 |     # training curve
211 |     it = np.array([l[0] for l in results])
212 |     loss = np.array([l[1] for l in results])
213 |     valloss = np.array([l[2] for l in results])
214 |     valacc = np.array([l[3] for l in results])
215 | 
216 |     fig = plt.figure()
217 |     ax1 = fig.add_subplot(111)
218 |     ax2 = ax1.twinx()
219 | 
220 |     ax1.plot(it,loss, color='blue', label='train loss')
221 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
222 |     ax2.plot(it,valacc, color='red', label='acc on val')
223 |     plt.legend(loc='lower left')
224 | 
225 |     ax1.set_xlabel('Iterations')
226 |     ax1.set_ylabel('Loss Value')
227 |     ax2.set_ylabel('Accuracy on Val [%]')
228 | 
229 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
230 |     plt.clf()
231 |     plt.close("all")
232 | 
233 |     # question type
234 |     it = np.array([l[0] for l in results])
235 |     oa_acc = np.array([l[3] for l in results])
236 |     qt_dic_list = [l[4] for l in results]
237 | 
238 |     def draw_qt_acc(target_key_list, figname):
239 |         fig = plt.figure()
240 |         for k in target_key_list:
241 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
242 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
243 |             plt.plot(it,t_val,label=str(k))
244 |         plt.legend(fontsize='small')
245 |         plt.ylim(0,100.)
246 |         #plt.legend(prop={'size':6})
247 | 
248 |         plt.xlabel('Iterations')
249 |         plt.ylabel('Accuracy on Val [%]')
250 | 
251 |         plt.savefig(figname,dpi=200)
252 |         plt.clf()
253 |         plt.close("all")
254 | 
255 |     if save_question_type_graphs:
256 |         s_keys = sorted(qt_dic_list[0].keys())
257 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
258 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
259 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
260 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
261 |         draw_qt_acc(['what color is the','what color are the','what color is',\
262 |             'what color','what is the color of the'],'./qt_color.png')
263 |         draw_qt_acc(['how many','how','how many people are',\
264 |             'how many people are in'],'./qt_number.png')
265 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
266 |             'which'],'./qt_who_why_where_which.png')
267 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
268 |             'is the woman','is this person','what is the woman','is the person',\
269 |             'what is the person'],'./qt_human.png')
270 | 
271 | 
272 | 


--------------------------------------------------------------------------------
/LSTM (baseline)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning Convolutional Text Representations for Visual Question Answering
 2 | 
 3 | This is the code for our SDM18 paper [Learning Convolutional Text Representations for Visual Question Answering](https://epubs.siam.org/doi/abs/10.1137/1.9781611975321.67). We used it to explore different text representation methods in VQA. The reference code is [vqa-mcb](https://github.com/akirafukui/vqa-mcb).
 4 | 
 5 | Created by [Zhengyang Wang](http://people.tamu.edu/~zhengyang.wang/) and [Shuiwang Ji](http://people.tamu.edu/~sji/index.html) at Texas A&M University.
 6 | 
 7 | ## Citation
 8 | If you wish to cite our work, you can use the following bib for now. 
 9 | 
10 | ```
11 | @inproceedings{wang2018learning,
12 |   title={Learning Convolutional Text Representations for Visual Question Answering},
13 |   author={Wang, Zhengyang and Ji, Shuiwang},
14 |   booktitle={Proceedings of the 2018 SIAM International Conference on Data Mining},
15 |   pages={594--602},
16 |   year={2018},
17 |   organization={SIAM}
18 | }
19 | ```
20 | 
21 | ## Instructions
22 | 
23 | To replicate our results, do the following prerequisites as in [vqa-mcb](https://github.com/akirafukui/vqa-mcb):
24 | 
25 | - Compile the `feature/20160617_cb_softattention` branch of [this fork of Caffe](https://github.com/akirafukui/caffe/). This branch contains Yang Gao’s Compact Bilinear layers ([dedicated repo](https://github.com/gy20073/compact_bilinear_pooling), [paper](https://arxiv.org/abs/1511.06062)) released under the [BDD license](https://github.com/gy20073/compact_bilinear_pooling/blob/master/caffe-20160312/LICENSE_BDD), and Ronghang Hu’s Soft Attention layers ([paper](https://arxiv.org/abs/1511.03745)) released under BSD 2-clause.
26 | - Download the [pre-trained ResNet-152 model](https://github.com/KaimingHe/deep-residual-networks).
27 | - Download the [VQA tools](https://github.com/VT-vision-lab/VQA).
28 | - Download the [VQA real-image dataset](http://visualqa.org/download.html).
29 | - Do the [data preprocessing](https://github.com/akirafukui/vqa-mcb/tree/master/preprocess).
30 | 
31 | **Note:** As explained in our paper, we did not use any additional data such as "GloVe" and "Visual Genome".
32 | 
33 | To train and test a model, edit the corresponding `config.py` and `qlstm_solver.prototxt` files.
34 | 
35 | **Note:** Unlike [vqa-mcb](https://github.com/akirafukui/vqa-mcb), in our experiments, different methods require different data provider layers. Use `vqa_data_provider_layer.py` and `visualize_tools.py` in the same folder.
36 | 
37 | In `config.py`, set `GPU_ID` and `VALIDATE_INTERVAL` (training iterations) properly.
38 | 
39 | **Note:** As stated in our paper, we trained only on the training set, and tested on the validation set. The code has been modified to do training and testing automatically if you set `VALIDATE_INTERVAL` to the number of iterations for training. The pre-set number is what we used in our results. In our experiments, we split the original training set into new training set and validation set, and used early stopping to determine this number. Then we used this code to train our model on all training data.
40 | 
41 | In `qlstm_solver.prototxt`, set `snapshot` and `snapshot_prefix`  correctly.
42 | 
43 | Now just run `python train_xxx.py`. Training can take some time. Snapshots are saved according to the settings in `qlstm_solver.prototxt`. To stop training, just hit `Control + C`.
44 | 


--------------------------------------------------------------------------------
/fastText (char+word)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 9
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22
 6 | LENGTH_OF_LONGEST_WORD = 17
 7 | #MAX_CHARS_IN_QUESTION = 100
 8 | MAX_ITERATIONS = 1000000
 9 | PRINT_INTERVAL = 1000
10 | VALIDATE_INTERVAL = 30000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
11 | 
12 | # what data to use for training
13 | TRAIN_DATA_SPLITS = 'train'
14 | 
15 | # what data to use for the vocabulary
16 | QUESTION_VOCAB_SPACE = 'train'
17 | ANSWER_VOCAB_SPACE = 'train'
18 | 
19 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
20 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
21 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
22 | 
23 | # location of the data
24 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
25 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
26 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
27 | 
28 | DATA_PATHS = {
29 | 	'train': {
30 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
31 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
32 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
33 | 	},
34 | 	'val': {
35 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
36 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
37 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
38 | 	},
39 | 	'test-dev': {
40 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
41 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
42 | 	},
43 | 	'test': {
44 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
45 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
46 | 	},
47 | 	# TODO it would be nice if genome also followed the same file format as vqa
48 | 	'genome': {
49 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
50 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
51 | 	}
52 | }
53 | 


--------------------------------------------------------------------------------
/fastText (char+word)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2 
22 | 


--------------------------------------------------------------------------------
/fastText (char+word)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         t_word, t_cont, t_word_c, t_cont_c, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
140 |         net.blobs['data1'].data[...] = t_word_c
141 |         net.blobs['cont1'].data[...] = t_cont_c
142 |         net.blobs['img_feature'].data[...] = t_img_feature
143 |         net.blobs['label'].data[...] = t_answer
144 |         #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
145 |         net.forward()
146 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
147 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
148 |         testloss_list.append(net.blobs['loss'].data)
149 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
150 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
151 |             if visualize:
152 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
153 |                 if mode == 'test-dev' or 'test':
154 |                     ans_str = ''
155 |                     ans_list = ['']*10
156 |                 else:
157 |                     ans_str = dp.vec_to_answer(ans)
158 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
159 |                 stat_list.append({\
160 |                                     'qid'   : qid,
161 |                                     'q_list' : q_list,
162 |                                     'iid'   : iid,
163 |                                     'answer': ans_str,
164 |                                     'ans_list': ans_list,
165 |                                     'pred'  : pred })
166 |         #percent = 100 * float(len(pred_list)) / total_questions
167 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
168 |         #sys.stdout.flush()
169 | 
170 | 
171 | 
172 |     mean_testloss = np.array(testloss_list).mean()
173 | 
174 |     if mode == 'val':
175 |         valFile = './result/val2014_resfile'
176 |         with open(valFile, 'w') as f:
177 |             json.dump(pred_list, f)
178 |         if visualize:
179 |             visualize_failures(stat_list,mode)
180 |         annFile = config.DATA_PATHS['val']['ans_file']
181 |         quesFile = config.DATA_PATHS['val']['ques_file']
182 |         vqa = VQA(annFile, quesFile)
183 |         vqaRes = vqa.loadRes(valFile, quesFile)
184 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
185 |         vqaEval.evaluate()
186 |         acc_overall = vqaEval.accuracy['overall']
187 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
188 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
189 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
190 |     elif mode == 'test-dev':
191 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
192 |         with open(filename+'.json', 'w') as f:
193 |             json.dump(pred_list, f)
194 |         if visualize:
195 |             visualize_failures(stat_list,mode)
196 |     elif mode == 'test':
197 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
198 |         with open(filename+'.json', 'w') as f:
199 |             json.dump(pred_list, f)
200 |         if visualize:
201 |             visualize_failures(stat_list,mode)
202 | 
203 | def drawgraph(results, save_question_type_graphs=False):
204 |     # 0:it
205 |     # 1:trainloss
206 |     # 2:testloss
207 |     # 3:oa_acc
208 |     # 4:qt_acc
209 |     # 5:at_acc
210 | 
211 |     # training curve
212 |     it = np.array([l[0] for l in results])
213 |     loss = np.array([l[1] for l in results])
214 |     valloss = np.array([l[2] for l in results])
215 |     valacc = np.array([l[3] for l in results])
216 | 
217 |     fig = plt.figure()
218 |     ax1 = fig.add_subplot(111)
219 |     ax2 = ax1.twinx()
220 | 
221 |     ax1.plot(it,loss, color='blue', label='train loss')
222 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
223 |     ax2.plot(it,valacc, color='red', label='acc on val')
224 |     plt.legend(loc='lower left')
225 | 
226 |     ax1.set_xlabel('Iterations')
227 |     ax1.set_ylabel('Loss Value')
228 |     ax2.set_ylabel('Accuracy on Val [%]')
229 | 
230 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
231 |     plt.clf()
232 |     plt.close("all")
233 | 
234 |     # question type
235 |     it = np.array([l[0] for l in results])
236 |     oa_acc = np.array([l[3] for l in results])
237 |     qt_dic_list = [l[4] for l in results]
238 | 
239 |     def draw_qt_acc(target_key_list, figname):
240 |         fig = plt.figure()
241 |         for k in target_key_list:
242 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
243 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
244 |             plt.plot(it,t_val,label=str(k))
245 |         plt.legend(fontsize='small')
246 |         plt.ylim(0,100.)
247 |         #plt.legend(prop={'size':6})
248 | 
249 |         plt.xlabel('Iterations')
250 |         plt.ylabel('Accuracy on Val [%]')
251 | 
252 |         plt.savefig(figname,dpi=200)
253 |         plt.clf()
254 |         plt.close("all")
255 | 
256 |     if save_question_type_graphs:
257 |         s_keys = sorted(qt_dic_list[0].keys())
258 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
259 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
260 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
261 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
262 |         draw_qt_acc(['what color is the','what color are the','what color is',\
263 |             'what color','what is the color of the'],'./qt_color.png')
264 |         draw_qt_acc(['how many','how','how many people are',\
265 |             'how many people are in'],'./qt_number.png')
266 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
267 |             'which'],'./qt_who_why_where_which.png')
268 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
269 |             'is the woman','is this person','what is the woman','is the person',\
270 |             'what is the person'],'./qt_human.png')
271 | 
272 | 
273 | 


--------------------------------------------------------------------------------
/fastText (char+word)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------
/fastText (word)/config.py:
--------------------------------------------------------------------------------
 1 | GPU_ID = 9
 2 | BATCH_SIZE = 32 
 3 | VAL_BATCH_SIZE = 32
 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop
 6 | EMBEDDING_SIZE = 300
 7 | MAX_ITERATIONS = 1000000
 8 | PRINT_INTERVAL = 1000
 9 | VALIDATE_INTERVAL = 40000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy.
10 | 
11 | # what data to use for training
12 | TRAIN_DATA_SPLITS = 'train'
13 | 
14 | # what data to use for the vocabulary
15 | QUESTION_VOCAB_SPACE = 'train'
16 | ANSWER_VOCAB_SPACE = 'train'
17 | 
18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools'
20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools'
21 | 
22 | # location of the data
23 | VQA_PREFIX = '/tempspace/zwang6/VQA/'
24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/'
25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/'
26 | 
27 | DATA_PATHS = {
28 | 	'train': {
29 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
30 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
31 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_'
32 | 	},
33 | 	'val': {
34 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
35 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
36 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_'
37 | 	},
38 | 	'test-dev': {
39 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
40 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
41 | 	},
42 | 	'test': {
43 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
44 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_'
45 | 	},
46 | 	# TODO it would be nice if genome also followed the same file format as vqa
47 | 	'genome': {
48 | 		'genome_file': GENOME_PREFIX + '/question_answers_prepro.json',
49 | 		'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/'
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/fastText (word)/qlstm_solver.prototxt:
--------------------------------------------------------------------------------
 1 | # The train/test net protocol buffer definition
 2 | train_net: "./result/proto_train.prototxt"
 3 | test_net: "./result/proto_test.prototxt"
 4 | 
 5 | max_iter: 1000000
 6 | display: 5000
 7 | snapshot: 5000
 8 | snapshot_prefix: "./result/"
 9 | 
10 | # The base learning rate, momentum and the weight decay of the network.
11 | solver_type: ADAM
12 | base_lr: 0.0007
13 | momentum: 0.9
14 | momentum2: 0.999
15 | weight_decay: 0.000
16 | lr_policy: "fixed"
17 | test_iter: 1
18 | test_interval: 10000000
19 | 
20 | # accumulate gradients
21 | iter_size: 2
22 | 
23 | average_loss: 64


--------------------------------------------------------------------------------
/fastText (word)/train_att_bc.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | from write_to_log import write_log
  9 | 
 10 | import caffe
 11 | from caffe import layers as L
 12 | from caffe import params as P
 13 | 
 14 | from vqa_data_provider_layer import VQADataProvider
 15 | from visualize_tools import exec_validation, drawgraph
 16 | import config
 17 | 
 18 | 
 19 | def qlstm(mode, batchsize, T, question_vocab_size, embed_size):
 20 |     n = caffe.NetSpec()
 21 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize})
 22 |     n.data, n.cont, n.img_feature, n.label = L.Python(\
 23 |         module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 )
 24 |     
 25 |     # word embedding (static + dynamic)
 26 |     n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \
 27 |         weight_filler=dict(type='uniform',min=-0.08,max=0.08))
 28 |     n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # N x T x d_w
 29 |     n.embed_scale_resh = L.Reshape(n.embed_scale,reshape_param=dict(shape=dict(dim=[batchsize,T,embed_size,1])))
 30 | 
 31 |     # avg of word embedding
 32 |     n.embed_avg = L.Convolution(n.embed_scale_resh, convolution_param={'kernel_size': 1, 'num_output': 1, 'bias_term': False, 'weight_filler': dict(type='constant', value=1)}, param=dict(lr_mult=0, decay_mult=0)) # N x 1 x d_w x 1
 33 |     n.embed_avg_resh = L.Reshape(n.embed_avg,reshape_param=dict(shape=dict(dim=[batchsize,embed_size,1,1])))
 34 | 
 35 |     n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh, axis=2, tiles=14)
 36 |     n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14)
 37 |     n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14])))
 38 |     n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 39 |     n.blcf_sign_sqrt = L.SignedSqrt(n.blcf)
 40 |     n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt)
 41 |     n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1})
 42 | 
 43 |     # multi-channel attention
 44 |     n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier'))
 45 |     n.att_conv1_relu = L.ReLU(n.att_conv1)
 46 |     n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier'))
 47 |     n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14])))
 48 |     n.att_softmax = L.Softmax(n.att_reshaped, axis=2)
 49 |     n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14])))
 50 |     att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1})
 51 |     n.att_map0 = att_maps[0]
 52 |     n.att_map1 = att_maps[1]
 53 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 54 |     n.att_feature0  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy)
 55 |     n.att_feature1  = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy)
 56 |     n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048])))
 57 |     n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048])))
 58 |     n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh)
 59 | 
 60 |     # merge attention and lstm with compact bilinear pooling
 61 |     n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1])))
 62 |     #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1])))
 63 |     n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.embed_avg_resh, 
 64 |                                       compact_bilinear_param=dict(num_output=16000,sum_pool=False))
 65 |     n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm)
 66 |     n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt)
 67 | 
 68 |     n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1})
 69 |     n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000])))
 70 | 
 71 |     n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier'))
 72 |     n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
 73 |     return n.to_proto()
 74 | 
 75 | def make_answer_vocab(adic, vocab_size):
 76 |     """
 77 |     Returns a dictionary that maps words to indices.
 78 |     """
 79 |     adict = {'':0}
 80 |     nadict = {'':1000000}
 81 |     vid = 1
 82 |     for qid in adic.keys():
 83 |         answer_obj = adic[qid]
 84 |         answer_list = [ans['answer'] for ans in answer_obj]
 85 |         
 86 |         for q_ans in answer_list:
 87 |             # create dict
 88 |             if adict.has_key(q_ans):
 89 |                 nadict[q_ans] += 1
 90 |             else:
 91 |                 nadict[q_ans] = 1
 92 |                 adict[q_ans] = vid
 93 |                 vid +=1
 94 | 
 95 |     # debug
 96 |     nalist = []
 97 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
 98 |         nalist.append((k,v))
 99 | 
100 |     # remove words that appear less than once 
101 |     n_del_ans = 0
102 |     n_valid_ans = 0
103 |     adict_nid = {}
104 |     for i, w in enumerate(nalist[:-vocab_size]):
105 |         del adict[w[0]]
106 |         n_del_ans += w[1]
107 |     for i, w in enumerate(nalist[-vocab_size:]):
108 |         n_valid_ans += w[1]
109 |         adict_nid[w[0]] = i
110 |     
111 |     return adict_nid
112 | 
113 | def make_question_vocab(qdic):
114 |     """
115 |     Returns a dictionary that maps words to indices.
116 |     """
117 |     vdict = {'':0}
118 |     vid = 1
119 |     for qid in qdic.keys():
120 |         # sequence to list
121 |         q_str = qdic[qid]['qstr']
122 |         q_list = VQADataProvider.seq_to_list(q_str)
123 | 
124 |         # create dict
125 |         for w in q_list:
126 |             if not vdict.has_key(w):
127 |                 vdict[w] = vid
128 |                 vid +=1
129 | 
130 |     return vdict
131 | 
132 | def make_vocab_files():
133 |     """
134 |     Produce the question and answer vocabulary files.
135 |     """
136 |     write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt')
137 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
138 |     question_vocab = make_question_vocab(qdic)
139 |     write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt')
140 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
141 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
142 |     return question_vocab, answer_vocab
143 | 
144 | def main():
145 |     if not os.path.exists('./result'):
146 |         os.makedirs('./result')
147 | 
148 |     question_vocab, answer_vocab = {}, {}
149 |     if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'):
150 |         write_log('restoring vocab', 'log.txt')
151 |         with open('./result/vdict.json','r') as f:
152 |             question_vocab = json.load(f)
153 |         with open('./result/adict.json','r') as f:
154 |             answer_vocab = json.load(f)
155 |     else:
156 |         question_vocab, answer_vocab = make_vocab_files()
157 |         with open('./result/vdict.json','w') as f:
158 |             json.dump(question_vocab, f)
159 |         with open('./result/adict.json','w') as f:
160 |             json.dump(answer_vocab, f)
161 | 
162 |     write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt')
163 |     write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt')
164 | 
165 |     with open('./result/proto_train.prototxt', 'w') as f:
166 |         f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
167 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), config.EMBEDDING_SIZE)))
168 | 
169 |     with open('./result/proto_test.prototxt', 'w') as f:
170 |         f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \
171 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), config.EMBEDDING_SIZE)))
172 | 
173 |     caffe.set_device(config.GPU_ID)
174 |     caffe.set_mode_gpu()
175 |     solver = caffe.get_solver('./qlstm_solver.prototxt')
176 | 
177 |     train_loss = np.zeros(config.MAX_ITERATIONS)
178 |     # results = []
179 | 
180 |     for it in range(config.MAX_ITERATIONS):
181 |         solver.step(1)
182 |     
183 |         # store the train loss
184 |         train_loss[it] = solver.net.blobs['loss'].data
185 |    
186 |         if it != 0 and it % config.PRINT_INTERVAL == 0:
187 |             write_log('------------------------------------', 'log.txt')
188 |             write_log('Iteration: ' + str(it), 'log.txt')
189 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
190 |             write_log('Train loss: ' + str(c_mean_loss), 'log.txt')
191 |         if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test
192 |             solver.test_nets[0].save('./result/tmp.caffemodel')
193 |             write_log('Validating...', 'log.txt')
194 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it)
195 |             write_log('Iteration: ' + str(it), 'log.txt')
196 |             write_log('Test loss: ' + str(test_loss), 'log.txt')
197 |             write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt')
198 |             write_log('Per Question Type Accuracy is the following:', 'log.txt')
199 |             for quesType in acc_per_ques:
200 |                 write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt')
201 |             write_log('Per Answer Type Accuracy is the following:', 'log.txt')
202 |             for ansType in acc_per_ans:
203 |                 write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt')
204 |             # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
205 |             # best_result_idx = np.array([x[3] for x in results]).argmax()
206 |             # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt')
207 |             # drawgraph(results)
208 | 
209 | if __name__ == '__main__':
210 |     main()
211 | 


--------------------------------------------------------------------------------
/fastText (word)/visualize_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_provider_layer import VQADataProvider
 16 | from vqa_data_provider_layer import VQADataProviderLayer
 17 | 
 18 | import config
 19 | sys.path.append(config.VQA_TOOLS_PATH)
 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 21 | 
 22 | from vqaTools.vqa import VQA
 23 | from vqaEvaluation.vqaEval import VQAEval
 24 | 
 25 | from write_to_log import write_log
 26 | 
 27 | def visualize_failures(stat_list,mode):
 28 | 
 29 |     def save_qtype(qtype_list, save_filename, mode):
 30 | 
 31 |         if mode == 'val':
 32 |             savepath = os.path.join('./eval', save_filename)
 33 |             # TODO
 34 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014'
 35 |         elif mode == 'test-dev':
 36 |             savepath = os.path.join('./test-dev', save_filename)
 37 |             # TODO
 38 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 39 |         elif mode == 'test':
 40 |             savepath = os.path.join('./test', save_filename)
 41 |             # TODO
 42 |             img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015'
 43 |         else:
 44 |             raise Exception('Unsupported mode')
 45 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 46 |         if not os.path.exists(savepath): os.makedirs(savepath)
 47 | 
 48 |         for qt in qtype_list:
 49 |             count = 0
 50 |             for t_question in stat_list:
 51 |                 #print count, t_question
 52 |                 if count < 40/len(qtype_list):
 53 |                     t_question_list = t_question['q_list']
 54 |                     saveflag = False
 55 |                     #print 'debug****************************'
 56 |                     #print qt
 57 |                     #print t_question_list
 58 |                     #print t_question_list[0] == qt[0]
 59 |                     #print t_question_list[1] == qt[1]
 60 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 61 |                         saveflag = True
 62 |                     else:
 63 |                         saveflag = False
 64 |                                
 65 |                     if saveflag == True:
 66 |                         t_iid = t_question['iid']
 67 |                         if mode == 'val':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 70 |                         elif mode == 'test-dev' or 'test':
 71 |                             t_img = Image.open(os.path.join(img_pre, \
 72 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 73 | 
 74 |                         # for caption
 75 |                         #print t_iid
 76 |                         #annIds = caps.getAnnIds(t_iid)
 77 |                         #anns = caps.loadAnns(annIds)
 78 |                         #cap_list = [ann['caption'] for ann in anns]
 79 |                         ans_list = t_question['ans_list']
 80 |                         draw = ImageDraw.Draw(t_img)
 81 |                         for i in range(len(ans_list)):
 82 |                             try:
 83 |                                 draw.text((10,10*i), str(ans_list[i]))
 84 |                             except:
 85 |                                 pass
 86 | 
 87 |                         ans = t_question['answer']
 88 |                         pred = t_question['pred']
 89 |                         if ans == -1:
 90 |                             pre = ''
 91 |                         elif ans == pred:
 92 |                             pre = 'correct  '
 93 |                         else:
 94 |                             pre = 'failure  '
 95 |                         #print ' aaa ', ans, pred
 96 |                         ans = re.sub( '/', ' ', str(ans))
 97 |                         pred = re.sub( '/', ' ', str(pred))
 98 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 99 |                             str(ans) + ' p_' + str(pred) + '.png'
100 |                         count += 1
101 |                         write_log(os.path.join(savepath,img_title), 'visualize_log.txt')
102 |                         t_img.save(os.path.join(savepath,img_title))
103 | 
104 |     write_log('saving colors', 'visualize_log.txt')
105 |     qt_color_list = [['what','color']]
106 |     save_qtype(qt_color_list, 'colors', mode)
107 | 
108 |     write_log('saving what is', 'visualize_log.txt')
109 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
110 |     save_qtype(qt_whatis_list, 'whatis', mode)
111 | 
112 |     write_log('saving is', 'visualize_log.txt')
113 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
114 |     save_qtype(qt_is_list, 'is', mode)
115 | 
116 |     write_log('saving how many', 'visualize_log.txt')
117 |     qt_howmany_list =[['how','many']]
118 |     save_qtype(qt_howmany_list, 'howmany', mode)
119 | 
120 | def exec_validation(device_id, mode, it='', visualize=False):
121 | 
122 |     caffe.set_device(device_id)
123 |     caffe.set_mode_gpu()
124 |     net = caffe.Net('./result/proto_test.prototxt',\
125 |               './result/tmp.caffemodel',\
126 |               caffe.TEST)
127 | 
128 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE)
129 |     total_questions = len(dp.getQuesIds())
130 |     epoch = 0
131 | 
132 |     pred_list = []
133 |     testloss_list = []
134 |     stat_list = []
135 | 
136 |     while epoch == 0:
137 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0))
140 |         net.blobs['img_feature'].data[...] = t_img_feature
141 |         net.blobs['label'].data[...] = t_answer
142 |         #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2))
143 |         net.forward()
144 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
145 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
146 |         testloss_list.append(net.blobs['loss'].data)
147 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
148 |             pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
149 |             if visualize:
150 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
151 |                 if mode == 'test-dev' or 'test':
152 |                     ans_str = ''
153 |                     ans_list = ['']*10
154 |                 else:
155 |                     ans_str = dp.vec_to_answer(ans)
156 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
157 |                 stat_list.append({\
158 |                                     'qid'   : qid,
159 |                                     'q_list' : q_list,
160 |                                     'iid'   : iid,
161 |                                     'answer': ans_str,
162 |                                     'ans_list': ans_list,
163 |                                     'pred'  : pred })
164 |         #percent = 100 * float(len(pred_list)) / total_questions
165 |         #sys.stdout.write('\r' + ('%.2f' % percent) + '%')
166 |         #sys.stdout.flush()
167 | 
168 | 
169 | 
170 |     mean_testloss = np.array(testloss_list).mean()
171 | 
172 |     if mode == 'val':
173 |         valFile = './result/val2014_resfile'
174 |         with open(valFile, 'w') as f:
175 |             json.dump(pred_list, f)
176 |         if visualize:
177 |             visualize_failures(stat_list,mode)
178 |         annFile = config.DATA_PATHS['val']['ans_file']
179 |         quesFile = config.DATA_PATHS['val']['ques_file']
180 |         vqa = VQA(annFile, quesFile)
181 |         vqaRes = vqa.loadRes(valFile, quesFile)
182 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
183 |         vqaEval.evaluate()
184 |         acc_overall = vqaEval.accuracy['overall']
185 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
186 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
187 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
188 |     elif mode == 'test-dev':
189 |         filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results'
190 |         with open(filename+'.json', 'w') as f:
191 |             json.dump(pred_list, f)
192 |         if visualize:
193 |             visualize_failures(stat_list,mode)
194 |     elif mode == 'test':
195 |         filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results'
196 |         with open(filename+'.json', 'w') as f:
197 |             json.dump(pred_list, f)
198 |         if visualize:
199 |             visualize_failures(stat_list,mode)
200 | 
201 | def drawgraph(results, save_question_type_graphs=False):
202 |     # 0:it
203 |     # 1:trainloss
204 |     # 2:testloss
205 |     # 3:oa_acc
206 |     # 4:qt_acc
207 |     # 5:at_acc
208 | 
209 |     # training curve
210 |     it = np.array([l[0] for l in results])
211 |     loss = np.array([l[1] for l in results])
212 |     valloss = np.array([l[2] for l in results])
213 |     valacc = np.array([l[3] for l in results])
214 | 
215 |     fig = plt.figure()
216 |     ax1 = fig.add_subplot(111)
217 |     ax2 = ax1.twinx()
218 | 
219 |     ax1.plot(it,loss, color='blue', label='train loss')
220 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
221 |     ax2.plot(it,valacc, color='red', label='acc on val')
222 |     plt.legend(loc='lower left')
223 | 
224 |     ax1.set_xlabel('Iterations')
225 |     ax1.set_ylabel('Loss Value')
226 |     ax2.set_ylabel('Accuracy on Val [%]')
227 | 
228 |     plt.savefig('./learning_curve max_%2.2f.png'%valacc.max())
229 |     plt.clf()
230 |     plt.close("all")
231 | 
232 |     # question type
233 |     it = np.array([l[0] for l in results])
234 |     oa_acc = np.array([l[3] for l in results])
235 |     qt_dic_list = [l[4] for l in results]
236 | 
237 |     def draw_qt_acc(target_key_list, figname):
238 |         fig = plt.figure()
239 |         for k in target_key_list:
240 |             write_log(str(k) + str(type(k)), 'visualize_log.txt')
241 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
242 |             plt.plot(it,t_val,label=str(k))
243 |         plt.legend(fontsize='small')
244 |         plt.ylim(0,100.)
245 |         #plt.legend(prop={'size':6})
246 | 
247 |         plt.xlabel('Iterations')
248 |         plt.ylabel('Accuracy on Val [%]')
249 | 
250 |         plt.savefig(figname,dpi=200)
251 |         plt.clf()
252 |         plt.close("all")
253 | 
254 |     if save_question_type_graphs:
255 |         s_keys = sorted(qt_dic_list[0].keys())
256 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
257 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
258 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
259 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
260 |         draw_qt_acc(['what color is the','what color are the','what color is',\
261 |             'what color','what is the color of the'],'./qt_color.png')
262 |         draw_qt_acc(['how many','how','how many people are',\
263 |             'how many people are in'],'./qt_number.png')
264 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
265 |             'which'],'./qt_who_why_where_which.png')
266 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
267 |             'is the woman','is this person','what is the woman','is the person',\
268 |             'what is the person'],'./qt_human.png')
269 | 
270 | 
271 | 


--------------------------------------------------------------------------------
/fastText (word)/write_to_log.py:
--------------------------------------------------------------------------------
1 | def write_log(str, filename):
2 |     with open(filename, 'a') as f:
3 |         f.write(str + "\n")
4 | 


--------------------------------------------------------------------------------