├── .gitignore ├── CNN Deep Residual ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception (char) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception (char+word) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception (word) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception + Bottleneck ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception + Gate (tanh) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception + Gate ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Inception + Residual ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── CNN Non-Inception ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── LSTM (baseline) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py ├── README.md ├── fastText (char+word) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py └── fastText (word) ├── config.py ├── qlstm_solver.prototxt ├── train_att_bc.py ├── visualize_tools.py ├── vqa_data_provider_layer.py └── write_to_log.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .DS_Store 3 | -------------------------------------------------------------------------------- /CNN Deep Residual/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 9 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | #MAX_WORDS_IN_QUESTION = 22 6 | MAX_CHARS_IN_QUESTION = 100 7 | MAX_ITERATIONS = 1000000 8 | PRINT_INTERVAL = 1000 9 | VALIDATE_INTERVAL = 110000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 10 | 11 | # what data to use for training 12 | TRAIN_DATA_SPLITS = 'train' 13 | 14 | # what data to use for the vocabulary 15 | QUESTION_VOCAB_SPACE = 'train' 16 | ANSWER_VOCAB_SPACE = 'train' 17 | 18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 21 | 22 | # location of the data 23 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 26 | 27 | DATA_PATHS = { 28 | 'train': { 29 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 30 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 31 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 32 | }, 33 | 'val': { 34 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 35 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 36 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 37 | }, 38 | 'test-dev': { 39 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 40 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 41 | }, 42 | 'test': { 43 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 44 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 45 | }, 46 | # TODO it would be nice if genome also followed the same file format as vqa 47 | 'genome': { 48 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 49 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /CNN Deep Residual/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Deep Residual/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 140 | net.blobs['img_feature'].data[...] = t_img_feature 141 | net.blobs['label'].data[...] = t_answer 142 | #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 143 | net.forward() 144 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 145 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 146 | testloss_list.append(net.blobs['loss'].data) 147 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 148 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 149 | if visualize: 150 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 151 | if mode == 'test-dev' or 'test': 152 | ans_str = '' 153 | ans_list = ['']*10 154 | else: 155 | ans_str = dp.vec_to_answer(ans) 156 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 157 | stat_list.append({\ 158 | 'qid' : qid, 159 | 'q_list' : q_list, 160 | 'iid' : iid, 161 | 'answer': ans_str, 162 | 'ans_list': ans_list, 163 | 'pred' : pred }) 164 | #percent = 100 * float(len(pred_list)) / total_questions 165 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 166 | #sys.stdout.flush() 167 | 168 | 169 | 170 | mean_testloss = np.array(testloss_list).mean() 171 | 172 | if mode == 'val': 173 | valFile = './result/val2014_resfile' 174 | with open(valFile, 'w') as f: 175 | json.dump(pred_list, f) 176 | if visualize: 177 | visualize_failures(stat_list,mode) 178 | annFile = config.DATA_PATHS['val']['ans_file'] 179 | quesFile = config.DATA_PATHS['val']['ques_file'] 180 | vqa = VQA(annFile, quesFile) 181 | vqaRes = vqa.loadRes(valFile, quesFile) 182 | vqaEval = VQAEval(vqa, vqaRes, n=2) 183 | vqaEval.evaluate() 184 | acc_overall = vqaEval.accuracy['overall'] 185 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 186 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 187 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 188 | elif mode == 'test-dev': 189 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 190 | with open(filename+'.json', 'w') as f: 191 | json.dump(pred_list, f) 192 | if visualize: 193 | visualize_failures(stat_list,mode) 194 | elif mode == 'test': 195 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 196 | with open(filename+'.json', 'w') as f: 197 | json.dump(pred_list, f) 198 | if visualize: 199 | visualize_failures(stat_list,mode) 200 | 201 | def drawgraph(results, save_question_type_graphs=False): 202 | # 0:it 203 | # 1:trainloss 204 | # 2:testloss 205 | # 3:oa_acc 206 | # 4:qt_acc 207 | # 5:at_acc 208 | 209 | # training curve 210 | it = np.array([l[0] for l in results]) 211 | loss = np.array([l[1] for l in results]) 212 | valloss = np.array([l[2] for l in results]) 213 | valacc = np.array([l[3] for l in results]) 214 | 215 | fig = plt.figure() 216 | ax1 = fig.add_subplot(111) 217 | ax2 = ax1.twinx() 218 | 219 | ax1.plot(it,loss, color='blue', label='train loss') 220 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 221 | ax2.plot(it,valacc, color='red', label='acc on val') 222 | plt.legend(loc='lower left') 223 | 224 | ax1.set_xlabel('Iterations') 225 | ax1.set_ylabel('Loss Value') 226 | ax2.set_ylabel('Accuracy on Val [%]') 227 | 228 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 229 | plt.clf() 230 | plt.close("all") 231 | 232 | # question type 233 | it = np.array([l[0] for l in results]) 234 | oa_acc = np.array([l[3] for l in results]) 235 | qt_dic_list = [l[4] for l in results] 236 | 237 | def draw_qt_acc(target_key_list, figname): 238 | fig = plt.figure() 239 | for k in target_key_list: 240 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 241 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 242 | plt.plot(it,t_val,label=str(k)) 243 | plt.legend(fontsize='small') 244 | plt.ylim(0,100.) 245 | #plt.legend(prop={'size':6}) 246 | 247 | plt.xlabel('Iterations') 248 | plt.ylabel('Accuracy on Val [%]') 249 | 250 | plt.savefig(figname,dpi=200) 251 | plt.clf() 252 | plt.close("all") 253 | 254 | if save_question_type_graphs: 255 | s_keys = sorted(qt_dic_list[0].keys()) 256 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 257 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 258 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 259 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 260 | draw_qt_acc(['what color is the','what color are the','what color is',\ 261 | 'what color','what is the color of the'],'./qt_color.png') 262 | draw_qt_acc(['how many','how','how many people are',\ 263 | 'how many people are in'],'./qt_number.png') 264 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 265 | 'which'],'./qt_who_why_where_which.png') 266 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 267 | 'is the woman','is this person','what is the woman','is the person',\ 268 | 'what is the person'],'./qt_human.png') 269 | 270 | 271 | -------------------------------------------------------------------------------- /CNN Deep Residual/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception (char)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 10 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | # MAX_WORDS_IN_QUESTION = 22 6 | MAX_CHARS_IN_QUESTION = 100 7 | MAX_ITERATIONS = 1000000 8 | PRINT_INTERVAL = 1000 9 | VALIDATE_INTERVAL = 150000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 10 | 11 | # what data to use for training 12 | TRAIN_DATA_SPLITS = 'train' 13 | 14 | # what data to use for the vocabulary 15 | QUESTION_VOCAB_SPACE = 'train' 16 | ANSWER_VOCAB_SPACE = 'train' 17 | 18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 21 | 22 | # location of the data 23 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 26 | 27 | DATA_PATHS = { 28 | 'train': { 29 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 30 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 31 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 32 | }, 33 | 'val': { 34 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 35 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 36 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 37 | }, 38 | 'test-dev': { 39 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 40 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 41 | }, 42 | 'test': { 43 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 44 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 45 | }, 46 | # TODO it would be nice if genome also followed the same file format as vqa 47 | 'genome': { 48 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 49 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /CNN Inception (char)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception (char)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 140 | net.blobs['img_feature'].data[...] = t_img_feature 141 | net.blobs['label'].data[...] = t_answer 142 | #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 143 | net.forward() 144 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 145 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 146 | testloss_list.append(net.blobs['loss'].data) 147 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 148 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 149 | if visualize: 150 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 151 | if mode == 'test-dev' or 'test': 152 | ans_str = '' 153 | ans_list = ['']*10 154 | else: 155 | ans_str = dp.vec_to_answer(ans) 156 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 157 | stat_list.append({\ 158 | 'qid' : qid, 159 | 'q_list' : q_list, 160 | 'iid' : iid, 161 | 'answer': ans_str, 162 | 'ans_list': ans_list, 163 | 'pred' : pred }) 164 | #percent = 100 * float(len(pred_list)) / total_questions 165 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 166 | #sys.stdout.flush() 167 | 168 | 169 | 170 | mean_testloss = np.array(testloss_list).mean() 171 | 172 | if mode == 'val': 173 | valFile = './result/val2014_resfile' 174 | with open(valFile, 'w') as f: 175 | json.dump(pred_list, f) 176 | if visualize: 177 | visualize_failures(stat_list,mode) 178 | annFile = config.DATA_PATHS['val']['ans_file'] 179 | quesFile = config.DATA_PATHS['val']['ques_file'] 180 | vqa = VQA(annFile, quesFile) 181 | vqaRes = vqa.loadRes(valFile, quesFile) 182 | vqaEval = VQAEval(vqa, vqaRes, n=2) 183 | vqaEval.evaluate() 184 | acc_overall = vqaEval.accuracy['overall'] 185 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 186 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 187 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 188 | elif mode == 'test-dev': 189 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 190 | with open(filename+'.json', 'w') as f: 191 | json.dump(pred_list, f) 192 | if visualize: 193 | visualize_failures(stat_list,mode) 194 | elif mode == 'test': 195 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 196 | with open(filename+'.json', 'w') as f: 197 | json.dump(pred_list, f) 198 | if visualize: 199 | visualize_failures(stat_list,mode) 200 | 201 | def drawgraph(results, save_question_type_graphs=False): 202 | # 0:it 203 | # 1:trainloss 204 | # 2:testloss 205 | # 3:oa_acc 206 | # 4:qt_acc 207 | # 5:at_acc 208 | 209 | # training curve 210 | it = np.array([l[0] for l in results]) 211 | loss = np.array([l[1] for l in results]) 212 | valloss = np.array([l[2] for l in results]) 213 | valacc = np.array([l[3] for l in results]) 214 | 215 | fig = plt.figure() 216 | ax1 = fig.add_subplot(111) 217 | ax2 = ax1.twinx() 218 | 219 | ax1.plot(it,loss, color='blue', label='train loss') 220 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 221 | ax2.plot(it,valacc, color='red', label='acc on val') 222 | plt.legend(loc='lower left') 223 | 224 | ax1.set_xlabel('Iterations') 225 | ax1.set_ylabel('Loss Value') 226 | ax2.set_ylabel('Accuracy on Val [%]') 227 | 228 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 229 | plt.clf() 230 | plt.close("all") 231 | 232 | # question type 233 | it = np.array([l[0] for l in results]) 234 | oa_acc = np.array([l[3] for l in results]) 235 | qt_dic_list = [l[4] for l in results] 236 | 237 | def draw_qt_acc(target_key_list, figname): 238 | fig = plt.figure() 239 | for k in target_key_list: 240 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 241 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 242 | plt.plot(it,t_val,label=str(k)) 243 | plt.legend(fontsize='small') 244 | plt.ylim(0,100.) 245 | #plt.legend(prop={'size':6}) 246 | 247 | plt.xlabel('Iterations') 248 | plt.ylabel('Accuracy on Val [%]') 249 | 250 | plt.savefig(figname,dpi=200) 251 | plt.clf() 252 | plt.close("all") 253 | 254 | if save_question_type_graphs: 255 | s_keys = sorted(qt_dic_list[0].keys()) 256 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 257 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 258 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 259 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 260 | draw_qt_acc(['what color is the','what color are the','what color is',\ 261 | 'what color','what is the color of the'],'./qt_color.png') 262 | draw_qt_acc(['how many','how','how many people are',\ 263 | 'how many people are in'],'./qt_number.png') 264 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 265 | 'which'],'./qt_who_why_where_which.png') 266 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 267 | 'is the woman','is this person','what is the woman','is the person',\ 268 | 'what is the person'],'./qt_human.png') 269 | 270 | 271 | -------------------------------------------------------------------------------- /CNN Inception (char)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception (char+word)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 7 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 6 | LENGTH_OF_LONGEST_WORD = 17 7 | #MAX_CHARS_IN_QUESTION = 100 8 | MAX_ITERATIONS = 1000000 9 | PRINT_INTERVAL = 1000 10 | VALIDATE_INTERVAL = 160000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 11 | 12 | # what data to use for training 13 | TRAIN_DATA_SPLITS = 'train' 14 | 15 | # what data to use for the vocabulary 16 | QUESTION_VOCAB_SPACE = 'train' 17 | ANSWER_VOCAB_SPACE = 'train' 18 | 19 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 20 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 21 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 22 | 23 | # location of the data 24 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 25 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 26 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 27 | 28 | DATA_PATHS = { 29 | 'train': { 30 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 31 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 32 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 33 | }, 34 | 'val': { 35 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 36 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 37 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 38 | }, 39 | 'test-dev': { 40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 41 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 42 | }, 43 | 'test': { 44 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 45 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 46 | }, 47 | # TODO it would be nice if genome also followed the same file format as vqa 48 | 'genome': { 49 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 50 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /CNN Inception (char+word)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception (char+word)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | t_word, t_cont, t_word_c, t_cont_c, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 140 | net.blobs['data1'].data[...] = t_word_c 141 | net.blobs['cont1'].data[...] = t_cont_c 142 | net.blobs['img_feature'].data[...] = t_img_feature 143 | net.blobs['label'].data[...] = t_answer 144 | #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 145 | net.forward() 146 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 147 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 148 | testloss_list.append(net.blobs['loss'].data) 149 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 150 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 151 | if visualize: 152 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 153 | if mode == 'test-dev' or 'test': 154 | ans_str = '' 155 | ans_list = ['']*10 156 | else: 157 | ans_str = dp.vec_to_answer(ans) 158 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 159 | stat_list.append({\ 160 | 'qid' : qid, 161 | 'q_list' : q_list, 162 | 'iid' : iid, 163 | 'answer': ans_str, 164 | 'ans_list': ans_list, 165 | 'pred' : pred }) 166 | #percent = 100 * float(len(pred_list)) / total_questions 167 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 168 | #sys.stdout.flush() 169 | 170 | 171 | 172 | mean_testloss = np.array(testloss_list).mean() 173 | 174 | if mode == 'val': 175 | valFile = './result/val2014_resfile' 176 | with open(valFile, 'w') as f: 177 | json.dump(pred_list, f) 178 | if visualize: 179 | visualize_failures(stat_list,mode) 180 | annFile = config.DATA_PATHS['val']['ans_file'] 181 | quesFile = config.DATA_PATHS['val']['ques_file'] 182 | vqa = VQA(annFile, quesFile) 183 | vqaRes = vqa.loadRes(valFile, quesFile) 184 | vqaEval = VQAEval(vqa, vqaRes, n=2) 185 | vqaEval.evaluate() 186 | acc_overall = vqaEval.accuracy['overall'] 187 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 188 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 189 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 190 | elif mode == 'test-dev': 191 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 192 | with open(filename+'.json', 'w') as f: 193 | json.dump(pred_list, f) 194 | if visualize: 195 | visualize_failures(stat_list,mode) 196 | elif mode == 'test': 197 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 198 | with open(filename+'.json', 'w') as f: 199 | json.dump(pred_list, f) 200 | if visualize: 201 | visualize_failures(stat_list,mode) 202 | 203 | def drawgraph(results, save_question_type_graphs=False): 204 | # 0:it 205 | # 1:trainloss 206 | # 2:testloss 207 | # 3:oa_acc 208 | # 4:qt_acc 209 | # 5:at_acc 210 | 211 | # training curve 212 | it = np.array([l[0] for l in results]) 213 | loss = np.array([l[1] for l in results]) 214 | valloss = np.array([l[2] for l in results]) 215 | valacc = np.array([l[3] for l in results]) 216 | 217 | fig = plt.figure() 218 | ax1 = fig.add_subplot(111) 219 | ax2 = ax1.twinx() 220 | 221 | ax1.plot(it,loss, color='blue', label='train loss') 222 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 223 | ax2.plot(it,valacc, color='red', label='acc on val') 224 | plt.legend(loc='lower left') 225 | 226 | ax1.set_xlabel('Iterations') 227 | ax1.set_ylabel('Loss Value') 228 | ax2.set_ylabel('Accuracy on Val [%]') 229 | 230 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 231 | plt.clf() 232 | plt.close("all") 233 | 234 | # question type 235 | it = np.array([l[0] for l in results]) 236 | oa_acc = np.array([l[3] for l in results]) 237 | qt_dic_list = [l[4] for l in results] 238 | 239 | def draw_qt_acc(target_key_list, figname): 240 | fig = plt.figure() 241 | for k in target_key_list: 242 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 243 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 244 | plt.plot(it,t_val,label=str(k)) 245 | plt.legend(fontsize='small') 246 | plt.ylim(0,100.) 247 | #plt.legend(prop={'size':6}) 248 | 249 | plt.xlabel('Iterations') 250 | plt.ylabel('Accuracy on Val [%]') 251 | 252 | plt.savefig(figname,dpi=200) 253 | plt.clf() 254 | plt.close("all") 255 | 256 | if save_question_type_graphs: 257 | s_keys = sorted(qt_dic_list[0].keys()) 258 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 259 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 260 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 261 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 262 | draw_qt_acc(['what color is the','what color are the','what color is',\ 263 | 'what color','what is the color of the'],'./qt_color.png') 264 | draw_qt_acc(['how many','how','how many people are',\ 265 | 'how many people are in'],'./qt_number.png') 266 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 267 | 'which'],'./qt_who_why_where_which.png') 268 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 269 | 'is the woman','is this person','what is the woman','is the person',\ 270 | 'what is the person'],'./qt_human.png') 271 | 272 | 273 | -------------------------------------------------------------------------------- /CNN Inception (char+word)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception (word)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 8 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 140000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Inception (word)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception (word)/train_att_bc.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | from write_to_log import write_log 9 | 10 | import caffe 11 | from caffe import layers as L 12 | from caffe import params as P 13 | 14 | from vqa_data_provider_layer import VQADataProvider 15 | from visualize_tools import exec_validation, drawgraph 16 | import config 17 | 18 | 19 | def qlstm(mode, batchsize, T, question_vocab_size): 20 | n = caffe.NetSpec() 21 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) 22 | # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ 23 | # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) 24 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 25 | module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) 26 | 27 | # word embedding (static + dynamic) 28 | n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 29 | weight_filler=dict(type='uniform',min=-0.08,max=0.08)) 30 | # n.embed = L.TanH(n.embed_ba) 31 | n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) 32 | n.embed_scale_resh = L.Reshape(n.embed_scale,\ 33 | reshape_param=dict(\ 34 | shape=dict(dim=[batchsize,1,T,300]))) 35 | 36 | # convolution 37 | n.word_feature_2 = L.Convolution(n.embed_scale_resh, kernel_h=2, kernel_w=300, stride=1, num_output=512, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) # N x 512 x ? x 1 38 | n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=512, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) 39 | n.word_feature_4 = L.Convolution(n.embed_scale_resh, kernel_h=4, kernel_w=300, stride=1, num_output=512, pad_h=3, pad_w=0, weight_filler=dict(type='xavier')) 40 | n.word_feature_5 = L.Convolution(n.embed_scale_resh, kernel_h=5, kernel_w=300, stride=1, num_output=512, pad_h=4, pad_w=0, weight_filler=dict(type='xavier')) 41 | n.word_relu_2 = L.ReLU(n.word_feature_2) 42 | n.word_relu_3 = L.ReLU(n.word_feature_3) 43 | n.word_relu_4 = L.ReLU(n.word_feature_4) 44 | n.word_relu_5 = L.ReLU(n.word_feature_5) 45 | n.word_vec_2 = L.Pooling(n.word_relu_2, kernel_h=T+1, kernel_w=1, stride=T+1, pool=P.Pooling.MAX) # N x 512 x 1 x 1 46 | n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T+2, kernel_w=1, stride=T+2, pool=P.Pooling.MAX) 47 | n.word_vec_4 = L.Pooling(n.word_relu_4, kernel_h=T+3, kernel_w=1, stride=T+3, pool=P.Pooling.MAX) 48 | n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T+4, kernel_w=1, stride=T+4, pool=P.Pooling.MAX) 49 | word_vec = [n.word_vec_2, n.word_vec_3, n.word_vec_4, n.word_vec_5] 50 | n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 2048 x 1 x 1 51 | n.concat_vec_dropped = L.Dropout(n.concat_vec,dropout_param={'dropout_ratio':0.5}) 52 | 53 | n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) 54 | n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) 55 | n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) 56 | n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 57 | n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) 58 | n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) 59 | n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) 60 | 61 | # multi-channel attention 62 | n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) 63 | n.att_conv1_relu = L.ReLU(n.att_conv1) 64 | n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) 65 | n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) 66 | n.att_softmax = L.Softmax(n.att_reshaped, axis=2) 67 | n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) 68 | att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) 69 | n.att_map0 = att_maps[0] 70 | n.att_map1 = att_maps[1] 71 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 72 | n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) 73 | n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) 74 | n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) 75 | n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) 76 | n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) 77 | 78 | # merge attention and lstm with compact bilinear pooling 79 | n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) 80 | #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 81 | n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 82 | compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 83 | n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) 84 | n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) 85 | 86 | n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) 87 | n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) 88 | 89 | n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) 90 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 91 | return n.to_proto() 92 | 93 | def make_answer_vocab(adic, vocab_size): 94 | """ 95 | Returns a dictionary that maps words to indices. 96 | """ 97 | adict = {'':0} 98 | nadict = {'':1000000} 99 | vid = 1 100 | for qid in adic.keys(): 101 | answer_obj = adic[qid] 102 | answer_list = [ans['answer'] for ans in answer_obj] 103 | 104 | for q_ans in answer_list: 105 | # create dict 106 | if adict.has_key(q_ans): 107 | nadict[q_ans] += 1 108 | else: 109 | nadict[q_ans] = 1 110 | adict[q_ans] = vid 111 | vid +=1 112 | 113 | # debug 114 | nalist = [] 115 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 116 | nalist.append((k,v)) 117 | 118 | # remove words that appear less than once 119 | n_del_ans = 0 120 | n_valid_ans = 0 121 | adict_nid = {} 122 | for i, w in enumerate(nalist[:-vocab_size]): 123 | del adict[w[0]] 124 | n_del_ans += w[1] 125 | for i, w in enumerate(nalist[-vocab_size:]): 126 | n_valid_ans += w[1] 127 | adict_nid[w[0]] = i 128 | 129 | return adict_nid 130 | 131 | def make_question_vocab(qdic): 132 | """ 133 | Returns a dictionary that maps words to indices. 134 | """ 135 | vdict = {'':0} 136 | vid = 1 137 | for qid in qdic.keys(): 138 | # sequence to list 139 | q_str = qdic[qid]['qstr'] 140 | q_list = VQADataProvider.seq_to_list(q_str) 141 | 142 | # create dict 143 | for w in q_list: 144 | if not vdict.has_key(w): 145 | vdict[w] = vid 146 | vid +=1 147 | 148 | return vdict 149 | 150 | def make_vocab_files(): 151 | """ 152 | Produce the question and answer vocabulary files. 153 | """ 154 | write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt') 155 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 156 | question_vocab = make_question_vocab(qdic) 157 | write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt') 158 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 159 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 160 | return question_vocab, answer_vocab 161 | 162 | def main(): 163 | if not os.path.exists('./result'): 164 | os.makedirs('./result') 165 | 166 | question_vocab, answer_vocab = {}, {} 167 | if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'): 168 | write_log('restoring vocab', 'log.txt') 169 | with open('./result/vdict.json','r') as f: 170 | question_vocab = json.load(f) 171 | with open('./result/adict.json','r') as f: 172 | answer_vocab = json.load(f) 173 | else: 174 | question_vocab, answer_vocab = make_vocab_files() 175 | with open('./result/vdict.json','w') as f: 176 | json.dump(question_vocab, f) 177 | with open('./result/adict.json','w') as f: 178 | json.dump(answer_vocab, f) 179 | 180 | write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt') 181 | write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt') 182 | 183 | with open('./result/proto_train.prototxt', 'w') as f: 184 | f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 185 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 186 | 187 | with open('./result/proto_test.prototxt', 'w') as f: 188 | f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \ 189 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 190 | 191 | caffe.set_device(config.GPU_ID) 192 | caffe.set_mode_gpu() 193 | solver = caffe.get_solver('./qlstm_solver.prototxt') 194 | 195 | train_loss = np.zeros(config.MAX_ITERATIONS) 196 | # results = [] 197 | 198 | for it in range(config.MAX_ITERATIONS): 199 | solver.step(1) 200 | 201 | # store the train loss 202 | train_loss[it] = solver.net.blobs['loss'].data 203 | 204 | if it != 0 and it % config.PRINT_INTERVAL == 0: 205 | write_log('------------------------------------', 'log.txt') 206 | write_log('Iteration: ' + str(it), 'log.txt') 207 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 208 | write_log('Train loss: ' + str(c_mean_loss), 'log.txt') 209 | if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test 210 | solver.test_nets[0].save('./result/tmp.caffemodel') 211 | write_log('Validating...', 'log.txt') 212 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it) 213 | write_log('Iteration: ' + str(it), 'log.txt') 214 | write_log('Test loss: ' + str(test_loss), 'log.txt') 215 | write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt') 216 | write_log('Per Question Type Accuracy is the following:', 'log.txt') 217 | for quesType in acc_per_ques: 218 | write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt') 219 | write_log('Per Answer Type Accuracy is the following:', 'log.txt') 220 | for ansType in acc_per_ans: 221 | write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt') 222 | # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 223 | # best_result_idx = np.array([x[3] for x in results]).argmax() 224 | # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt') 225 | # drawgraph(results) 226 | 227 | if __name__ == '__main__': 228 | main() 229 | -------------------------------------------------------------------------------- /CNN Inception (word)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | # percent = 100 * float(len(pred_list)) / total_questions 166 | # sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | # sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /CNN Inception (word)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception + Bottleneck/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 10 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 120000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Inception + Bottleneck/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception + Bottleneck/train_att_bc.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | from write_to_log import write_log 9 | 10 | import caffe 11 | from caffe import layers as L 12 | from caffe import params as P 13 | 14 | from vqa_data_provider_layer import VQADataProvider 15 | from visualize_tools import exec_validation, drawgraph 16 | import config 17 | 18 | 19 | def qlstm(mode, batchsize, T, question_vocab_size): 20 | n = caffe.NetSpec() 21 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) 22 | # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ 23 | # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) 24 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 25 | module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) 26 | 27 | # word embedding 28 | n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 29 | weight_filler=dict(type='uniform',min=-0.08,max=0.08)) 30 | # n.embed = L.TanH(n.embed_ba) 31 | n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) 32 | n.embed_scale_resh = L.Reshape(n.embed_scale,\ 33 | reshape_param=dict(\ 34 | shape=dict(dim=[batchsize,1,T,-1]))) 35 | 36 | # convolution 37 | n.word_feature_3_1 = L.Convolution(n.embed_scale_resh, kernel_h=1, kernel_w=300, stride=1, num_output=256, pad_h=0, pad_w=0, weight_filler=dict(type='xavier')) 38 | n.word_relu_3_1_r = L.ReLU(n.word_feature_3_1) 39 | n.word_feature_3_2 = L.Convolution(n.word_relu_3_1_r, kernel_h=3, kernel_w=1, stride=1, num_output=256, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) 40 | n.word_relu_3_2_r = L.ReLU(n.word_feature_3_2) 41 | n.word_feature_3 = L.Convolution(n.word_relu_3_2_r, kernel_h=1, kernel_w=1, stride=1, num_output=1024, pad_h=0, pad_w=0, weight_filler=dict(type='xavier')) 42 | 43 | n.word_feature_5_1 = L.Convolution(n.embed_scale_resh, kernel_h=1, kernel_w=300, stride=1, num_output=256, pad_h=0, pad_w=0, weight_filler=dict(type='xavier')) 44 | n.word_relu_5_1_r = L.ReLU(n.word_feature_5_1) 45 | n.word_feature_5_2 = L.Convolution(n.word_relu_5_1_r, kernel_h=5, kernel_w=1, stride=1, num_output=256, pad_h=2, pad_w=0, weight_filler=dict(type='xavier')) 46 | n.word_relu_5_2_r = L.ReLU(n.word_feature_5_2) 47 | n.word_feature_5 = L.Convolution(n.word_relu_5_2_r, kernel_h=1, kernel_w=1, stride=1, num_output=1024, pad_h=0, pad_w=0, weight_filler=dict(type='xavier')) 48 | 49 | n.word_relu_3 = L.ReLU(n.word_feature_3) 50 | n.word_relu_5 = L.ReLU(n.word_feature_5) 51 | 52 | n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX) 53 | n.word_vec_5 = L.Pooling(n.word_relu_5, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX) 54 | 55 | word_vec = [n.word_vec_3, n.word_vec_5] 56 | n.concat_vec = L.Concat(*word_vec, concat_param={'axis': 1}) # N x 2*d_w x 1 x 1 57 | n.concat_vec_dropped = L.Dropout(n.concat_vec,dropout_param={'dropout_ratio':0.5}) 58 | 59 | n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) 60 | n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) 61 | n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) 62 | n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 63 | n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) 64 | n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) 65 | n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) 66 | 67 | # multi-channel attention 68 | n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) 69 | n.att_conv1_relu = L.ReLU(n.att_conv1) 70 | n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) 71 | n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) 72 | n.att_softmax = L.Softmax(n.att_reshaped, axis=2) 73 | n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) 74 | att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) 75 | n.att_map0 = att_maps[0] 76 | n.att_map1 = att_maps[1] 77 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 78 | n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) 79 | n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) 80 | n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) 81 | n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) 82 | n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) 83 | 84 | # merge attention and lstm with compact bilinear pooling 85 | n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) 86 | #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 87 | n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 88 | compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 89 | n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) 90 | n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) 91 | 92 | n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) 93 | n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) 94 | 95 | n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) 96 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 97 | return n.to_proto() 98 | 99 | def make_answer_vocab(adic, vocab_size): 100 | """ 101 | Returns a dictionary that maps words to indices. 102 | """ 103 | adict = {'':0} 104 | nadict = {'':1000000} 105 | vid = 1 106 | for qid in adic.keys(): 107 | answer_obj = adic[qid] 108 | answer_list = [ans['answer'] for ans in answer_obj] 109 | 110 | for q_ans in answer_list: 111 | # create dict 112 | if adict.has_key(q_ans): 113 | nadict[q_ans] += 1 114 | else: 115 | nadict[q_ans] = 1 116 | adict[q_ans] = vid 117 | vid +=1 118 | 119 | # debug 120 | nalist = [] 121 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 122 | nalist.append((k,v)) 123 | 124 | # remove words that appear less than once 125 | n_del_ans = 0 126 | n_valid_ans = 0 127 | adict_nid = {} 128 | for i, w in enumerate(nalist[:-vocab_size]): 129 | del adict[w[0]] 130 | n_del_ans += w[1] 131 | for i, w in enumerate(nalist[-vocab_size:]): 132 | n_valid_ans += w[1] 133 | adict_nid[w[0]] = i 134 | 135 | return adict_nid 136 | 137 | def make_question_vocab(qdic): 138 | """ 139 | Returns a dictionary that maps words to indices. 140 | """ 141 | vdict = {'':0} 142 | vid = 1 143 | for qid in qdic.keys(): 144 | # sequence to list 145 | q_str = qdic[qid]['qstr'] 146 | q_list = VQADataProvider.seq_to_list(q_str) 147 | 148 | # create dict 149 | for w in q_list: 150 | if not vdict.has_key(w): 151 | vdict[w] = vid 152 | vid +=1 153 | 154 | return vdict 155 | 156 | def make_vocab_files(): 157 | """ 158 | Produce the question and answer vocabulary files. 159 | """ 160 | write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt') 161 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 162 | question_vocab = make_question_vocab(qdic) 163 | write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt') 164 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 165 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 166 | return question_vocab, answer_vocab 167 | 168 | def main(): 169 | if not os.path.exists('./result'): 170 | os.makedirs('./result') 171 | 172 | question_vocab, answer_vocab = {}, {} 173 | if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'): 174 | write_log('restoring vocab', 'log.txt') 175 | with open('./result/vdict.json','r') as f: 176 | question_vocab = json.load(f) 177 | with open('./result/adict.json','r') as f: 178 | answer_vocab = json.load(f) 179 | else: 180 | question_vocab, answer_vocab = make_vocab_files() 181 | with open('./result/vdict.json','w') as f: 182 | json.dump(question_vocab, f) 183 | with open('./result/adict.json','w') as f: 184 | json.dump(answer_vocab, f) 185 | 186 | write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt') 187 | write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt') 188 | 189 | with open('./result/proto_train.prototxt', 'w') as f: 190 | f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 191 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 192 | 193 | with open('./result/proto_test.prototxt', 'w') as f: 194 | f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \ 195 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 196 | 197 | caffe.set_device(config.GPU_ID) 198 | caffe.set_mode_gpu() 199 | solver = caffe.get_solver('./qlstm_solver.prototxt') 200 | 201 | train_loss = np.zeros(config.MAX_ITERATIONS) 202 | # results = [] 203 | 204 | for it in range(config.MAX_ITERATIONS): 205 | solver.step(1) 206 | 207 | # store the train loss 208 | train_loss[it] = solver.net.blobs['loss'].data 209 | 210 | if it != 0 and it % config.PRINT_INTERVAL == 0: 211 | write_log('------------------------------------', 'log.txt') 212 | write_log('Iteration: ' + str(it), 'log.txt') 213 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 214 | write_log('Train loss: ' + str(c_mean_loss), 'log.txt') 215 | if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test 216 | solver.test_nets[0].save('./result/tmp.caffemodel') 217 | write_log('Validating...', 'log.txt') 218 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it) 219 | write_log('Iteration: ' + str(it), 'log.txt') 220 | write_log('Test loss: ' + str(test_loss), 'log.txt') 221 | write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt') 222 | write_log('Per Question Type Accuracy is the following:', 'log.txt') 223 | for quesType in acc_per_ques: 224 | write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt') 225 | write_log('Per Answer Type Accuracy is the following:', 'log.txt') 226 | for ansType in acc_per_ans: 227 | write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt') 228 | # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 229 | # best_result_idx = np.array([x[3] for x in results]).argmax() 230 | # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt') 231 | # drawgraph(results) 232 | 233 | if __name__ == '__main__': 234 | main() 235 | -------------------------------------------------------------------------------- /CNN Inception + Bottleneck/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | # percent = 100 * float(len(pred_list)) / total_questions 166 | # sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | # sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /CNN Inception + Bottleneck/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception + Gate (tanh)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 10 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 110000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Inception + Gate (tanh)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception + Gate (tanh)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | # percent = 100 * float(len(pred_list)) / total_questions 166 | # sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | # sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /CNN Inception + Gate (tanh)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception + Gate/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 7 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 150000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Inception + Gate/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception + Gate/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | # percent = 100 * float(len(pred_list)) / total_questions 166 | # sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | # sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /CNN Inception + Gate/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Inception + Residual/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 6 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 130000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Inception + Residual/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Inception + Residual/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /CNN Non-Inception/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 9 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 160000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /CNN Non-Inception/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /CNN Non-Inception/train_att_bc.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | from write_to_log import write_log 9 | 10 | import caffe 11 | from caffe import layers as L 12 | from caffe import params as P 13 | 14 | from vqa_data_provider_layer import VQADataProvider 15 | from visualize_tools import exec_validation, drawgraph 16 | import config 17 | 18 | 19 | def qlstm(mode, batchsize, T, question_vocab_size): 20 | n = caffe.NetSpec() 21 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) 22 | # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ 23 | # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) 24 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 25 | module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) 26 | 27 | # word embedding 28 | n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 29 | weight_filler=dict(type='uniform',min=-0.08,max=0.08)) 30 | # n.embed = L.TanH(n.embed_ba) 31 | n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) 32 | n.embed_scale_resh = L.Reshape(n.embed_scale,\ 33 | reshape_param=dict(\ 34 | shape=dict(dim=[batchsize,1,T,-1]))) 35 | 36 | # convolution 37 | n.word_feature_3 = L.Convolution(n.embed_scale_resh, kernel_h=3, kernel_w=300, stride=1, num_output=2048, pad_h=1, pad_w=0, weight_filler=dict(type='xavier')) 38 | n.word_relu_3 = L.ReLU(n.word_feature_3) 39 | n.word_vec_3 = L.Pooling(n.word_relu_3, kernel_h=T, kernel_w=1, stride=T, pool=P.Pooling.MAX) 40 | n.concat_vec_dropped = L.Dropout(n.word_vec_3,dropout_param={'dropout_ratio':0.5}) 41 | 42 | n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.concat_vec_dropped, axis=2, tiles=14) 43 | n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) 44 | n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) 45 | n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 46 | n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) 47 | n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) 48 | n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) 49 | 50 | # multi-channel attention 51 | n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) 52 | n.att_conv1_relu = L.ReLU(n.att_conv1) 53 | n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) 54 | n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) 55 | n.att_softmax = L.Softmax(n.att_reshaped, axis=2) 56 | n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) 57 | att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) 58 | n.att_map0 = att_maps[0] 59 | n.att_map1 = att_maps[1] 60 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 61 | n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) 62 | n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) 63 | n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) 64 | n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) 65 | n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) 66 | 67 | # merge attention and lstm with compact bilinear pooling 68 | n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) 69 | #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 70 | n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.concat_vec_dropped, 71 | compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 72 | n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) 73 | n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) 74 | 75 | n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) 76 | n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) 77 | 78 | n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) 79 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 80 | return n.to_proto() 81 | 82 | def make_answer_vocab(adic, vocab_size): 83 | """ 84 | Returns a dictionary that maps words to indices. 85 | """ 86 | adict = {'':0} 87 | nadict = {'':1000000} 88 | vid = 1 89 | for qid in adic.keys(): 90 | answer_obj = adic[qid] 91 | answer_list = [ans['answer'] for ans in answer_obj] 92 | 93 | for q_ans in answer_list: 94 | # create dict 95 | if adict.has_key(q_ans): 96 | nadict[q_ans] += 1 97 | else: 98 | nadict[q_ans] = 1 99 | adict[q_ans] = vid 100 | vid +=1 101 | 102 | # debug 103 | nalist = [] 104 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 105 | nalist.append((k,v)) 106 | 107 | # remove words that appear less than once 108 | n_del_ans = 0 109 | n_valid_ans = 0 110 | adict_nid = {} 111 | for i, w in enumerate(nalist[:-vocab_size]): 112 | del adict[w[0]] 113 | n_del_ans += w[1] 114 | for i, w in enumerate(nalist[-vocab_size:]): 115 | n_valid_ans += w[1] 116 | adict_nid[w[0]] = i 117 | 118 | return adict_nid 119 | 120 | def make_question_vocab(qdic): 121 | """ 122 | Returns a dictionary that maps words to indices. 123 | """ 124 | vdict = {'':0} 125 | vid = 1 126 | for qid in qdic.keys(): 127 | # sequence to list 128 | q_str = qdic[qid]['qstr'] 129 | q_list = VQADataProvider.seq_to_list(q_str) 130 | 131 | # create dict 132 | for w in q_list: 133 | if not vdict.has_key(w): 134 | vdict[w] = vid 135 | vid +=1 136 | 137 | return vdict 138 | 139 | def make_vocab_files(): 140 | """ 141 | Produce the question and answer vocabulary files. 142 | """ 143 | write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt') 144 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 145 | question_vocab = make_question_vocab(qdic) 146 | write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt') 147 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 148 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 149 | return question_vocab, answer_vocab 150 | 151 | def main(): 152 | if not os.path.exists('./result'): 153 | os.makedirs('./result') 154 | 155 | question_vocab, answer_vocab = {}, {} 156 | if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'): 157 | write_log('restoring vocab', 'log.txt') 158 | with open('./result/vdict.json','r') as f: 159 | question_vocab = json.load(f) 160 | with open('./result/adict.json','r') as f: 161 | answer_vocab = json.load(f) 162 | else: 163 | question_vocab, answer_vocab = make_vocab_files() 164 | with open('./result/vdict.json','w') as f: 165 | json.dump(question_vocab, f) 166 | with open('./result/adict.json','w') as f: 167 | json.dump(answer_vocab, f) 168 | 169 | write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt') 170 | write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt') 171 | 172 | with open('./result/proto_train.prototxt', 'w') as f: 173 | f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 174 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 175 | 176 | with open('./result/proto_test.prototxt', 'w') as f: 177 | f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \ 178 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 179 | 180 | caffe.set_device(config.GPU_ID) 181 | caffe.set_mode_gpu() 182 | solver = caffe.get_solver('./qlstm_solver.prototxt') 183 | 184 | train_loss = np.zeros(config.MAX_ITERATIONS) 185 | # results = [] 186 | 187 | for it in range(config.MAX_ITERATIONS): 188 | solver.step(1) 189 | 190 | # store the train loss 191 | train_loss[it] = solver.net.blobs['loss'].data 192 | 193 | if it != 0 and it % config.PRINT_INTERVAL == 0: 194 | write_log('------------------------------------', 'log.txt') 195 | write_log('Iteration: ' + str(it), 'log.txt') 196 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 197 | write_log('Train loss: ' + str(c_mean_loss), 'log.txt') 198 | if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test 199 | solver.test_nets[0].save('./result/tmp.caffemodel') 200 | write_log('Validating...', 'log.txt') 201 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it) 202 | write_log('Iteration: ' + str(it), 'log.txt') 203 | write_log('Test loss: ' + str(test_loss), 'log.txt') 204 | write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt') 205 | write_log('Per Question Type Accuracy is the following:', 'log.txt') 206 | for quesType in acc_per_ques: 207 | write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt') 208 | write_log('Per Answer Type Accuracy is the following:', 'log.txt') 209 | for ansType in acc_per_ans: 210 | write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt') 211 | # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 212 | # best_result_idx = np.array([x[3] for x in results]).argmax() 213 | # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt') 214 | # drawgraph(results) 215 | 216 | if __name__ == '__main__': 217 | main() 218 | -------------------------------------------------------------------------------- /CNN Non-Inception/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | # percent = 100 * float(len(pred_list)) / total_questions 166 | # sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | # sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /CNN Non-Inception/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /LSTM (baseline)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 10 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | MAX_ITERATIONS = 1000000 7 | PRINT_INTERVAL = 1000 8 | VALIDATE_INTERVAL = 90000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 9 | 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | 13 | # what data to use for the vocabulary 14 | QUESTION_VOCAB_SPACE = 'train' 15 | ANSWER_VOCAB_SPACE = 'train' 16 | 17 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 18 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 19 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 20 | 21 | # location of the data 22 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 23 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 24 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 25 | 26 | DATA_PATHS = { 27 | 'train': { 28 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 29 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 30 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 31 | }, 32 | 'val': { 33 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 34 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 35 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 36 | }, 37 | 'test-dev': { 38 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 39 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 40 | }, 41 | 'test': { 42 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 43 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 44 | }, 45 | # TODO it would be nice if genome also followed the same file format as vqa 46 | 'genome': { 47 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 48 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /LSTM (baseline)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /LSTM (baseline)/train_att_bc.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | from write_to_log import write_log 9 | 10 | import caffe 11 | from caffe import layers as L 12 | from caffe import params as P 13 | 14 | from vqa_data_provider_layer import VQADataProvider 15 | from visualize_tools import exec_validation, drawgraph 16 | import config 17 | 18 | 19 | def qlstm(mode, batchsize, T, question_vocab_size): 20 | n = caffe.NetSpec() 21 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) 22 | # n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ 23 | # module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=5 ) 24 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 25 | module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) 26 | 27 | n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 28 | weight_filler=dict(type='uniform',min=-0.08,max=0.08)) 29 | n.embed = L.TanH(n.embed_ba) 30 | # concat_word_embed = [n.embed, n.glove] 31 | # n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 32 | 33 | # LSTM1 34 | n.lstm1 = L.LSTM(\ 35 | n.embed, n.cont,\ 36 | recurrent_param=dict(\ 37 | num_output=1024,\ 38 | weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ 39 | bias_filler=dict(type='constant',value=0))) 40 | tops1 = L.Slice(n.lstm1, ntop=T, slice_param={'axis':0}) 41 | for i in xrange(T-1): 42 | n.__setattr__('slice_first'+str(i), tops1[int(i)]) 43 | n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) 44 | n.lstm1_out = tops1[T-1] 45 | n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ 46 | reshape_param=dict(\ 47 | shape=dict(dim=[-1,1024]))) 48 | n.lstm1_reshaped_droped = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':0.3}) 49 | n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':0.3}) 50 | # LSTM2 51 | n.lstm2 = L.LSTM(\ 52 | n.lstm1_droped, n.cont,\ 53 | recurrent_param=dict(\ 54 | num_output=1024,\ 55 | weight_filler=dict(type='uniform',min=-0.08,max=0.08),\ 56 | bias_filler=dict(type='constant',value=0))) 57 | tops2 = L.Slice(n.lstm2, ntop=T, slice_param={'axis':0}) 58 | for i in xrange(T-1): 59 | n.__setattr__('slice_second'+str(i), tops2[int(i)]) 60 | n.__setattr__('silence_data_second'+str(i), L.Silence(tops2[int(i)],ntop=0)) 61 | n.lstm2_out = tops2[T-1] 62 | n.lstm2_reshaped = L.Reshape(n.lstm2_out,\ 63 | reshape_param=dict(\ 64 | shape=dict(dim=[-1,1024]))) 65 | n.lstm2_reshaped_droped = L.Dropout(n.lstm2_reshaped,dropout_param={'dropout_ratio':0.3}) 66 | concat_botom = [n.lstm1_reshaped_droped, n.lstm2_reshaped_droped] 67 | n.lstm_12 = L.Concat(*concat_botom) 68 | 69 | n.q_emb_tanh_droped_resh = L.Reshape(n.lstm_12,reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 70 | n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.q_emb_tanh_droped_resh, axis=2, tiles=14) 71 | n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) 72 | n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) 73 | n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 74 | n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) 75 | n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) 76 | n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) 77 | 78 | # multi-channel attention 79 | n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) 80 | n.att_conv1_relu = L.ReLU(n.att_conv1) 81 | n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) 82 | n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) 83 | n.att_softmax = L.Softmax(n.att_reshaped, axis=2) 84 | n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) 85 | att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) 86 | n.att_map0 = att_maps[0] 87 | n.att_map1 = att_maps[1] 88 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 89 | n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) 90 | n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) 91 | n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) 92 | n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) 93 | n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) 94 | 95 | # merge attention and lstm with compact bilinear pooling 96 | n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) 97 | n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 98 | n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.lstm_12_resh, 99 | compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 100 | n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) 101 | n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) 102 | 103 | n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) 104 | n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) 105 | 106 | n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) 107 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 108 | return n.to_proto() 109 | 110 | def make_answer_vocab(adic, vocab_size): 111 | """ 112 | Returns a dictionary that maps words to indices. 113 | """ 114 | adict = {'':0} 115 | nadict = {'':1000000} 116 | vid = 1 117 | for qid in adic.keys(): 118 | answer_obj = adic[qid] 119 | answer_list = [ans['answer'] for ans in answer_obj] 120 | 121 | for q_ans in answer_list: 122 | # create dict 123 | if adict.has_key(q_ans): 124 | nadict[q_ans] += 1 125 | else: 126 | nadict[q_ans] = 1 127 | adict[q_ans] = vid 128 | vid +=1 129 | 130 | # debug 131 | nalist = [] 132 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 133 | nalist.append((k,v)) 134 | 135 | # remove words that appear less than once 136 | n_del_ans = 0 137 | n_valid_ans = 0 138 | adict_nid = {} 139 | for i, w in enumerate(nalist[:-vocab_size]): 140 | del adict[w[0]] 141 | n_del_ans += w[1] 142 | for i, w in enumerate(nalist[-vocab_size:]): 143 | n_valid_ans += w[1] 144 | adict_nid[w[0]] = i 145 | 146 | return adict_nid 147 | 148 | def make_question_vocab(qdic): 149 | """ 150 | Returns a dictionary that maps words to indices. 151 | """ 152 | vdict = {'':0} 153 | vid = 1 154 | for qid in qdic.keys(): 155 | # sequence to list 156 | q_str = qdic[qid]['qstr'] 157 | q_list = VQADataProvider.seq_to_list(q_str) 158 | 159 | # create dict 160 | for w in q_list: 161 | if not vdict.has_key(w): 162 | vdict[w] = vid 163 | vid +=1 164 | 165 | return vdict 166 | 167 | def make_vocab_files(): 168 | """ 169 | Produce the question and answer vocabulary files. 170 | """ 171 | write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt') 172 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 173 | question_vocab = make_question_vocab(qdic) 174 | write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt') 175 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 176 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 177 | return question_vocab, answer_vocab 178 | 179 | def main(): 180 | if not os.path.exists('./result'): 181 | os.makedirs('./result') 182 | 183 | question_vocab, answer_vocab = {}, {} 184 | if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'): 185 | write_log('restoring vocab', 'log.txt') 186 | with open('./result/vdict.json','r') as f: 187 | question_vocab = json.load(f) 188 | with open('./result/adict.json','r') as f: 189 | answer_vocab = json.load(f) 190 | else: 191 | question_vocab, answer_vocab = make_vocab_files() 192 | with open('./result/vdict.json','w') as f: 193 | json.dump(question_vocab, f) 194 | with open('./result/adict.json','w') as f: 195 | json.dump(answer_vocab, f) 196 | 197 | write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt') 198 | write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt') 199 | 200 | with open('./result/proto_train.prototxt', 'w') as f: 201 | f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 202 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 203 | 204 | with open('./result/proto_test.prototxt', 'w') as f: 205 | f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \ 206 | config.MAX_WORDS_IN_QUESTION, len(question_vocab)))) 207 | 208 | caffe.set_device(config.GPU_ID) 209 | caffe.set_mode_gpu() 210 | solver = caffe.get_solver('./qlstm_solver.prototxt') 211 | 212 | train_loss = np.zeros(config.MAX_ITERATIONS) 213 | # results = [] 214 | 215 | for it in range(config.MAX_ITERATIONS): 216 | solver.step(1) 217 | 218 | # store the train loss 219 | train_loss[it] = solver.net.blobs['loss'].data 220 | 221 | if it != 0 and it % config.PRINT_INTERVAL == 0: 222 | write_log('------------------------------------', 'log.txt') 223 | write_log('Iteration: ' + str(it), 'log.txt') 224 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 225 | write_log('Train loss: ' + str(c_mean_loss), 'log.txt') 226 | if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test 227 | solver.test_nets[0].save('./result/tmp.caffemodel') 228 | write_log('Validating...', 'log.txt') 229 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it) 230 | write_log('Iteration: ' + str(it), 'log.txt') 231 | write_log('Test loss: ' + str(test_loss), 'log.txt') 232 | write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt') 233 | write_log('Per Question Type Accuracy is the following:', 'log.txt') 234 | for quesType in acc_per_ques: 235 | write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt') 236 | write_log('Per Answer Type Accuracy is the following:', 'log.txt') 237 | for ansType in acc_per_ans: 238 | write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt') 239 | # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 240 | # best_result_idx = np.array([x[3] for x in results]).argmax() 241 | # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt') 242 | # drawgraph(results) 243 | 244 | if __name__ == '__main__': 245 | main() 246 | -------------------------------------------------------------------------------- /LSTM (baseline)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | # t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 139 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 140 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 141 | net.blobs['img_feature'].data[...] = t_img_feature 142 | net.blobs['label'].data[...] = t_answer 143 | # net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 146 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 147 | testloss_list.append(net.blobs['loss'].data) 148 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 149 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 150 | if visualize: 151 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 152 | if mode == 'test-dev' or 'test': 153 | ans_str = '' 154 | ans_list = ['']*10 155 | else: 156 | ans_str = dp.vec_to_answer(ans) 157 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 158 | stat_list.append({\ 159 | 'qid' : qid, 160 | 'q_list' : q_list, 161 | 'iid' : iid, 162 | 'answer': ans_str, 163 | 'ans_list': ans_list, 164 | 'pred' : pred }) 165 | #percent = 100 * float(len(pred_list)) / total_questions 166 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 167 | #sys.stdout.flush() 168 | 169 | 170 | 171 | mean_testloss = np.array(testloss_list).mean() 172 | 173 | if mode == 'val': 174 | valFile = './result/val2014_resfile' 175 | with open(valFile, 'w') as f: 176 | json.dump(pred_list, f) 177 | if visualize: 178 | visualize_failures(stat_list,mode) 179 | annFile = config.DATA_PATHS['val']['ans_file'] 180 | quesFile = config.DATA_PATHS['val']['ques_file'] 181 | vqa = VQA(annFile, quesFile) 182 | vqaRes = vqa.loadRes(valFile, quesFile) 183 | vqaEval = VQAEval(vqa, vqaRes, n=2) 184 | vqaEval.evaluate() 185 | acc_overall = vqaEval.accuracy['overall'] 186 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 187 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 188 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 189 | elif mode == 'test-dev': 190 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 191 | with open(filename+'.json', 'w') as f: 192 | json.dump(pred_list, f) 193 | if visualize: 194 | visualize_failures(stat_list,mode) 195 | elif mode == 'test': 196 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(pred_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | 202 | def drawgraph(results, save_question_type_graphs=False): 203 | # 0:it 204 | # 1:trainloss 205 | # 2:testloss 206 | # 3:oa_acc 207 | # 4:qt_acc 208 | # 5:at_acc 209 | 210 | # training curve 211 | it = np.array([l[0] for l in results]) 212 | loss = np.array([l[1] for l in results]) 213 | valloss = np.array([l[2] for l in results]) 214 | valacc = np.array([l[3] for l in results]) 215 | 216 | fig = plt.figure() 217 | ax1 = fig.add_subplot(111) 218 | ax2 = ax1.twinx() 219 | 220 | ax1.plot(it,loss, color='blue', label='train loss') 221 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 222 | ax2.plot(it,valacc, color='red', label='acc on val') 223 | plt.legend(loc='lower left') 224 | 225 | ax1.set_xlabel('Iterations') 226 | ax1.set_ylabel('Loss Value') 227 | ax2.set_ylabel('Accuracy on Val [%]') 228 | 229 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 230 | plt.clf() 231 | plt.close("all") 232 | 233 | # question type 234 | it = np.array([l[0] for l in results]) 235 | oa_acc = np.array([l[3] for l in results]) 236 | qt_dic_list = [l[4] for l in results] 237 | 238 | def draw_qt_acc(target_key_list, figname): 239 | fig = plt.figure() 240 | for k in target_key_list: 241 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 242 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 243 | plt.plot(it,t_val,label=str(k)) 244 | plt.legend(fontsize='small') 245 | plt.ylim(0,100.) 246 | #plt.legend(prop={'size':6}) 247 | 248 | plt.xlabel('Iterations') 249 | plt.ylabel('Accuracy on Val [%]') 250 | 251 | plt.savefig(figname,dpi=200) 252 | plt.clf() 253 | plt.close("all") 254 | 255 | if save_question_type_graphs: 256 | s_keys = sorted(qt_dic_list[0].keys()) 257 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 258 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 259 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 260 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 261 | draw_qt_acc(['what color is the','what color are the','what color is',\ 262 | 'what color','what is the color of the'],'./qt_color.png') 263 | draw_qt_acc(['how many','how','how many people are',\ 264 | 'how many people are in'],'./qt_number.png') 265 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 266 | 'which'],'./qt_who_why_where_which.png') 267 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 268 | 'is the woman','is this person','what is the woman','is the person',\ 269 | 'what is the person'],'./qt_human.png') 270 | 271 | 272 | -------------------------------------------------------------------------------- /LSTM (baseline)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Convolutional Text Representations for Visual Question Answering 2 | 3 | This is the code for our SDM18 paper [Learning Convolutional Text Representations for Visual Question Answering](https://epubs.siam.org/doi/abs/10.1137/1.9781611975321.67). We used it to explore different text representation methods in VQA. The reference code is [vqa-mcb](https://github.com/akirafukui/vqa-mcb). 4 | 5 | Created by [Zhengyang Wang](http://people.tamu.edu/~zhengyang.wang/) and [Shuiwang Ji](http://people.tamu.edu/~sji/index.html) at Texas A&M University. 6 | 7 | ## Citation 8 | If you wish to cite our work, you can use the following bib for now. 9 | 10 | ``` 11 | @inproceedings{wang2018learning, 12 | title={Learning Convolutional Text Representations for Visual Question Answering}, 13 | author={Wang, Zhengyang and Ji, Shuiwang}, 14 | booktitle={Proceedings of the 2018 SIAM International Conference on Data Mining}, 15 | pages={594--602}, 16 | year={2018}, 17 | organization={SIAM} 18 | } 19 | ``` 20 | 21 | ## Instructions 22 | 23 | To replicate our results, do the following prerequisites as in [vqa-mcb](https://github.com/akirafukui/vqa-mcb): 24 | 25 | - Compile the `feature/20160617_cb_softattention` branch of [this fork of Caffe](https://github.com/akirafukui/caffe/). This branch contains Yang Gao’s Compact Bilinear layers ([dedicated repo](https://github.com/gy20073/compact_bilinear_pooling), [paper](https://arxiv.org/abs/1511.06062)) released under the [BDD license](https://github.com/gy20073/compact_bilinear_pooling/blob/master/caffe-20160312/LICENSE_BDD), and Ronghang Hu’s Soft Attention layers ([paper](https://arxiv.org/abs/1511.03745)) released under BSD 2-clause. 26 | - Download the [pre-trained ResNet-152 model](https://github.com/KaimingHe/deep-residual-networks). 27 | - Download the [VQA tools](https://github.com/VT-vision-lab/VQA). 28 | - Download the [VQA real-image dataset](http://visualqa.org/download.html). 29 | - Do the [data preprocessing](https://github.com/akirafukui/vqa-mcb/tree/master/preprocess). 30 | 31 | **Note:** As explained in our paper, we did not use any additional data such as "GloVe" and "Visual Genome". 32 | 33 | To train and test a model, edit the corresponding `config.py` and `qlstm_solver.prototxt` files. 34 | 35 | **Note:** Unlike [vqa-mcb](https://github.com/akirafukui/vqa-mcb), in our experiments, different methods require different data provider layers. Use `vqa_data_provider_layer.py` and `visualize_tools.py` in the same folder. 36 | 37 | In `config.py`, set `GPU_ID` and `VALIDATE_INTERVAL` (training iterations) properly. 38 | 39 | **Note:** As stated in our paper, we trained only on the training set, and tested on the validation set. The code has been modified to do training and testing automatically if you set `VALIDATE_INTERVAL` to the number of iterations for training. The pre-set number is what we used in our results. In our experiments, we split the original training set into new training set and validation set, and used early stopping to determine this number. Then we used this code to train our model on all training data. 40 | 41 | In `qlstm_solver.prototxt`, set `snapshot` and `snapshot_prefix` correctly. 42 | 43 | Now just run `python train_xxx.py`. Training can take some time. Snapshots are saved according to the settings in `qlstm_solver.prototxt`. To stop training, just hit `Control + C`. 44 | -------------------------------------------------------------------------------- /fastText (char+word)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 9 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 6 | LENGTH_OF_LONGEST_WORD = 17 7 | #MAX_CHARS_IN_QUESTION = 100 8 | MAX_ITERATIONS = 1000000 9 | PRINT_INTERVAL = 1000 10 | VALIDATE_INTERVAL = 30000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 11 | 12 | # what data to use for training 13 | TRAIN_DATA_SPLITS = 'train' 14 | 15 | # what data to use for the vocabulary 16 | QUESTION_VOCAB_SPACE = 'train' 17 | ANSWER_VOCAB_SPACE = 'train' 18 | 19 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 20 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 21 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 22 | 23 | # location of the data 24 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 25 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 26 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 27 | 28 | DATA_PATHS = { 29 | 'train': { 30 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 31 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 32 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 33 | }, 34 | 'val': { 35 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 36 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 37 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 38 | }, 39 | 'test-dev': { 40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 41 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 42 | }, 43 | 'test': { 44 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 45 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 46 | }, 47 | # TODO it would be nice if genome also followed the same file format as vqa 48 | 'genome': { 49 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 50 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /fastText (char+word)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | -------------------------------------------------------------------------------- /fastText (char+word)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | t_word, t_cont, t_word_c, t_cont_c, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 140 | net.blobs['data1'].data[...] = t_word_c 141 | net.blobs['cont1'].data[...] = t_cont_c 142 | net.blobs['img_feature'].data[...] = t_img_feature 143 | net.blobs['label'].data[...] = t_answer 144 | #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 145 | net.forward() 146 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 147 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 148 | testloss_list.append(net.blobs['loss'].data) 149 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 150 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 151 | if visualize: 152 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 153 | if mode == 'test-dev' or 'test': 154 | ans_str = '' 155 | ans_list = ['']*10 156 | else: 157 | ans_str = dp.vec_to_answer(ans) 158 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 159 | stat_list.append({\ 160 | 'qid' : qid, 161 | 'q_list' : q_list, 162 | 'iid' : iid, 163 | 'answer': ans_str, 164 | 'ans_list': ans_list, 165 | 'pred' : pred }) 166 | #percent = 100 * float(len(pred_list)) / total_questions 167 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 168 | #sys.stdout.flush() 169 | 170 | 171 | 172 | mean_testloss = np.array(testloss_list).mean() 173 | 174 | if mode == 'val': 175 | valFile = './result/val2014_resfile' 176 | with open(valFile, 'w') as f: 177 | json.dump(pred_list, f) 178 | if visualize: 179 | visualize_failures(stat_list,mode) 180 | annFile = config.DATA_PATHS['val']['ans_file'] 181 | quesFile = config.DATA_PATHS['val']['ques_file'] 182 | vqa = VQA(annFile, quesFile) 183 | vqaRes = vqa.loadRes(valFile, quesFile) 184 | vqaEval = VQAEval(vqa, vqaRes, n=2) 185 | vqaEval.evaluate() 186 | acc_overall = vqaEval.accuracy['overall'] 187 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 188 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 189 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 190 | elif mode == 'test-dev': 191 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 192 | with open(filename+'.json', 'w') as f: 193 | json.dump(pred_list, f) 194 | if visualize: 195 | visualize_failures(stat_list,mode) 196 | elif mode == 'test': 197 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 198 | with open(filename+'.json', 'w') as f: 199 | json.dump(pred_list, f) 200 | if visualize: 201 | visualize_failures(stat_list,mode) 202 | 203 | def drawgraph(results, save_question_type_graphs=False): 204 | # 0:it 205 | # 1:trainloss 206 | # 2:testloss 207 | # 3:oa_acc 208 | # 4:qt_acc 209 | # 5:at_acc 210 | 211 | # training curve 212 | it = np.array([l[0] for l in results]) 213 | loss = np.array([l[1] for l in results]) 214 | valloss = np.array([l[2] for l in results]) 215 | valacc = np.array([l[3] for l in results]) 216 | 217 | fig = plt.figure() 218 | ax1 = fig.add_subplot(111) 219 | ax2 = ax1.twinx() 220 | 221 | ax1.plot(it,loss, color='blue', label='train loss') 222 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 223 | ax2.plot(it,valacc, color='red', label='acc on val') 224 | plt.legend(loc='lower left') 225 | 226 | ax1.set_xlabel('Iterations') 227 | ax1.set_ylabel('Loss Value') 228 | ax2.set_ylabel('Accuracy on Val [%]') 229 | 230 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 231 | plt.clf() 232 | plt.close("all") 233 | 234 | # question type 235 | it = np.array([l[0] for l in results]) 236 | oa_acc = np.array([l[3] for l in results]) 237 | qt_dic_list = [l[4] for l in results] 238 | 239 | def draw_qt_acc(target_key_list, figname): 240 | fig = plt.figure() 241 | for k in target_key_list: 242 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 243 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 244 | plt.plot(it,t_val,label=str(k)) 245 | plt.legend(fontsize='small') 246 | plt.ylim(0,100.) 247 | #plt.legend(prop={'size':6}) 248 | 249 | plt.xlabel('Iterations') 250 | plt.ylabel('Accuracy on Val [%]') 251 | 252 | plt.savefig(figname,dpi=200) 253 | plt.clf() 254 | plt.close("all") 255 | 256 | if save_question_type_graphs: 257 | s_keys = sorted(qt_dic_list[0].keys()) 258 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 259 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 260 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 261 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 262 | draw_qt_acc(['what color is the','what color are the','what color is',\ 263 | 'what color','what is the color of the'],'./qt_color.png') 264 | draw_qt_acc(['how many','how','how many people are',\ 265 | 'how many people are in'],'./qt_number.png') 266 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 267 | 'which'],'./qt_who_why_where_which.png') 268 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 269 | 'is the woman','is this person','what is the woman','is the person',\ 270 | 'what is the person'],'./qt_human.png') 271 | 272 | 273 | -------------------------------------------------------------------------------- /fastText (char+word)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | -------------------------------------------------------------------------------- /fastText (word)/config.py: -------------------------------------------------------------------------------- 1 | GPU_ID = 9 2 | BATCH_SIZE = 32 3 | VAL_BATCH_SIZE = 32 4 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 5 | MAX_WORDS_IN_QUESTION = 22 # Do not crop 6 | EMBEDDING_SIZE = 300 7 | MAX_ITERATIONS = 1000000 8 | PRINT_INTERVAL = 1000 9 | VALIDATE_INTERVAL = 40000 # We train on 'train' and test on 'val'. Set it to the number of iterations for training. Then the validation accuracy is the test accuracy. 10 | 11 | # what data to use for training 12 | TRAIN_DATA_SPLITS = 'train' 13 | 14 | # what data to use for the vocabulary 15 | QUESTION_VOCAB_SPACE = 'train' 16 | ANSWER_VOCAB_SPACE = 'train' 17 | 18 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 19 | VQA_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonHelperTools' 20 | VQA_EVAL_TOOLS_PATH = '/tempspace/zwang6/VQA/PythonEvaluationTools' 21 | 22 | # location of the data 23 | VQA_PREFIX = '/tempspace/zwang6/VQA/' 24 | GENOME_PREFIX = '/tempspace/zwang6/vqa_mcb/genome/' 25 | DATA_PREFIX = '/tempspace/zwang6/vqa_mcb/vqa-mcb/preprocess/' 26 | 27 | DATA_PATHS = { 28 | 'train': { 29 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 30 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 31 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/train2014/COCO_train2014_' 32 | }, 33 | 'val': { 34 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 35 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 36 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/val2014/COCO_val2014_' 37 | }, 38 | 'test-dev': { 39 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 40 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 41 | }, 42 | 'test': { 43 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 44 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/test2015/COCO_test2015_' 45 | }, 46 | # TODO it would be nice if genome also followed the same file format as vqa 47 | 'genome': { 48 | 'genome_file': GENOME_PREFIX + '/question_answers_prepro.json', 49 | 'features_prefix': DATA_PREFIX + '/image_features/resnet_res5c_bgrms_large/selected/' 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /fastText (word)/qlstm_solver.prototxt: -------------------------------------------------------------------------------- 1 | # The train/test net protocol buffer definition 2 | train_net: "./result/proto_train.prototxt" 3 | test_net: "./result/proto_test.prototxt" 4 | 5 | max_iter: 1000000 6 | display: 5000 7 | snapshot: 5000 8 | snapshot_prefix: "./result/" 9 | 10 | # The base learning rate, momentum and the weight decay of the network. 11 | solver_type: ADAM 12 | base_lr: 0.0007 13 | momentum: 0.9 14 | momentum2: 0.999 15 | weight_decay: 0.000 16 | lr_policy: "fixed" 17 | test_iter: 1 18 | test_interval: 10000000 19 | 20 | # accumulate gradients 21 | iter_size: 2 22 | 23 | average_loss: 64 -------------------------------------------------------------------------------- /fastText (word)/train_att_bc.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | from write_to_log import write_log 9 | 10 | import caffe 11 | from caffe import layers as L 12 | from caffe import params as P 13 | 14 | from vqa_data_provider_layer import VQADataProvider 15 | from visualize_tools import exec_validation, drawgraph 16 | import config 17 | 18 | 19 | def qlstm(mode, batchsize, T, question_vocab_size, embed_size): 20 | n = caffe.NetSpec() 21 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize}) 22 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 23 | module='vqa_data_provider_layer', layer='VQADataProviderLayer', param_str=mode_str, ntop=4 ) 24 | 25 | # word embedding (static + dynamic) 26 | n.embed_ba = L.Embed(n.data, input_dim=question_vocab_size, num_output=embed_size, \ 27 | weight_filler=dict(type='uniform',min=-0.08,max=0.08)) 28 | n.embed_scale = L.Scale(n.embed_ba, n.cont, scale_param=dict(dict(axis=0))) # N x T x d_w 29 | n.embed_scale_resh = L.Reshape(n.embed_scale,reshape_param=dict(shape=dict(dim=[batchsize,T,embed_size,1]))) 30 | 31 | # avg of word embedding 32 | n.embed_avg = L.Convolution(n.embed_scale_resh, convolution_param={'kernel_size': 1, 'num_output': 1, 'bias_term': False, 'weight_filler': dict(type='constant', value=1)}, param=dict(lr_mult=0, decay_mult=0)) # N x 1 x d_w x 1 33 | n.embed_avg_resh = L.Reshape(n.embed_avg,reshape_param=dict(shape=dict(dim=[batchsize,embed_size,1,1]))) 34 | 35 | n.q_emb_tanh_droped_resh_tiled_1 = L.Tile(n.embed_avg_resh, axis=2, tiles=14) 36 | n.q_emb_tanh_droped_resh_tiled = L.Tile(n.q_emb_tanh_droped_resh_tiled_1, axis=3, tiles=14) 37 | n.i_emb_tanh_droped_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[-1,2048,14,14]))) 38 | n.blcf = L.CompactBilinear(n.q_emb_tanh_droped_resh_tiled, n.i_emb_tanh_droped_resh, compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 39 | n.blcf_sign_sqrt = L.SignedSqrt(n.blcf) 40 | n.blcf_sign_sqrt_l2 = L.L2Normalize(n.blcf_sign_sqrt) 41 | n.blcf_droped = L.Dropout(n.blcf_sign_sqrt_l2,dropout_param={'dropout_ratio':0.1}) 42 | 43 | # multi-channel attention 44 | n.att_conv1 = L.Convolution(n.blcf_droped, kernel_size=1, stride=1, num_output=512, pad=0, weight_filler=dict(type='xavier')) 45 | n.att_conv1_relu = L.ReLU(n.att_conv1) 46 | n.att_conv2 = L.Convolution(n.att_conv1_relu, kernel_size=1, stride=1, num_output=2, pad=0, weight_filler=dict(type='xavier')) 47 | n.att_reshaped = L.Reshape(n.att_conv2,reshape_param=dict(shape=dict(dim=[-1,2,14*14]))) 48 | n.att_softmax = L.Softmax(n.att_reshaped, axis=2) 49 | n.att = L.Reshape(n.att_softmax,reshape_param=dict(shape=dict(dim=[-1,2,14,14]))) 50 | att_maps = L.Slice(n.att, ntop=2, slice_param={'axis':1}) 51 | n.att_map0 = att_maps[0] 52 | n.att_map1 = att_maps[1] 53 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 54 | n.att_feature0 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map0, dummy) 55 | n.att_feature1 = L.SoftAttention(n.i_emb_tanh_droped_resh, n.att_map1, dummy) 56 | n.att_feature0_resh = L.Reshape(n.att_feature0, reshape_param=dict(shape=dict(dim=[-1,2048]))) 57 | n.att_feature1_resh = L.Reshape(n.att_feature1, reshape_param=dict(shape=dict(dim=[-1,2048]))) 58 | n.att_feature = L.Concat(n.att_feature0_resh, n.att_feature1_resh) 59 | 60 | # merge attention and lstm with compact bilinear pooling 61 | n.att_feature_resh = L.Reshape(n.att_feature, reshape_param=dict(shape=dict(dim=[-1,4096,1,1]))) 62 | #n.lstm_12_resh = L.Reshape(n.lstm_12, reshape_param=dict(shape=dict(dim=[-1,2048,1,1]))) 63 | n.bc_att_lstm = L.CompactBilinear(n.att_feature_resh, n.embed_avg_resh, 64 | compact_bilinear_param=dict(num_output=16000,sum_pool=False)) 65 | n.bc_sign_sqrt = L.SignedSqrt(n.bc_att_lstm) 66 | n.bc_sign_sqrt_l2 = L.L2Normalize(n.bc_sign_sqrt) 67 | 68 | n.bc_dropped = L.Dropout(n.bc_sign_sqrt_l2, dropout_param={'dropout_ratio':0.1}) 69 | n.bc_dropped_resh = L.Reshape(n.bc_dropped, reshape_param=dict(shape=dict(dim=[-1, 16000]))) 70 | 71 | n.prediction = L.InnerProduct(n.bc_dropped_resh, num_output=3000, weight_filler=dict(type='xavier')) 72 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 73 | return n.to_proto() 74 | 75 | def make_answer_vocab(adic, vocab_size): 76 | """ 77 | Returns a dictionary that maps words to indices. 78 | """ 79 | adict = {'':0} 80 | nadict = {'':1000000} 81 | vid = 1 82 | for qid in adic.keys(): 83 | answer_obj = adic[qid] 84 | answer_list = [ans['answer'] for ans in answer_obj] 85 | 86 | for q_ans in answer_list: 87 | # create dict 88 | if adict.has_key(q_ans): 89 | nadict[q_ans] += 1 90 | else: 91 | nadict[q_ans] = 1 92 | adict[q_ans] = vid 93 | vid +=1 94 | 95 | # debug 96 | nalist = [] 97 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 98 | nalist.append((k,v)) 99 | 100 | # remove words that appear less than once 101 | n_del_ans = 0 102 | n_valid_ans = 0 103 | adict_nid = {} 104 | for i, w in enumerate(nalist[:-vocab_size]): 105 | del adict[w[0]] 106 | n_del_ans += w[1] 107 | for i, w in enumerate(nalist[-vocab_size:]): 108 | n_valid_ans += w[1] 109 | adict_nid[w[0]] = i 110 | 111 | return adict_nid 112 | 113 | def make_question_vocab(qdic): 114 | """ 115 | Returns a dictionary that maps words to indices. 116 | """ 117 | vdict = {'':0} 118 | vid = 1 119 | for qid in qdic.keys(): 120 | # sequence to list 121 | q_str = qdic[qid]['qstr'] 122 | q_list = VQADataProvider.seq_to_list(q_str) 123 | 124 | # create dict 125 | for w in q_list: 126 | if not vdict.has_key(w): 127 | vdict[w] = vid 128 | vid +=1 129 | 130 | return vdict 131 | 132 | def make_vocab_files(): 133 | """ 134 | Produce the question and answer vocabulary files. 135 | """ 136 | write_log('making question vocab... ' + config.QUESTION_VOCAB_SPACE, 'log.txt') 137 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 138 | question_vocab = make_question_vocab(qdic) 139 | write_log('making answer vocab... ' + config.ANSWER_VOCAB_SPACE, 'log.txt') 140 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 141 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 142 | return question_vocab, answer_vocab 143 | 144 | def main(): 145 | if not os.path.exists('./result'): 146 | os.makedirs('./result') 147 | 148 | question_vocab, answer_vocab = {}, {} 149 | if os.path.exists('./result/vdict.json') and os.path.exists('./result/adict.json'): 150 | write_log('restoring vocab', 'log.txt') 151 | with open('./result/vdict.json','r') as f: 152 | question_vocab = json.load(f) 153 | with open('./result/adict.json','r') as f: 154 | answer_vocab = json.load(f) 155 | else: 156 | question_vocab, answer_vocab = make_vocab_files() 157 | with open('./result/vdict.json','w') as f: 158 | json.dump(question_vocab, f) 159 | with open('./result/adict.json','w') as f: 160 | json.dump(answer_vocab, f) 161 | 162 | write_log('question vocab size: '+ str(len(question_vocab)), 'log.txt') 163 | write_log('answer vocab size: '+ str(len(answer_vocab)), 'log.txt') 164 | 165 | with open('./result/proto_train.prototxt', 'w') as f: 166 | f.write(str(qlstm(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 167 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), config.EMBEDDING_SIZE))) 168 | 169 | with open('./result/proto_test.prototxt', 'w') as f: 170 | f.write(str(qlstm('val', config.VAL_BATCH_SIZE, \ 171 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), config.EMBEDDING_SIZE))) 172 | 173 | caffe.set_device(config.GPU_ID) 174 | caffe.set_mode_gpu() 175 | solver = caffe.get_solver('./qlstm_solver.prototxt') 176 | 177 | train_loss = np.zeros(config.MAX_ITERATIONS) 178 | # results = [] 179 | 180 | for it in range(config.MAX_ITERATIONS): 181 | solver.step(1) 182 | 183 | # store the train loss 184 | train_loss[it] = solver.net.blobs['loss'].data 185 | 186 | if it != 0 and it % config.PRINT_INTERVAL == 0: 187 | write_log('------------------------------------', 'log.txt') 188 | write_log('Iteration: ' + str(it), 'log.txt') 189 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 190 | write_log('Train loss: ' + str(c_mean_loss), 'log.txt') 191 | if it != 0 and it % config.VALIDATE_INTERVAL == 0: # acutually test 192 | solver.test_nets[0].save('./result/tmp.caffemodel') 193 | write_log('Validating...', 'log.txt') 194 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.GPU_ID, 'val', it=it) 195 | write_log('Iteration: ' + str(it), 'log.txt') 196 | write_log('Test loss: ' + str(test_loss), 'log.txt') 197 | write_log('Overall Accuracy: ' + str(acc_overall), 'log.txt') 198 | write_log('Per Question Type Accuracy is the following:', 'log.txt') 199 | for quesType in acc_per_ques: 200 | write_log("%s : %.02f" % (quesType, acc_per_ques[quesType]), 'log.txt') 201 | write_log('Per Answer Type Accuracy is the following:', 'log.txt') 202 | for ansType in acc_per_ans: 203 | write_log("%s : %.02f" % (ansType, acc_per_ans[ansType]), 'log.txt') 204 | # results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 205 | # best_result_idx = np.array([x[3] for x in results]).argmax() 206 | # write_log('Best accuracy of ' + str(results[best_result_idx][3]) + ' was at iteration ' + str(results[best_result_idx][0]), 'log.txt') 207 | # drawgraph(results) 208 | 209 | if __name__ == '__main__': 210 | main() 211 | -------------------------------------------------------------------------------- /fastText (word)/visualize_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_provider_layer import VQADataProvider 16 | from vqa_data_provider_layer import VQADataProviderLayer 17 | 18 | import config 19 | sys.path.append(config.VQA_TOOLS_PATH) 20 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 21 | 22 | from vqaTools.vqa import VQA 23 | from vqaEvaluation.vqaEval import VQAEval 24 | 25 | from write_to_log import write_log 26 | 27 | def visualize_failures(stat_list,mode): 28 | 29 | def save_qtype(qtype_list, save_filename, mode): 30 | 31 | if mode == 'val': 32 | savepath = os.path.join('./eval', save_filename) 33 | # TODO 34 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/val2014' 35 | elif mode == 'test-dev': 36 | savepath = os.path.join('./test-dev', save_filename) 37 | # TODO 38 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 39 | elif mode == 'test': 40 | savepath = os.path.join('./test', save_filename) 41 | # TODO 42 | img_pre = '/tempspace/zwang6/VQA/Images/mscoco/test2015' 43 | else: 44 | raise Exception('Unsupported mode') 45 | if os.path.exists(savepath): shutil.rmtree(savepath) 46 | if not os.path.exists(savepath): os.makedirs(savepath) 47 | 48 | for qt in qtype_list: 49 | count = 0 50 | for t_question in stat_list: 51 | #print count, t_question 52 | if count < 40/len(qtype_list): 53 | t_question_list = t_question['q_list'] 54 | saveflag = False 55 | #print 'debug****************************' 56 | #print qt 57 | #print t_question_list 58 | #print t_question_list[0] == qt[0] 59 | #print t_question_list[1] == qt[1] 60 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 61 | saveflag = True 62 | else: 63 | saveflag = False 64 | 65 | if saveflag == True: 66 | t_iid = t_question['iid'] 67 | if mode == 'val': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 70 | elif mode == 'test-dev' or 'test': 71 | t_img = Image.open(os.path.join(img_pre, \ 72 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 73 | 74 | # for caption 75 | #print t_iid 76 | #annIds = caps.getAnnIds(t_iid) 77 | #anns = caps.loadAnns(annIds) 78 | #cap_list = [ann['caption'] for ann in anns] 79 | ans_list = t_question['ans_list'] 80 | draw = ImageDraw.Draw(t_img) 81 | for i in range(len(ans_list)): 82 | try: 83 | draw.text((10,10*i), str(ans_list[i])) 84 | except: 85 | pass 86 | 87 | ans = t_question['answer'] 88 | pred = t_question['pred'] 89 | if ans == -1: 90 | pre = '' 91 | elif ans == pred: 92 | pre = 'correct ' 93 | else: 94 | pre = 'failure ' 95 | #print ' aaa ', ans, pred 96 | ans = re.sub( '/', ' ', str(ans)) 97 | pred = re.sub( '/', ' ', str(pred)) 98 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 99 | str(ans) + ' p_' + str(pred) + '.png' 100 | count += 1 101 | write_log(os.path.join(savepath,img_title), 'visualize_log.txt') 102 | t_img.save(os.path.join(savepath,img_title)) 103 | 104 | write_log('saving colors', 'visualize_log.txt') 105 | qt_color_list = [['what','color']] 106 | save_qtype(qt_color_list, 'colors', mode) 107 | 108 | write_log('saving what is', 'visualize_log.txt') 109 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 110 | save_qtype(qt_whatis_list, 'whatis', mode) 111 | 112 | write_log('saving is', 'visualize_log.txt') 113 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 114 | save_qtype(qt_is_list, 'is', mode) 115 | 116 | write_log('saving how many', 'visualize_log.txt') 117 | qt_howmany_list =[['how','many']] 118 | save_qtype(qt_howmany_list, 'howmany', mode) 119 | 120 | def exec_validation(device_id, mode, it='', visualize=False): 121 | 122 | caffe.set_device(device_id) 123 | caffe.set_mode_gpu() 124 | net = caffe.Net('./result/proto_test.prototxt',\ 125 | './result/tmp.caffemodel',\ 126 | caffe.TEST) 127 | 128 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE) 129 | total_questions = len(dp.getQuesIds()) 130 | epoch = 0 131 | 132 | pred_list = [] 133 | testloss_list = [] 134 | stat_list = [] 135 | 136 | while epoch == 0: 137 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = t_word # np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = t_cont # np.transpose(t_cont,(1,0)) 140 | net.blobs['img_feature'].data[...] = t_img_feature 141 | net.blobs['label'].data[...] = t_answer 142 | #net.blobs['glove'].data[...] = t_glove_matrix # np.transpose(t_glove_matrix, (1,0,2)) 143 | net.forward() 144 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 145 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 146 | testloss_list.append(net.blobs['loss'].data) 147 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 148 | pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 149 | if visualize: 150 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 151 | if mode == 'test-dev' or 'test': 152 | ans_str = '' 153 | ans_list = ['']*10 154 | else: 155 | ans_str = dp.vec_to_answer(ans) 156 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 157 | stat_list.append({\ 158 | 'qid' : qid, 159 | 'q_list' : q_list, 160 | 'iid' : iid, 161 | 'answer': ans_str, 162 | 'ans_list': ans_list, 163 | 'pred' : pred }) 164 | #percent = 100 * float(len(pred_list)) / total_questions 165 | #sys.stdout.write('\r' + ('%.2f' % percent) + '%') 166 | #sys.stdout.flush() 167 | 168 | 169 | 170 | mean_testloss = np.array(testloss_list).mean() 171 | 172 | if mode == 'val': 173 | valFile = './result/val2014_resfile' 174 | with open(valFile, 'w') as f: 175 | json.dump(pred_list, f) 176 | if visualize: 177 | visualize_failures(stat_list,mode) 178 | annFile = config.DATA_PATHS['val']['ans_file'] 179 | quesFile = config.DATA_PATHS['val']['ques_file'] 180 | vqa = VQA(annFile, quesFile) 181 | vqaRes = vqa.loadRes(valFile, quesFile) 182 | vqaEval = VQAEval(vqa, vqaRes, n=2) 183 | vqaEval.evaluate() 184 | acc_overall = vqaEval.accuracy['overall'] 185 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 186 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 187 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 188 | elif mode == 'test-dev': 189 | filename = './result/vqa_OpenEnded_mscoco_test-dev2015_v3t'+str(it).zfill(8)+'_results' 190 | with open(filename+'.json', 'w') as f: 191 | json.dump(pred_list, f) 192 | if visualize: 193 | visualize_failures(stat_list,mode) 194 | elif mode == 'test': 195 | filename = './result/vqa_OpenEnded_mscoco_test2015_v3c'+str(it).zfill(8)+'_results' 196 | with open(filename+'.json', 'w') as f: 197 | json.dump(pred_list, f) 198 | if visualize: 199 | visualize_failures(stat_list,mode) 200 | 201 | def drawgraph(results, save_question_type_graphs=False): 202 | # 0:it 203 | # 1:trainloss 204 | # 2:testloss 205 | # 3:oa_acc 206 | # 4:qt_acc 207 | # 5:at_acc 208 | 209 | # training curve 210 | it = np.array([l[0] for l in results]) 211 | loss = np.array([l[1] for l in results]) 212 | valloss = np.array([l[2] for l in results]) 213 | valacc = np.array([l[3] for l in results]) 214 | 215 | fig = plt.figure() 216 | ax1 = fig.add_subplot(111) 217 | ax2 = ax1.twinx() 218 | 219 | ax1.plot(it,loss, color='blue', label='train loss') 220 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 221 | ax2.plot(it,valacc, color='red', label='acc on val') 222 | plt.legend(loc='lower left') 223 | 224 | ax1.set_xlabel('Iterations') 225 | ax1.set_ylabel('Loss Value') 226 | ax2.set_ylabel('Accuracy on Val [%]') 227 | 228 | plt.savefig('./learning_curve max_%2.2f.png'%valacc.max()) 229 | plt.clf() 230 | plt.close("all") 231 | 232 | # question type 233 | it = np.array([l[0] for l in results]) 234 | oa_acc = np.array([l[3] for l in results]) 235 | qt_dic_list = [l[4] for l in results] 236 | 237 | def draw_qt_acc(target_key_list, figname): 238 | fig = plt.figure() 239 | for k in target_key_list: 240 | write_log(str(k) + str(type(k)), 'visualize_log.txt') 241 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 242 | plt.plot(it,t_val,label=str(k)) 243 | plt.legend(fontsize='small') 244 | plt.ylim(0,100.) 245 | #plt.legend(prop={'size':6}) 246 | 247 | plt.xlabel('Iterations') 248 | plt.ylabel('Accuracy on Val [%]') 249 | 250 | plt.savefig(figname,dpi=200) 251 | plt.clf() 252 | plt.close("all") 253 | 254 | if save_question_type_graphs: 255 | s_keys = sorted(qt_dic_list[0].keys()) 256 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 257 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 258 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 259 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 260 | draw_qt_acc(['what color is the','what color are the','what color is',\ 261 | 'what color','what is the color of the'],'./qt_color.png') 262 | draw_qt_acc(['how many','how','how many people are',\ 263 | 'how many people are in'],'./qt_number.png') 264 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 265 | 'which'],'./qt_who_why_where_which.png') 266 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 267 | 'is the woman','is this person','what is the woman','is the person',\ 268 | 'what is the person'],'./qt_human.png') 269 | 270 | 271 | -------------------------------------------------------------------------------- /fastText (word)/write_to_log.py: -------------------------------------------------------------------------------- 1 | def write_log(str, filename): 2 | with open(filename, 'a') as f: 3 | f.write(str + "\n") 4 | --------------------------------------------------------------------------------