├── .gitignore ├── README.md ├── Utils ├── __init__.py ├── dataSet.py ├── data_reader.py ├── text_processing.py └── utils.py ├── config ├── clevr_from_scratch.yaml ├── clevr_gt_layout.yaml ├── clevr_gt_rl.yaml ├── vqa_from_scratch.yaml └── vqa_gt_layout.yaml ├── data └── vqa │ ├── answers_vqa.txt │ ├── gt_layout_train2014_new_parse.npy │ ├── gt_layout_val2014_new_parse.npy │ ├── vocabulary_layout.txt │ ├── vocabulary_vqa.txt │ └── vocabulary_vqa_glove.npy ├── eval_model ├── eval_example.py ├── eval_layout_accuracy.py ├── eval_layout_learning.py └── layout_evaluator.py ├── global_variables ├── __init__.py └── global_variables.py ├── loadn2nmn_pytorch_env.sh ├── models ├── Attention2.py ├── __init__.py ├── custom_loss.py ├── end2endModuleNet.py ├── function2Module.py ├── layout_assembler.py ├── module_net.py └── modules.py ├── tools ├── build_clevr_imdb.py ├── build_vqa_imdb.py ├── extract_visual_features_vgg_pool5.py └── get_ground_truth_layout.py └── train_model ├── __init__.py ├── from_scratch_hyperparameters.py ├── gt_hyperparameters.py ├── gt_rl_hyperparameters.py ├── input_parameters.py ├── main.py ├── main_copy.py └── train_clevr_gt_layout.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | *.err 3 | *.pyc 4 | test_helper/* 5 | exp_clevr/* 6 | .idea/* 7 | */__pycache__/* 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning to Reason: End-to-End Module Networks for Visual Question Answering 2 | 3 | This repository re-implement https://github.com/ronghanghu/n2nmn in pytorch: 4 | 5 | * R. Hu, J. Andreas, M. Rohrbach, T. Darrell, K. Saenko, *Learning to Reason: End-to-End Module Networks for Visual Question Answering*. in ICCV, 2017. ([PDF](https://arxiv.org/pdf/1704.05526.pdf)) 6 | 7 | ## Getting Started 8 | 9 | These instructions will get you a copy of the project up and running on your local machine for testing. 10 | (Note, this codebase is still under development. To run it, still need to use part of the original code for data preprocessing) 11 | 12 | ### Installing 13 | 14 | 1. Install Python 3 (Anaconda recommended: https://www.continuum.io/downloads). 15 | 2. Install Pytorch (http://pytorch.org/) 16 | 3. cudnn/v7.0-cuda.9.0 (optional) 17 | ``` 18 | git clone git@github.com:YuJiang01/n2nmn_pytorch.git 19 | ``` 20 | 21 | 22 | ### Get preprocessed data 23 | * Follow https://github.com/ronghanghu/n2nmn#download-and-preprocess-the-data preprocess step for CLEVR 24 | 25 | After preprocess the data, 26 | 27 | 28 | 29 | ### Training 30 | 31 | Example: 32 | ``` 33 | python train_model/train_clevr_gt_layout.py --exp_name gt_test --model_type gt_layout --data_dir /private/home/tinayujiang/n2nmn/exp_clevr/data --image_feat_dir /private/home/tinayujiang/n2nmn/exp_clevr/data/vgg_pool5/train --out_dir /private/home/tinayujiang/temp_out 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /Utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/Utils/__init__.py -------------------------------------------------------------------------------- /Utils/dataSet.py: -------------------------------------------------------------------------------- 1 | import os 2 | from torch.utils.data import Dataset 3 | from Utils import text_processing 4 | import numpy as np 5 | 6 | 7 | 8 | class vqa_dataset(Dataset): 9 | def __init__(self,imdb_file, image_feat_directory, **data_params): 10 | super(vqa_dataset,self).__init__() 11 | if imdb_file.endswith('.npy'): 12 | imdb = np.load(imdb_file) 13 | else: 14 | raise TypeError('unknown imdb format.') 15 | 16 | self.imdb = imdb 17 | self.image_feat_directory = image_feat_directory 18 | self.data_params = data_params 19 | self.image_depth_first = data_params['image_depth_first'] 20 | 21 | self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file']) 22 | self.T_encoder = data_params['T_encoder'] 23 | 24 | # peek one example to see whether answer and gt_layout are in the data 25 | self.load_answer = (('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None)) \ 26 | or (('valid_answers' in self.imdb[0]) and (self.imdb[0]['valid_answers'] is not None)) 27 | self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None) 28 | if 'load_gt_layout' in data_params: 29 | self.load_gt_layout = data_params['load_gt_layout'] 30 | # the answer dict is always loaded, regardless of self.load_answer 31 | self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file']) 32 | if not self.load_answer: 33 | print('imdb does not contain answers') 34 | if self.load_gt_layout: 35 | self.T_decoder = data_params['T_decoder'] 36 | self.assembler = data_params['assembler'] 37 | self.prune_filter_module = (data_params['prune_filter_module'] 38 | if 'prune_filter_module' in data_params 39 | else False) 40 | else: 41 | print('imdb does not contain ground-truth layout') 42 | 43 | # load one feature map to peek its size 44 | image_file_name = os.path.basename(self.imdb[0]['feature_path']) 45 | image_feat_path = os.path.join(self.image_feat_directory,image_file_name) 46 | feats = np.load(image_feat_path) 47 | #self.feat_H, self.feat_W, self.feat_D = feats.shape[1:] 48 | 49 | def __len__(self): 50 | return len(self.imdb) 51 | 52 | def __getitem__(self, idx): 53 | input_seq = np.zeros((self.T_encoder),np.int32) 54 | iminfo = self.imdb[idx] 55 | question_inds = [self.vocab_dict.word2idx(w) for w in iminfo['question_tokens']] 56 | seq_length = len(question_inds) 57 | input_seq[:seq_length] = question_inds 58 | image_file_name = os.path.basename(self.imdb[idx]['feature_path']) 59 | image_feat_path = os.path.join(self.image_feat_directory, image_file_name) 60 | image_feat =np.squeeze(np.load(image_feat_path), axis=0) 61 | if not self.image_depth_first: 62 | image_feat = np.transpose(image_feat, axes=(2, 0, 1)) 63 | answer = None 64 | if self.load_answer: 65 | if 'answer' in iminfo: 66 | answer = iminfo['answer'] 67 | elif 'valid_answers' in iminfo: 68 | valid_answers = iminfo['valid_answers'] 69 | answer = np.random.choice(valid_answers) 70 | answer_idx = self.answer_dict.word2idx(answer) 71 | 72 | if self.load_gt_layout: 73 | gt_layout_tokens = iminfo['gt_layout_tokens'] 74 | if self.prune_filter_module: 75 | # remove duplicated consequtive modules (only keeping one _Filter) 76 | for n_t in range(len(gt_layout_tokens) - 1, 0, -1): 77 | if (gt_layout_tokens[n_t - 1] in {'_Filter', '_Find'} 78 | and gt_layout_tokens[n_t] == '_Filter'): 79 | gt_layout_tokens[n_t] = None 80 | gt_layout_tokens = [t for t in gt_layout_tokens if t] 81 | gt_layout =np.array(self.assembler.module_list2tokens( 82 | gt_layout_tokens, self.T_decoder)) 83 | 84 | sample = dict(input_seq_batch=input_seq, 85 | seq_length_batch=seq_length, 86 | image_feat_batch=image_feat) 87 | if self.load_answer: 88 | sample['answer_label_batch'] = answer_idx 89 | if self.load_gt_layout: 90 | sample['gt_layout_batch'] = gt_layout 91 | 92 | return sample -------------------------------------------------------------------------------- /Utils/data_reader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import os 4 | import sys 5 | import threading 6 | import queue 7 | import numpy as np 8 | 9 | from Utils import text_processing 10 | 11 | class BatchLoaderClevr: 12 | def __init__(self, imdb,image_feat_dir, data_params): 13 | self.imdb = imdb 14 | self.image_feat_dir = image_feat_dir 15 | self.data_params = data_params 16 | 17 | self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file']) 18 | self.T_encoder = data_params['T_encoder'] 19 | 20 | # peek one example to see whether answer and gt_layout are in the data 21 | self.load_answer = ('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None) 22 | self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None) 23 | if 'load_gt_layout' in data_params: 24 | self.load_gt_layout = data_params['load_gt_layout'] 25 | # the answer dict is always loaded, regardless of self.load_answer 26 | self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file']) 27 | if not self.load_answer: 28 | print('imdb does not contain answers') 29 | if self.load_gt_layout: 30 | self.T_decoder = data_params['T_decoder'] 31 | self.assembler = data_params['assembler'] 32 | self.prune_filter_module = (data_params['prune_filter_module'] 33 | if 'prune_filter_module' in data_params 34 | else False) 35 | else: 36 | print('imdb does not contain ground-truth layout') 37 | 38 | # load one feature map to peek its size 39 | image_feat_basename = os.path.basename(self.imdb[0]['feature_path']) 40 | image_feat_name = os.path.join(self.image_feat_dir, image_feat_basename) 41 | feats = np.load(image_feat_name) 42 | #self.feat_H, self.feat_W, self.feat_D = feats.shape[1:] 43 | self.feat_D, self.feat_H, self.feat_W = feats.shape[1:] 44 | 45 | def load_one_batch(self, sample_ids): 46 | actual_batch_size = len(sample_ids) 47 | input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32) 48 | seq_length_batch = np.zeros(actual_batch_size, np.int32) 49 | #image_feat_batch = np.zeros((actual_batch_size, self.feat_H, self.feat_W, self.feat_D), np.float32) 50 | image_feat_batch = np.zeros((actual_batch_size, self.feat_D, self.feat_H, self.feat_W), np.float32) 51 | image_path_list = [None]*actual_batch_size 52 | if self.load_answer: 53 | answer_label_batch = np.zeros(actual_batch_size, np.int32) 54 | if self.load_gt_layout: 55 | gt_layout_batch = np.zeros((self.T_decoder, actual_batch_size), np.int32) 56 | 57 | for n in range(len(sample_ids)): 58 | iminfo = self.imdb[sample_ids[n]] 59 | question_inds = [self.vocab_dict.word2idx(w) for w in iminfo['question_tokens']] 60 | seq_length = len(question_inds) 61 | input_seq_batch[:seq_length, n] = question_inds 62 | seq_length_batch[n] = seq_length 63 | image_feat_basename = os.path.basename(iminfo['feature_path']) 64 | image_feat_name = os.path.join(self.image_feat_dir, image_feat_basename) 65 | image_feat_batch[n:n+1] = np.load(image_feat_name) 66 | if self.load_answer: 67 | answer_idx = self.answer_dict.word2idx(iminfo['answer']) 68 | answer_label_batch[n] = answer_idx 69 | if self.load_gt_layout: 70 | gt_layout_tokens = iminfo['gt_layout_tokens'] 71 | if self.prune_filter_module: 72 | # remove duplicated consequtive modules (only keeping one _Filter) 73 | for n_t in range(len(gt_layout_tokens)-1, 0, -1): 74 | if (gt_layout_tokens[n_t-1] in {'_Filter', '_Find'} 75 | and gt_layout_tokens[n_t] == '_Filter'): 76 | gt_layout_tokens[n_t] = None 77 | gt_layout_tokens = [t for t in gt_layout_tokens if t] 78 | gt_layout_batch[:, n] = self.assembler.module_list2tokens( 79 | gt_layout_tokens, self.T_decoder) 80 | batch = dict(input_seq_batch=input_seq_batch, 81 | seq_length_batch=seq_length_batch, 82 | image_feat_batch=image_feat_batch, 83 | image_path_list=image_path_list) 84 | if self.load_answer: 85 | batch['answer_label_batch'] = answer_label_batch 86 | if self.load_gt_layout: 87 | batch['gt_layout_batch'] = gt_layout_batch 88 | return batch 89 | 90 | class DataReader: 91 | def __init__(self, imdb_file, image_feat_dir, shuffle=True, one_pass=False, prefetch_num=8, **kwargs): 92 | print('Loading imdb from file...', end=''); sys.stdout.flush() 93 | if imdb_file.endswith('.npy'): 94 | imdb = np.load(imdb_file) 95 | else: 96 | raise TypeError('unknown imdb format.') 97 | print('Done') 98 | self.imdb = imdb 99 | self.image_feat_dir = image_feat_dir 100 | self.shuffle = shuffle 101 | self.one_pass = one_pass 102 | self.prefetch_num = prefetch_num 103 | self.data_params = kwargs 104 | 105 | # Clevr data loader 106 | self.batch_loader = BatchLoaderClevr(self.imdb,self.image_feat_dir, self.data_params) 107 | 108 | # Start prefetching thread 109 | self.prefetch_queue = queue.Queue(maxsize=self.prefetch_num) 110 | self.prefetch_thread = threading.Thread(target=_run_prefetch, 111 | args=(self.prefetch_queue, self.batch_loader, self.imdb, 112 | self.shuffle, self.one_pass, self.data_params)) 113 | self.prefetch_thread.daemon = True 114 | self.prefetch_thread.start() 115 | 116 | def batches(self): 117 | while True: 118 | # Get a batch from the prefetching queue 119 | #if self.prefetch_queue.empty(): 120 | # print('data reader: waiting for data loading (IO is slow)...') 121 | batch = self.prefetch_queue.get(block=True) 122 | if batch is None: 123 | assert(self.one_pass) 124 | print('data reader: one pass finished') 125 | raise StopIteration() 126 | yield batch 127 | 128 | def _run_prefetch(prefetch_queue, batch_loader, imdb, shuffle, one_pass, data_params): 129 | num_samples = len(imdb) 130 | batch_size = data_params['batch_size'] 131 | 132 | n_sample = 0 133 | fetch_order = np.arange(num_samples) 134 | while True: 135 | # Shuffle the sample order for every epoch 136 | if n_sample == 0 and shuffle: 137 | fetch_order = np.random.permutation(num_samples) 138 | 139 | # Load batch from file 140 | # note that len(sample_ids) <= batch_size, not necessarily equal 141 | sample_ids = fetch_order[n_sample:n_sample+batch_size] 142 | batch = batch_loader.load_one_batch(sample_ids) 143 | prefetch_queue.put(batch, block=True) 144 | 145 | n_sample += len(sample_ids) 146 | if n_sample >= num_samples: 147 | # Put in a None batch to indicate a whole pass is over 148 | if one_pass: 149 | prefetch_queue.put(None, block=True) 150 | n_sample = 0 151 | -------------------------------------------------------------------------------- /Utils/text_processing.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 4 | def tokenize(sentence): 5 | tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower()) 6 | tokens = [t.strip() for t in tokens if len(t.strip()) > 0] 7 | return tokens 8 | 9 | def load_str_list(fname): 10 | with open(fname) as f: 11 | lines = f.readlines() 12 | lines = [l.strip() for l in lines] 13 | return lines 14 | 15 | class VocabDict: 16 | def __init__(self, vocab_file): 17 | self.word_list = load_str_list(vocab_file) 18 | self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)} 19 | self.num_vocab = len(self.word_list) 20 | self.UNK_idx = self.word2idx_dict[''] if '' in self.word2idx_dict else None 21 | 22 | def idx2word(self, n_w): 23 | return self.word_list[n_w] 24 | 25 | def word2idx(self, w): 26 | if w in self.word2idx_dict: 27 | return self.word2idx_dict[w] 28 | elif self.UNK_idx is not None: 29 | return self.UNK_idx 30 | else: 31 | raise ValueError('word %s not in dictionary (while dictionary does not contain )' % w) 32 | 33 | def tokenize_and_index(self, sentence): 34 | inds = [self.word2idx(w) for w in tokenize(sentence)] 35 | return inds 36 | -------------------------------------------------------------------------------- /Utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def unique_columns(data): 5 | dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0])) 6 | dataf = np.asfortranarray(data).view(dt) 7 | u,uind = np.unique(dataf, return_inverse=True) 8 | m = u.view(data.dtype).reshape(-1,data.shape[0]).T 9 | res = [np.where(uind==x)[0] for x in range(m.shape[1])] 10 | return res -------------------------------------------------------------------------------- /config/clevr_from_scratch.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: clevr 3 | data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR 4 | preprocess_model : vgg_pool5 5 | vocab_question_file : vocabulary_clevr.txt 6 | vocab_layout_file : vocabulary_layout.txt 7 | vocab_answer_file : answers_clevr.txt 8 | imdb_file_trn : imdb_trn.npy 9 | image_depth_first: True 10 | 11 | output: 12 | root_dir: ~/temp/temp_out 13 | exp_name: clevr_scratch 14 | model: 15 | model_type: scratch 16 | image_height : 10 17 | image_width : 15 18 | in_image_dim : 512 19 | embed_dim_txt : 300 20 | embed_dim_nmn : 300 21 | hidden_size : 512 22 | num_layers : 2 23 | encoder_dropout : 0 24 | decoder_dropout : 0 25 | decoder_sampling : True 26 | T_encoder : 45 27 | T_decoder : 10 28 | N : 64 29 | lambda_entropy : 0.01 30 | prune_filter_module : True 31 | use_qpn : True 32 | qpn_dropout : True 33 | reduce_visfeat_dim : False 34 | 35 | training_parameters: 36 | weight_decay : 0 37 | baseline_decay : 0.99 38 | max_iter : 120000 39 | snapshot_interval : 10000 40 | max_grad_l2_norm: 10 41 | learning_rate : 0.001 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /config/clevr_gt_layout.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: clevr 3 | data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR 4 | preprocess_model : vgg_pool5 5 | vocab_question_file : vocabulary_clevr.txt 6 | vocab_layout_file : vocabulary_layout.txt 7 | vocab_answer_file : answers_clevr.txt 8 | imdb_file_trn : imdb_trn.npy 9 | image_depth_first: True 10 | 11 | output: 12 | root_dir: ~/temp/temp_out 13 | exp_name: clevr_gt_layout 14 | model: 15 | model_type: gt_layout 16 | image_height : 10 17 | image_width : 15 18 | in_image_dim : 512 19 | embed_dim_txt : 300 20 | embed_dim_nmn : 300 21 | hidden_size : 512 22 | num_layers : 2 23 | encoder_dropout : 0 24 | decoder_dropout : 0 25 | decoder_sampling : True 26 | T_encoder : 45 27 | T_decoder : 10 28 | N : 64 29 | lambda_entropy : 0 30 | prune_filter_module : True 31 | use_qpn : True 32 | qpn_dropout : True 33 | reduce_visfeat_dim : False 34 | 35 | training_parameters: 36 | weight_decay : 5.0e-6 37 | baseline_decay : 0.99 38 | max_iter : 80000 39 | snapshot_interval : 10000 40 | max_grad_l2_norm: 10 41 | learning_rate : 0.001 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /config/clevr_gt_rl.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: clevr 3 | data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR 4 | preprocess_model : vgg_pool5 5 | vocab_question_file : vocabulary_clevr.txt 6 | vocab_layout_file : vocabulary_layout.txt 7 | vocab_answer_file : answers_clevr.txt 8 | imdb_file_trn : imdb_trn.npy 9 | image_depth_first: True 10 | 11 | output: 12 | root_dir: ~/temp/temp_out 13 | exp_name: clevr_gt_rl 14 | model: 15 | model_type: gt+rl 16 | model_path: 17 | image_height : 10 18 | image_width : 15 19 | in_image_dim : 512 20 | embed_dim_txt : 300 21 | embed_dim_nmn : 300 22 | hidden_size : 512 23 | num_layers : 2 24 | encoder_dropout : 0 25 | decoder_dropout : 0 26 | decoder_sampling : True 27 | T_encoder : 45 28 | T_decoder : 10 29 | N : 64 30 | lambda_entropy : 0.005 31 | prune_filter_module : True 32 | use_qpn : True 33 | qpn_dropout : True 34 | reduce_visfeat_dim : False 35 | 36 | training_parameters: 37 | weight_decay : 5.0e-6 38 | baseline_decay : 0.99 39 | max_iter : 80000 40 | snapshot_interval : 10000 41 | max_grad_l2_norm: 10 42 | learning_rate : 0.0001 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /config/vqa_from_scratch.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: vqa 3 | data_root_dir : /private/home/tinayujiang/data/n2nmn_vqa 4 | preprocess_model : resnet_res5c 5 | vocab_question_file : vocabulary_vqa.txt 6 | vocab_layout_file : vocabulary_layout.txt 7 | vocab_answer_file : answers_vqa.txt 8 | imdb_file_trn : imdb_trn.npy 9 | image_depth_first: False 10 | output: 11 | root_dir: ~/temp/temp_out 12 | exp_name: vqa_scratch 13 | model: 14 | model_type: scratch 15 | image_height : 14 16 | image_width : 14 17 | in_image_dim : 2048 18 | embed_dim_txt : 300 19 | embed_dim_nmn : 300 20 | hidden_size : 1000 21 | num_layers : 2 22 | encoder_dropout : 0.1 23 | decoder_dropout : 0.1 24 | decoder_sampling : True 25 | T_encoder : 26 26 | T_decoder : 13 27 | N : 64 28 | lambda_entropy : 0.01 29 | prune_filter_module : True 30 | use_qpn : True 31 | qpn_dropout : True 32 | reduce_visfeat_dim : False 33 | 34 | training_parameters: 35 | weight_decay : 0 36 | baseline_decay : 0.99 37 | max_iter : 120000 38 | snapshot_interval : 10000 39 | max_grad_l2_norm: 10 40 | learning_rate : 0.001 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /config/vqa_gt_layout.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | dataset: vqa 3 | data_root_dir : /private/home/tinayujiang/data/n2nmn_vqa 4 | preprocess_model : resnet_res5c 5 | vocab_question_file : vocabulary_vqa.txt 6 | vocab_layout_file : vocabulary_layout.txt 7 | vocab_answer_file : answers_vqa.txt 8 | imdb_file_trn : imdb_trn.npy 9 | image_depth_first: False 10 | output: 11 | root_dir: ~/temp/temp_out 12 | exp_name: vga_gt_layout 13 | model: 14 | model_type: gt_layout 15 | image_height : 14 16 | image_width : 14 17 | in_image_dim : 2048 18 | embed_dim_txt : 300 19 | embed_dim_nmn : 300 20 | hidden_size : 1000 21 | num_layers : 2 22 | encoder_dropout : 0.1 23 | decoder_dropout : 0.1 24 | decoder_sampling : False 25 | T_encoder : 26 26 | T_decoder : 13 27 | N : 64 28 | lambda_entropy : 0 29 | prune_filter_module : True 30 | use_qpn : True 31 | qpn_dropout : True 32 | reduce_visfeat_dim : False 33 | 34 | training_parameters: 35 | weight_decay : 5.0e-6 36 | baseline_decay : 0.99 37 | max_iter : 80000 38 | snapshot_interval : 10000 39 | max_grad_l2_norm: 10 40 | learning_rate : 0.001 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /data/vqa/answers_vqa.txt: -------------------------------------------------------------------------------- 1 | 2 | yes 3 | no 4 | 2 5 | 1 6 | white 7 | 3 8 | red 9 | blue 10 | 4 11 | green 12 | black 13 | yellow 14 | brown 15 | 5 16 | tennis 17 | 6 18 | baseball 19 | orange 20 | 0 21 | bathroom 22 | right 23 | left 24 | wood 25 | gray 26 | frisbee 27 | pink 28 | pizza 29 | 7 30 | kitchen 31 | 8 32 | cat 33 | skiing 34 | black and white 35 | dog 36 | skateboarding 37 | snow 38 | skateboard 39 | surfing 40 | water 41 | grass 42 | giraffe 43 | surfboard 44 | 10 45 | wii 46 | kite 47 | man 48 | broccoli 49 | purple 50 | winter 51 | elephant 52 | stop 53 | train 54 | horse 55 | 9 56 | umbrella 57 | apple 58 | silver 59 | banana 60 | sheep 61 | eating 62 | phone 63 | bear 64 | motorcycle 65 | 12 66 | cake 67 | soccer 68 | beach 69 | tan 70 | wine 71 | zebra 72 | brick 73 | sunny 74 | table 75 | woman 76 | laptop 77 | bench 78 | bananas 79 | female 80 | food 81 | hat 82 | bus 83 | male 84 | flowers 85 | living room 86 | cow 87 | maybe 88 | outside 89 | cell phone 90 | hot dog 91 | bird 92 | helmet 93 | kites 94 | night 95 | snowboarding 96 | 11 97 | down 98 | trees 99 | camera 100 | red and white 101 | bed 102 | bedroom 103 | nothing 104 | unknown 105 | christmas 106 | fork 107 | tennis racket 108 | metal 109 | tree 110 | blonde 111 | up 112 | glasses 113 | 20 114 | fence 115 | 15 116 | beer 117 | tile 118 | nike 119 | boat 120 | bat 121 | 13 122 | airport 123 | cloudy 124 | blue and white 125 | glass 126 | day 127 | sitting 128 | open 129 | teddy bear 130 | plane 131 | car 132 | clear 133 | standing 134 | suitcase 135 | chocolate 136 | many 137 | donut 138 | sandwich 139 | bike 140 | zoo 141 | sand 142 | cows 143 | beige 144 | girl 145 | ball 146 | birthday 147 | palm 148 | ocean 149 | airplane 150 | stripes 151 | chair 152 | horses 153 | toilet 154 | carrots 155 | old 156 | knife 157 | coffee 158 | fall 159 | cheese 160 | chinese 161 | round 162 | 14 163 | tie 164 | lot 165 | skis 166 | snowboard 167 | scissors 168 | donuts 169 | boy 170 | mountains 171 | walking 172 | fruit 173 | sleeping 174 | truck 175 | wall 176 | paper 177 | breakfast 178 | wedding 179 | not sure 180 | clock 181 | sunglasses 182 | mirror 183 | fire hydrant 184 | cold 185 | asian 186 | square 187 | street 188 | bicycle 189 | toothbrush 190 | on table 191 | elephants 192 | wetsuit 193 | dirt 194 | plastic 195 | plaid 196 | plate 197 | 16 198 | none 199 | usa 200 | spoon 201 | 25 202 | ground 203 | tv 204 | gold 205 | daytime 206 | luggage 207 | cooking 208 | happy 209 | backpack 210 | carrot 211 | summer 212 | africa 213 | window 214 | chicken 215 | racket 216 | people 217 | computer 218 | stone 219 | on 220 | mountain 221 | no one 222 | zebras 223 | sun 224 | circle 225 | afternoon 226 | graffiti 227 | person 228 | 30 229 | hay 230 | brown and white 231 | leaves 232 | evening 233 | park 234 | fish 235 | city 236 | 50 237 | overcast 238 | oranges 239 | bread 240 | jeans 241 | restaurant 242 | playing 243 | sidewalk 244 | closed 245 | couch 246 | giraffes 247 | birds 248 | ski poles 249 | watch 250 | refrigerator 251 | field 252 | motorcycles 253 | child 254 | flying kite 255 | morning 256 | microwave 257 | grazing 258 | remote 259 | towel 260 | striped 261 | flower 262 | 18 263 | rocks 264 | church 265 | building 266 | bricks 267 | inside 268 | milk 269 | floor 270 | books 271 | light 272 | bag 273 | umbrellas 274 | background 275 | pepperoni 276 | catcher 277 | apples 278 | london 279 | concrete 280 | small 281 | rock 282 | vase 283 | road 284 | playing wii 285 | american 286 | 100 287 | surfboards 288 | brushing teeth 289 | bridge 290 | desk 291 | rainy 292 | leather 293 | police 294 | clouds 295 | white and black 296 | very 297 | gas 298 | picture 299 | keyboard 300 | sink 301 | adidas 302 | new york 303 | bowl 304 | 17 305 | coca cola 306 | dell 307 | pine 308 | hot dogs 309 | tomato 310 | heart 311 | middle 312 | spring 313 | baby 314 | vegetables 315 | sandals 316 | pitcher 317 | floral 318 | ketchup 319 | umpire 320 | rain 321 | book 322 | outdoors 323 | box 324 | mouse 325 | lights 326 | drinking 327 | sky 328 | china 329 | shadow 330 | england 331 | hydrant 332 | skating 333 | oven 334 | reading 335 | 24 336 | both 337 | ski 338 | home 339 | jumping 340 | in air 341 | front 342 | playing tennis 343 | rose 344 | wii remote 345 | checkered 346 | baseball bat 347 | on wall 348 | hotel 349 | tennis ball 350 | house 351 | river 352 | double decker 353 | rectangle 354 | bears 355 | dessert 356 | white and blue 357 | 40 358 | talking on phone 359 | wilson 360 | in water 361 | red and black 362 | noon 363 | roses 364 | office 365 | carpet 366 | salad 367 | blanket 368 | canada 369 | triangle 370 | guitar 371 | pole 372 | electric 373 | 23 374 | resting 375 | real 376 | no idea 377 | train station 378 | top 379 | safety 380 | long 381 | sunset 382 | lettuce 383 | taking picture 384 | on plate 385 | red and yellow 386 | young 387 | stop sign 388 | counter 389 | hot 390 | star 391 | green and white 392 | lemon 393 | tracks 394 | tennis court 395 | on ground 396 | parking meter 397 | rope 398 | white and red 399 | purse 400 | on right 401 | on left 402 | fridge 403 | photographer 404 | sign 405 | lake 406 | 19 407 | 21 408 | scarf 409 | rug 410 | soup 411 | dinner 412 | air 413 | goggles 414 | english 415 | shoes 416 | short 417 | windows 418 | seagull 419 | pepsi 420 | toy 421 | wire 422 | cars 423 | ponytail 424 | 22 425 | running 426 | back 427 | olives 428 | poles 429 | lots 430 | plant 431 | indoors 432 | away 433 | delta 434 | gloves 435 | talking 436 | doughnut 437 | clothes 438 | collar 439 | basket 440 | off 441 | helmets 442 | glove 443 | dusk 444 | flying 445 | tomatoes 446 | raining 447 | steel 448 | cup 449 | unsure 450 | flag 451 | branch 452 | blue and yellow 453 | taking off 454 | tennis racquet 455 | cement 456 | hand 457 | painting 458 | oval 459 | rainbow 460 | japanese 461 | wii controller 462 | stove 463 | shorts 464 | forward 465 | bottle 466 | parking 467 | octagon 468 | beef 469 | spinach 470 | duck 471 | laying down 472 | stainless steel 473 | one way 474 | boots 475 | coke 476 | large 477 | smoke 478 | cream 479 | lunch 480 | passenger 481 | mustard 482 | 28 483 | steam 484 | bikes 485 | head 486 | skate park 487 | blender 488 | fan 489 | pillow 490 | yellow and black 491 | strawberry 492 | turkey 493 | dogs 494 | calm 495 | united states 496 | cross 497 | shirt 498 | japan 499 | landing 500 | strawberries 501 | texting 502 | straw 503 | surf 504 | orange and white 505 | meat 506 | toilet paper 507 | suit 508 | red and blue 509 | orange juice 510 | clean 511 | tea 512 | wind 513 | urban 514 | boats 515 | fake 516 | doughnuts 517 | warm 518 | on bed 519 | polar bear 520 | headphones 521 | fire 522 | polar 523 | w 524 | tower 525 | trash can 526 | america 527 | maroon 528 | shade 529 | samsung 530 | school 531 | flip flops 532 | blinds 533 | lamp 534 | flying kites 535 | decoration 536 | bottom 537 | nighttime 538 | dirty 539 | wooden 540 | cutting board 541 | to right 542 | ford 543 | big 544 | clay 545 | 35 546 | electricity 547 | sneakers 548 | honda 549 | wild 550 | swimming 551 | pictures 552 | p 553 | east 554 | to left 555 | marble 556 | one on right 557 | nobody 558 | teal 559 | football 560 | jet 561 | asia 562 | ice cream 563 | tabby 564 | rice 565 | yellow and blue 566 | television 567 | kia 568 | trash 569 | ducks 570 | ham 571 | oak 572 | jacket 573 | subway 574 | behind 575 | india 576 | go 577 | necklace 578 | peppers 579 | living 580 | good 581 | stuffed animal 582 | posing 583 | riding 584 | nowhere 585 | skateboards 586 | bicycles 587 | soda 588 | wet 589 | parrot 590 | on desk 591 | watching 592 | shelf 593 | desert 594 | newspaper 595 | south 596 | downhill 597 | plants 598 | ring 599 | goat 600 | board 601 | polo 602 | yellow and red 603 | french 604 | headband 605 | men 606 | candles 607 | italian 608 | north 609 | water skiing 610 | granite 611 | door 612 | playing frisbee 613 | bull 614 | bacon 615 | fries 616 | fishing 617 | bow 618 | children 619 | pen 620 | chain link 621 | cutting 622 | 60 623 | cap 624 | bmw 625 | german 626 | diamond 627 | lab 628 | above 629 | grapes 630 | several 631 | possibly 632 | one on left 633 | gray and white 634 | hardwood 635 | dress 636 | tulips 637 | store 638 | market 639 | onions 640 | stick 641 | white and brown 642 | candle 643 | moving 644 | indian 645 | racquet 646 | spanish 647 | cowboy 648 | west 649 | van 650 | tall 651 | military 652 | chopsticks 653 | paint 654 | 27 655 | protection 656 | smiling 657 | starbucks 658 | parking lot 659 | net 660 | waves 661 | lighthouse 662 | african 663 | cart 664 | on floor 665 | women 666 | cutting cake 667 | carriage 668 | b 669 | face 670 | reflection 671 | color 672 | on sidewalk 673 | toyota 674 | public 675 | black white 676 | 32 677 | bracelet 678 | fire truck 679 | sony 680 | on street 681 | cigarette 682 | fork and knife 683 | forest 684 | no parking 685 | wave 686 | seagulls 687 | palm trees 688 | center 689 | dining room 690 | straight 691 | scooter 692 | 26 693 | napkin 694 | green and yellow 695 | terrier 696 | pug 697 | sugar 698 | blue and red 699 | cats 700 | statue 701 | batter 702 | ceramic 703 | skateboarder 704 | towels 705 | fedex 706 | united 707 | full 708 | half 709 | italy 710 | t 711 | 38 712 | 55 713 | basketball 714 | roman 715 | c 716 | navy 717 | wool 718 | low 719 | california 720 | adult 721 | hair 722 | parade 723 | phones 724 | motorbike 725 | big ben 726 | catching 727 | juice 728 | ramp 729 | on counter 730 | lady 731 | bandana 732 | black and red 733 | sausage 734 | hit ball 735 | corn 736 | suitcases 737 | chef 738 | at camera 739 | pineapple 740 | calico 741 | 2 feet 742 | bun 743 | squares 744 | siamese 745 | potatoes 746 | chain 747 | 34 748 | collie 749 | outdoor 750 | army 751 | monkey 752 | owl 753 | light blue 754 | sunlight 755 | teddy bears 756 | dock 757 | playing game 758 | chicago 759 | wine glass 760 | cone 761 | out 762 | station 763 | bar 764 | bakery 765 | on beach 766 | french fries 767 | yellow and white 768 | pigeon 769 | baseball field 770 | bamboo 771 | soccer ball 772 | tag 773 | in background 774 | game 775 | right side 776 | visor 777 | on grass 778 | blue and green 779 | pie 780 | video game 781 | balance 782 | pickle 783 | high 784 | remote control 785 | pepper 786 | us 787 | pelican 788 | shoe 789 | bell 790 | daisy 791 | eagle 792 | a 793 | looking 794 | commercial 795 | waiting 796 | batting 797 | hello kitty 798 | tiles 799 | parsley 800 | 33 801 | serving 802 | 45 803 | nintendo wii 804 | laying 805 | bulldog 806 | empty 807 | n 808 | surfer 809 | dark 810 | parasailing 811 | nokia 812 | dunkin donuts 813 | tray 814 | 3 feet 815 | harley 816 | pasta 817 | red sox 818 | brush 819 | ski lift 820 | drink 821 | cloth 822 | blue and black 823 | red and green 824 | butterfly 825 | vegetable 826 | buildings 827 | smile 828 | poodle 829 | pan 830 | hats 831 | playing baseball 832 | night time 833 | log 834 | stairs 835 | 2012 836 | france 837 | rail 838 | steak 839 | library 840 | krispy kreme 841 | sailboat 842 | peach 843 | basil 844 | cotton 845 | black and yellow 846 | fast 847 | horns 848 | pink and white 849 | sweater 850 | giants 851 | yellow and green 852 | mac 853 | pot 854 | eggs 855 | sprinkles 856 | mercedes 857 | peace 858 | controller 859 | hospital 860 | kid 861 | left side 862 | jump 863 | shower 864 | tiger 865 | art 866 | snowy 867 | work 868 | halloween 869 | softball 870 | fireplace 871 | modern 872 | pork 873 | cookies 874 | sauce 875 | kite flying 876 | cell phones 877 | twin 878 | vertical 879 | farm 880 | crane 881 | microphone 882 | rural 883 | slow 884 | german shepherd 885 | turquoise 886 | white and green 887 | 200 888 | kids 889 | vanilla 890 | palm tree 891 | mask 892 | clock tower 893 | broken 894 | bucket 895 | 29 896 | swan 897 | sad 898 | cherry 899 | tennis shoes 900 | sofa 901 | pig 902 | left one 903 | bus stop 904 | in front 905 | ribbon 906 | little 907 | ice 908 | stars 909 | in field 910 | stripe 911 | garbage 912 | on tracks 913 | piano 914 | onion 915 | cleaning 916 | maple 917 | butter 918 | fruits 919 | woods 920 | volleyball 921 | watermelon 922 | magnets 923 | right one 924 | lion 925 | crosswalk 926 | tattoo 927 | bowling 928 | daisies 929 | backwards 930 | 2013 931 | cook 932 | working 933 | curtain 934 | cross country 935 | harley davidson 936 | hair dryer 937 | windsurfing 938 | on road 939 | washington 940 | camouflage 941 | candy 942 | deer 943 | roman numerals 944 | tennis rackets 945 | 2010 946 | yamaha 947 | museum 948 | egg 949 | trunk 950 | mushrooms 951 | mozzarella 952 | map 953 | vases 954 | barn 955 | cardboard 956 | serve 957 | white and gray 958 | string 959 | glazed 960 | laptops 961 | chandelier 962 | country 963 | 75 964 | 44 965 | beard 966 | cargo 967 | vest 968 | wicker 969 | parked 970 | soap 971 | pigeons 972 | 10 feet 973 | school bus 974 | m 975 | burgundy 976 | driving 977 | golden retriever 978 | bikini 979 | ladder 980 | germany 981 | iron 982 | apron 983 | typing 984 | relaxing 985 | san francisco 986 | bags 987 | on building 988 | stuffed animals 989 | party 990 | goats 991 | selfie 992 | caution 993 | shrimp 994 | 2011 995 | dry 996 | pastry 997 | ceiling 998 | human 999 | panda 1000 | on bench 1001 | foreground 1002 | 2009 1003 | cupcake 1004 | pots 1005 | runway 1006 | neither 1007 | colgate 1008 | europe 1009 | circles 1010 | gravel 1011 | cheesecake 1012 | red white and blue 1013 | paddle 1014 | foil 1015 | obama 1016 | skier 1017 | meter 1018 | kitten 1019 | railing 1020 | business 1021 | sandwiches 1022 | toys 1023 | 31 1024 | pier 1025 | in bowl 1026 | tail 1027 | racing 1028 | asphalt 1029 | not very 1030 | light brown 1031 | walk 1032 | traffic 1033 | restroom 1034 | man on right 1035 | 42 1036 | taxi 1037 | air force 1038 | first 1039 | skull 1040 | grill 1041 | medium 1042 | toothbrushes 1043 | garage 1044 | in sky 1045 | dodgers 1046 | 36 1047 | pickles 1048 | texas 1049 | family 1050 | playing video game 1051 | nintendo 1052 | british airways 1053 | camo 1054 | suv 1055 | throwing 1056 | beans 1057 | sleep 1058 | tape 1059 | hp 1060 | cucumber 1061 | clothing 1062 | coat 1063 | noodles 1064 | swinging 1065 | toothpaste 1066 | wheat 1067 | watching tv 1068 | bowtie 1069 | leash 1070 | harness 1071 | iphone 1072 | dragon 1073 | yankees 1074 | tired 1075 | water bottle 1076 | boxes 1077 | na 1078 | wagon 1079 | x 1080 | fun 1081 | man on left 1082 | can 1083 | australia 1084 | 70 1085 | pillows 1086 | pitbull 1087 | hill 1088 | dalmatian 1089 | sweatband 1090 | wristband 1091 | polka dots 1092 | orioles 1093 | britain 1094 | autumn 1095 | leaf 1096 | new 1097 | beagle 1098 | geese 1099 | polka dot 1100 | bow tie 1101 | boston 1102 | green and blue 1103 | bank of america 1104 | bush 1105 | day time 1106 | cleats 1107 | sparrow 1108 | lamb 1109 | brown and black 1110 | ship 1111 | aluminum 1112 | 500 1113 | barbed wire 1114 | rack 1115 | uk 1116 | catching frisbee 1117 | do not enter 1118 | chairs 1119 | animals 1120 | track 1121 | y 1122 | cupcakes 1123 | dirt bike 1124 | cumulus 1125 | pants 1126 | ski pole 1127 | seat 1128 | partly cloudy 1129 | hearts 1130 | nice 1131 | plates 1132 | pilot 1133 | apartment 1134 | girls 1135 | all 1136 | uphill 1137 | moss 1138 | fedora 1139 | on shelf 1140 | volkswagen 1141 | 1000 1142 | messy 1143 | s 1144 | rust 1145 | bagel 1146 | chips 1147 | going 1148 | caucasian 1149 | in grass 1150 | blurry 1151 | in car 1152 | pavement 1153 | ram 1154 | circular 1155 | leaving 1156 | sail 1157 | owner 1158 | hills 1159 | in back 1160 | below 1161 | jeep 1162 | mustache 1163 | rectangular 1164 | flags 1165 | cereal 1166 | mutt 1167 | evergreen 1168 | shadows 1169 | cabinet 1170 | green and red 1171 | setting 1172 | foggy 1173 | labrador 1174 | mexico 1175 | southwest 1176 | sunflower 1177 | brushing 1178 | pastries 1179 | desktop 1180 | normal 1181 | smoking 1182 | stand 1183 | love 1184 | hands 1185 | wires 1186 | tent 1187 | wine glasses 1188 | curly 1189 | post 1190 | black and gray 1191 | cones 1192 | logs 1193 | behind fence 1194 | tusks 1195 | american flag 1196 | produce 1197 | salmon 1198 | chase 1199 | e 1200 | horizontal 1201 | animal 1202 | eat 1203 | bathtub 1204 | corner 1205 | multi 1206 | british 1207 | tank top 1208 | engine 1209 | screen 1210 | orange and black 1211 | flip 1212 | kickstand 1213 | idk 1214 | natural 1215 | right hand 1216 | frog 1217 | trailer 1218 | grizzly 1219 | on tower 1220 | 37 1221 | game controller 1222 | finch 1223 | bathing 1224 | 5 feet 1225 | coming 1226 | white and orange 1227 | stones 1228 | ties 1229 | weeds 1230 | parachute 1231 | asparagus 1232 | traveling 1233 | bushes 1234 | tablecloth 1235 | 41 1236 | transportation 1237 | peacock 1238 | older 1239 | sweet 1240 | midday 1241 | canoe 1242 | probably 1243 | chevrolet 1244 | thin 1245 | windy 1246 | platform 1247 | lying down 1248 | wetsuits 1249 | 43 1250 | knife and fork 1251 | surfboarding 1252 | cattle 1253 | cubs 1254 | nose 1255 | traffic light 1256 | green and black 1257 | pool 1258 | downtown 1259 | 1 foot 1260 | model 1261 | falling 1262 | mom 1263 | passengers 1264 | brushing his teeth 1265 | dining 1266 | porcelain 1267 | propeller 1268 | santa 1269 | stuffed bear 1270 | pizza cutter 1271 | happiness 1272 | lg 1273 | yield 1274 | dishwasher 1275 | rabbit 1276 | exit 1277 | pitching 1278 | snowboards 1279 | cut 1280 | cauliflower 1281 | emirates 1282 | queen 1283 | curtains 1284 | guy 1285 | ipod 1286 | 6 feet 1287 | 4 feet 1288 | sunflowers 1289 | white and yellow 1290 | telephone 1291 | ostrich 1292 | power lines 1293 | shallow 1294 | mud 1295 | private 1296 | champagne 1297 | hamburger 1298 | cactus 1299 | baseball cap 1300 | cookie 1301 | icing 1302 | crow 1303 | planes 1304 | budweiser 1305 | construction 1306 | stool 1307 | time 1308 | potato 1309 | oil 1310 | hungry 1311 | tissue 1312 | arabic 1313 | smoothie 1314 | squash 1315 | 150 1316 | moped 1317 | lavender 1318 | eyes 1319 | pancakes 1320 | cane 1321 | backhand 1322 | tennis player 1323 | grapefruit 1324 | lime 1325 | warning 1326 | computers 1327 | taking photo 1328 | bib 1329 | tractor 1330 | soft 1331 | checkerboard 1332 | dancing 1333 | skatepark 1334 | boogie board 1335 | top left 1336 | upside down 1337 | trains 1338 | waving 1339 | happy birthday 1340 | steps 1341 | 80 1342 | in middle 1343 | mexican 1344 | black and blue 1345 | spatula 1346 | on couch 1347 | red bull 1348 | hundreds 1349 | aqua 1350 | flip phone 1351 | penguin 1352 | black bear 1353 | sushi 1354 | buses 1355 | fair 1356 | amtrak 1357 | mug 1358 | arm 1359 | parasail 1360 | khaki 1361 | fur 1362 | ana 1363 | towards 1364 | drinking water 1365 | 47 1366 | roof 1367 | frosting 1368 | crown 1369 | friends 1370 | baseball glove 1371 | travel 1372 | feet 1373 | doubles 1374 | skate 1375 | stopped 1376 | lufthansa 1377 | virgin 1378 | florida 1379 | fighting 1380 | wheelchair 1381 | vans 1382 | toaster oven 1383 | fell 1384 | very tall 1385 | teddy 1386 | r 1387 | alaska 1388 | chihuahua 1389 | hawk 1390 | 51 1391 | dark brown 1392 | tags 1393 | blue and gray 1394 | home plate 1395 | to eat 1396 | gothic 1397 | foot 1398 | broadway 1399 | husky 1400 | motorola 1401 | land 1402 | court 1403 | 48 1404 | hummingbird 1405 | black and brown 1406 | smartphone 1407 | neck 1408 | taking pictures 1409 | indoor 1410 | on pole 1411 | hazy 1412 | skirt 1413 | o 1414 | double 1415 | lily 1416 | suzuki 1417 | red white 1418 | cabbage 1419 | talking on cell phone 1420 | korean 1421 | microsoft 1422 | goose 1423 | photo 1424 | wiimote 1425 | curb 1426 | denim 1427 | top right 1428 | beanie 1429 | rowing 1430 | nuts 1431 | on top 1432 | free 1433 | paris 1434 | styrofoam 1435 | converse 1436 | storage 1437 | 46 1438 | g 1439 | 2007 1440 | hotel room 1441 | dots 1442 | cross country skiing 1443 | pm 1444 | 52 1445 | kiwi 1446 | 350 1447 | l 1448 | burger 1449 | monitor 1450 | picnic 1451 | handle 1452 | speaker 1453 | goalie 1454 | bottles 1455 | power 1456 | referee 1457 | veggie 1458 | on chair 1459 | berries 1460 | friend 1461 | tire 1462 | washington dc 1463 | 300 1464 | side 1465 | safari 1466 | yarn 1467 | snowing 1468 | sub 1469 | words 1470 | mat 1471 | dove 1472 | feeding 1473 | bath 1474 | quilt 1475 | lanyard 1476 | doll 1477 | red white blue 1478 | coal 1479 | picnic table 1480 | shark 1481 | swing 1482 | dead 1483 | mushroom 1484 | pond 1485 | cardinals 1486 | fern 1487 | olive 1488 | antique 1489 | tropical 1490 | houses 1491 | fresh 1492 | 2000 1493 | african american 1494 | in 1495 | red and gray 1496 | 39 1497 | on track 1498 | salt 1499 | tub 1500 | toaster 1501 | ivory 1502 | pipe 1503 | overalls 1504 | music 1505 | lime green 1506 | veggies 1507 | avocado 1508 | 66 1509 | at beach 1510 | for fun 1511 | stickers 1512 | tissues 1513 | celery 1514 | canopy 1515 | baseball game 1516 | end 1517 | bud light 1518 | street sign 1519 | christmas tree 1520 | cooked 1521 | radiator 1522 | gate 1523 | golf 1524 | bad 1525 | curved 1526 | lexus 1527 | nissan 1528 | fried 1529 | window sill 1530 | mickey mouse 1531 | helicopter 1532 | on water 1533 | mohawk 1534 | google 1535 | balloon 1536 | signs 1537 | gray and black 1538 | clydesdale 1539 | balloons 1540 | octopus 1541 | dachshund 1542 | rooster 1543 | stroller 1544 | orange and yellow 1545 | tongue 1546 | gun 1547 | smiley face 1548 | fabric 1549 | knee pads 1550 | 20 feet 1551 | top hat 1552 | pumpkin 1553 | fly kite 1554 | f 1555 | captivity 1556 | brace 1557 | belt 1558 | multicolored 1559 | mitt 1560 | dark blue 1561 | not clear 1562 | spaghetti 1563 | formal 1564 | boys 1565 | band 1566 | cobblestone 1567 | rough 1568 | dresser 1569 | socks 1570 | frisbees 1571 | linoleum 1572 | leopard 1573 | toward 1574 | tulip 1575 | foam 1576 | cool 1577 | peas 1578 | tarmac 1579 | puppy 1580 | necktie 1581 | cowboy hat 1582 | thumbs up 1583 | alcohol 1584 | ollie 1585 | hitting ball 1586 | pasture 1587 | deep 1588 | drawing 1589 | utensils 1590 | tablet 1591 | lilies 1592 | heineken 1593 | knives 1594 | kayak 1595 | race 1596 | sliced 1597 | hit 1598 | mets 1599 | pacifier 1600 | grilled 1601 | 49 1602 | cups 1603 | copper 1604 | train tracks 1605 | in street 1606 | hugging 1607 | lemons 1608 | police officer 1609 | writing 1610 | motion 1611 | off white 1612 | blood 1613 | toothpick 1614 | coach 1615 | briefcase 1616 | cooler 1617 | toilets 1618 | legos 1619 | boxing 1620 | dr pepper 1621 | play 1622 | honey 1623 | toshiba 1624 | chalk 1625 | jungle 1626 | brown and tan 1627 | hoodie 1628 | fluorescent 1629 | black and silver 1630 | shelves 1631 | rugby 1632 | crossing 1633 | mother 1634 | ride 1635 | pointing 1636 | bookshelf 1637 | powdered sugar 1638 | arriving 1639 | wheels 1640 | 54 1641 | blankets 1642 | ultimate frisbee 1643 | sea 1644 | mattress 1645 | arrow 1646 | for sale 1647 | pirate 1648 | kawasaki 1649 | urinal 1650 | papers 1651 | wallpaper 1652 | cell 1653 | bronze 1654 | all of them 1655 | canon 1656 | over 1657 | nursing 1658 | cord 1659 | regular 1660 | rackets 1661 | pony 1662 | cherries 1663 | yellow and orange 1664 | klm 1665 | benches 1666 | saddle 1667 | arch 1668 | sepia 1669 | shell 1670 | pajamas 1671 | money 1672 | some 1673 | circus 1674 | swans 1675 | checkers 1676 | parking meters 1677 | pacific 1678 | los angeles 1679 | dishes 1680 | green and brown 1681 | air france 1682 | awake 1683 | sports 1684 | 2 hours 1685 | thanksgiving 1686 | kettle 1687 | suits 1688 | hanging 1689 | display 1690 | very high 1691 | coconut 1692 | in vase 1693 | angels 1694 | legs 1695 | wakeboarding 1696 | united kingdom 1697 | herding 1698 | jets 1699 | checker 1700 | buffalo 1701 | phillies 1702 | staring 1703 | teeth 1704 | me 1705 | pedestal 1706 | earring 1707 | robe 1708 | sweatshirt 1709 | identification 1710 | flickr 1711 | boxer 1712 | looking out window 1713 | gatorade 1714 | crosstown 1715 | birthday cake 1716 | cameraman 1717 | on toilet 1718 | dinosaur 1719 | muffin 1720 | garden 1721 | wine tasting 1722 | snowboarder 1723 | cafe 1724 | lifeguard 1725 | mixer 1726 | barrel 1727 | on man 1728 | computer screen 1729 | right handed 1730 | railroad crossing 1731 | on stove 1732 | bunk 1733 | stir fry 1734 | dugout 1735 | tarp 1736 | kitesurfing 1737 | checked 1738 | someone 1739 | menu 1740 | relish 1741 | russian 1742 | shepherd 1743 | donkey 1744 | heels 1745 | bleachers 1746 | healthy 1747 | rolex 1748 | green and orange 1749 | throwing frisbee 1750 | on sink 1751 | ivy 1752 | air conditioner 1753 | lines 1754 | raw 1755 | in park 1756 | poor 1757 | blackberry 1758 | new york city 1759 | diesel 1760 | radio 1761 | baskets 1762 | tree branch 1763 | ferry 1764 | scale 1765 | laptop computer 1766 | mall 1767 | robin 1768 | seeds 1769 | chiquita 1770 | dozens 1771 | olympics 1772 | purple and white 1773 | diamonds 1774 | king 1775 | tiled 1776 | green beans 1777 | sheet 1778 | dole 1779 | cage 1780 | far right 1781 | hitting 1782 | bunny 1783 | toast 1784 | horse racing 1785 | harbor 1786 | uniform 1787 | pedestrians 1788 | pitch 1789 | gravy 1790 | jockey 1791 | 53 1792 | urinals 1793 | accident 1794 | es 1795 | flat 1796 | blue green 1797 | navy blue 1798 | logo 1799 | hard 1800 | blueberries 1801 | design 1802 | against wall 1803 | photography 1804 | crib 1805 | castle 1806 | medal 1807 | farmer 1808 | in zoo 1809 | vent 1810 | sled 1811 | corona 1812 | catch 1813 | sedan 1814 | hood 1815 | raspberry 1816 | vehicle 1817 | looking at camera 1818 | shower curtain 1819 | camel 1820 | highway 1821 | kissing 1822 | vegetarian 1823 | onion rings 1824 | strips 1825 | street light 1826 | antelope 1827 | 65 1828 | dawn 1829 | rear 1830 | dad 1831 | driver 1832 | tow truck 1833 | retriever 1834 | eating grass 1835 | residential 1836 | moon 1837 | skull and crossbones 1838 | flour 1839 | eiffel tower 1840 | bridle 1841 | side of road 1842 | ibm 1843 | backward 1844 | handicap 1845 | toilet brush 1846 | tunnel 1847 | little girl 1848 | chains 1849 | badminton 1850 | too many to count 1851 | seattle 1852 | above sink 1853 | t shirt 1854 | cover 1855 | on his head 1856 | chevy 1857 | main 1858 | island 1859 | balls 1860 | light green 1861 | roast beef 1862 | singing 1863 | blue white 1864 | detroit 1865 | 1 year 1866 | stretching 1867 | drinks 1868 | boarding 1869 | tongs 1870 | lace 1871 | hose 1872 | spiderman 1873 | ketchup and mustard 1874 | nikon 1875 | sniffing 1876 | tank 1877 | ski slope 1878 | bulls 1879 | audi 1880 | shearing 1881 | far 1882 | oriental 1883 | fox 1884 | himself 1885 | 1st 1886 | pocket 1887 | chrome 1888 | line 1889 | decorative 1890 | thumb 1891 | apartments 1892 | sliding 1893 | d 1894 | my best guess is yes 1895 | us open 1896 | baking 1897 | bottom left 1898 | silver and black 1899 | freight 1900 | raspberries 1901 | policeman 1902 | tourist 1903 | trick 1904 | frisbee golf 1905 | spain 1906 | fire extinguisher 1907 | in sink 1908 | unclear 1909 | continental 1910 | in suitcase 1911 | pink and yellow 1912 | far left 1913 | shirts 1914 | broom 1915 | news 1916 | under 1917 | skateboard trick 1918 | magazine 1919 | mouth 1920 | packing 1921 | gone 1922 | kangaroo 1923 | paisley 1924 | airplanes 1925 | yard 1926 | golden 1927 | wavy 1928 | trolley 1929 | cardinal 1930 | bottom right 1931 | playing video games 1932 | single 1933 | tires 1934 | printer 1935 | uniforms 1936 | deck 1937 | furniture 1938 | jelly 1939 | riding horse 1940 | croissant 1941 | xbox 1942 | shore 1943 | jesus 1944 | slippers 1945 | earphones 1946 | coffee cup 1947 | rubber 1948 | letters 1949 | rodeo 1950 | brushing her teeth 1951 | pc 1952 | petting 1953 | star wars 1954 | throw 1955 | braves 1956 | sunrise 1957 | windmill 1958 | bars 1959 | crest 1960 | in snow 1961 | taking selfie 1962 | video 1963 | peanut butter 1964 | coins 1965 | parakeet 1966 | black and orange 1967 | on laptop 1968 | fountain 1969 | directions 1970 | wii controllers 1971 | desserts 1972 | shut 1973 | trucks 1974 | fancy 1975 | wax paper 1976 | hiking 1977 | drain 1978 | baseball player 1979 | earrings 1980 | wii remotes 1981 | lid 1982 | clocks 1983 | 2008 1984 | heat 1985 | raft 1986 | forks 1987 | wheel 1988 | ox 1989 | mercedes benz 1990 | spoons 1991 | sesame 1992 | 68 1993 | married 1994 | smooth 1995 | club 1996 | haircut 1997 | mixed 1998 | bone 1999 | smaller 2000 | overhead 2001 | motor 2002 | coffee table 2003 | on field 2004 | climbing 2005 | buggy 2006 | bright 2007 | aluminum foil 2008 | mint 2009 | leg 2010 | triangles 2011 | k 2012 | on skateboard 2013 | frisby 2014 | 56 2015 | globe 2016 | savannah 2017 | cinnamon 2018 | ferris wheel 2019 | stopping 2020 | sailboats 2021 | us air force 2022 | eyeglasses 2023 | pedestrian crossing 2024 | 120 2025 | shopping 2026 | domestic 2027 | pottery 2028 | dairy 2029 | yogurt 2030 | metro 2031 | base 2032 | pigtails 2033 | ireland 2034 | alligator 2035 | lighter 2036 | maple leaf 2037 | gas station 2038 | on train 2039 | movie 2040 | flamingo 2041 | 101 2042 | cucumbers 2043 | cloud 2044 | plain 2045 | across street 2046 | stadium 2047 | above stove 2048 | liquid 2049 | sticker 2050 | spectators 2051 | decorations 2052 | skies 2053 | ascending 2054 | goatee 2055 | blue and orange 2056 | vinyl 2057 | pink and blue 2058 | ducati 2059 | speakers 2060 | scooters 2061 | heater 2062 | electronics 2063 | stew 2064 | 400 2065 | 1950 2066 | 72 2067 | security 2068 | intersection 2069 | magazines 2070 | 6 inches 2071 | meow 2072 | chili 2073 | speed 2074 | squatting 2075 | robot 2076 | pirates 2077 | remotes 2078 | seafood 2079 | 8 feet 2080 | pencil 2081 | graduation 2082 | lift 2083 | left hand 2084 | stands 2085 | squirrel 2086 | orange and green 2087 | placemat 2088 | space 2089 | wedding cake 2090 | den 2091 | heinz 2092 | on boat 2093 | jetblue 2094 | in kitchen 2095 | whale 2096 | bidet 2097 | branches 2098 | mountain dew 2099 | stuffed 2100 | rocky 2101 | braid 2102 | my best guess is no 2103 | vines 2104 | painted 2105 | pads 2106 | red and silver 2107 | cabinets 2108 | second 2109 | persian 2110 | h 2111 | scrambled 2112 | garlic 2113 | colorado 2114 | spray paint 2115 | main street 2116 | orchid 2117 | oar 2118 | swimsuit 2119 | washington monument 2120 | stormy 2121 | faucet 2122 | antenna 2123 | pizza hut 2124 | ping pong 2125 | lego 2126 | concert 2127 | abstract 2128 | dump truck 2129 | bats 2130 | walkway 2131 | cakes 2132 | 90 2133 | commuter 2134 | disc 2135 | paddle boarding 2136 | 12 feet 2137 | volvo 2138 | victorian 2139 | weather vane 2140 | wireless 2141 | multi colored 2142 | spots 2143 | tattoos 2144 | 1 way 2145 | in ocean 2146 | coaster 2147 | player 2148 | couple 2149 | never 2150 | very old 2151 | camper 2152 | turning 2153 | conductor 2154 | town 2155 | video games 2156 | daffodils 2157 | washing 2158 | skateboard park 2159 | costume 2160 | broke 2161 | surprise 2162 | photos 2163 | mack 2164 | plow 2165 | turtle 2166 | thomas 2167 | button 2168 | tour 2169 | blue jeans 2170 | advertisement 2171 | one in front 2172 | spider 2173 | peeing 2174 | 2nd 2175 | still 2176 | silk 2177 | for safety 2178 | surfers 2179 | target 2180 | macbook 2181 | jar 2182 | michigan 2183 | swiss 2184 | catch ball 2185 | state farm 2186 | classroom 2187 | great britain 2188 | dandelions 2189 | border collie 2190 | biking 2191 | fisheye 2192 | stained glass 2193 | war 2194 | motocross 2195 | reins 2196 | windowsill 2197 | semi 2198 | biplane 2199 | 100 feet 2200 | sideways 2201 | ladybug 2202 | in box 2203 | parmesan 2204 | napkins 2205 | wreath 2206 | case 2207 | harry potter 2208 | fog 2209 | 3 inches 2210 | forehand 2211 | plunger 2212 | above toilet 2213 | diet coke 2214 | winnie pooh 2215 | life jacket 2216 | chevron 2217 | hawaiian 2218 | suspenders 2219 | white and pink 2220 | united states of america 2221 | grocery 2222 | riding bike 2223 | brazil 2224 | polar bears 2225 | blue jay 2226 | carnation 2227 | herd 2228 | lobster 2229 | hit tennis ball 2230 | notebook 2231 | cartoon 2232 | soon 2233 | black and green 2234 | tusk 2235 | santa hat 2236 | baked 2237 | trunks 2238 | hockey 2239 | movement 2240 | western 2241 | thai 2242 | in corner 2243 | batman 2244 | dish 2245 | feeding giraffe 2246 | caramel 2247 | cirrus 2248 | ledge 2249 | behind man 2250 | 74 2251 | silverware 2252 | bunch 2253 | soldier 2254 | monster 2255 | verizon 2256 | brass 2257 | pelicans 2258 | patio 2259 | rv 2260 | at airport 2261 | 99 2262 | hsbc 2263 | footprints 2264 | cutting hair 2265 | shih tzu 2266 | on phone 2267 | horseback riding 2268 | sewing 2269 | air canada 2270 | buoy 2271 | calendar 2272 | paddling 2273 | 5 years 2274 | 10 years 2275 | sony ericsson 2276 | straight ahead 2277 | grape 2278 | orange and blue 2279 | joy 2280 | clown 2281 | lamps 2282 | family room 2283 | earbuds 2284 | farmers market 2285 | hammer time 2286 | shaking hands 2287 | teapot 2288 | finger 2289 | patterned 2290 | high heels 2291 | steeple 2292 | carnations 2293 | sticks 2294 | ski resort 2295 | hexagon 2296 | 64 2297 | butt 2298 | pine trees 2299 | almonds 2300 | tennis balls 2301 | 61 2302 | reebok 2303 | towards camera 2304 | huge 2305 | on bike 2306 | burton 2307 | wings 2308 | on bus 2309 | in basket 2310 | bank 2311 | dodge 2312 | bagels 2313 | cluttered 2314 | play tennis 2315 | middle one 2316 | marina 2317 | feathers 2318 | toddler 2319 | russia 2320 | thick 2321 | pear 2322 | ny 2323 | victoria 2324 | skiers 2325 | brown and yellow 2326 | yellow and brown 2327 | mound 2328 | licking 2329 | peaches 2330 | anniversary 2331 | cheetah 2332 | aa 2333 | paper towel 2334 | keys 2335 | no left turn 2336 | pens 2337 | bbq 2338 | transparent 2339 | twins 2340 | in tree 2341 | bus station 2342 | american airlines 2343 | protest 2344 | hound 2345 | elmo 2346 | fans 2347 | white one 2348 | leaning 2349 | coleslaw 2350 | gazebo 2351 | sheepdog 2352 | stripped 2353 | mango 2354 | fanta 2355 | fluffy 2356 | calf 2357 | under armour 2358 | throw frisbee 2359 | geico 2360 | paintings 2361 | lacoste 2362 | very big 2363 | more than 10 2364 | strap 2365 | index 2366 | camping 2367 | poop 2368 | peanuts 2369 | macaroni 2370 | ear 2371 | cop 2372 | run 2373 | kfc 2374 | lizard 2375 | sauerkraut 2376 | hiding 2377 | rye 2378 | tea kettle 2379 | hammock 2380 | grandfather 2381 | shopping cart 2382 | shells 2383 | skyscrapers 2384 | controllers 2385 | scratching 2386 | bunt 2387 | back left 2388 | ceiling fan 2389 | 3 ft 2390 | moose 2391 | size 2392 | 30 feet 2393 | yellow and gray 2394 | mother and child 2395 | 15 feet 2396 | tube 2397 | yacht 2398 | turn 2399 | fir 2400 | on sign 2401 | banana peel 2402 | on tree 2403 | muffins 2404 | very fast 2405 | singles 2406 | mouse pad 2407 | atv 2408 | no smoking 2409 | 50 years 2410 | vacation 2411 | competition 2412 | not at all 2413 | radish 2414 | birthday party 2415 | one in back 2416 | playing soccer 2417 | dough 2418 | coats 2419 | container 2420 | woman on left 2421 | toothpicks 2422 | coffee maker 2423 | sidecar 2424 | puma 2425 | logitech 2426 | crocs 2427 | 1 hour 2428 | poster 2429 | gazelle 2430 | junk 2431 | roll 2432 | lilac 2433 | mayo 2434 | tourists 2435 | for balance 2436 | rails 2437 | taller 2438 | cannot tell 2439 | no shirt 2440 | emergency 2441 | jean 2442 | skater 2443 | in bed 2444 | hawaii 2445 | one world 2446 | cigarettes 2447 | chest 2448 | sandy 2449 | ups 2450 | in motion 2451 | mario 2452 | fly 2453 | lasagna 2454 | life vest 2455 | 700 2456 | pearl 2457 | salt and pepper 2458 | stork 2459 | red light 2460 | blow dryer 2461 | somewhat 2462 | laminate 2463 | ears 2464 | factory 2465 | la 2466 | wing 2467 | males 2468 | plantains 2469 | nasa 2470 | acer 2471 | multiple 2472 | lighting 2473 | opened 2474 | bunk bed 2475 | posing for picture 2476 | mountainous 2477 | on rack 2478 | yorkie 2479 | 67 2480 | on mountain 2481 | snake 2482 | braids 2483 | easton 2484 | adults 2485 | show 2486 | tokyo 2487 | few 2488 | card 2489 | privacy 2490 | for protection 2491 | gmc 2492 | snowflakes 2493 | holding 2494 | tan and white 2495 | surprised 2496 | tropicana 2497 | blocks 2498 | rectangles 2499 | mosaic 2500 | gray and red 2501 | tin 2502 | wheelie 2503 | red wine 2504 | tools 2505 | classic 2506 | nature 2507 | greyhound 2508 | mayonnaise 2509 | floating 2510 | team 2511 | biker 2512 | welcome 2513 | 88 2514 | tennis match 2515 | food truck 2516 | room 2517 | best buy 2518 | city street 2519 | canadian 2520 | jet ski 2521 | in bathroom 2522 | balancing 2523 | landscape 2524 | ski sticks 2525 | yellow and pink 2526 | snow skiing 2527 | peace sign 2528 | enclosure 2529 | hallway 2530 | on tray 2531 | kayaking 2532 | broccoli and carrots 2533 | all way 2534 | railroad 2535 | 2 years 2536 | dark green 2537 | theater 2538 | descending 2539 | cleaner 2540 | collage 2541 | pickup 2542 | dark gray 2543 | prom 2544 | closet 2545 | feta 2546 | cameras 2547 | college 2548 | listening 2549 | frisbie 2550 | hanger 2551 | practice 2552 | on sand 2553 | sleeveless 2554 | person on left 2555 | 63 2556 | red and orange 2557 | younger 2558 | philadelphia 2559 | behind clouds 2560 | sightseeing 2561 | motorbikes 2562 | parasails 2563 | freezer 2564 | plastic wrap 2565 | stop light 2566 | wakeboard 2567 | zucchini 2568 | jp morgan 2569 | dog and cat 2570 | easter 2571 | pink and black 2572 | grocery store 2573 | hyundai 2574 | lava lamp 2575 | towing 2576 | 250 2577 | rome 2578 | homemade 2579 | oars 2580 | v 2581 | cola 2582 | great 2583 | whipped cream 2584 | chickens 2585 | 50 feet 2586 | safe 2587 | lemonade 2588 | selling 2589 | ginger 2590 | house cat 2591 | blue team 2592 | cat and dog 2593 | toward camera 2594 | riding motorcycle 2595 | pet 2596 | shaving 2597 | ahead 2598 | burrito 2599 | comfort 2600 | garbage can 2601 | shoulder 2602 | in wild 2603 | cathedral 2604 | cd 2605 | double decker bus 2606 | cruise ship 2607 | in oven 2608 | glaze 2609 | traffic lights 2610 | first base 2611 | qantas 2612 | website 2613 | scared 2614 | marines 2615 | tripod 2616 | 1950s 2617 | neon 2618 | sword 2619 | facebook 2620 | vw 2621 | handicapped 2622 | isuzu 2623 | tortilla 2624 | curious 2625 | violet 2626 | on his face 2627 | rams 2628 | 103 2629 | 20 mph 2630 | choppy 2631 | in stands 2632 | 4 ft 2633 | thailand 2634 | ticket 2635 | dome 2636 | syrup 2637 | bob 2638 | reds 2639 | laughing 2640 | tying tie 2641 | mo 2642 | man made 2643 | wood and metal 2644 | high chair 2645 | transport 2646 | 125 2647 | pedestrian 2648 | wrench 2649 | parrots 2650 | wisconsin 2651 | hilly 2652 | pita 2653 | grain 2654 | posts 2655 | baggage claim 2656 | baltimore 2657 | on snow 2658 | porch 2659 | fighter 2660 | dolphin 2661 | pink and purple 2662 | chimney 2663 | windsor 2664 | on runway 2665 | on hill 2666 | name 2667 | digital 2668 | busy 2669 | elm 2670 | planter 2671 | eat it 2672 | beets 2673 | under sink 2674 | brown bear 2675 | neon green 2676 | vintage 2677 | union station 2678 | lap 2679 | fires 2680 | crab 2681 | spiral 2682 | toilet seat 2683 | pans 2684 | backyard 2685 | greek 2686 | casserole 2687 | firefighter 2688 | print 2689 | fighter jet 2690 | balcony 2691 | grooming 2692 | white and tan 2693 | information 2694 | heavy 2695 | beads 2696 | professional 2697 | playground 2698 | oregon 2699 | half full 2700 | dashboard 2701 | kite string 2702 | buttons 2703 | tell time 2704 | tuna 2705 | only 2706 | turban 2707 | take off 2708 | nightstand 2709 | fireman 2710 | mail 2711 | name tag 2712 | sale 2713 | bookcase 2714 | close 2715 | j 2716 | ambulance 2717 | htc 2718 | red yellow 2719 | butterflies 2720 | melon 2721 | philips 2722 | slide 2723 | eye 2724 | upper left 2725 | blue and pink 2726 | omelet 2727 | sculpture 2728 | baggage 2729 | sprite 2730 | under bed 2731 | tiara 2732 | wine bottle 2733 | san diego 2734 | 6 ft 2735 | behind him 2736 | frame 2737 | train car 2738 | 85 2739 | flash 2740 | away from camera 2741 | rider 2742 | left handed 2743 | numerous 2744 | block sun 2745 | statue of liberty 2746 | downward 2747 | looking at phone 2748 | backpacks 2749 | colorful 2750 | sandal 2751 | looking for food 2752 | little boy 2753 | artwork 2754 | after 2755 | used 2756 | golden gate 2757 | babies 2758 | blt 2759 | wax 2760 | waffle 2761 | to hit ball 2762 | very long 2763 | man in middle 2764 | 98 2765 | wii bowling 2766 | the 2767 | yak 2768 | clip 2769 | partly 2770 | vehicles 2771 | disney 2772 | shepard 2773 | miami 2774 | mac and cheese 2775 | liquor 2776 | hilton 2777 | catholic 2778 | loading 2779 | countryside 2780 | for shade 2781 | steer 2782 | paper towels 2783 | casual 2784 | computer mouse 2785 | milking 2786 | tomato sauce 2787 | knee 2788 | lambs 2789 | pears 2790 | 747 2791 | festival 2792 | on plane 2793 | cards 2794 | tennessee 2795 | on rock 2796 | baker 2797 | slow down 2798 | hoagie 2799 | supreme 2800 | fashion 2801 | cans 2802 | ranch 2803 | photograph 2804 | ge 2805 | student 2806 | upper right 2807 | earth 2808 | white black 2809 | south africa 2810 | stainless 2811 | pretty 2812 | suit and tie 2813 | to see 2814 | boating 2815 | students 2816 | blue and silver 2817 | red velvet 2818 | riding horses 2819 | dreadlocks 2820 | riding elephant 2821 | blinders 2822 | burgers 2823 | goal 2824 | jackson 2825 | bored 2826 | water ski 2827 | holding it 2828 | amazon 2829 | foreign 2830 | stuffed toy 2831 | herself 2832 | deli 2833 | lays 2834 | dog bed 2835 | on roof 2836 | stoplight 2837 | 140 2838 | 5 ft 2839 | jal 2840 | iris 2841 | practicing 2842 | skillet 2843 | laundry 2844 | gym 2845 | down street 2846 | pickup truck 2847 | buns 2848 | rottweiler 2849 | flat screen 2850 | motel 2851 | by window 2852 | pipes 2853 | 89 2854 | photographing 2855 | wii sports 2856 | by water 2857 | paw 2858 | worms 2859 | 3rd 2860 | gaming 2861 | deep dish 2862 | multicolor 2863 | digging 2864 | kingfisher 2865 | blueberry 2866 | union pacific 2867 | bell pepper 2868 | flower pot 2869 | 76 2870 | plastic bag 2871 | block 2872 | crows 2873 | googles 2874 | on computer 2875 | sure 2876 | under tree 2877 | path 2878 | lion king 2879 | cilantro 2880 | orange and red 2881 | in woods 2882 | on side 2883 | paper plate 2884 | turf 2885 | moo 2886 | parking garage 2887 | light pole 2888 | skeleton 2889 | towel rack 2890 | new york yankees 2891 | under table 2892 | switzerland 2893 | spices 2894 | feather 2895 | bus driver 2896 | u haul 2897 | mustard and ketchup 2898 | countertop 2899 | meeting 2900 | pomeranian 2901 | flats 2902 | drying 2903 | very deep 2904 | telling time 2905 | calculator 2906 | heron 2907 | weather 2908 | egypt 2909 | pee 2910 | rhino 2911 | artificial 2912 | dead end 2913 | thousands 2914 | throw ball 2915 | cushion 2916 | gull 2917 | prince 2918 | melbourne 2919 | to catch ball 2920 | deciduous 2921 | around neck 2922 | fast food 2923 | analog 2924 | playing games 2925 | laptop screen 2926 | palms 2927 | check 2928 | venice 2929 | baseball mitt 2930 | scotland 2931 | grinding 2932 | grind 2933 | father 2934 | brown and green 2935 | late afternoon 2936 | limes 2937 | violin 2938 | visibility 2939 | urinating 2940 | shovel 2941 | mural 2942 | equestrian 2943 | 2005 2944 | milking cow 2945 | sweat 2946 | more than 20 2947 | solid 2948 | 1900 2949 | us airways 2950 | kneeling 2951 | his left 2952 | 57 2953 | scarves 2954 | sailing 2955 | boeing 2956 | stem 2957 | lower 2958 | soccer field 2959 | storm 2960 | celebrating 2961 | asleep 2962 | panasonic 2963 | man and woman 2964 | parent 2965 | north face 2966 | on tennis court 2967 | sas 2968 | street lights 2969 | wait 2970 | 69 2971 | bird feeder 2972 | comforter 2973 | in window 2974 | feeder 2975 | tricks 2976 | using laptop 2977 | kleenex 2978 | jackets 2979 | style 2980 | taco 2981 | bowls 2982 | long time 2983 | sweden 2984 | zig zag 2985 | december 2986 | boardwalk 2987 | toronto 2988 | stuff 2989 | using computer 2990 | skinny 2991 | mesh 2992 | buffet 2993 | burnt 2994 | walmart 2995 | tigers 2996 | no entry 2997 | bending 2998 | bay 2999 | angry birds 3000 | cleveland 3001 | dc 3002 | -------------------------------------------------------------------------------- /data/vqa/gt_layout_train2014_new_parse.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/gt_layout_train2014_new_parse.npy -------------------------------------------------------------------------------- /data/vqa/gt_layout_val2014_new_parse.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/gt_layout_val2014_new_parse.npy -------------------------------------------------------------------------------- /data/vqa/vocabulary_layout.txt: -------------------------------------------------------------------------------- 1 | _Find 2 | _Transform 3 | _And 4 | _Describe 5 | 6 | -------------------------------------------------------------------------------- /data/vqa/vocabulary_vqa_glove.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/vocabulary_vqa_glove.npy -------------------------------------------------------------------------------- /eval_model/eval_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | from torch.autograd import Variable 3 | from models.layout_assembler import Assembler 4 | from Utils.data_reader import DataReader 5 | import sys 6 | 7 | import os 8 | import torch 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | import matplotlib.ticker as ticker 12 | 13 | 14 | from Utils.text_processing import * 15 | 16 | from eval_model.layout_evaluator import run_eval 17 | 18 | 19 | # Data files 20 | vocab_question_file = './exp_clevr/data/vocabulary_clevr.txt' 21 | vocab_layout_file = './exp_clevr/data/vocabulary_layout.txt' 22 | vocab_answer_file = './exp_clevr/data/answers_clevr.txt' 23 | 24 | 25 | exp_name = 'clevr_gt_layout' 26 | tst_image_set = 'val' 27 | out_file = "layout_learning_on_eval_dataset.txt" 28 | 29 | snapshot_name='%08d' % 500 30 | 31 | T_encoder = 45 32 | T_decoder = 20 33 | N = 64 34 | prune_filter_module = True 35 | 36 | 37 | 38 | imdb_file_tst = './exp_clevr/data/imdb/imdb_%s.npy' % tst_image_set 39 | snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, snapshot_name) 40 | 41 | save_file = './exp_clevr/results/%s/%s.%s.txt' % (exp_name, snapshot_name, tst_image_set) 42 | os.makedirs(os.path.dirname(save_file), exist_ok=True) 43 | 44 | eval_output_file = './exp_clevr/eval_outputs/%s/%s.%s.txt' % (exp_name, snapshot_name, tst_image_set) 45 | os.makedirs(os.path.dirname(eval_output_file), exist_ok=True) 46 | 47 | assembler = Assembler(vocab_layout_file) 48 | data_reader_tst = DataReader(imdb_file_tst, shuffle=False, one_pass=True, 49 | batch_size=N, 50 | T_encoder=T_encoder, 51 | T_decoder=T_decoder, 52 | assembler=assembler, 53 | vocab_question_file=vocab_question_file, 54 | vocab_answer_file=vocab_answer_file, 55 | prune_filter_module=prune_filter_module) 56 | 57 | print('Running test ...') 58 | answer_correct_total = 0 59 | layout_correct_total = 0 60 | layout_valid_total = 0 61 | num_questions_total = 0 62 | answer_word_list = data_reader_tst.batch_loader.answer_dict.word_list 63 | output_answers = [] 64 | 65 | ##load my model 66 | model = torch.load(snapshot_file) 67 | 68 | n_total = 0 69 | 70 | batch = data_reader_tst.prefetch_queue.get(block=True) 71 | _, batch_size = batch['input_seq_batch'].shape 72 | 73 | input_text_seq_lens = batch['seq_length_batch'] 74 | input_text_seqs = batch['input_seq_batch'] 75 | input_layouts = batch['gt_layout_batch'] 76 | 77 | num_questions_total += batch_size 78 | 79 | n_correct_layout = 0 80 | 81 | input_variable = Variable(torch.LongTensor(input_text_seqs)) 82 | 83 | target_variable = Variable(torch.LongTensor(input_layouts)) 84 | 85 | myLayouts, myAttentions = model(input_variable, input_text_seq_lens, target_variable) 86 | predicted_layouts = torch.topk(myLayouts, 1)[1].cpu().data.numpy()[:, :, 0] 87 | 88 | 89 | sample_idx = 17 90 | 91 | 92 | 93 | def plot_sample(sample_idx): 94 | example_text = input_text_seqs[:,sample_idx] 95 | example_att = myAttentions.data.cpu().numpy()[sample_idx,:,:] 96 | example_layout = predicted_layouts[:,sample_idx] 97 | word_list = load_str_list(vocab_question_file) 98 | layout_list=load_str_list(vocab_layout_file) 99 | sentence = list(map(lambda x:word_list[x],example_text)) 100 | layout = list(map(lambda x: layout_list[x],example_layout)) 101 | sentence_len = sum(1 for i in sentence if i != ';') 102 | layout_len = sum(1 for i in layout if i != '') 103 | fig = plt.figure() 104 | ax = fig.add_subplot(111) 105 | cax = ax.matshow(example_att[0:layout_len,0:sentence_len],cmap='bone') 106 | fig.colorbar(cax) 107 | ax.set_xticklabels(['']+sentence[0:sentence_len],rotation=90) 108 | ax.set_yticklabels(['']+layout[0:layout_len]) 109 | 110 | ax.xaxis.set_major_locator(ticker.MultipleLocator(1)) 111 | ax.yaxis.set_major_locator(ticker.MultipleLocator(1)) 112 | plt.show() 113 | 114 | 115 | plot_sample(17) -------------------------------------------------------------------------------- /eval_model/eval_layout_accuracy.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | from eval_model.layout_evaluator import run_eval 3 | import argparse 4 | 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--gpu_id', type=int, default=0) 8 | 9 | parser.add_argument('--exp_name', required=True) 10 | parser.add_argument('--snapshot_name', required=True) 11 | parser.add_argument('--test_split', required=True) 12 | parser.add_argument("--data_dir",type=str, required=True) 13 | parser.add_argument("--image_dir",type=str, required=True) 14 | parser.add_argument("--model_dir",type=str, required=True) 15 | 16 | args = parser.parse_args() 17 | 18 | 19 | exp_name = args.exp_name 20 | snapshot_name = args.snapshot_name 21 | tst_image_set = args.test_split 22 | data_dir = args.data_dir 23 | image_dir = args.image_dir 24 | model_dir = args.model_dir 25 | 26 | layout_accuracy, layout_correct_total, num_questions_total,answer_accuracy =\ 27 | run_eval(exp_name,snapshot_name,tst_image_set,data_dir, image_dir, model_dir, print_log=True) 28 | 29 | 30 | print('On split: %s' % tst_image_set) 31 | print('\t layout accuracy = %f (%d / %d) answer_accuracy= %f' % 32 | (layout_accuracy, layout_correct_total, num_questions_total,answer_accuracy)) -------------------------------------------------------------------------------- /eval_model/eval_layout_learning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from eval_model.layout_evaluator import run_eval 5 | 6 | exp_name = 'clevr_gt_layout' 7 | dataSplitSet = 'val' 8 | out_file = "layout_learning_on_eval_dataset.txt" 9 | 10 | eval_results = [] 11 | 12 | 13 | with open(out_file, 'w') as f: 14 | for i_iter in range(500): 15 | snapshot_name = '%08d' % i_iter 16 | snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, snapshot_name) 17 | if os.path.exists(snapshot_file): 18 | accuracy,_, total = run_eval(exp_name, snapshot_name, dataSplitSet) 19 | eval_results.append((i_iter,accuracy,total)) 20 | print("iter:", i_iter,"\taccuracy:", accuracy, "\ttotal:", total) 21 | sys.stdout.flush() 22 | print("iter:", i_iter, "\taccuracy:", accuracy, "\ttotal:", total,file=f) 23 | 24 | -------------------------------------------------------------------------------- /eval_model/layout_evaluator.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | from torch.autograd import Variable 3 | from models.layout_assembler import Assembler 4 | from Utils.data_reader import DataReader 5 | 6 | import os 7 | import torch 8 | import numpy as np 9 | 10 | from global_variables.global_variables import use_cuda 11 | 12 | 13 | T_encoder = 45 14 | T_decoder = 10 15 | N = 64 16 | prune_filter_module = True 17 | 18 | 19 | 20 | 21 | def run_eval(exp_name, snapshot_name, tst_image_set, data_dir, image_feat_dir, tf_model_dir,print_log = False): 22 | vocab_question_file = os.path.join(data_dir,"vocabulary_clevr.txt") 23 | vocab_layout_file = os.path.join(data_dir,"vocabulary_layout.txt") 24 | vocab_answer_file = os.path.join(data_dir,"answers_clevr.txt") 25 | 26 | imdb_file_tst_base_name = 'imdb_%s.npy' % tst_image_set 27 | imdb_file_tst = os.path.join(data_dir,"imdb",imdb_file_tst_base_name) 28 | 29 | image_feat_dir_tst = os.path.join(image_feat_dir,tst_image_set) 30 | 31 | #module_snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, "model_"+snapshot_name) 32 | 33 | module_snapshot_file = os.path.join(tf_model_dir, exp_name, "model_"+snapshot_name) 34 | assembler = Assembler(vocab_layout_file) 35 | 36 | data_reader_tst = DataReader(imdb_file_tst,image_feat_dir_tst, shuffle=False, one_pass=True, 37 | batch_size=N, 38 | T_encoder=T_encoder, 39 | T_decoder=T_decoder, 40 | assembler=assembler, 41 | vocab_question_file=vocab_question_file, 42 | vocab_answer_file=vocab_answer_file, 43 | prune_filter_module=prune_filter_module) 44 | 45 | 46 | if data_reader_tst is not None: 47 | print('Running test ...') 48 | 49 | 50 | answer_correct_total = 0 51 | layout_correct_total = 0 52 | layout_valid_total = 0 53 | num_questions_total = 0 54 | 55 | 56 | ##load my model 57 | myModel = torch.load(module_snapshot_file) 58 | 59 | 60 | for i, batch in enumerate(data_reader_tst.batches()): 61 | 62 | _, batch_size = batch['input_seq_batch'].shape 63 | 64 | input_text_seq_lens = batch['seq_length_batch'] 65 | input_text_seqs = batch['input_seq_batch'] 66 | input_layouts = batch['gt_layout_batch'] 67 | input_images = batch['image_feat_batch'] 68 | input_answers = batch['answer_label_batch'] 69 | 70 | num_questions_total += batch_size 71 | 72 | 73 | input_txt_variable = Variable(torch.LongTensor(input_text_seqs)) 74 | input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable 75 | 76 | input_layout_variable = None 77 | 78 | _, _, myAnswer, predicted_layouts, expr_validity_array,_ = myModel( 79 | input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens, 80 | input_layout_variable=input_layout_variable, 81 | input_answers=None, input_images=input_images,sample_token=False) 82 | 83 | 84 | layout_correct_total += np.sum(np.all(predicted_layouts == input_layouts, axis=0)) 85 | 86 | 87 | answer_correct_total += np.sum(np.logical_and(expr_validity_array, myAnswer == input_answers)) 88 | 89 | layout_valid_total += np.sum(expr_validity_array) 90 | 91 | ##current accuracy 92 | layout_accuracy = layout_correct_total / num_questions_total 93 | answer_accuracy = answer_correct_total / num_questions_total 94 | layout_validity = layout_valid_total / num_questions_total 95 | 96 | if (i+1)%100 ==0 and print_log: 97 | print("iter:", i + 1, " layout_accuracy=%.4f"% layout_accuracy, 98 | " answer_accuracy=%.4f"% answer_accuracy, 99 | " layout_validity=%.4f"% layout_validity,) 100 | 101 | 102 | 103 | return layout_accuracy, layout_correct_total ,num_questions_total, answer_accuracy -------------------------------------------------------------------------------- /global_variables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/global_variables/__init__.py -------------------------------------------------------------------------------- /global_variables/global_variables.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | use_cuda = torch.cuda.is_available() 5 | 6 | model_type_gt = "gt_layout" 7 | model_type_scratch = "scratch" 8 | model_type_gt_rl = "gt+rl" 9 | -------------------------------------------------------------------------------- /loadn2nmn_pytorch_env.sh: -------------------------------------------------------------------------------- 1 | export PYTHONPATH="/private/home/tinayujiang/n2nmn_pytorch:$PYTHONPATH" 2 | 3 | module load cudnn/v7.0-cuda.9.0 4 | 5 | -------------------------------------------------------------------------------- /models/Attention2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | from global_variables.global_variables import use_cuda 7 | 8 | 9 | class EncoderRNN(nn.Module): 10 | def __init__(self, input_size, hidden_size, input_encoding_size, num_layers=1): 11 | super(EncoderRNN, self).__init__() 12 | self.hidden_size = hidden_size 13 | self.num_layers = num_layers 14 | 15 | self.embedding = nn.Embedding(input_size, input_encoding_size) 16 | self.lstm = nn.LSTM(input_encoding_size, hidden_size) 17 | 18 | def forward(self, input_seqs, input_seq_lens, hidden): 19 | embedded = self.embedding(input_seqs) 20 | outputs, hidden = self.lstm(embedded) 21 | return outputs, hidden, embedded 22 | 23 | def initHidden(self,batch_size): 24 | result = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size)) 25 | if use_cuda: 26 | return result.cuda() 27 | else: 28 | return result 29 | 30 | 31 | 32 | class AttnDecoderRNN(nn.Module): 33 | def __init__(self, hidden_size, output_size, output_encoding_size, 34 | max_decoder_len=0, dropout_p=0.1,num_layers = 1, 35 | assembler_w=None, assembler_b=None, assembler_p = None,EOStoken=-1): 36 | super(AttnDecoderRNN, self).__init__() 37 | self.hidden_size = hidden_size 38 | self.output_size = output_size 39 | self.output_encoding_size = output_encoding_size 40 | self.dropout_p = dropout_p 41 | self.num_layers = num_layers 42 | self.max_decoder_len = max_decoder_len 43 | 44 | self.go_embeding = nn.Embedding(1, self.output_encoding_size) 45 | self.embedding = nn.Embedding(self.output_size, self.output_encoding_size) 46 | self.dropout = nn.Dropout(self.dropout_p) 47 | self.lstm = nn.LSTM(self.output_encoding_size, self.hidden_size) 48 | self.out = nn.Linear(self.hidden_size * 2, self.output_size) 49 | self.encoderLinear = nn.Linear(self.hidden_size, self.hidden_size) 50 | self.decoderLinear = nn.Linear(self.hidden_size, self.hidden_size) 51 | self.attnLinear = nn.Linear(self.hidden_size, 1) 52 | self.assembler_w = torch.FloatTensor(assembler_w).cuda() if use_cuda else torch.FloatTensor(assembler_w) 53 | self.assembler_b = torch.FloatTensor(assembler_b).cuda() if use_cuda else torch.FloatTensor(assembler_b) 54 | self.assembler_p = torch.FloatTensor(assembler_p).cuda() if use_cuda else torch.FloatTensor(assembler_p) 55 | self.batch_size = 0 56 | self.EOS_token = EOStoken 57 | self._init_par() 58 | 59 | def _init_par(self): 60 | torch.nn.init.xavier_uniform(self.decoderLinear.weight) 61 | torch.nn.init.xavier_uniform(self.attnLinear.weight) 62 | torch.nn.init.constant(self.decoderLinear.bias,0) 63 | torch.nn.init.constant(self.attnLinear.bias,0) 64 | ''' 65 | compute if a token is valid at current sequence 66 | decoding_state [N,3] 67 | assembler_w [3,output_size, 4 ] 68 | assembler_b [output_size, 4] 69 | output [N, output_size] 70 | ''' 71 | def _get_valid_tokens(self,decoding_state, assembler_W, assembler_b): 72 | 73 | batch_size = decoding_state.size(0) 74 | expanded_state = decoding_state.view(batch_size,3,1,1).expand(batch_size, 3, self.output_size, 4) 75 | 76 | expanded_w= assembler_W.view(1,3, self.output_size,4).expand(batch_size, 3, self.output_size, 4) 77 | 78 | tmp1 = torch.sum(expanded_state * expanded_w, dim=1) 79 | expanded_b = assembler_b.view(1,-1,4).expand(batch_size,-1,4) 80 | tmp2= tmp1 - expanded_b 81 | tmp3 = torch.min(tmp2,dim=2)[0] 82 | token_invalidity = torch.lt(tmp3, 0) 83 | token_invalidity = token_invalidity.cuda() if use_cuda else token_invalidity 84 | return token_invalidity 85 | 86 | 87 | ''' 88 | update the decoding state, which is used to determine if a token is valid 89 | decoding_state [N,3] 90 | assembler_p [output_size, 3] 91 | predicted_token [N,output_size] 92 | output [N, output_size] 93 | ''' 94 | def _update_decoding_state(self, decoding_state, predicted_token, assembler_P): 95 | decoding_state = decoding_state + torch.mm(predicted_token , assembler_P) 96 | return decoding_state 97 | 98 | 99 | ''' 100 | for a give state compute the lstm hidden layer, attention and predicted layers 101 | can handle the situation where seq_len is 1 or >1 (i.e., s=using groudtruth layout) 102 | 103 | input parameters : 104 | time: int, time step of decoder 105 | previous_token: [decoder_len, batch], decoder_len=1 for step-by-step decoder 106 | previous_hidden_state: (h_n, c_n), dimmension:both are (num_layers * num_directions, batch, hidden_size) 107 | encoder_outputs : outputs from LSTM in encoder[seq_len, batch, hidden_size * num_directions] 108 | encoder_lens: list of input sequence lengths 109 | decoding_state: the state used to decide valid tokens 110 | 111 | output parameters : 112 | predicted_token: [decoder_len, batch] 113 | Att_weighted_text: batch,out_len,txt_embed_dim 114 | log_seq_prob: [batch] 115 | neg_entropy: [batch] 116 | ''' 117 | def _step_by_step_attention_decoder(self, time, embedded, previous_hidden_state, 118 | encoder_outputs, encoder_lens, decoding_state,target_variable,sample_token): 119 | 120 | ##step1 run LSTM to get decoder hidden state 121 | seq_len = encoder_outputs.size(0) 122 | batch_size = encoder_outputs.size(1) 123 | hidden_size = encoder_outputs.size(2) 124 | 125 | out_len = embedded.size(0) 126 | 127 | output, hidden = self.lstm(embedded, previous_hidden_state) 128 | ##step2: use function in Eq(2) of the paper to compute attention 129 | ##size encoder_outputs (seq_len,batch_size,hidden_size)==>(out_len,seq_len,batch_size,hidden_size) 130 | encoder_outputs_expand = encoder_outputs.view(1, seq_len, batch_size, hidden_size).expand(out_len, seq_len, 131 | batch_size, 132 | hidden_size) 133 | encoder_transform = self.encoderLinear(encoder_outputs_expand) 134 | 135 | ##size output (out_len,batch_size,hidden_size) 136 | output_expand = output.view(out_len, 1, batch_size, hidden_size).expand(out_len, seq_len, batch_size, 137 | hidden_size) 138 | output_transfrom = self.decoderLinear(output_expand) 139 | 140 | ##raw_attention size (out_len,seq_len,batch_size,1) 141 | raw_attention = self.attnLinear(F.tanh(encoder_transform + output_transfrom)).view(out_len, seq_len, 142 | batch_size) ## Eq2 143 | 144 | # (out_len, seq_len, batch_size)==>(batch_size,out_len,seq_len) 145 | raw_attention = raw_attention.permute(2, 0, 1) 146 | 147 | ##mask the end of the question 148 | if encoder_lens is not None: 149 | mask = np.ones((batch_size, out_len, seq_len)) 150 | for i, v in enumerate(encoder_lens): 151 | mask[i, :, 0:v] = 0 152 | mask_tensor = torch.ByteTensor(mask) 153 | mask_tensor = mask_tensor.cuda() if use_cuda else mask_tensor 154 | raw_attention.data.masked_fill_(mask_tensor, -float('inf')) 155 | 156 | attention = F.softmax(raw_attention, dim=2) ##(batch,out_len,seq_len) 157 | 158 | 159 | ##c_t = \sum_{i=1}^I att_{ti}h_i t: decoder time t, and encoder time i 160 | ## (seq_len,batch_size,hidden_size) ==>(batch_size,seq_len,hidden_size) 161 | encoder_batch_first = encoder_outputs.permute(1, 0, 2) 162 | context = torch.bmm(attention, encoder_batch_first) 163 | 164 | ##(out_len,batch,hidden_size) --> (batch,out_len,hidden_size) 165 | output_batch_first = output.permute(1, 0, 2) 166 | 167 | ##(batch,out_len,hidden_size*2) 168 | combined = torch.cat((context, output_batch_first), dim=2).permute(1, 0, 2) 169 | 170 | ## [out_len,batch,out_size] 171 | output_prob = F.softmax(self.out(combined), dim=2) 172 | 173 | 174 | 175 | ##get the valid token for current position based on previous token to perform a mask for next prediction 176 | ## token_validity [N, output_size] 177 | token_invalidity = self._get_valid_tokens(decoding_state=decoding_state, 178 | assembler_W=self.assembler_w, 179 | assembler_b=self.assembler_b) 180 | 181 | ## probs 182 | probs = output_prob.view(-1,self.output_size) 183 | probs.data.masked_fill_(token_invalidity,0.0) 184 | probs_sum = torch.sum(probs, dim=1, keepdim=True) 185 | probs = probs/probs_sum 186 | 187 | 188 | if target_variable is not None: 189 | predicted_token = target_variable[time, :].view(-1,1) 190 | elif sample_token: 191 | predicted_token = probs.multinomial() 192 | else: 193 | predicted_token = torch.max(probs, dim=1)[1].view(-1, 1) 194 | 195 | 196 | ##[batch_size, self.output_size] 197 | tmp = torch.zeros(batch_size, self.output_size) 198 | tmp = tmp.cuda() if use_cuda else tmp 199 | predicted_token_encoded = tmp.scatter_(1, predicted_token.data, 1.0) 200 | predicted_token_encoded = predicted_token_encoded.cuda() if use_cuda else predicted_token_encoded 201 | 202 | updated_decoding_state = self._update_decoding_state(decoding_state=decoding_state, 203 | predicted_token=predicted_token_encoded, 204 | assembler_P=self.assembler_p) 205 | 206 | ## compute the negative entropy 207 | token_invalidity_float = Variable(token_invalidity.type(torch.FloatTensor)).detach() 208 | token_invalidity_float = token_invalidity_float.cuda() if use_cuda else token_invalidity_float 209 | token_neg_entropy = torch.sum(probs.detach() * torch.log(probs + 0.000001), dim=1) 210 | 211 | ## compute log_seq_prob 212 | selected_token_log_prob =torch.log(torch.sum(probs * Variable(predicted_token_encoded), dim=1)+ 0.000001) 213 | 214 | 215 | return predicted_token.permute(1, 0), hidden, attention, updated_decoding_state,token_neg_entropy, selected_token_log_prob 216 | 217 | 218 | 219 | 220 | 221 | def forward(self,encoder_hidden,encoder_outputs,encoder_lens,target_variable,sample_token): 222 | self.batch_size = encoder_outputs.size(1) 223 | total_neg_entropy = 0 224 | total_seq_prob = 0 225 | 226 | ## set initiate step: 227 | time = 0 228 | start_token = Variable(torch.LongTensor(np.zeros((1, self.batch_size))), requires_grad=False) 229 | start_token = start_token.cuda() if use_cuda else start_token 230 | next_input = self.go_embeding(start_token) 231 | next_decoding_state = torch.FloatTensor([[0, 0, self.max_decoder_len]]).expand(self.batch_size, 3).contiguous() 232 | next_decoding_state = next_decoding_state.cuda() if use_cuda else next_decoding_state 233 | loop_state = True 234 | previous_hidden = encoder_hidden 235 | 236 | while time < self.max_decoder_len : 237 | predicted_token, previous_hidden, context, next_decoding_state, neg_entropy, log_seq_prob = \ 238 | self._step_by_step_attention_decoder(time=time, 239 | embedded= next_input, 240 | previous_hidden_state=previous_hidden, encoder_outputs=encoder_outputs, 241 | encoder_lens=encoder_lens, decoding_state=next_decoding_state,target_variable= target_variable,sample_token=sample_token) 242 | 243 | if time == 0: 244 | predicted_tokens = predicted_token 245 | total_neg_entropy = neg_entropy 246 | total_seq_prob = log_seq_prob 247 | context_total = context 248 | else: 249 | predicted_tokens = torch.cat((predicted_tokens, predicted_token)) 250 | total_neg_entropy += neg_entropy 251 | total_seq_prob += log_seq_prob 252 | context_total = torch.cat((context_total, context), dim=1) 253 | 254 | time +=1 255 | next_input =self.embedding(predicted_token) 256 | loop_state = torch.ne(predicted_token, self.EOS_token).any() 257 | 258 | return predicted_tokens, context_total, total_neg_entropy, total_seq_prob 259 | 260 | 261 | 262 | 263 | class attention_seq2seq(nn.Module): 264 | def __init__(self, encoder, decoder): 265 | super(attention_seq2seq, self).__init__() 266 | self.encoder = encoder 267 | self.decoder = decoder 268 | 269 | def forward(self, input_seqs,input_seq_lens,target_variable,sample_token): 270 | encoder_hidden = self.encoder.initHidden(len(input_seq_lens)) 271 | encoder_outputs, encoder_hidden, txt_embedded = self.encoder(input_seqs,input_seq_lens, encoder_hidden) 272 | decoder_results, attention, neg_entropy, log_seq_prob = self.decoder(target_variable=target_variable, 273 | encoder_hidden= encoder_hidden, 274 | encoder_outputs= encoder_outputs, 275 | encoder_lens=input_seq_lens, sample_token=sample_token 276 | ) 277 | ##using attention from decoder and txt_embedded from the encoder to get the attention weighted text 278 | ## txt_embedded [seq_len,batch,input_encoding_size] 279 | ## attention [batch, out_len,seq_len] 280 | txt_embedded_perm = txt_embedded.permute(1,0,2) 281 | att_weighted_text = torch.bmm(attention, txt_embedded_perm) 282 | 283 | 284 | return decoder_results, att_weighted_text, neg_entropy, log_seq_prob 285 | #return decoder_results, attention, neg_entropy, log_seq_prob 286 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/models/__init__.py -------------------------------------------------------------------------------- /models/custom_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class custom_loss(nn.Module): 5 | def __init__(self,lambda_entropy): 6 | super(custom_loss, self).__init__() 7 | self.lambda_entropy = lambda_entropy 8 | 9 | def forward(self, neg_entropy, answer_loss, policy_gradient_losses=None,layout_loss =None): 10 | answer = torch.mean(answer_loss) 11 | #entropy = torch.mean(neg_entropy) 12 | #policy_gradient = torch.mean(policy_gradient_losses) 13 | #print(" answer= %f, entropy = %f, policy_gradient = %f" % 14 | # (answer,entropy,policy_gradient)) 15 | 16 | if layout_loss is None: 17 | return torch.mean(neg_entropy) * self.lambda_entropy +\ 18 | torch.mean(answer_loss)+torch.mean(policy_gradient_losses), answer 19 | else: 20 | return answer + layout_loss, answer 21 | -------------------------------------------------------------------------------- /models/end2endModuleNet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import sys 4 | from models.Attention2 import * 5 | from models.module_net import * 6 | from Utils.utils import unique_columns 7 | 8 | 9 | 10 | 11 | class end2endModuleNet(nn.Module): 12 | def __init__(self, num_vocab_txt, num_vocab_nmn, out_num_choices, 13 | embed_dim_nmn, embed_dim_txt, image_height, image_width, in_image_dim, 14 | hidden_size, assembler, layout_criterion, answer_criterion,max_layout_len, num_layers=1, decoder_dropout=0,**kwarg): 15 | 16 | super(end2endModuleNet, self).__init__() 17 | 18 | self.assembler = assembler 19 | self.layout_criterion = layout_criterion 20 | self.answer_criterion = answer_criterion 21 | 22 | 23 | ##initiate encoder and decoder 24 | myEncoder = EncoderRNN(num_vocab_txt, hidden_size, embed_dim_txt, num_layers) 25 | myDecoder = AttnDecoderRNN(hidden_size, num_vocab_nmn, embed_dim_nmn, 26 | max_decoder_len = max_layout_len, 27 | dropout_p=decoder_dropout, num_layers= num_layers, 28 | assembler_w=self.assembler.W, assembler_b=self.assembler.b, 29 | assembler_p=self.assembler.P, EOStoken=self.assembler.EOS_idx) 30 | 31 | if use_cuda: 32 | myEncoder = myEncoder.cuda() 33 | myDecoder = myDecoder.cuda() 34 | 35 | 36 | ##initatiate attentionSeq2seq 37 | mySeq2seq = attention_seq2seq(myEncoder, myDecoder) 38 | self.mySeq2seq = mySeq2seq.cuda() if use_cuda else mySeq2seq 39 | 40 | 41 | ##initiate moduleNet 42 | myModuleNet = module_net(image_height=image_height, image_width=image_width, in_image_dim=in_image_dim, 43 | in_text_dim=embed_dim_txt, out_num_choices=out_num_choices, map_dim=hidden_size) 44 | 45 | self.myModuleNet = myModuleNet.cuda() if use_cuda else myModuleNet 46 | 47 | def forward(self, input_txt_variable, input_text_seq_lens, 48 | input_images, input_answers, 49 | input_layout_variable,sample_token, policy_gradient_baseline=None, 50 | baseline_decay=None): 51 | 52 | batch_size = len(input_text_seq_lens) 53 | 54 | ##run attentionSeq2Seq 55 | myLayouts, myAttentions, neg_entropy, log_seq_prob = \ 56 | self.mySeq2seq(input_txt_variable, input_text_seq_lens, input_layout_variable,sample_token) 57 | 58 | 59 | layout_loss = None 60 | if input_layout_variable is not None: 61 | layout_loss = torch.mean(-log_seq_prob) 62 | 63 | predicted_layouts = np.asarray(myLayouts.cpu().data.numpy()) 64 | expr_list, expr_validity_array = self.assembler.assemble(predicted_layouts) 65 | 66 | ## group samples based on layout 67 | sample_groups_by_layout = unique_columns(predicted_layouts) 68 | 69 | ##run moduleNet 70 | answer_losses = None 71 | policy_gradient_losses = None 72 | avg_answer_loss =None 73 | total_loss = None 74 | updated_baseline = policy_gradient_baseline 75 | current_answer = np.zeros(batch_size) 76 | 77 | for sample_group in sample_groups_by_layout: 78 | if sample_group.shape == 0: 79 | continue 80 | 81 | first_in_group = sample_group[0] 82 | if expr_validity_array[first_in_group]: 83 | layout_exp = expr_list[first_in_group] 84 | 85 | if input_answers is None: 86 | ith_answer_variable = None 87 | else: 88 | ith_answer = input_answers[sample_group] 89 | ith_answer_variable = Variable(torch.LongTensor(ith_answer)) 90 | ith_answer_variable = ith_answer_variable.cuda() if use_cuda else ith_answer_variable 91 | 92 | textAttention = myAttentions[sample_group, :] 93 | 94 | ith_image = input_images[sample_group, :, :, :] 95 | ith_images_variable = Variable(torch.FloatTensor(ith_image)) 96 | ith_images_variable = ith_images_variable.cuda() if use_cuda else ith_images_variable 97 | 98 | ##image[batch_size, H_feat, W_feat, D_feat] ==> [batch_size, D_feat, W_feat, H_feat] for conv2d 99 | #ith_images_variable = ith_images_variable.permute(0, 3, 1, 2) 100 | 101 | ith_images_variable = ith_images_variable.contiguous() 102 | 103 | myAnswers = self.myModuleNet(input_image_variable=ith_images_variable, 104 | input_text_attention_variable=textAttention, 105 | target_answer_variable=ith_answer_variable, 106 | expr_list=layout_exp) 107 | current_answer[sample_group] = torch.topk(myAnswers, 1)[1].cpu().data.numpy()[:, 0] 108 | 109 | 110 | ##compute loss function only when answer is provided 111 | if ith_answer_variable is not None: 112 | current_answer_loss = self.answer_criterion(myAnswers, ith_answer_variable) 113 | sample_group_tensor = torch.cuda.LongTensor(sample_group) if use_cuda else torch.LongTensor(sample_group) 114 | 115 | current_log_seq_prob = log_seq_prob[sample_group_tensor] 116 | current_answer_loss_val = Variable(current_answer_loss.data,requires_grad=False) 117 | tmp1 = current_answer_loss_val - policy_gradient_baseline 118 | current_policy_gradient_loss = tmp1 * current_log_seq_prob 119 | 120 | if answer_losses is None: 121 | answer_losses = current_answer_loss 122 | policy_gradient_losses = current_policy_gradient_loss 123 | else: 124 | answer_losses = torch.cat((answer_losses, current_answer_loss)) 125 | policy_gradient_losses = torch.cat((policy_gradient_losses, current_policy_gradient_loss)) 126 | 127 | try: 128 | if input_answers is not None: 129 | total_loss, avg_answer_loss = self.layout_criterion(neg_entropy=neg_entropy, 130 | answer_loss=answer_losses, 131 | policy_gradient_losses=policy_gradient_losses, 132 | layout_loss=layout_loss) 133 | ##update layout policy baseline 134 | avg_sample_loss = torch.mean(answer_losses) 135 | avg_sample_loss_value = avg_sample_loss.cpu().data.numpy()[0] 136 | updated_baseline = policy_gradient_baseline + (1 - baseline_decay) * ( 137 | avg_sample_loss_value - policy_gradient_baseline) 138 | 139 | except: 140 | print("sample_group = ", sample_group) 141 | print("neg_entropy=", neg_entropy) 142 | print("answer_losses=", answer_losses) 143 | print("policy_gradient_losses=", policy_gradient_losses) 144 | print("layout_loss=", layout_loss) 145 | sys.stdout.flush() 146 | sys.exit("Exception Occur") 147 | 148 | 149 | 150 | 151 | 152 | return total_loss, avg_answer_loss, current_answer, predicted_layouts, expr_validity_array, updated_baseline 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /models/function2Module.py: -------------------------------------------------------------------------------- 1 | 2 | from models.modules import * 3 | 4 | function2module = { 5 | 'filter_color': FilterModule, 6 | 'filter_material': FilterModule, 7 | 'filter_shape': FilterModule, 8 | 'filter_size': FilterModule, 9 | 10 | 'same_color': FindSamePropertyModule, 11 | 'same_material': FindSamePropertyModule, 12 | 'same_shape': FindSamePropertyModule, 13 | 'same_size': FindSamePropertyModule, 14 | 15 | 'relate': TransformModule, 16 | 'intersect': AndModule, 17 | 'union': OrModule, 18 | 19 | 'count': CountModule, 20 | 'exist': ExistModule, 21 | 'equal_integer': EqualNumModule, 22 | 'greater_than': MoreNumModule, 23 | 'less_than': LessNumModule, 24 | 25 | 'equal_color': SamePropertyModule, 26 | 'equal_material': SamePropertyModule, 27 | 'equal_shape': SamePropertyModule, 28 | 'equal_size': SamePropertyModule, 29 | 30 | 'query_color': DescribeModule, 31 | 'query_material': DescribeModule, 32 | 'query_shape': DescribeModule, 33 | 'query_size': DescribeModule, 34 | 35 | 'scene': SceneModule, 36 | 'unique': None 37 | } -------------------------------------------------------------------------------- /models/layout_assembler.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | 5 | # the number of attention input to each module 6 | _module_input_num = { 7 | '_Scene': 0, 8 | '_Find': 0, 9 | '_Filter': 1, 10 | '_FindSameProperty': 1, 11 | '_Transform': 1, 12 | '_And': 2, 13 | '_Or': 2, 14 | '_Count': 1, 15 | '_Exist': 1, 16 | '_EqualNum': 2, 17 | '_MoreNum': 2, 18 | '_LessNum': 2, 19 | '_SameProperty': 2, 20 | '_Describe': 1} 21 | 22 | # output type of each module 23 | _module_output_type = { 24 | '_Scene': 'att', 25 | '_Find': 'att', 26 | '_Filter': 'att', 27 | '_FindSameProperty': 'att', 28 | '_Transform': 'att', 29 | '_And': 'att', 30 | '_Or': 'att', 31 | '_Count': 'ans', 32 | '_Exist': 'ans', 33 | '_EqualNum': 'ans', 34 | '_MoreNum': 'ans', 35 | '_LessNum': 'ans', 36 | '_SameProperty': 'ans', 37 | '_Describe': 'ans'} 38 | 39 | INVALID_EXPR = 'INVALID_EXPR' 40 | # decoding validity: maintaining a state x of [#att, #ans, T_remain] 41 | # when T_remain is T_decoder when decoding the first module token 42 | # a token s can be predicted iff all( - b_s >= 0) 43 | # the validity token list is 44 | # XW - b >= 0 45 | # the state transition matrix is P, so the state update is X += S P, 46 | # where S is the predicted tokens (one-hot vectors) 47 | def _build_validity_mats(module_names): 48 | state_size = 3 49 | num_vocab_nmn = len(module_names) 50 | num_constraints = 4 51 | P = np.zeros((num_vocab_nmn, state_size), np.int32) 52 | W = np.zeros((state_size, num_vocab_nmn, num_constraints), np.int32) 53 | b = np.zeros((num_vocab_nmn, num_constraints), np.int32) 54 | 55 | # collect the input and output numbers of each module 56 | att_in_nums = np.zeros(num_vocab_nmn) 57 | att_out_nums = np.zeros(num_vocab_nmn) 58 | ans_out_nums = np.zeros(num_vocab_nmn) 59 | for n_s, s in enumerate(module_names): 60 | if s != '': 61 | att_in_nums[n_s] = _module_input_num[s] 62 | att_out_nums[n_s] = _module_output_type[s] == 'att' 63 | ans_out_nums[n_s] = _module_output_type[s] == 'ans' 64 | # construct the trasition matrix P 65 | for n_s, s in enumerate(module_names): 66 | P[n_s, 0] = att_out_nums[n_s] - att_in_nums[n_s] 67 | P[n_s, 1] = ans_out_nums[n_s] 68 | P[n_s, 2] = -1 69 | # construct the validity W and b 70 | att_absorb_nums = (att_in_nums - att_out_nums) 71 | max_att_absorb_nonans = np.max(att_absorb_nums * (ans_out_nums == 0)) 72 | max_att_absorb_ans = np.max(att_absorb_nums * (ans_out_nums != 0)) 73 | for n_s, s in enumerate(module_names): 74 | if s != '': 75 | # constraint: a non- module can be outputted iff all the following holds 76 | # * 0) there's enough att in the stack 77 | # #att >= att_in_nums[n_s] 78 | W[0, n_s, 0] = 1 79 | b[n_s, 0] = att_in_nums[n_s] 80 | # * 1) for answer modules, there's no extra att in the stack 81 | # #att <= att_in_nums[n_s] 82 | # -#att >= -att_in_nums[n_s] 83 | # for non-answer modules, T_remain >= 3 84 | # (the last two has to be AnswerType and ) 85 | if ans_out_nums[n_s] != 0: 86 | W[0, n_s, 1] = -1 87 | b[n_s, 1] = -att_in_nums[n_s] 88 | else: 89 | W[2, n_s, 1] = 1 90 | b[n_s, 1] = 3 91 | # * 2) there's no answer in the stack (otherwise only) 92 | # #ans <= 0 93 | # -#ans >= 0 94 | W[1, n_s, 2] = -1 95 | # * 3) there's enough time to consume the all attentions, output answer plus 96 | # 3.1) for non-answer modules, we already have T_remain>= 3 from constraint 2 97 | # In maximum (T_remain-3) further steps 98 | # (plus 3 steps for this, ans, ) to consume atts 99 | # (T_remain-3) * max_att_absorb_nonans + max_att_absorb_ans + att_absorb_nums[n_s] >= #att 100 | # T_remain*MANA - #att >= 3*MANA - MAA - A[s] 101 | # - #att + MANA * T_remain >= 3*MANA - MAA - A[s] 102 | # 3.2) for answer modules, if it can be decoded then constraint 0&1 ensures 103 | # that there'll be no att left in stack after decoding this answer, 104 | # hence no further constraints here 105 | if ans_out_nums[n_s] == 0: 106 | W[0, n_s, 3] = -1 107 | W[2, n_s, 3] = max_att_absorb_nonans 108 | b[n_s, 3] = 3*max_att_absorb_nonans - max_att_absorb_ans - att_absorb_nums[n_s] 109 | else: # -case 110 | # constraint: a token can be outputted iff all the following holds 111 | # * 0) there's ans in the stack 112 | # #ans >= 1 113 | W[1, n_s, 0] = 1 114 | b[n_s, 0] = 1 115 | 116 | return P, W, b 117 | 118 | class Assembler: 119 | def __init__(self, module_vocab_file): 120 | # read the module list, and record the index of each module and 121 | with open(module_vocab_file) as f: 122 | self.module_names = [s.strip() for s in f.readlines()] 123 | # find the index of 124 | for n_s in range(len(self.module_names)): 125 | if self.module_names[n_s] == '': 126 | self.EOS_idx = n_s 127 | break 128 | # build a dictionary from module name to token index 129 | self.name2idx_dict = {name: n_s for n_s, name in enumerate(self.module_names)} 130 | self.num_vocab_nmn = len(self.module_names) 131 | 132 | self.P, self.W, self.b = _build_validity_mats(self.module_names) 133 | 134 | def module_list2tokens(self, module_list, T=None): 135 | layout_tokens = [self.name2idx_dict[name] for name in module_list] 136 | if T is not None: 137 | if len(module_list) >= T: 138 | raise ValueError('Not enough time steps to add ') 139 | layout_tokens += [self.EOS_idx]*(T-len(module_list)) 140 | return layout_tokens 141 | 142 | def _layout_tokens2str(self, layout_tokens): 143 | return ' '.join([self.module_names[idx] for idx in layout_tokens]) 144 | 145 | def _invalid_expr(self, layout_tokens, error_str): 146 | return {'module': INVALID_EXPR, 147 | 'expr_str': self._layout_tokens2str(layout_tokens), 148 | 'error': error_str} 149 | 150 | def _assemble_layout_tokens(self, layout_tokens, batch_idx): 151 | # All modules takes a time_idx as the index from LSTM hidden states 152 | # (even if it doesn't need it, like _And), and different arity of 153 | # attention inputs. The output type can be either attention or answer 154 | # 155 | # The final assembled expression for each instance is as follows: 156 | # expr_type := 157 | # {'module': '_Find', 'output_type': 'att', 'time_idx': idx} 158 | # | {'module': '_Transform', 'output_type': 'att', 'time_idx': idx, 159 | # 'inputs_0': } 160 | # | {'module': '_And', 'output_type': 'att', 'time_idx': idx, 161 | # 'inputs_0': , 'inputs_1': )} 162 | # | {'module': '_Answer', 'output_type': 'ans', 'time_idx': idx, 163 | # 'inputs_0': } 164 | # | {'module': INVALID_EXPR, 'expr_str': '...', 'error': '...', 165 | # 'assembly_loss': } (for invalid expressions) 166 | # 167 | 168 | # A valid layout must contain . Assembly fails if it doesn't. 169 | if not np.any(layout_tokens == self.EOS_idx): 170 | return self._invalid_expr(layout_tokens, 'cannot find ') 171 | 172 | # Decoding Reverse Polish Notation with a stack 173 | decoding_stack = [] 174 | for t in range(len(layout_tokens)): 175 | # decode a module/operation 176 | module_idx = layout_tokens[t] 177 | if module_idx == self.EOS_idx: 178 | break 179 | module_name = self.module_names[module_idx] 180 | expr = {'module': module_name, 181 | 'output_type': _module_output_type[module_name], 182 | 'time_idx': t, 'batch_idx': batch_idx} 183 | 184 | input_num = _module_input_num[module_name] 185 | # Check if there are enough input in the stack 186 | if len(decoding_stack) < input_num: 187 | # Invalid expression. Not enough input. 188 | return self._invalid_expr(layout_tokens, 'not enough input for ' + module_name) 189 | 190 | # Get the input from stack 191 | for n_input in range(input_num-1, -1, -1): 192 | stack_top = decoding_stack.pop() 193 | if stack_top['output_type'] != 'att': 194 | # Invalid expression. Input must be attention 195 | return self._invalid_expr(layout_tokens, 'input incompatible for ' + module_name) 196 | expr['input_%d' % n_input] = stack_top 197 | 198 | decoding_stack.append(expr) 199 | 200 | # After decoding the reverse polish expression, there should be exactly 201 | # one expression in the stack 202 | if len(decoding_stack) != 1: 203 | return self._invalid_expr(layout_tokens, 'final stack size not equal to 1 (%d remains)' % len(decoding_stack)) 204 | 205 | result = decoding_stack[0] 206 | # The result type should be answer, not attention 207 | if result['output_type'] != 'ans': 208 | return self._invalid_expr(layout_tokens, 'result type must be ans, not att') 209 | return result 210 | 211 | def assemble(self, layout_tokens_batch): 212 | # layout_tokens_batch is a numpy array with shape [T, N], 213 | # containing module tokens and , in Reverse Polish Notation. 214 | _, N = layout_tokens_batch.shape 215 | expr_list = [self._assemble_layout_tokens(layout_tokens_batch[:, n], n) 216 | for n in range(N)] 217 | expr_validity = np.array([expr['module'] != INVALID_EXPR 218 | for expr in expr_list], np.bool) 219 | return expr_list, expr_validity 220 | -------------------------------------------------------------------------------- /models/module_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from models.modules import * 5 | from torch.autograd import Variable 6 | 7 | 8 | use_cuda = torch.cuda.is_available() 9 | 10 | 11 | 12 | class module_net(nn.Module): 13 | 14 | ##initiate all small modules which will be used here 15 | def __init__(self, image_height, image_width, in_image_dim, in_text_dim, out_num_choices, map_dim): 16 | super(module_net,self).__init__() 17 | self.image_height = image_height 18 | self.image_width = image_width 19 | self.in_image_dim = in_image_dim 20 | self.in_text_dim = in_text_dim 21 | self.out_num_choices = out_num_choices 22 | self.map_dim = map_dim 23 | self.SceneModule = SceneModule() 24 | self.FindModule = FindModule(image_dim=in_image_dim, text_dim=in_text_dim, map_dim= map_dim) 25 | self.TransformModule = TransformModule(image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim) 26 | self.AndModule = AndModule() 27 | self.OrModule = OrModule() 28 | self.FilterModule = FilterModule(findModule=self.FindModule, andModule=self.AndModule) 29 | self.FindSamePropertyModule = FindSamePropertyModule( 30 | output_num_choice=out_num_choices,image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim) 31 | 32 | self.CountModule = CountModule(output_num_choice=out_num_choices, 33 | image_height=image_height, image_width= image_width) 34 | 35 | self.ExistModule = ExistModule(output_num_choice=out_num_choices, 36 | image_height=image_height, image_width= image_width) 37 | 38 | self.EqualNumModule = EqualNumModule(output_num_choice=out_num_choices, 39 | image_height=image_height, image_width= image_width) 40 | 41 | self.MoreNumModule = MoreNumModule(output_num_choice=out_num_choices, 42 | image_height=image_height, image_width= image_width) 43 | 44 | self.LessNumModule = LessNumModule(output_num_choice=out_num_choices, 45 | image_height=image_height, image_width= image_width) 46 | 47 | self.SamePropertyModule = SamePropertyModule( 48 | output_num_choice=out_num_choices,image_dim=in_image_dim,text_dim=in_text_dim, map_dim=map_dim) 49 | 50 | self.DescribeModule = DescribeModule( 51 | output_num_choice=out_num_choices,image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim) 52 | 53 | self.layout2module = { 54 | '_Filter': self.FilterModule, 55 | '_FindSameProperty': self.FindSamePropertyModule, 56 | '_Transform': self.TransformModule, 57 | '_And': self.AndModule, 58 | '_Or': self.OrModule, 59 | '_Count': self.CountModule, 60 | '_Exist': self.ExistModule, 61 | '_EqualNum': self.EqualNumModule, 62 | '_MoreNum': self.MoreNumModule, 63 | '_LessNum': self.LessNumModule, 64 | '_SameProperty': self.SamePropertyModule, 65 | '_Describe': self.DescribeModule, 66 | '_Find': self.FindModule, 67 | '_Scene': self.SceneModule 68 | } 69 | 70 | #text[N, D_text] 71 | 72 | def recursively_assemble_network(self,input_image_variable, input_text_attention_variable,expr_list): 73 | current_module = self.layout2module[expr_list['module']] 74 | time_idx = expr_list['time_idx'] 75 | text_index = Variable(torch.LongTensor([time_idx])) 76 | text_index = text_index.cuda() if use_cuda else text_index 77 | text_at_time = torch.index_select(input_text_attention_variable, dim=1, 78 | index=text_index).view(-1, self.in_text_dim) 79 | 80 | input_0 = None 81 | input_1 = None 82 | 83 | if 'input_0' in expr_list: 84 | input_0 = self.recursively_assemble_network(input_image_variable, 85 | input_text_attention_variable, expr_list['input_0']) 86 | if 'input_1' in expr_list: 87 | input_1 = self.recursively_assemble_network(input_image_variable, input_text_attention_variable, 88 | expr_list['input_1']) 89 | 90 | res = current_module(input_image_variable, text_at_time, input_0, input_1) 91 | return res 92 | 93 | 94 | def forward(self, input_image_variable, input_text_attention_variable, target_answer_variable, expr_list): 95 | 96 | 97 | ##for now assume batch_size = 1 98 | result = self.recursively_assemble_network(input_image_variable,input_text_attention_variable,expr_list) 99 | 100 | return result 101 | 102 | 103 | -------------------------------------------------------------------------------- /models/modules.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | 7 | use_cuda = torch.cuda.is_available() 8 | 9 | 10 | ''' 11 | NOTE: in all modules, 12 | image_feat [N,D_image,H,W] 13 | text [N,D_text] 14 | attention [N,1,H,W] 15 | ''' 16 | 17 | 18 | 19 | class SceneModule(nn.Module): 20 | def __init__(self): 21 | super(SceneModule,self).__init__() 22 | 23 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 24 | N, _, H, W = input_image_feat.shape 25 | res = torch.ones((N, 1, H, W)) 26 | att_grid = Variable(res) 27 | att_grid = att_grid.cuda() if use_cuda else att_grid 28 | return att_grid 29 | 30 | 31 | class FindModule(nn.Module): 32 | ''' 33 | Mapping image_feat_grid X text_param ->att.grid 34 | (N,D_image,H,W) X (N,1,D_text) --> [N,1,H,W] 35 | ''' 36 | def __init__(self, image_dim, text_dim, map_dim): 37 | super(FindModule,self).__init__() 38 | self.map_dim = map_dim 39 | self.conv1 = nn.Conv2d(image_dim,map_dim,kernel_size=1) 40 | self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1) 41 | self.textfc = nn.Linear(text_dim,map_dim) 42 | 43 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 44 | image_mapped = self.conv1(input_image_feat) #(N, map_dim, H, W) 45 | text_mapped = self.textfc(input_text).view(-1, self.map_dim,1,1).expand_as(image_mapped) 46 | elmtwize_mult = image_mapped * text_mapped 47 | elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) #(N, map_dim, H, W) 48 | att_grid = self.conv2(elmtwize_mult) #(N, 1, H, W) 49 | return att_grid 50 | 51 | 52 | 53 | class FilterModule(nn.Module): 54 | def __init__(self, findModule, andModule): 55 | super(FilterModule,self).__init__() 56 | self.andModule = andModule 57 | self.findModule = findModule 58 | 59 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 60 | find_result = self.findModule(input_image_feat,input_text,input_image_attention1,input_image_attention2) 61 | att_grid = self.andModule(input_image_feat,input_text,input_image_attention1,find_result) 62 | return att_grid 63 | 64 | 65 | class FindSamePropertyModule(nn.Module): 66 | def __init__(self,output_num_choice, image_dim, text_dim, map_dim): 67 | super(FindSamePropertyModule,self).__init__() 68 | self.out_num_choice = output_num_choice 69 | self.image_dim = image_dim 70 | self.map_dim = map_dim 71 | self.text_fc = nn.Linear(text_dim, map_dim) 72 | self.att_fc_1 = nn.Linear(image_dim, map_dim) 73 | self.conv1 = nn.Conv2d(image_dim, map_dim, kernel_size=1) 74 | self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1) 75 | 76 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 77 | H, W = input_image_attention1.shape[2:4] 78 | att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W) 79 | image_reshape = input_image_feat.view(-1,self.image_dim,H * W) 80 | att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2) #[N, image_dim] 81 | att_feat_1_mapped = self.att_fc_1(att_feat_1).view(-1, self.map_dim,1,1) #[N, map_dim,1,1] 82 | 83 | text_mapped = self.text_fc(input_text).view(-1,self.map_dim,1,1) 84 | 85 | image_mapped = self.conv1(input_image_feat) # (N, map_dim, H, W) 86 | 87 | elmtwize_mult = image_mapped * text_mapped * att_feat_1_mapped #[N, map_dim, H, W] 88 | elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) 89 | 90 | att_grid = self.conv2(elmtwize_mult) 91 | 92 | return att_grid 93 | 94 | 95 | class TransformModule(nn.Module): 96 | def __init__(self, image_dim, text_dim, map_dim,kernel_size=5, padding=2): 97 | super(TransformModule,self).__init__() 98 | self.map_dim = map_dim 99 | self.conv1 = nn.Conv2d(1, map_dim, kernel_size=kernel_size, padding=padding) 100 | self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1) 101 | self.textfc = nn.Linear(text_dim,map_dim) 102 | 103 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 104 | image_att_mapped = self.conv1(input_image_attention1) #(N, map_dim, H, W) 105 | text_mapped = self.textfc(input_text).view(-1, self.map_dim,1,1).expand_as(image_att_mapped) 106 | elmtwize_mult = image_att_mapped * text_mapped 107 | elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) #(N, map_dim, H, W) 108 | att_grid = self.conv2(elmtwize_mult) #(N, 1, H, W) 109 | return att_grid 110 | 111 | 112 | class AndModule(nn.Module): 113 | def __init__(self): 114 | super(AndModule,self).__init__() 115 | 116 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 117 | return torch.max(input_image_attention1, input_image_attention2) 118 | 119 | 120 | class OrModule(nn.Module): 121 | def __init__(self): 122 | super(OrModule,self).__init__() 123 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 124 | return torch.min(input_image_attention1, input_image_attention2) 125 | 126 | 127 | 128 | class CountModule(nn.Module): 129 | def __init__(self,output_num_choice, image_height, image_width): 130 | super(CountModule,self).__init__() 131 | self.out_num_choice = output_num_choice 132 | self.lc_out = nn.Linear(image_height*image_width + 3, self.out_num_choice) 133 | 134 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 135 | H, W = input_image_attention1.shape[2:4] 136 | att_all = input_image_attention1.view(-1, H*W) ##flatten attention to [N, H*W] 137 | att_avg = torch.mean(att_all, 1, keepdim=True) 138 | att_min = torch.min(att_all, 1, keepdim=True)[0] 139 | att_max = torch.max(att_all,1, keepdim=True)[0] 140 | att_concat = torch.cat((att_all, att_avg, att_min, att_max), 1) 141 | scores = self.lc_out(att_concat) 142 | return scores 143 | 144 | 145 | 146 | 147 | class ExistModule(nn.Module): 148 | def __init__(self,output_num_choice, image_height, image_width): 149 | super(ExistModule,self).__init__() 150 | self.out_num_choice = output_num_choice 151 | self.lc_out = nn.Linear(image_height*image_width + 3, self.out_num_choice) 152 | 153 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 154 | H, W = input_image_attention1.shape[2:4] 155 | att_all = input_image_attention1.view(-1, H*W) ##flatten attention to [N, H*W] 156 | att_avg = torch.mean(att_all, 1, keepdim=True) 157 | att_min = torch.min(att_all, 1, keepdim=True)[0] 158 | att_max = torch.max(att_all, 1, keepdim=True)[0] 159 | att_concat = torch.cat((att_all, att_avg, att_min, att_max), 1) 160 | scores = self.lc_out(att_concat) 161 | return scores 162 | 163 | 164 | class EqualNumModule(nn.Module): 165 | def __init__(self,output_num_choice, image_height, image_width): 166 | super(EqualNumModule,self).__init__() 167 | self.out_num_choice = output_num_choice 168 | self.lc_out = nn.Linear(image_height*image_width *2 + 6, self.out_num_choice) 169 | 170 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 171 | H, W = input_image_attention1.shape[2:4] 172 | att1_all = input_image_attention1.view(-1, H * W) ##flatten attention to [N, H*W] 173 | att1_avg = torch.mean(att1_all, 1, keepdim=True) 174 | att1_min = torch.min(att1_all, 1, keepdim=True)[0] 175 | att1_max = torch.max(att1_all, 1, keepdim=True)[0] 176 | 177 | att2_all = input_image_attention2.view(-1, H * W) ##flatten attention to [N, H*W] 178 | att2_avg = torch.mean(att2_all, 1, keepdim=True) 179 | att2_min = torch.min(att2_all, 1, keepdim=True)[0] 180 | att2_max = torch.max(att2_all, 1, keepdim=True)[0] 181 | 182 | att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max,att2_all, att2_avg, att2_min, att2_max), 1) 183 | scores = self.lc_out(att_concat) 184 | return scores 185 | 186 | class MoreNumModule(nn.Module): 187 | def __init__(self, output_num_choice, image_height, image_width): 188 | super(MoreNumModule, self).__init__() 189 | self.out_num_choice = output_num_choice 190 | self.lc_out = nn.Linear(image_height * image_width * 2 + 6, self.out_num_choice) 191 | 192 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 193 | H, W = input_image_attention1.shape[2:4] 194 | att1_all = input_image_attention1.view(-1, H * W) ##flatten attention to [N, H*W] 195 | att1_avg = torch.mean(att1_all, 1, keepdim=True) 196 | att1_min = torch.min(att1_all, 1, keepdim=True)[0] 197 | att1_max = torch.max(att1_all, 1, keepdim=True)[0] 198 | 199 | att2_all = input_image_attention2.view(-1, H * W) ##flatten attention to [N, H*W] 200 | att2_avg = torch.mean(att2_all, 1, keepdim=True) 201 | att2_min = torch.min(att2_all, 1, keepdim=True)[0] 202 | att2_max = torch.max(att2_all, 1, keepdim=True)[0] 203 | 204 | att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max, att2_all, att2_avg, att2_min, att2_max), 1) 205 | scores = self.lc_out(att_concat) 206 | return scores 207 | 208 | class LessNumModule(nn.Module): 209 | def __init__(self, output_num_choice, image_height, image_width): 210 | super(LessNumModule, self).__init__() 211 | self.out_num_choice = output_num_choice 212 | self.lc_out = nn.Linear(image_height * image_width * 2 + 6, self.out_num_choice) 213 | 214 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 215 | H, W = input_image_attention1.shape[2:4] 216 | att1_all = input_image_attention1.view(-1, H * W) ##flatten attention to [N, H*W] 217 | att1_avg = torch.mean(att1_all, 1, keepdim=True) 218 | att1_min = torch.min(att1_all, 1, keepdim=True)[0] 219 | att1_max = torch.max(att1_all, 1, keepdim=True)[0] 220 | 221 | att2_all = input_image_attention2.view(-1, H * W) ##flatten attention to [N, H*W] 222 | att2_avg = torch.mean(att2_all, 1, keepdim=True) 223 | att2_min = torch.min(att2_all, 1, keepdim=True)[0] 224 | att2_max = torch.max(att2_all, 1, keepdim=True)[0] 225 | 226 | att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max, att2_all, att2_avg, att2_min, att2_max), 1) 227 | scores = self.lc_out(att_concat) 228 | return scores 229 | 230 | class SamePropertyModule(nn.Module): 231 | def __init__(self,output_num_choice, image_dim, text_dim, map_dim): 232 | super(SamePropertyModule,self).__init__() 233 | self.out_num_choice = output_num_choice 234 | self.image_dim = image_dim 235 | self.text_fc = nn.Linear(text_dim, map_dim) 236 | self.att_fc_1 = nn.Linear(image_dim, map_dim) 237 | self.att_fc_2 = nn.Linear(image_dim, map_dim) 238 | self.lc_out = nn.Linear(map_dim, self.out_num_choice) 239 | 240 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 241 | H, W = input_image_attention1.shape[2:4] 242 | att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W) 243 | att_softmax_2 = F.softmax(input_image_attention2.view(-1, H * W), dim=1).view(-1, 1, H*W) 244 | image_reshape = input_image_feat.view(-1,self.image_dim,H * W) 245 | att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2) #[N, image_dim] 246 | att_feat_2 = torch.sum(att_softmax_2 * image_reshape, dim=2) 247 | att_feat_1_mapped = self.att_fc_1(att_feat_1) #[N, map_dim] 248 | att_feat_2_mapped = self.att_fc_2(att_feat_2) 249 | 250 | text_mapped = self.text_fc(input_text) 251 | elmtwize_mult = att_feat_1_mapped * text_mapped * att_feat_2_mapped #[N, map_dim] 252 | elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) 253 | scores = self.lc_out(elmtwize_mult) 254 | 255 | return scores 256 | 257 | class DescribeModule(nn.Module): 258 | def __init__(self,output_num_choice, image_dim, text_dim, map_dim): 259 | super(DescribeModule,self).__init__() 260 | self.out_num_choice = output_num_choice 261 | self.image_dim = image_dim 262 | self.text_fc = nn.Linear(text_dim, map_dim) 263 | self.att_fc_1 = nn.Linear(image_dim, map_dim) 264 | self.lc_out = nn.Linear(map_dim, self.out_num_choice) 265 | 266 | def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None): 267 | H, W = input_image_attention1.shape[2:4] 268 | att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W) 269 | image_reshape = input_image_feat.view(-1,self.image_dim,H * W) #[N,image_dim,H*W] 270 | att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2) #[N, image_dim] 271 | att_feat_1_mapped = self.att_fc_1(att_feat_1) #[N, map_dim] 272 | 273 | text_mapped = self.text_fc(input_text) 274 | elmtwize_mult = att_feat_1_mapped * text_mapped #[N, map_dim] 275 | elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) 276 | scores = self.lc_out(elmtwize_mult) 277 | 278 | return scores -------------------------------------------------------------------------------- /tools/build_clevr_imdb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import os 4 | 5 | import sys 6 | from Utils import text_processing 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | 11 | parser.add_argument("--data_dir",type=str, required=True, help="directory for data ") 12 | parser.add_argument("--out_dir",type=str, required=True, help="output directory for json files") 13 | args = parser.parse_args() 14 | data_dir = args.data_dir 15 | out_dir = args.out_dir 16 | 17 | question_file = 'CLEVR_%s_questions_gt_layout.json' 18 | 19 | def build_imdb(image_set): 20 | print('building imdb %s' % image_set) 21 | question_file_name = (question_file % image_set) 22 | question_file_path = os.path.join(data_dir, question_file_name ) 23 | with open(question_file_path) as f: 24 | questions = json.load(f) 25 | imdb = [None]*len(questions) 26 | for n_q, q in enumerate(questions): 27 | if (n_q+1) % 10000 == 0: 28 | print('processing %d / %d' % (n_q+1, len(questions))) 29 | image_name = q['image_filename'].split('.')[0] 30 | feature_name = image_name + '.npy' 31 | question_str = q['question'] 32 | question_tokens = text_processing.tokenize(question_str) 33 | gt_layout_tokens = None 34 | if 'gt_layout' in q: 35 | gt_layout_tokens = q['gt_layout'] 36 | answer = None 37 | if 'answer' in q: 38 | answer = q['answer'] 39 | 40 | iminfo = dict(image_name=image_name, 41 | feature_path=feature_name, 42 | question_str=question_str, 43 | question_tokens=question_tokens, 44 | gt_layout_tokens=gt_layout_tokens, 45 | answer=answer) 46 | imdb[n_q] = iminfo 47 | return imdb 48 | 49 | 50 | imdb_trn = build_imdb('train') 51 | imdb_val = build_imdb('val') 52 | imdb_tst = build_imdb('test') 53 | 54 | os.makedirs('out_dir', exist_ok=True) 55 | 56 | out_trn = os.path.join(out_dir, 'imdb_trn.npy') 57 | out_val = os.path.join(out_dir, 'imdb_val.npy') 58 | out_tst = os.path.join(out_dir, 'imdb_tst.npy') 59 | 60 | np.save(out_trn, np.array(imdb_trn)) 61 | np.save(out_val, np.array(imdb_val)) 62 | np.save(out_tst, np.array(imdb_tst)) 63 | -------------------------------------------------------------------------------- /tools/build_vqa_imdb.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import os 4 | from collections import defaultdict 5 | import sys 6 | from Utils import text_processing 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--data_dir", type=str, required=True, help="data directory") 11 | parser.add_argument("--out_dir", type=str, required=True, help="imdb output directory") 12 | args = parser.parse_args() 13 | 14 | data_dir = args.data_dir 15 | out_dir = args.out_dir 16 | 17 | ''' 18 | vocab_answer_file = './answers_vqa.txt' 19 | annotation_file = '../vqa-dataset/Annotations/mscoco_%s_annotations.json' 20 | question_file = '../vqa-dataset/Questions/OpenEnded_mscoco_%s_questions.json' 21 | gt_layout_file = './gt_layout_%s_new_parse.npy' 22 | 23 | image_dir = '../vqa-dataset/Images/%s/' 24 | feature_dir = './resnet_res5c/%s/' 25 | ''' 26 | 27 | vocab_answer_file = os.path.join(out_dir, 'answers_vqa.txt') 28 | gt_layout_file = os.path.join(out_dir, 'gt_layout_%s_new_parse.npy') 29 | 30 | annotation_file = os.path.join(data_dir, 'mscoco_%s_annotations.json') 31 | question_file = os.path.join(data_dir, 'OpenEnded_mscoco_%s_questions.json') 32 | 33 | 34 | #image_dir = '../vqa-dataset/Images/%s/' 35 | #feature_dir = './resnet_res5c/%s/' 36 | 37 | 38 | 39 | answer_dict = text_processing.VocabDict(vocab_answer_file) 40 | valid_answer_set = set(answer_dict.word_list) 41 | 42 | def extract_answers(q_answers): 43 | all_answers = [answer["answer"] for answer in q_answers] 44 | valid_answers = [a for a in all_answers if a in valid_answer_set] 45 | return all_answers, valid_answers 46 | 47 | def build_imdb(image_set): 48 | print('building imdb %s' % image_set) 49 | if image_set in ['train2014', 'val2014']: 50 | load_answer = True 51 | load_gt_layout = True 52 | with open(annotation_file % image_set) as f: 53 | annotations = json.load(f)["annotations"] 54 | qid2ann_dict = {ann['question_id']: ann for ann in annotations} 55 | qid2layout_dict = np.load(gt_layout_file % image_set)[()] 56 | else: 57 | load_answer = False 58 | load_gt_layout = False 59 | with open(question_file % image_set) as f: 60 | questions = json.load(f)['questions'] 61 | coco_set_name = image_set.replace('-dev', '') 62 | #abs_image_dir = os.path.abspath(image_dir % coco_set_name) 63 | #abs_feature_dir = os.path.abspath(feature_dir % coco_set_name) 64 | image_name_template = 'COCO_' + coco_set_name + '_%012d' 65 | imdb = [None]*len(questions) 66 | 67 | unk_ans_count = 0 68 | for n_q, q in enumerate(questions): 69 | if (n_q+1) % 10000 == 0: 70 | print('processing %d / %d' % (n_q+1, len(questions))) 71 | image_id = q['image_id'] 72 | question_id = q['question_id'] 73 | image_name = image_name_template % image_id 74 | #image_path = os.path.join(abs_image_dir, image_name + '.jpg') 75 | feature_path = image_name + '.npy' 76 | #feature_path = os.path.join(abs_feature_dir, image_name + '.npy') 77 | question_str = q['question'] 78 | question_tokens = text_processing.tokenize(question_str) 79 | 80 | iminfo = dict(image_name=image_name, 81 | image_id=image_id, 82 | question_id=question_id, 83 | feature_path=feature_path, 84 | question_str=question_str, 85 | question_tokens=question_tokens) 86 | 87 | # load answers 88 | if load_answer: 89 | ann = qid2ann_dict[question_id] 90 | all_answers, valid_answers = extract_answers(ann['answers']) 91 | if len(valid_answers) == 0: 92 | valid_answers = [''] 93 | unk_ans_count += 1 94 | iminfo['all_answers'] = all_answers 95 | iminfo['valid_answers'] = valid_answers 96 | 97 | if load_gt_layout: 98 | gt_layout_tokens = qid2layout_dict[question_id] 99 | iminfo['gt_layout_tokens'] = gt_layout_tokens 100 | 101 | imdb[n_q] = iminfo 102 | print('total %d out of %d answers are ' % (unk_ans_count, len(questions))) 103 | return imdb 104 | 105 | imdb_train2014 = build_imdb('train2014') 106 | imdb_val2014 = build_imdb('val2014') 107 | imdb_test2015 = build_imdb('test2015') 108 | imdb_test_dev2015 = build_imdb('test-dev2015') 109 | 110 | 111 | imdb_dir = os.path.join(out_dir,'imdb') 112 | os.makedirs(imdb_dir, exist_ok=True) 113 | np.save(os.path.join(imdb_dir, 'imdb_train2014.npy'), np.array(imdb_train2014)) 114 | np.save(os.path.join(imdb_dir, 'imdb_val2014.npy'), np.array(imdb_val2014)) 115 | np.save(os.path.join(imdb_dir, 'imdb_trainval2014.npy'), np.array(imdb_train2014 + imdb_val2014)) 116 | np.save(os.path.join(imdb_dir, 'imdb_test2015.npy'), np.array(imdb_test2015)) 117 | np.save(os.path.join(imdb_dir, 'imdb_test-dev2015.npy'), np.array(imdb_test_dev2015)) 118 | -------------------------------------------------------------------------------- /tools/extract_visual_features_vgg_pool5.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | parser = argparse.ArgumentParser() 3 | parser.add_argument('--gpu_id', type=int, default=0) 4 | parser.add_argument("--data_dir",type=str, required=True) 5 | parser.add_argument("--out_dir",type=str, required=True) 6 | 7 | args = parser.parse_args() 8 | gpu_id = args.gpu_id # set GPU id to use 9 | import os; os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) 10 | import sys 11 | sys.path.append('../../') 12 | from glob import glob 13 | 14 | import skimage.io 15 | import skimage.color 16 | import numpy as np 17 | 18 | import torch 19 | import torchvision.models as models 20 | import torch.nn as nn 21 | from torch.autograd import Variable 22 | from global_variables.global_variables import use_cuda 23 | 24 | image_basedir = args.data_dir 25 | save_basedir = args.out_dir 26 | 27 | #H = 320 28 | #W = 480 29 | 30 | channel_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32) 31 | 32 | 33 | 34 | 35 | 36 | class vgg16_feature_module(nn.Module): 37 | def __init__(self, vgg16_model): 38 | super(vgg16_feature_module, self).__init__() 39 | self.feature_module = nn.Sequential(*list(list(vgg16_model.children())[0])) 40 | 41 | def forward(self, x): 42 | return self.feature_module(x) 43 | 44 | vgg16 = models.vgg16(pretrained=True) 45 | 46 | vgg16_feature = vgg16_feature_module(vgg16) 47 | vgg16_feature = vgg16_feature.cuda() if use_cuda else vgg16_feature 48 | 49 | def extract_image_pool5(impath): 50 | im = skimage.io.imread(impath)[..., :3] 51 | im_val = (im[np.newaxis, ...]- channel_mean) 52 | im_val = np.transpose(im_val,axes=(0,3,1,2)) 53 | im_val_tensor = torch.FloatTensor(im_val) 54 | im_val_variable = Variable(im_val_tensor) 55 | im_val_variable = im_val_variable.cuda() if use_cuda else im_val_variable 56 | pool5_val = vgg16_feature(im_val_variable) 57 | return pool5_val.data.cpu().numpy() 58 | 59 | def extract_dataset_pool5(image_dir, save_dir, ext_filter='*.png'): 60 | image_list = glob(image_dir + '/' + ext_filter) 61 | os.makedirs(save_dir, exist_ok=True) 62 | 63 | for n_im, impath in enumerate(image_list): 64 | if (n_im+1) % 100 == 0: 65 | print('processing %d / %d' % (n_im+1, len(image_list))) 66 | image_name = os.path.basename(impath).split('.')[0] 67 | save_path = os.path.join(save_dir, image_name + '.npy') 68 | if not os.path.exists(save_path): 69 | pool5_val = extract_image_pool5(impath) 70 | np.save(save_path, pool5_val) 71 | 72 | for image_set in ['train', 'val', 'test']: 73 | print('Extracting image set ' + image_set) 74 | extract_dataset_pool5(os.path.join(image_basedir, image_set), 75 | os.path.join(save_basedir, image_set)) 76 | print('Done.') -------------------------------------------------------------------------------- /tools/get_ground_truth_layout.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import os 4 | import argparse 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--question_dir', type=str, required=True, help="directory for questions") 8 | parser.add_argument('--out_dir', type=str, required=True) 9 | 10 | args = parser.parse_args() 11 | question_dir = args.question_dir 12 | out_dir = args.out_dir 13 | 14 | function2module = { 15 | 'filter_color': '_Filter', 16 | 'filter_material': '_Filter', 17 | 'filter_shape': '_Filter', 18 | 'filter_size': '_Filter', 19 | 20 | 'same_color': '_FindSameProperty', 21 | 'same_material': '_FindSameProperty', 22 | 'same_shape': '_FindSameProperty', 23 | 'same_size': '_FindSameProperty', 24 | 25 | 'relate': '_Transform', 26 | 'intersect': '_And', 27 | 'union': '_Or', 28 | 29 | 'count': '_Count', 30 | 'exist': '_Exist', 31 | 'equal_integer': '_EqualNum', 32 | 'greater_than': '_MoreNum', 33 | 'less_than': '_LessNum', 34 | 35 | 'equal_color': '_SameProperty', 36 | 'equal_material': '_SameProperty', 37 | 'equal_shape': '_SameProperty', 38 | 'equal_size': '_SameProperty', 39 | 40 | 'query_color': '_Describe', 41 | 'query_material': '_Describe', 42 | 'query_shape': '_Describe', 43 | 'query_size': '_Describe', 44 | 45 | 'scene': '_Scene', 46 | 'unique': None 47 | } 48 | 49 | 50 | def _traversal(program, i): 51 | funcs = [] 52 | for j in program[i]['inputs']: 53 | funcs += _traversal(program, j) 54 | funcs.append(program[i]['function']) 55 | return funcs 56 | 57 | 58 | prune_set = { 59 | 'equal_integer', 'greater_than', 'less_than', 'equal_color', 60 | 'equal_material', 'equal_shape', 'equal_size'} 61 | rm_set = { 62 | 'count', 'query_color', 'query_material', 'query_shape', 'query_size'} 63 | 64 | 65 | def _prune_program(program): 66 | for f in program: 67 | if f and f['function'] in prune_set: 68 | assert (len(f['inputs']) == 2) 69 | input_f_0 = program[f['inputs'][0]] 70 | input_f_1 = program[f['inputs'][1]] 71 | if input_f_0['function'] in rm_set: 72 | assert (len(input_f_0['inputs']) == 1) 73 | program[f['inputs'][0]] = None 74 | f['inputs'][0] = input_f_0['inputs'][0] 75 | if input_f_1['function'] in rm_set: 76 | assert (len(input_f_1['inputs']) == 1) 77 | program[f['inputs'][1]] = None 78 | f['inputs'][1] = input_f_1['inputs'][0] 79 | 80 | return program 81 | 82 | 83 | def linearize_program(q): 84 | program = _prune_program(q['program']) 85 | # 1. Find root: the root module has no parent 86 | is_root = np.array([f is not None for f in program]) 87 | for f in program: 88 | if f is not None: 89 | is_root[f['inputs']] = False 90 | if np.sum(is_root) != 1: 91 | assert (np.sum(is_root) >= 1) 92 | # remove the roots that are 'scene' 93 | is_not_scene = np.array([not (f and f['function'] == 'scene') for f in program]) 94 | is_root = np.logical_and(is_root, is_not_scene) 95 | assert (np.sum(is_root) == 1) 96 | 97 | root = np.argmax(is_root) 98 | 99 | # 2. Post-order traversal to obtain RPN 100 | funcs = _traversal(program, root) 101 | 102 | # 3. Map modules and fix exps 103 | q_modules = [function2module[f] for f in funcs] 104 | q_modules_new = q_modules[:] 105 | for n_f in range(1, len(q_modules)): 106 | # replace _Scene + _Filter with _Find 107 | if q_modules[n_f - 1] == '_Scene' and q_modules[n_f] == '_Filter': 108 | q_modules_new[n_f - 1] = None 109 | q_modules_new[n_f] = '_Find' 110 | 111 | q_modules_new = [m for m in q_modules_new if m is not None] 112 | return q_modules_new 113 | 114 | 115 | def add_gt_layout(question_file, save_file): 116 | with open(question_file) as f: 117 | questions = json.load(f)['questions'] 118 | 119 | for n_q, q in enumerate(questions): 120 | if (n_q + 1) % 1000 == 0: 121 | print('processing %d / %d' % (n_q + 1, len(questions))) 122 | if 'program' in q: 123 | q['gt_layout'] = linearize_program(q) 124 | 125 | with open(save_file, 'w') as f: 126 | json.dump(questions, f) 127 | 128 | 129 | # question_file_trn = '../clevr-dataset/questions/CLEVR_train_questions.json' 130 | # save_file_trn = './CLEVR_train_questions_gt_layout.json' 131 | 132 | question_file_trn = os.path.join(question_dir, 'CLEVR_train_questions.json') 133 | save_file_trn = os.path.join(out_dir, 'CLEVR_train_questions_gt_layout.json') 134 | 135 | add_gt_layout(question_file_trn, save_file_trn) 136 | 137 | #question_file_val = '../clevr-dataset/questions/CLEVR_val_questions.json' 138 | #save_file_val = './CLEVR_val_questions_gt_layout.json' 139 | 140 | question_file_val = os.path.join(question_dir, 'CLEVR_val_questions.json') 141 | save_file_val = os.path.join(out_dir, 'CLEVR_val_questions_gt_layout.json') 142 | 143 | add_gt_layout(question_file_val, save_file_val) 144 | 145 | #question_file_tst = '../clevr-dataset/questions/CLEVR_test_questions.json' 146 | #save_file_tst = './CLEVR_test_questions_gt_layout.json' 147 | 148 | question_file_tst = os.path.join(question_dir, 'CLEVR_test_questions.json') 149 | save_file_tst = os.path.join(out_dir,'CLEVR_test_questions_gt_layout.json') 150 | 151 | add_gt_layout(question_file_tst, save_file_tst) 152 | -------------------------------------------------------------------------------- /train_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/train_model/__init__.py -------------------------------------------------------------------------------- /train_model/from_scratch_hyperparameters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Module parameters 4 | H_feat = 10 5 | W_feat = 15 6 | D_feat = 512 7 | embed_dim_txt = 300 8 | embed_dim_nmn = 300 9 | lstm_dim = 512 10 | num_layers = 2 11 | encoder_dropout = False 12 | decoder_dropout = False 13 | decoder_sampling = True 14 | T_encoder = 45 15 | T_decoder = 10 16 | N = 64 17 | prune_filter_module = True 18 | 19 | # Training parameters 20 | invalid_expr_loss = np.log(28) # loss value when the layout is invalid 21 | lambda_entropy = 0.01 22 | weight_decay = 0 23 | baseline_decay = 0.99 24 | max_grad_l2_norm = 10 25 | max_iter = 120000 26 | snapshot_interval = 10000 27 | learning_rate = 0.001 28 | 29 | -------------------------------------------------------------------------------- /train_model/gt_hyperparameters.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Module parameters 4 | H_feat = 10 5 | W_feat = 15 6 | D_feat = 512 7 | embed_dim_txt = 300 8 | #embed_dim_txt = 512 9 | embed_dim_nmn = 300 10 | lstm_dim = 512 11 | num_layers = 2 12 | encoder_dropout = False 13 | decoder_dropout = False 14 | decoder_sampling = True 15 | T_encoder = 45 16 | T_decoder = 10 17 | N = 64 18 | prune_filter_module = True 19 | 20 | # Training parameters 21 | weight_decay = 5e-6 22 | baseline_decay = 0.99 23 | max_grad_l2_norm = 10 24 | max_iter = 80000 25 | snapshot_interval = 10000 26 | 27 | lambda_entropy = 0 28 | 29 | learning_rate = 0.001 30 | -------------------------------------------------------------------------------- /train_model/gt_rl_hyperparameters.py: -------------------------------------------------------------------------------- 1 | # Module parameters 2 | H_feat = 10 3 | W_feat = 15 4 | D_feat = 512 5 | embed_dim_txt = 300 6 | #embed_dim_txt = 512 7 | embed_dim_nmn = 300 8 | lstm_dim = 512 9 | num_layers = 2 10 | encoder_dropout = False 11 | decoder_dropout = False 12 | decoder_sampling = True 13 | T_encoder = 45 14 | T_decoder = 10 15 | N = 64 16 | prune_filter_module = True 17 | 18 | # Training parameters 19 | invalid_expr_loss = 0.5 # loss value when the layout is invalid 20 | lambda_entropy = 0.005 21 | weight_decay = 5e-6 22 | baseline_decay = 0.99 23 | max_grad_l2_norm = 10 24 | max_iter = 80000 25 | snapshot_interval = 10000 26 | 27 | learning_rate = 0.0001 -------------------------------------------------------------------------------- /train_model/input_parameters.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | import argparse 3 | import os 4 | import sys 5 | from global_variables.global_variables import * 6 | 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument('--gpu_id', type=int, default=0) 9 | parser.add_argument("--exp_name",type=str, default="clevr_gt_layout") 10 | parser.add_argument("--model_type",type=str, choices=[model_type_scratch, model_type_gt, model_type_gt_rl], 11 | required=True, help='models:'+ model_type_scratch + ',' +model_type_gt+', '+model_type_gt_rl) 12 | parser.add_argument("--model_path",type=str, required=False) 13 | parser.add_argument("--data_dir",type=str,default="./exp_clevr/data") 14 | parser.add_argument("--image_feat_dir",type=str,default="/Users/tinayujiang/work/clevr_dataset/data/vgg_pool5/train") 15 | parser.add_argument("--out_dir",type=str,default="./exp_clevr") 16 | args = parser.parse_args() 17 | 18 | gpu_id = args.gpu_id # set GPU id to use 19 | exp_name = args.exp_name 20 | model_type = args.model_type 21 | 22 | 23 | 24 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) 25 | 26 | 27 | out_dir = args.out_dir 28 | data_dir = args.data_dir 29 | image_feat_dir = args.image_feat_dir 30 | 31 | if model_type == model_type_scratch: 32 | from train_model.from_scratch_hyperparameters import * 33 | elif model_type == model_type_gt: 34 | from train_model.gt_hyperparameters import * 35 | elif model_type == model_type_gt_rl: 36 | from train_model.gt_rl_hyperparameters import * 37 | if(args.model_path == None): 38 | exit("model ",model_type_gt_rl," require a pretrained model using --model_path") 39 | model_path = args.model_path 40 | else: 41 | sys.exit("unknown model type", model_type) 42 | 43 | 44 | 45 | # Log params 46 | #log_dir = './exp_clevr/tb/%s/' % exp_name 47 | log_dir = os.path.join(out_dir, 'tb', exp_name) 48 | 49 | # Data files 50 | #vocab_question_file = './exp_clevr/data/vocabulary_clevr.txt' 51 | #vocab_layout_file = './exp_clevr/data/vocabulary_layout.txt' 52 | #vocab_answer_file = './exp_clevr/data/answers_clevr.txt' 53 | 54 | vocab_question_file = os.path.join(data_dir, 'vocabulary_clevr.txt') 55 | vocab_layout_file = os.path.join(data_dir, 'vocabulary_layout.txt') 56 | vocab_answer_file = os.path.join(data_dir, 'answers_clevr.txt') 57 | 58 | 59 | #imdb_file_trn = './exp_clevr/data/imdb/imdb_trn.npy' 60 | #imdb_file_tst = './exp_clevr/data/imdb/imdb_val.npy' 61 | imdb_file_trn = os.path.join(data_dir, 'imdb/imdb_trn.npy') 62 | imdb_file_tst = os.path.join(data_dir, 'imdb/imdb_val.npy') 63 | image_feat_dir = image_feat_dir 64 | 65 | ##snapshot directory name 66 | #snapshot_dir = './exp_clevr/tfmodel/%s/' % exp_name 67 | 68 | snapshot_dir = os.path.join(out_dir,"tfmodel",exp_name) -------------------------------------------------------------------------------- /train_model/main.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import argparse 3 | import os 4 | from models.layout_assembler import Assembler 5 | from models.end2endModuleNet import * 6 | from models.custom_loss import custom_loss 7 | from global_variables.global_variables import * 8 | from Utils.data_reader import DataReader 9 | from torch import optim 10 | 11 | 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--config", type=str, required=True, help="config yaml file") 15 | parser.add_argument("--out_dir",type=str, required=True, help="output directory") 16 | args = parser.parse_args() 17 | 18 | config_file= args.config 19 | out_dir = args.out_dir 20 | 21 | with open(config_file, 'r') as f: 22 | config = yaml.load(f) 23 | 24 | torch.manual_seed(1) 25 | ##update config file with commandline arguments 26 | 27 | 28 | def prepare_train_data_set(**data_cofig): 29 | data_root_dir = data_cofig['data_root_dir'] 30 | vocab_layout_file = os.path.join(data_root_dir, data_cofig['vocab_layout_file']) 31 | assembler = Assembler(vocab_layout_file) 32 | imdb_file_trn = os.path.join(data_root_dir, 'imdb',data_cofig['imdb_file_trn']) 33 | image_feat_dir = os.path.join(data_root_dir,data_cofig['preprocess_model'],'train') 34 | vocab_question_file = os.path.join(data_root_dir,data_cofig['vocab_question_file']) 35 | vocab_answer_file = os.path.join(data_root_dir,data_cofig['vocab_answer_file']) 36 | prune_filter_module = data_cofig['prune_filter_module'] 37 | N = data_cofig['N'] 38 | T_encoder = data_cofig['T_encoder'] 39 | T_decoder = data_cofig['T_decoder'] 40 | 41 | data_reader_trn = DataReader(imdb_file_trn, image_feat_dir, shuffle=False, one_pass=True, 42 | batch_size=N, 43 | T_encoder=T_encoder, 44 | T_decoder=T_decoder, 45 | assembler=assembler, 46 | vocab_question_file=vocab_question_file, 47 | vocab_answer_file=vocab_answer_file, 48 | prune_filter_module=prune_filter_module) 49 | 50 | num_vocab_txt = data_reader_trn.batch_loader.vocab_dict.num_vocab 51 | num_vocab_nmn = len(assembler.module_names) 52 | num_choices = data_reader_trn.batch_loader.answer_dict.num_vocab 53 | 54 | return data_reader_trn, num_vocab_txt, num_choices,num_vocab_nmn, assembler 55 | 56 | 57 | def prepare_model(num_vocab_txt, num_choices, num_vocab_nmn,assembler, **model_config): 58 | if model_config['model_type'] == model_type_gt_rl: 59 | myModel = torch.load(model_config['model_path']) 60 | else: 61 | '''myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn, 62 | out_num_choices=num_choices, 63 | embed_dim_nmn=embed_dim_nmn, embed_dim_txt=embed_dim_txt, 64 | image_height=H_feat, image_width=W_feat, in_image_dim=D_feat, 65 | hidden_size=lstm_dim, assembler=assembler, layout_criterion=criterion_layout, 66 | max_layout_len=T_decoder, 67 | answer_criterion=criterion_answer, num_layers=num_layers, decoder_dropout=0)''' 68 | 69 | criterion_layout = custom_loss(lambda_entropy= model_config['lambda_entropy']) 70 | criterion_answer = nn.CrossEntropyLoss(size_average=False, reduce=False) 71 | 72 | myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn, 73 | out_num_choices=num_choices, assembler= assembler, 74 | layout_criterion=criterion_layout, answer_criterion=criterion_answer, 75 | max_layout_len=model_config['T_decoder'], **model_config) 76 | myModel = myModel.cuda() if use_cuda else myModel 77 | 78 | return myModel 79 | 80 | 81 | data_reader_trn, num_vocab_txt, num_choices, num_vocab_nmn, assembler = prepare_train_data_set(**config['data'], **config['model']) 82 | myModel = prepare_model(num_vocab_txt, num_choices, num_vocab_nmn, assembler, **config['model']) 83 | 84 | training_parameters = config['training_parameters'] 85 | myOptimizer = optim.Adam(myModel.parameters(), 86 | weight_decay=training_parameters['weight_decay'], 87 | lr=training_parameters['learning_rate']) 88 | 89 | model_type = config['model']['model_type'] 90 | avg_accuracy = 0 91 | accuracy_decay = 0.99 92 | avg_layout_accuracy = 0 93 | updated_baseline = np.log(28) 94 | max_iter = training_parameters['max_iter'] 95 | baseline_decay = training_parameters['baseline_decay'] 96 | max_grad_l2_norm = training_parameters['max_grad_l2_norm'] 97 | snapshot_interval = training_parameters['snapshot_interval'] 98 | snapshot_dir = os.path.join(config['output']['root_dir'],"tfmodel",config['output']['exp_name']) 99 | 100 | for i_iter, batch in enumerate(data_reader_trn.batches()): 101 | if i_iter >= max_iter: 102 | break 103 | 104 | _, n_sample = batch['input_seq_batch'].shape 105 | input_text_seq_lens = batch['seq_length_batch'] 106 | input_text_seqs = batch['input_seq_batch'] 107 | input_layouts = batch['gt_layout_batch'] 108 | input_images = batch['image_feat_batch'] 109 | input_answers = batch['answer_label_batch'] 110 | 111 | np.savetxt("/private/home/tinayujiang/temp/temp_out/input_text_seqs.txt",input_text_seqs) 112 | np.savetxt("/private/home/tinayujiang/temp/temp_out/input_layouts.txt", input_layouts) 113 | #np.savetxt("/private/home/tinayujiang/temp/temp_out/input_images.txt", input_images[0,:,:]) 114 | np.savetxt("/private/home/tinayujiang/temp/temp_out/input_answers.txt", input_answers) 115 | 116 | 117 | 118 | 119 | n_correct_layout = 0 120 | n_correct_answer = 0 121 | 122 | input_txt_variable = Variable(torch.LongTensor(input_text_seqs)) 123 | input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable 124 | 125 | input_layout_variable = None 126 | decoder_sampling = True 127 | 128 | if model_type == model_type_gt: 129 | decoder_sampling = False 130 | input_layout_variable = Variable(torch.LongTensor(input_layouts)) 131 | input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable 132 | 133 | myOptimizer.zero_grad() 134 | 135 | total_loss, avg_answer_loss, myAnswer, predicted_layouts, expr_validity_array, updated_baseline \ 136 | = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens, 137 | input_answers=input_answers, input_images=input_images, policy_gradient_baseline=updated_baseline, 138 | baseline_decay=baseline_decay, input_layout_variable=input_layout_variable, 139 | sample_token=decoder_sampling 140 | ) 141 | 142 | if total_loss is not None: 143 | total_loss.backward() 144 | torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm) 145 | myOptimizer.step() 146 | 147 | layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0)) 148 | avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy) 149 | 150 | accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers)) 151 | avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy) 152 | validity = np.mean(expr_validity_array) 153 | 154 | if (i_iter + 1) % 100 == 0: 155 | print("iter:", i_iter + 1, 156 | " cur_layout_acc:%.3f" % layout_accuracy, " avg_layout_acc:%.3f" % avg_layout_accuracy, 157 | " cur_ans_acc:%.4f" % accuracy, " avg_answer_acc:%.4f" % avg_accuracy, 158 | "total loss:%.4f" % total_loss.data.cpu().numpy()[0], 159 | "avg_answer_loss:%.4f" % avg_answer_loss.data.cpu().numpy()[0]) 160 | 161 | sys.stdout.flush() 162 | 163 | # Save snapshot 164 | if (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter: 165 | model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1)) 166 | torch.save(myModel, model_snapshot_file) 167 | print('snapshot saved to ' + model_snapshot_file) 168 | sys.stdout.flush() 169 | -------------------------------------------------------------------------------- /train_model/main_copy.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import argparse 3 | import os 4 | from models.layout_assembler import Assembler 5 | from models.end2endModuleNet import * 6 | from models.custom_loss import custom_loss 7 | from global_variables.global_variables import * 8 | from Utils.data_reader import DataReader 9 | from torch import optim 10 | from Utils.dataSet import vqa_dataset 11 | from torch.utils.data import DataLoader 12 | 13 | 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--config", type=str, required=True, help="config yaml file") 17 | parser.add_argument("--out_dir",type=str, required=True, help="output directory") 18 | args = parser.parse_args() 19 | 20 | config_file= args.config 21 | out_dir = args.out_dir 22 | 23 | with open(config_file, 'r') as f: 24 | config = yaml.load(f) 25 | 26 | torch.manual_seed(1) 27 | 28 | ##update config file with commandline arguments 29 | 30 | 31 | def prepare_train_data_set(**data_cofig): 32 | data_root_dir = data_cofig['data_root_dir'] 33 | vocab_layout_file = os.path.join(data_root_dir, data_cofig['vocab_layout_file']) 34 | assembler = Assembler(vocab_layout_file) 35 | imdb_file_trn = os.path.join(data_root_dir, 'imdb',data_cofig['imdb_file_trn']) 36 | image_feat_dir = os.path.join(data_root_dir,data_cofig['preprocess_model'],'train') 37 | vocab_question_file = os.path.join(data_root_dir,data_cofig['vocab_question_file']) 38 | vocab_answer_file = os.path.join(data_root_dir,data_cofig['vocab_answer_file']) 39 | prune_filter_module = data_cofig['prune_filter_module'] 40 | N = data_cofig['N'] 41 | T_encoder = data_cofig['T_encoder'] 42 | T_decoder = data_cofig['T_decoder'] 43 | image_depth_first = data_cofig['image_depth_first'] 44 | 45 | vqa_train_dataset = vqa_dataset(imdb_file=imdb_file_trn, image_feat_directory=image_feat_dir, T_encoder=T_encoder, 46 | T_decoder=T_decoder, 47 | assembler=assembler, 48 | vocab_question_file=vocab_question_file, 49 | vocab_answer_file=vocab_answer_file, 50 | prune_filter_module=prune_filter_module, 51 | image_depth_first=image_depth_first) 52 | 53 | data_reader_trn = DataLoader(dataset=vqa_train_dataset, batch_size=N, shuffle=True) 54 | 55 | num_vocab_txt = vqa_train_dataset.vocab_dict.num_vocab 56 | num_vocab_nmn = len(assembler.module_names) 57 | num_choices = vqa_train_dataset.answer_dict.num_vocab 58 | 59 | return data_reader_trn, num_vocab_txt, num_choices,num_vocab_nmn, assembler 60 | 61 | 62 | def prepare_model(num_vocab_txt, num_choices, num_vocab_nmn,assembler, **model_config): 63 | if model_config['model_type'] == model_type_gt_rl: 64 | myModel = torch.load(model_config['model_path']) 65 | else: 66 | criterion_layout = custom_loss(lambda_entropy= model_config['lambda_entropy']) 67 | criterion_answer = nn.CrossEntropyLoss(size_average=False, reduce=False) 68 | 69 | myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn, 70 | out_num_choices=num_choices, assembler= assembler, 71 | layout_criterion=criterion_layout, answer_criterion=criterion_answer, 72 | max_layout_len=model_config['T_decoder'], **model_config) 73 | myModel = myModel.cuda() if use_cuda else myModel 74 | 75 | return myModel 76 | 77 | 78 | data_reader_trn, num_vocab_txt, num_choices, num_vocab_nmn, assembler = prepare_train_data_set(**config['data'], **config['model']) 79 | myModel = prepare_model(num_vocab_txt, num_choices, num_vocab_nmn, assembler, **config['model']) 80 | 81 | training_parameters = config['training_parameters'] 82 | myOptimizer = optim.Adam(myModel.parameters(), 83 | weight_decay=training_parameters['weight_decay'], 84 | lr=training_parameters['learning_rate']) 85 | 86 | model_type = config['model']['model_type'] 87 | avg_accuracy = 0 88 | accuracy_decay = 0.99 89 | avg_layout_accuracy = 0 90 | updated_baseline = np.log(28) 91 | max_iter = training_parameters['max_iter'] 92 | baseline_decay = training_parameters['baseline_decay'] 93 | max_grad_l2_norm = training_parameters['max_grad_l2_norm'] 94 | snapshot_interval = training_parameters['snapshot_interval'] 95 | snapshot_dir = os.path.join(config['output']['root_dir'],"tfmodel",config['output']['exp_name']) 96 | os.makedirs(snapshot_dir, exist_ok=True) 97 | 98 | i_iter = 0 99 | for iepoch in range(100): 100 | print("iepoch = ", iepoch) 101 | if i_iter >= max_iter: 102 | break 103 | for i, batch in enumerate(data_reader_trn): 104 | n_sample,_ = batch['input_seq_batch'].shape 105 | input_text_seq_lens = batch['seq_length_batch'].cpu().numpy() 106 | input_text_seqs = np.transpose(batch['input_seq_batch'].cpu().numpy()) 107 | input_layouts = np.transpose(batch['gt_layout_batch'].cpu().numpy()) 108 | input_images = batch['image_feat_batch'].cpu().numpy() 109 | input_answers = batch['answer_label_batch'].cpu().numpy() 110 | 111 | n_correct_layout = 0 112 | n_correct_answer = 0 113 | 114 | input_txt_variable = Variable(torch.LongTensor(input_text_seqs)) 115 | input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable 116 | 117 | input_layout_variable = None 118 | decoder_sampling = True 119 | 120 | if model_type == model_type_gt: 121 | decoder_sampling = False 122 | input_layout_variable = Variable(torch.LongTensor(input_layouts)) 123 | input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable 124 | 125 | myOptimizer.zero_grad() 126 | 127 | total_loss, avg_answer_loss, myAnswer, predicted_layouts, expr_validity_array, updated_baseline \ 128 | = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens, 129 | input_answers=input_answers, input_images=input_images, policy_gradient_baseline=updated_baseline, 130 | baseline_decay=baseline_decay, input_layout_variable=input_layout_variable, 131 | sample_token=decoder_sampling 132 | ) 133 | 134 | if total_loss is not None: 135 | total_loss.backward() 136 | torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm) 137 | myOptimizer.step() 138 | 139 | layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0)) 140 | avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy) 141 | 142 | accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers)) 143 | avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy) 144 | validity = np.mean(expr_validity_array) 145 | 146 | if (i_iter + 1) % 20 == 0: 147 | print("iter:", i_iter + 1, 148 | " cur_layout_acc:%.3f" % layout_accuracy, " avg_layout_acc:%.3f" % avg_layout_accuracy, 149 | " cur_ans_acc:%.4f" % accuracy, " avg_answer_acc:%.4f" % avg_accuracy, 150 | "total loss:%.4f" % total_loss.data.cpu().numpy()[0], 151 | "avg_answer_loss:%.4f" % avg_answer_loss.data.cpu().numpy()[0]) 152 | 153 | sys.stdout.flush() 154 | 155 | # Save snapshot 156 | if (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter: 157 | model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1)) 158 | torch.save(myModel, model_snapshot_file) 159 | print('snapshot saved to ' + model_snapshot_file) 160 | sys.stdout.flush() 161 | i_iter += 1 162 | 163 | -------------------------------------------------------------------------------- /train_model/train_clevr_gt_layout.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | from Utils.data_reader import DataReader 3 | import sys 4 | from torch import optim 5 | 6 | from models.layout_assembler import Assembler 7 | from train_model.input_parameters import * 8 | from models.end2endModuleNet import * 9 | from models.custom_loss import custom_loss 10 | from global_variables.global_variables import use_cuda 11 | 12 | 13 | 14 | 15 | ##create directory for snapshot 16 | os.makedirs(snapshot_dir, exist_ok=True) 17 | 18 | 19 | assembler = Assembler(vocab_layout_file) 20 | 21 | data_reader_trn = DataReader(imdb_file_trn,image_feat_dir, shuffle=True, one_pass=False, 22 | batch_size=N, 23 | T_encoder=T_encoder, 24 | T_decoder=T_decoder, 25 | assembler=assembler, 26 | vocab_question_file=vocab_question_file, 27 | vocab_answer_file=vocab_answer_file, 28 | prune_filter_module=prune_filter_module) 29 | 30 | num_vocab_txt = data_reader_trn.batch_loader.vocab_dict.num_vocab 31 | num_vocab_nmn = len(assembler.module_names) 32 | num_choices = data_reader_trn.batch_loader.answer_dict.num_vocab 33 | 34 | 35 | 36 | criterion_layout = custom_loss(lambda_entropy = lambda_entropy) 37 | criterion_answer = nn.CrossEntropyLoss(size_average=False,reduce=False) 38 | 39 | 40 | if model_type == model_type_gt_rl: 41 | myModel = torch.load(model_path) 42 | else: 43 | myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn, out_num_choices=num_choices, 44 | embed_dim_nmn=embed_dim_nmn, embed_dim_txt=embed_dim_txt, 45 | image_height=H_feat, image_width=W_feat, in_image_dim=D_feat, 46 | hidden_size=lstm_dim, assembler=assembler, layout_criterion=criterion_layout, 47 | max_layout_len=T_decoder, 48 | answer_criterion=criterion_answer, num_layers=num_layers, decoder_dropout=0) 49 | 50 | myOptimizer = optim.Adam(myModel.parameters(), weight_decay=weight_decay, lr=learning_rate) 51 | 52 | 53 | avg_accuracy = 0 54 | accuracy_decay = 0.99 55 | avg_layout_accuracy = 0 56 | updated_baseline = np.log(28) 57 | 58 | for i_iter, batch in enumerate(data_reader_trn.batches()): 59 | if i_iter >= max_iter: 60 | break 61 | 62 | _, n_sample = batch['input_seq_batch'].shape 63 | input_text_seq_lens = batch['seq_length_batch'] 64 | input_text_seqs = batch['input_seq_batch'] 65 | input_layouts = batch['gt_layout_batch'] 66 | input_images = batch['image_feat_batch'] 67 | input_answers = batch['answer_label_batch'] 68 | 69 | 70 | n_correct_layout = 0 71 | n_correct_answer = 0 72 | 73 | input_txt_variable = Variable(torch.LongTensor(input_text_seqs)) 74 | input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable 75 | 76 | input_layout_variable = None 77 | 78 | if model_type == model_type_gt: 79 | input_layout_variable = Variable(torch.LongTensor(input_layouts)) 80 | input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable 81 | 82 | 83 | 84 | 85 | 86 | myOptimizer.zero_grad() 87 | 88 | total_loss,avg_answer_loss ,myAnswer, predicted_layouts, expr_validity_array, updated_baseline \ 89 | = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens, 90 | input_answers=input_answers, input_images=input_images,policy_gradient_baseline=updated_baseline, 91 | baseline_decay=baseline_decay, input_layout_variable=input_layout_variable, 92 | sample_token=decoder_sampling 93 | ) 94 | 95 | if total_loss is not None: 96 | total_loss.backward() 97 | torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm) 98 | myOptimizer.step() 99 | 100 | layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0)) 101 | avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy) 102 | 103 | accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers)) 104 | avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy) 105 | validity = np.mean(expr_validity_array) 106 | 107 | if (i_iter + 1) % 20 == 0 : 108 | print("iter:", i_iter + 1, 109 | " cur_layout_acc:%.3f"% layout_accuracy, " avg_layout_acc:%.3f"% avg_layout_accuracy, 110 | " cur_ans_acc:%.4f"% accuracy, " avg_answer_acc:%.4f"% avg_accuracy, 111 | "total loss:%.4f"%total_loss.data.cpu().numpy()[0], 112 | "avg_answer_loss:%.4f"% avg_answer_loss.data.cpu().numpy()[0]) 113 | 114 | sys.stdout.flush() 115 | 116 | # Save snapshot 117 | if (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter: 118 | model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1)) 119 | torch.save(myModel, model_snapshot_file) 120 | print('snapshot saved to ' + model_snapshot_file ) 121 | sys.stdout.flush() 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | --------------------------------------------------------------------------------