├── .gitignore
├── README.md
├── Utils
    ├── __init__.py
    ├── dataSet.py
    ├── data_reader.py
    ├── text_processing.py
    └── utils.py
├── config
    ├── clevr_from_scratch.yaml
    ├── clevr_gt_layout.yaml
    ├── clevr_gt_rl.yaml
    ├── vqa_from_scratch.yaml
    └── vqa_gt_layout.yaml
├── data
    └── vqa
    │   ├── answers_vqa.txt
    │   ├── gt_layout_train2014_new_parse.npy
    │   ├── gt_layout_val2014_new_parse.npy
    │   ├── vocabulary_layout.txt
    │   ├── vocabulary_vqa.txt
    │   └── vocabulary_vqa_glove.npy
├── eval_model
    ├── eval_example.py
    ├── eval_layout_accuracy.py
    ├── eval_layout_learning.py
    └── layout_evaluator.py
├── global_variables
    ├── __init__.py
    └── global_variables.py
├── loadn2nmn_pytorch_env.sh
├── models
    ├── Attention2.py
    ├── __init__.py
    ├── custom_loss.py
    ├── end2endModuleNet.py
    ├── function2Module.py
    ├── layout_assembler.py
    ├── module_net.py
    └── modules.py
├── tools
    ├── build_clevr_imdb.py
    ├── build_vqa_imdb.py
    ├── extract_visual_features_vgg_pool5.py
    └── get_ground_truth_layout.py
└── train_model
    ├── __init__.py
    ├── from_scratch_hyperparameters.py
    ├── gt_hyperparameters.py
    ├── gt_rl_hyperparameters.py
    ├── input_parameters.py
    ├── main.py
    ├── main_copy.py
    └── train_clevr_gt_layout.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | *.err
3 | *.pyc
4 | test_helper/*
5 | exp_clevr/*
6 | .idea/*
7 | */__pycache__/*
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning to Reason: End-to-End Module Networks for Visual Question Answering
 2 | 
 3 | This repository re-implement https://github.com/ronghanghu/n2nmn in pytorch:
 4 | 
 5 | * R. Hu, J. Andreas, M. Rohrbach, T. Darrell, K. Saenko, *Learning to Reason: End-to-End Module Networks for Visual Question Answering*. in ICCV, 2017. ([PDF](https://arxiv.org/pdf/1704.05526.pdf))
 6 | 
 7 | ## Getting Started
 8 | 
 9 | These instructions will get you a copy of the project up and running on your local machine for testing.
10 | (Note, this codebase is still under development. To run it, still need to use part of the original code for data preprocessing)
11 | 
12 | ### Installing
13 | 
14 | 1. Install Python 3 (Anaconda recommended: https://www.continuum.io/downloads).
15 | 2. Install Pytorch (http://pytorch.org/)
16 | 3. cudnn/v7.0-cuda.9.0 (optional)
17 | ```
18 | git clone git@github.com:YuJiang01/n2nmn_pytorch.git
19 | ```
20 | 
21 | 
22 | ### Get preprocessed data
23 | * Follow https://github.com/ronghanghu/n2nmn#download-and-preprocess-the-data preprocess step for CLEVR
24 | 
25 | After preprocess the data, 
26 | 
27 | 
28 | 
29 | ### Training
30 | 
31 | Example:
32 | ```
33 | python train_model/train_clevr_gt_layout.py --exp_name gt_test --model_type gt_layout  --data_dir /private/home/tinayujiang/n2nmn/exp_clevr/data --image_feat_dir /private/home/tinayujiang/n2nmn/exp_clevr/data/vgg_pool5/train --out_dir /private/home/tinayujiang/temp_out
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/Utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/Utils/__init__.py


--------------------------------------------------------------------------------
/Utils/dataSet.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from torch.utils.data import Dataset
 3 | from Utils import text_processing
 4 | import numpy as np
 5 | 
 6 | 
 7 | 
 8 | class vqa_dataset(Dataset):
 9 |     def __init__(self,imdb_file, image_feat_directory, **data_params):
10 |         super(vqa_dataset,self).__init__()
11 |         if imdb_file.endswith('.npy'):
12 |             imdb = np.load(imdb_file)
13 |         else:
14 |             raise TypeError('unknown imdb format.')
15 | 
16 |         self.imdb = imdb
17 |         self.image_feat_directory = image_feat_directory
18 |         self.data_params = data_params
19 |         self.image_depth_first = data_params['image_depth_first']
20 | 
21 |         self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file'])
22 |         self.T_encoder = data_params['T_encoder']
23 | 
24 |         # peek one example to see whether answer and gt_layout are in the data
25 |         self.load_answer = (('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None)) \
26 |                            or (('valid_answers' in self.imdb[0]) and (self.imdb[0]['valid_answers'] is not None))
27 |         self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None)
28 |         if 'load_gt_layout' in data_params:
29 |             self.load_gt_layout = data_params['load_gt_layout']
30 |         # the answer dict is always loaded, regardless of self.load_answer
31 |         self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file'])
32 |         if not self.load_answer:
33 |             print('imdb does not contain answers')
34 |         if self.load_gt_layout:
35 |             self.T_decoder = data_params['T_decoder']
36 |             self.assembler = data_params['assembler']
37 |             self.prune_filter_module = (data_params['prune_filter_module']
38 |                                         if 'prune_filter_module' in data_params
39 |                                         else False)
40 |         else:
41 |             print('imdb does not contain ground-truth layout')
42 | 
43 |         # load one feature map to peek its size
44 |         image_file_name = os.path.basename(self.imdb[0]['feature_path'])
45 |         image_feat_path = os.path.join(self.image_feat_directory,image_file_name)
46 |         feats = np.load(image_feat_path)
47 |         #self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
48 | 
49 |     def __len__(self):
50 |         return len(self.imdb)
51 | 
52 |     def __getitem__(self, idx):
53 |         input_seq = np.zeros((self.T_encoder),np.int32)
54 |         iminfo = self.imdb[idx]
55 |         question_inds = [self.vocab_dict.word2idx(w) for w in iminfo['question_tokens']]
56 |         seq_length = len(question_inds)
57 |         input_seq[:seq_length] = question_inds
58 |         image_file_name = os.path.basename(self.imdb[idx]['feature_path'])
59 |         image_feat_path = os.path.join(self.image_feat_directory, image_file_name)
60 |         image_feat =np.squeeze(np.load(image_feat_path), axis=0)
61 |         if not self.image_depth_first:
62 |             image_feat = np.transpose(image_feat, axes=(2, 0, 1))
63 |         answer = None
64 |         if self.load_answer:
65 |             if 'answer' in iminfo:
66 |                 answer = iminfo['answer']
67 |             elif 'valid_answers' in iminfo:
68 |                 valid_answers = iminfo['valid_answers']
69 |                 answer = np.random.choice(valid_answers)
70 |             answer_idx = self.answer_dict.word2idx(answer)
71 | 
72 |         if self.load_gt_layout:
73 |             gt_layout_tokens = iminfo['gt_layout_tokens']
74 |             if self.prune_filter_module:
75 |                 # remove duplicated consequtive modules (only keeping one _Filter)
76 |                 for n_t in range(len(gt_layout_tokens) - 1, 0, -1):
77 |                     if (gt_layout_tokens[n_t - 1] in {'_Filter', '_Find'}
78 |                             and gt_layout_tokens[n_t] == '_Filter'):
79 |                         gt_layout_tokens[n_t] = None
80 |                 gt_layout_tokens = [t for t in gt_layout_tokens if t]
81 |             gt_layout =np.array(self.assembler.module_list2tokens(
82 |                 gt_layout_tokens, self.T_decoder))
83 | 
84 |         sample = dict(input_seq_batch=input_seq,
85 |                      seq_length_batch=seq_length,
86 |                      image_feat_batch=image_feat)
87 |         if self.load_answer:
88 |             sample['answer_label_batch'] = answer_idx
89 |         if self.load_gt_layout:
90 |             sample['gt_layout_batch'] = gt_layout
91 | 
92 |         return sample


--------------------------------------------------------------------------------
/Utils/data_reader.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import os
  4 | import sys
  5 | import threading
  6 | import queue
  7 | import numpy as np
  8 | 
  9 | from Utils import text_processing
 10 | 
 11 | class BatchLoaderClevr:
 12 |     def __init__(self, imdb,image_feat_dir, data_params):
 13 |         self.imdb = imdb
 14 |         self.image_feat_dir = image_feat_dir
 15 |         self.data_params = data_params
 16 | 
 17 |         self.vocab_dict = text_processing.VocabDict(data_params['vocab_question_file'])
 18 |         self.T_encoder = data_params['T_encoder']
 19 | 
 20 |         # peek one example to see whether answer and gt_layout are in the data
 21 |         self.load_answer = ('answer' in self.imdb[0]) and (self.imdb[0]['answer'] is not None)
 22 |         self.load_gt_layout = ('gt_layout_tokens' in self.imdb[0]) and (self.imdb[0]['gt_layout_tokens'] is not None)
 23 |         if 'load_gt_layout' in data_params:
 24 |             self.load_gt_layout = data_params['load_gt_layout']
 25 |         # the answer dict is always loaded, regardless of self.load_answer
 26 |         self.answer_dict = text_processing.VocabDict(data_params['vocab_answer_file'])
 27 |         if not self.load_answer:
 28 |             print('imdb does not contain answers')
 29 |         if self.load_gt_layout:
 30 |             self.T_decoder = data_params['T_decoder']
 31 |             self.assembler = data_params['assembler']
 32 |             self.prune_filter_module = (data_params['prune_filter_module']
 33 |                                         if 'prune_filter_module' in data_params
 34 |                                         else False)
 35 |         else:
 36 |             print('imdb does not contain ground-truth layout')
 37 | 
 38 |         # load one feature map to peek its size
 39 |         image_feat_basename = os.path.basename(self.imdb[0]['feature_path'])
 40 |         image_feat_name = os.path.join(self.image_feat_dir, image_feat_basename)
 41 |         feats = np.load(image_feat_name)
 42 |         #self.feat_H, self.feat_W, self.feat_D = feats.shape[1:]
 43 |         self.feat_D, self.feat_H, self.feat_W = feats.shape[1:]
 44 | 
 45 |     def load_one_batch(self, sample_ids):
 46 |         actual_batch_size = len(sample_ids)
 47 |         input_seq_batch = np.zeros((self.T_encoder, actual_batch_size), np.int32)
 48 |         seq_length_batch = np.zeros(actual_batch_size, np.int32)
 49 |         #image_feat_batch = np.zeros((actual_batch_size, self.feat_H, self.feat_W, self.feat_D), np.float32)
 50 |         image_feat_batch = np.zeros((actual_batch_size, self.feat_D, self.feat_H, self.feat_W), np.float32)
 51 |         image_path_list = [None]*actual_batch_size
 52 |         if self.load_answer:
 53 |             answer_label_batch = np.zeros(actual_batch_size, np.int32)
 54 |         if self.load_gt_layout:
 55 |             gt_layout_batch = np.zeros((self.T_decoder, actual_batch_size), np.int32)
 56 | 
 57 |         for n in range(len(sample_ids)):
 58 |             iminfo = self.imdb[sample_ids[n]]
 59 |             question_inds = [self.vocab_dict.word2idx(w) for w in iminfo['question_tokens']]
 60 |             seq_length = len(question_inds)
 61 |             input_seq_batch[:seq_length, n] = question_inds
 62 |             seq_length_batch[n] = seq_length
 63 |             image_feat_basename = os.path.basename(iminfo['feature_path'])
 64 |             image_feat_name = os.path.join(self.image_feat_dir, image_feat_basename)
 65 |             image_feat_batch[n:n+1] = np.load(image_feat_name)
 66 |             if self.load_answer:
 67 |                 answer_idx = self.answer_dict.word2idx(iminfo['answer'])
 68 |                 answer_label_batch[n] = answer_idx
 69 |             if self.load_gt_layout:
 70 |                 gt_layout_tokens = iminfo['gt_layout_tokens']
 71 |                 if self.prune_filter_module:
 72 |                     # remove duplicated consequtive modules (only keeping one _Filter)
 73 |                     for n_t in range(len(gt_layout_tokens)-1, 0, -1):
 74 |                         if (gt_layout_tokens[n_t-1] in {'_Filter', '_Find'}
 75 |                             and gt_layout_tokens[n_t] == '_Filter'):
 76 |                             gt_layout_tokens[n_t] = None
 77 |                     gt_layout_tokens = [t for t in gt_layout_tokens if t]
 78 |                 gt_layout_batch[:, n] = self.assembler.module_list2tokens(
 79 |                     gt_layout_tokens, self.T_decoder)
 80 |         batch = dict(input_seq_batch=input_seq_batch,
 81 |                      seq_length_batch=seq_length_batch,
 82 |                      image_feat_batch=image_feat_batch,
 83 |                      image_path_list=image_path_list)
 84 |         if self.load_answer:
 85 |             batch['answer_label_batch'] = answer_label_batch
 86 |         if self.load_gt_layout:
 87 |             batch['gt_layout_batch'] = gt_layout_batch
 88 |         return batch
 89 | 
 90 | class DataReader:
 91 |     def __init__(self, imdb_file, image_feat_dir, shuffle=True, one_pass=False, prefetch_num=8, **kwargs):
 92 |         print('Loading imdb from file...', end=''); sys.stdout.flush()
 93 |         if imdb_file.endswith('.npy'):
 94 |             imdb = np.load(imdb_file)
 95 |         else:
 96 |             raise TypeError('unknown imdb format.')
 97 |         print('Done')
 98 |         self.imdb = imdb
 99 |         self.image_feat_dir = image_feat_dir
100 |         self.shuffle = shuffle
101 |         self.one_pass = one_pass
102 |         self.prefetch_num = prefetch_num
103 |         self.data_params = kwargs
104 | 
105 |         # Clevr data loader
106 |         self.batch_loader = BatchLoaderClevr(self.imdb,self.image_feat_dir, self.data_params)
107 | 
108 |         # Start prefetching thread
109 |         self.prefetch_queue = queue.Queue(maxsize=self.prefetch_num)
110 |         self.prefetch_thread = threading.Thread(target=_run_prefetch,
111 |             args=(self.prefetch_queue, self.batch_loader, self.imdb,
112 |                   self.shuffle, self.one_pass, self.data_params))
113 |         self.prefetch_thread.daemon = True
114 |         self.prefetch_thread.start()
115 | 
116 |     def batches(self):
117 |         while True:
118 |             # Get a batch from the prefetching queue
119 |             #if self.prefetch_queue.empty():
120 |             # print('data reader: waiting for data loading (IO is slow)...')
121 |             batch = self.prefetch_queue.get(block=True)
122 |             if batch is None:
123 |                 assert(self.one_pass)
124 |                 print('data reader: one pass finished')
125 |                 raise StopIteration()
126 |             yield batch
127 | 
128 | def _run_prefetch(prefetch_queue, batch_loader, imdb, shuffle, one_pass, data_params):
129 |     num_samples = len(imdb)
130 |     batch_size = data_params['batch_size']
131 | 
132 |     n_sample = 0
133 |     fetch_order = np.arange(num_samples)
134 |     while True:
135 |         # Shuffle the sample order for every epoch
136 |         if n_sample == 0 and shuffle:
137 |             fetch_order = np.random.permutation(num_samples)
138 | 
139 |         # Load batch from file
140 |         # note that len(sample_ids) <= batch_size, not necessarily equal
141 |         sample_ids = fetch_order[n_sample:n_sample+batch_size]
142 |         batch = batch_loader.load_one_batch(sample_ids)
143 |         prefetch_queue.put(batch, block=True)
144 | 
145 |         n_sample += len(sample_ids)
146 |         if n_sample >= num_samples:
147 |             # Put in a None batch to indicate a whole pass is over
148 |             if one_pass:
149 |                 prefetch_queue.put(None, block=True)
150 |             n_sample = 0
151 | 


--------------------------------------------------------------------------------
/Utils/text_processing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 4 | def tokenize(sentence):
 5 |     tokens = SENTENCE_SPLIT_REGEX.split(sentence.lower())
 6 |     tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
 7 |     return tokens
 8 | 
 9 | def load_str_list(fname):
10 |     with open(fname) as f:
11 |         lines = f.readlines()
12 |     lines = [l.strip() for l in lines]
13 |     return lines
14 | 
15 | class VocabDict:
16 |     def __init__(self, vocab_file):
17 |         self.word_list = load_str_list(vocab_file)
18 |         self.word2idx_dict = {w:n_w for n_w, w in enumerate(self.word_list)}
19 |         self.num_vocab = len(self.word_list)
20 |         self.UNK_idx = self.word2idx_dict['<unk>'] if '<unk>' in self.word2idx_dict else None
21 | 
22 |     def idx2word(self, n_w):
23 |         return self.word_list[n_w]
24 | 
25 |     def word2idx(self, w):
26 |         if w in self.word2idx_dict:
27 |             return self.word2idx_dict[w]
28 |         elif self.UNK_idx is not None:
29 |             return self.UNK_idx
30 |         else:
31 |             raise ValueError('word %s not in dictionary (while dictionary does not contain <unk>)' % w)
32 | 
33 |     def tokenize_and_index(self, sentence):
34 |         inds = [self.word2idx(w) for w in tokenize(sentence)]
35 |         return inds
36 | 


--------------------------------------------------------------------------------
/Utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def unique_columns(data):
 5 |     dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0]))
 6 |     dataf = np.asfortranarray(data).view(dt)
 7 |     u,uind = np.unique(dataf, return_inverse=True)
 8 |     m = u.view(data.dtype).reshape(-1,data.shape[0]).T
 9 |     res = [np.where(uind==x)[0] for x in range(m.shape[1])]
10 |     return res


--------------------------------------------------------------------------------
/config/clevr_from_scratch.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: clevr
 3 |     data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR
 4 |     preprocess_model : vgg_pool5
 5 |     vocab_question_file : vocabulary_clevr.txt
 6 |     vocab_layout_file : vocabulary_layout.txt
 7 |     vocab_answer_file : answers_clevr.txt
 8 |     imdb_file_trn : imdb_trn.npy
 9 |     image_depth_first: True
10 | 
11 | output:
12 |     root_dir: ~/temp/temp_out
13 |     exp_name: clevr_scratch 
14 | model:
15 |     model_type: scratch
16 |     image_height : 10
17 |     image_width : 15
18 |     in_image_dim : 512
19 |     embed_dim_txt : 300
20 |     embed_dim_nmn : 300
21 |     hidden_size : 512
22 |     num_layers : 2
23 |     encoder_dropout : 0
24 |     decoder_dropout : 0
25 |     decoder_sampling : True
26 |     T_encoder : 45
27 |     T_decoder : 10
28 |     N : 64
29 |     lambda_entropy : 0.01
30 |     prune_filter_module : True
31 |     use_qpn : True
32 |     qpn_dropout : True
33 |     reduce_visfeat_dim : False
34 | 
35 | training_parameters:
36 |     weight_decay : 0
37 |     baseline_decay : 0.99
38 |     max_iter : 120000
39 |     snapshot_interval : 10000
40 |     max_grad_l2_norm: 10
41 |     learning_rate : 0.001
42 | 
43 |     
44 |     
45 |     
46 |     
47 | 


--------------------------------------------------------------------------------
/config/clevr_gt_layout.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: clevr
 3 |     data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR
 4 |     preprocess_model : vgg_pool5
 5 |     vocab_question_file : vocabulary_clevr.txt
 6 |     vocab_layout_file : vocabulary_layout.txt
 7 |     vocab_answer_file : answers_clevr.txt
 8 |     imdb_file_trn : imdb_trn.npy
 9 |     image_depth_first: True
10 | 
11 | output:
12 |     root_dir: ~/temp/temp_out
13 |     exp_name: clevr_gt_layout 
14 | model:
15 |     model_type: gt_layout
16 |     image_height : 10
17 |     image_width : 15
18 |     in_image_dim : 512
19 |     embed_dim_txt : 300
20 |     embed_dim_nmn : 300
21 |     hidden_size : 512
22 |     num_layers : 2
23 |     encoder_dropout : 0
24 |     decoder_dropout : 0
25 |     decoder_sampling : True
26 |     T_encoder : 45
27 |     T_decoder : 10
28 |     N : 64
29 |     lambda_entropy : 0
30 |     prune_filter_module : True
31 |     use_qpn : True
32 |     qpn_dropout : True
33 |     reduce_visfeat_dim : False
34 | 
35 | training_parameters:
36 |     weight_decay : 5.0e-6
37 |     baseline_decay : 0.99
38 |     max_iter : 80000
39 |     snapshot_interval : 10000
40 |     max_grad_l2_norm: 10
41 |     learning_rate : 0.001
42 | 
43 |     
44 |     
45 |     
46 |     
47 | 


--------------------------------------------------------------------------------
/config/clevr_gt_rl.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: clevr
 3 |     data_root_dir : /private/home/tinayujiang/data/n2nmn_CLEVR
 4 |     preprocess_model : vgg_pool5
 5 |     vocab_question_file : vocabulary_clevr.txt
 6 |     vocab_layout_file : vocabulary_layout.txt
 7 |     vocab_answer_file : answers_clevr.txt
 8 |     imdb_file_trn : imdb_trn.npy
 9 |     image_depth_first: True
10 | 
11 | output:
12 |     root_dir: ~/temp/temp_out
13 |     exp_name: clevr_gt_rl 
14 | model:
15 |     model_type: gt+rl
16 |     model_path: 
17 |     image_height : 10
18 |     image_width : 15
19 |     in_image_dim : 512
20 |     embed_dim_txt : 300
21 |     embed_dim_nmn : 300
22 |     hidden_size : 512
23 |     num_layers : 2
24 |     encoder_dropout : 0
25 |     decoder_dropout : 0
26 |     decoder_sampling : True
27 |     T_encoder : 45
28 |     T_decoder : 10
29 |     N : 64
30 |     lambda_entropy : 0.005
31 |     prune_filter_module : True
32 |     use_qpn : True
33 |     qpn_dropout : True
34 |     reduce_visfeat_dim : False
35 | 
36 | training_parameters:
37 |     weight_decay : 5.0e-6
38 |     baseline_decay : 0.99
39 |     max_iter : 80000
40 |     snapshot_interval : 10000
41 |     max_grad_l2_norm: 10
42 |     learning_rate : 0.0001
43 | 
44 |     
45 |     
46 |     
47 |     
48 | 


--------------------------------------------------------------------------------
/config/vqa_from_scratch.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: vqa
 3 |     data_root_dir : /private/home/tinayujiang/data/n2nmn_vqa
 4 |     preprocess_model : resnet_res5c
 5 |     vocab_question_file : vocabulary_vqa.txt
 6 |     vocab_layout_file : vocabulary_layout.txt
 7 |     vocab_answer_file : answers_vqa.txt
 8 |     imdb_file_trn : imdb_trn.npy
 9 |     image_depth_first: False
10 | output:
11 |     root_dir: ~/temp/temp_out
12 |     exp_name: vqa_scratch 
13 | model:
14 |     model_type: scratch
15 |     image_height : 14
16 |     image_width : 14
17 |     in_image_dim : 2048
18 |     embed_dim_txt : 300
19 |     embed_dim_nmn : 300
20 |     hidden_size : 1000
21 |     num_layers : 2
22 |     encoder_dropout : 0.1
23 |     decoder_dropout : 0.1
24 |     decoder_sampling : True
25 |     T_encoder : 26
26 |     T_decoder : 13
27 |     N : 64
28 |     lambda_entropy : 0.01
29 |     prune_filter_module : True
30 |     use_qpn : True
31 |     qpn_dropout : True
32 |     reduce_visfeat_dim : False
33 | 
34 | training_parameters:
35 |     weight_decay : 0
36 |     baseline_decay : 0.99
37 |     max_iter : 120000
38 |     snapshot_interval : 10000
39 |     max_grad_l2_norm: 10
40 |     learning_rate : 0.001
41 | 
42 |     
43 |     
44 |     
45 |     
46 | 


--------------------------------------------------------------------------------
/config/vqa_gt_layout.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |     dataset: vqa
 3 |     data_root_dir : /private/home/tinayujiang/data/n2nmn_vqa
 4 |     preprocess_model : resnet_res5c
 5 |     vocab_question_file : vocabulary_vqa.txt
 6 |     vocab_layout_file : vocabulary_layout.txt
 7 |     vocab_answer_file : answers_vqa.txt
 8 |     imdb_file_trn : imdb_trn.npy
 9 |     image_depth_first: False
10 | output:
11 |     root_dir: ~/temp/temp_out
12 |     exp_name: vga_gt_layout
13 | model:
14 |     model_type: gt_layout
15 |     image_height : 14
16 |     image_width : 14
17 |     in_image_dim : 2048
18 |     embed_dim_txt : 300
19 |     embed_dim_nmn : 300
20 |     hidden_size : 1000
21 |     num_layers : 2
22 |     encoder_dropout : 0.1
23 |     decoder_dropout : 0.1
24 |     decoder_sampling : False
25 |     T_encoder : 26
26 |     T_decoder : 13
27 |     N : 64
28 |     lambda_entropy : 0
29 |     prune_filter_module : True
30 |     use_qpn : True
31 |     qpn_dropout : True
32 |     reduce_visfeat_dim : False
33 | 
34 | training_parameters:
35 |     weight_decay : 5.0e-6
36 |     baseline_decay : 0.99
37 |     max_iter : 80000
38 |     snapshot_interval : 10000
39 |     max_grad_l2_norm: 10
40 |     learning_rate : 0.001
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/data/vqa/answers_vqa.txt:
--------------------------------------------------------------------------------
   1 | <unk>
   2 | yes
   3 | no
   4 | 2
   5 | 1
   6 | white
   7 | 3
   8 | red
   9 | blue
  10 | 4
  11 | green
  12 | black
  13 | yellow
  14 | brown
  15 | 5
  16 | tennis
  17 | 6
  18 | baseball
  19 | orange
  20 | 0
  21 | bathroom
  22 | right
  23 | left
  24 | wood
  25 | gray
  26 | frisbee
  27 | pink
  28 | pizza
  29 | 7
  30 | kitchen
  31 | 8
  32 | cat
  33 | skiing
  34 | black and white
  35 | dog
  36 | skateboarding
  37 | snow
  38 | skateboard
  39 | surfing
  40 | water
  41 | grass
  42 | giraffe
  43 | surfboard
  44 | 10
  45 | wii
  46 | kite
  47 | man
  48 | broccoli
  49 | purple
  50 | winter
  51 | elephant
  52 | stop
  53 | train
  54 | horse
  55 | 9
  56 | umbrella
  57 | apple
  58 | silver
  59 | banana
  60 | sheep
  61 | eating
  62 | phone
  63 | bear
  64 | motorcycle
  65 | 12
  66 | cake
  67 | soccer
  68 | beach
  69 | tan
  70 | wine
  71 | zebra
  72 | brick
  73 | sunny
  74 | table
  75 | woman
  76 | laptop
  77 | bench
  78 | bananas
  79 | female
  80 | food
  81 | hat
  82 | bus
  83 | male
  84 | flowers
  85 | living room
  86 | cow
  87 | maybe
  88 | outside
  89 | cell phone
  90 | hot dog
  91 | bird
  92 | helmet
  93 | kites
  94 | night
  95 | snowboarding
  96 | 11
  97 | down
  98 | trees
  99 | camera
 100 | red and white
 101 | bed
 102 | bedroom
 103 | nothing
 104 | unknown
 105 | christmas
 106 | fork
 107 | tennis racket
 108 | metal
 109 | tree
 110 | blonde
 111 | up
 112 | glasses
 113 | 20
 114 | fence
 115 | 15
 116 | beer
 117 | tile
 118 | nike
 119 | boat
 120 | bat
 121 | 13
 122 | airport
 123 | cloudy
 124 | blue and white
 125 | glass
 126 | day
 127 | sitting
 128 | open
 129 | teddy bear
 130 | plane
 131 | car
 132 | clear
 133 | standing
 134 | suitcase
 135 | chocolate
 136 | many
 137 | donut
 138 | sandwich
 139 | bike
 140 | zoo
 141 | sand
 142 | cows
 143 | beige
 144 | girl
 145 | ball
 146 | birthday
 147 | palm
 148 | ocean
 149 | airplane
 150 | stripes
 151 | chair
 152 | horses
 153 | toilet
 154 | carrots
 155 | old
 156 | knife
 157 | coffee
 158 | fall
 159 | cheese
 160 | chinese
 161 | round
 162 | 14
 163 | tie
 164 | lot
 165 | skis
 166 | snowboard
 167 | scissors
 168 | donuts
 169 | boy
 170 | mountains
 171 | walking
 172 | fruit
 173 | sleeping
 174 | truck
 175 | wall
 176 | paper
 177 | breakfast
 178 | wedding
 179 | not sure
 180 | clock
 181 | sunglasses
 182 | mirror
 183 | fire hydrant
 184 | cold
 185 | asian
 186 | square
 187 | street
 188 | bicycle
 189 | toothbrush
 190 | on table
 191 | elephants
 192 | wetsuit
 193 | dirt
 194 | plastic
 195 | plaid
 196 | plate
 197 | 16
 198 | none
 199 | usa
 200 | spoon
 201 | 25
 202 | ground
 203 | tv
 204 | gold
 205 | daytime
 206 | luggage
 207 | cooking
 208 | happy
 209 | backpack
 210 | carrot
 211 | summer
 212 | africa
 213 | window
 214 | chicken
 215 | racket
 216 | people
 217 | computer
 218 | stone
 219 | on
 220 | mountain
 221 | no one
 222 | zebras
 223 | sun
 224 | circle
 225 | afternoon
 226 | graffiti
 227 | person
 228 | 30
 229 | hay
 230 | brown and white
 231 | leaves
 232 | evening
 233 | park
 234 | fish
 235 | city
 236 | 50
 237 | overcast
 238 | oranges
 239 | bread
 240 | jeans
 241 | restaurant
 242 | playing
 243 | sidewalk
 244 | closed
 245 | couch
 246 | giraffes
 247 | birds
 248 | ski poles
 249 | watch
 250 | refrigerator
 251 | field
 252 | motorcycles
 253 | child
 254 | flying kite
 255 | morning
 256 | microwave
 257 | grazing
 258 | remote
 259 | towel
 260 | striped
 261 | flower
 262 | 18
 263 | rocks
 264 | church
 265 | building
 266 | bricks
 267 | inside
 268 | milk
 269 | floor
 270 | books
 271 | light
 272 | bag
 273 | umbrellas
 274 | background
 275 | pepperoni
 276 | catcher
 277 | apples
 278 | london
 279 | concrete
 280 | small
 281 | rock
 282 | vase
 283 | road
 284 | playing wii
 285 | american
 286 | 100
 287 | surfboards
 288 | brushing teeth
 289 | bridge
 290 | desk
 291 | rainy
 292 | leather
 293 | police
 294 | clouds
 295 | white and black
 296 | very
 297 | gas
 298 | picture
 299 | keyboard
 300 | sink
 301 | adidas
 302 | new york
 303 | bowl
 304 | 17
 305 | coca cola
 306 | dell
 307 | pine
 308 | hot dogs
 309 | tomato
 310 | heart
 311 | middle
 312 | spring
 313 | baby
 314 | vegetables
 315 | sandals
 316 | pitcher
 317 | floral
 318 | ketchup
 319 | umpire
 320 | rain
 321 | book
 322 | outdoors
 323 | box
 324 | mouse
 325 | lights
 326 | drinking
 327 | sky
 328 | china
 329 | shadow
 330 | england
 331 | hydrant
 332 | skating
 333 | oven
 334 | reading
 335 | 24
 336 | both
 337 | ski
 338 | home
 339 | jumping
 340 | in air
 341 | front
 342 | playing tennis
 343 | rose
 344 | wii remote
 345 | checkered
 346 | baseball bat
 347 | on wall
 348 | hotel
 349 | tennis ball
 350 | house
 351 | river
 352 | double decker
 353 | rectangle
 354 | bears
 355 | dessert
 356 | white and blue
 357 | 40
 358 | talking on phone
 359 | wilson
 360 | in water
 361 | red and black
 362 | noon
 363 | roses
 364 | office
 365 | carpet
 366 | salad
 367 | blanket
 368 | canada
 369 | triangle
 370 | guitar
 371 | pole
 372 | electric
 373 | 23
 374 | resting
 375 | real
 376 | no idea
 377 | train station
 378 | top
 379 | safety
 380 | long
 381 | sunset
 382 | lettuce
 383 | taking picture
 384 | on plate
 385 | red and yellow
 386 | young
 387 | stop sign
 388 | counter
 389 | hot
 390 | star
 391 | green and white
 392 | lemon
 393 | tracks
 394 | tennis court
 395 | on ground
 396 | parking meter
 397 | rope
 398 | white and red
 399 | purse
 400 | on right
 401 | on left
 402 | fridge
 403 | photographer
 404 | sign
 405 | lake
 406 | 19
 407 | 21
 408 | scarf
 409 | rug
 410 | soup
 411 | dinner
 412 | air
 413 | goggles
 414 | english
 415 | shoes
 416 | short
 417 | windows
 418 | seagull
 419 | pepsi
 420 | toy
 421 | wire
 422 | cars
 423 | ponytail
 424 | 22
 425 | running
 426 | back
 427 | olives
 428 | poles
 429 | lots
 430 | plant
 431 | indoors
 432 | away
 433 | delta
 434 | gloves
 435 | talking
 436 | doughnut
 437 | clothes
 438 | collar
 439 | basket
 440 | off
 441 | helmets
 442 | glove
 443 | dusk
 444 | flying
 445 | tomatoes
 446 | raining
 447 | steel
 448 | cup
 449 | unsure
 450 | flag
 451 | branch
 452 | blue and yellow
 453 | taking off
 454 | tennis racquet
 455 | cement
 456 | hand
 457 | painting
 458 | oval
 459 | rainbow
 460 | japanese
 461 | wii controller
 462 | stove
 463 | shorts
 464 | forward
 465 | bottle
 466 | parking
 467 | octagon
 468 | beef
 469 | spinach
 470 | duck
 471 | laying down
 472 | stainless steel
 473 | one way
 474 | boots
 475 | coke
 476 | large
 477 | smoke
 478 | cream
 479 | lunch
 480 | passenger
 481 | mustard
 482 | 28
 483 | steam
 484 | bikes
 485 | head
 486 | skate park
 487 | blender
 488 | fan
 489 | pillow
 490 | yellow and black
 491 | strawberry
 492 | turkey
 493 | dogs
 494 | calm
 495 | united states
 496 | cross
 497 | shirt
 498 | japan
 499 | landing
 500 | strawberries
 501 | texting
 502 | straw
 503 | surf
 504 | orange and white
 505 | meat
 506 | toilet paper
 507 | suit
 508 | red and blue
 509 | orange juice
 510 | clean
 511 | tea
 512 | wind
 513 | urban
 514 | boats
 515 | fake
 516 | doughnuts
 517 | warm
 518 | on bed
 519 | polar bear
 520 | headphones
 521 | fire
 522 | polar
 523 | w
 524 | tower
 525 | trash can
 526 | america
 527 | maroon
 528 | shade
 529 | samsung
 530 | school
 531 | flip flops
 532 | blinds
 533 | lamp
 534 | flying kites
 535 | decoration
 536 | bottom
 537 | nighttime
 538 | dirty
 539 | wooden
 540 | cutting board
 541 | to right
 542 | ford
 543 | big
 544 | clay
 545 | 35
 546 | electricity
 547 | sneakers
 548 | honda
 549 | wild
 550 | swimming
 551 | pictures
 552 | p
 553 | east
 554 | to left
 555 | marble
 556 | one on right
 557 | nobody
 558 | teal
 559 | football
 560 | jet
 561 | asia
 562 | ice cream
 563 | tabby
 564 | rice
 565 | yellow and blue
 566 | television
 567 | kia
 568 | trash
 569 | ducks
 570 | ham
 571 | oak
 572 | jacket
 573 | subway
 574 | behind
 575 | india
 576 | go
 577 | necklace
 578 | peppers
 579 | living
 580 | good
 581 | stuffed animal
 582 | posing
 583 | riding
 584 | nowhere
 585 | skateboards
 586 | bicycles
 587 | soda
 588 | wet
 589 | parrot
 590 | on desk
 591 | watching
 592 | shelf
 593 | desert
 594 | newspaper
 595 | south
 596 | downhill
 597 | plants
 598 | ring
 599 | goat
 600 | board
 601 | polo
 602 | yellow and red
 603 | french
 604 | headband
 605 | men
 606 | candles
 607 | italian
 608 | north
 609 | water skiing
 610 | granite
 611 | door
 612 | playing frisbee
 613 | bull
 614 | bacon
 615 | fries
 616 | fishing
 617 | bow
 618 | children
 619 | pen
 620 | chain link
 621 | cutting
 622 | 60
 623 | cap
 624 | bmw
 625 | german
 626 | diamond
 627 | lab
 628 | above
 629 | grapes
 630 | several
 631 | possibly
 632 | one on left
 633 | gray and white
 634 | hardwood
 635 | dress
 636 | tulips
 637 | store
 638 | market
 639 | onions
 640 | stick
 641 | white and brown
 642 | candle
 643 | moving
 644 | indian
 645 | racquet
 646 | spanish
 647 | cowboy
 648 | west
 649 | van
 650 | tall
 651 | military
 652 | chopsticks
 653 | paint
 654 | 27
 655 | protection
 656 | smiling
 657 | starbucks
 658 | parking lot
 659 | net
 660 | waves
 661 | lighthouse
 662 | african
 663 | cart
 664 | on floor
 665 | women
 666 | cutting cake
 667 | carriage
 668 | b
 669 | face
 670 | reflection
 671 | color
 672 | on sidewalk
 673 | toyota
 674 | public
 675 | black white
 676 | 32
 677 | bracelet
 678 | fire truck
 679 | sony
 680 | on street
 681 | cigarette
 682 | fork and knife
 683 | forest
 684 | no parking
 685 | wave
 686 | seagulls
 687 | palm trees
 688 | center
 689 | dining room
 690 | straight
 691 | scooter
 692 | 26
 693 | napkin
 694 | green and yellow
 695 | terrier
 696 | pug
 697 | sugar
 698 | blue and red
 699 | cats
 700 | statue
 701 | batter
 702 | ceramic
 703 | skateboarder
 704 | towels
 705 | fedex
 706 | united
 707 | full
 708 | half
 709 | italy
 710 | t
 711 | 38
 712 | 55
 713 | basketball
 714 | roman
 715 | c
 716 | navy
 717 | wool
 718 | low
 719 | california
 720 | adult
 721 | hair
 722 | parade
 723 | phones
 724 | motorbike
 725 | big ben
 726 | catching
 727 | juice
 728 | ramp
 729 | on counter
 730 | lady
 731 | bandana
 732 | black and red
 733 | sausage
 734 | hit ball
 735 | corn
 736 | suitcases
 737 | chef
 738 | at camera
 739 | pineapple
 740 | calico
 741 | 2 feet
 742 | bun
 743 | squares
 744 | siamese
 745 | potatoes
 746 | chain
 747 | 34
 748 | collie
 749 | outdoor
 750 | army
 751 | monkey
 752 | owl
 753 | light blue
 754 | sunlight
 755 | teddy bears
 756 | dock
 757 | playing game
 758 | chicago
 759 | wine glass
 760 | cone
 761 | out
 762 | station
 763 | bar
 764 | bakery
 765 | on beach
 766 | french fries
 767 | yellow and white
 768 | pigeon
 769 | baseball field
 770 | bamboo
 771 | soccer ball
 772 | tag
 773 | in background
 774 | game
 775 | right side
 776 | visor
 777 | on grass
 778 | blue and green
 779 | pie
 780 | video game
 781 | balance
 782 | pickle
 783 | high
 784 | remote control
 785 | pepper
 786 | us
 787 | pelican
 788 | shoe
 789 | bell
 790 | daisy
 791 | eagle
 792 | a
 793 | looking
 794 | commercial
 795 | waiting
 796 | batting
 797 | hello kitty
 798 | tiles
 799 | parsley
 800 | 33
 801 | serving
 802 | 45
 803 | nintendo wii
 804 | laying
 805 | bulldog
 806 | empty
 807 | n
 808 | surfer
 809 | dark
 810 | parasailing
 811 | nokia
 812 | dunkin donuts
 813 | tray
 814 | 3 feet
 815 | harley
 816 | pasta
 817 | red sox
 818 | brush
 819 | ski lift
 820 | drink
 821 | cloth
 822 | blue and black
 823 | red and green
 824 | butterfly
 825 | vegetable
 826 | buildings
 827 | smile
 828 | poodle
 829 | pan
 830 | hats
 831 | playing baseball
 832 | night time
 833 | log
 834 | stairs
 835 | 2012
 836 | france
 837 | rail
 838 | steak
 839 | library
 840 | krispy kreme
 841 | sailboat
 842 | peach
 843 | basil
 844 | cotton
 845 | black and yellow
 846 | fast
 847 | horns
 848 | pink and white
 849 | sweater
 850 | giants
 851 | yellow and green
 852 | mac
 853 | pot
 854 | eggs
 855 | sprinkles
 856 | mercedes
 857 | peace
 858 | controller
 859 | hospital
 860 | kid
 861 | left side
 862 | jump
 863 | shower
 864 | tiger
 865 | art
 866 | snowy
 867 | work
 868 | halloween
 869 | softball
 870 | fireplace
 871 | modern
 872 | pork
 873 | cookies
 874 | sauce
 875 | kite flying
 876 | cell phones
 877 | twin
 878 | vertical
 879 | farm
 880 | crane
 881 | microphone
 882 | rural
 883 | slow
 884 | german shepherd
 885 | turquoise
 886 | white and green
 887 | 200
 888 | kids
 889 | vanilla
 890 | palm tree
 891 | mask
 892 | clock tower
 893 | broken
 894 | bucket
 895 | 29
 896 | swan
 897 | sad
 898 | cherry
 899 | tennis shoes
 900 | sofa
 901 | pig
 902 | left one
 903 | bus stop
 904 | in front
 905 | ribbon
 906 | little
 907 | ice
 908 | stars
 909 | in field
 910 | stripe
 911 | garbage
 912 | on tracks
 913 | piano
 914 | onion
 915 | cleaning
 916 | maple
 917 | butter
 918 | fruits
 919 | woods
 920 | volleyball
 921 | watermelon
 922 | magnets
 923 | right one
 924 | lion
 925 | crosswalk
 926 | tattoo
 927 | bowling
 928 | daisies
 929 | backwards
 930 | 2013
 931 | cook
 932 | working
 933 | curtain
 934 | cross country
 935 | harley davidson
 936 | hair dryer
 937 | windsurfing
 938 | on road
 939 | washington
 940 | camouflage
 941 | candy
 942 | deer
 943 | roman numerals
 944 | tennis rackets
 945 | 2010
 946 | yamaha
 947 | museum
 948 | egg
 949 | trunk
 950 | mushrooms
 951 | mozzarella
 952 | map
 953 | vases
 954 | barn
 955 | cardboard
 956 | serve
 957 | white and gray
 958 | string
 959 | glazed
 960 | laptops
 961 | chandelier
 962 | country
 963 | 75
 964 | 44
 965 | beard
 966 | cargo
 967 | vest
 968 | wicker
 969 | parked
 970 | soap
 971 | pigeons
 972 | 10 feet
 973 | school bus
 974 | m
 975 | burgundy
 976 | driving
 977 | golden retriever
 978 | bikini
 979 | ladder
 980 | germany
 981 | iron
 982 | apron
 983 | typing
 984 | relaxing
 985 | san francisco
 986 | bags
 987 | on building
 988 | stuffed animals
 989 | party
 990 | goats
 991 | selfie
 992 | caution
 993 | shrimp
 994 | 2011
 995 | dry
 996 | pastry
 997 | ceiling
 998 | human
 999 | panda
1000 | on bench
1001 | foreground
1002 | 2009
1003 | cupcake
1004 | pots
1005 | runway
1006 | neither
1007 | colgate
1008 | europe
1009 | circles
1010 | gravel
1011 | cheesecake
1012 | red white and blue
1013 | paddle
1014 | foil
1015 | obama
1016 | skier
1017 | meter
1018 | kitten
1019 | railing
1020 | business
1021 | sandwiches
1022 | toys
1023 | 31
1024 | pier
1025 | in bowl
1026 | tail
1027 | racing
1028 | asphalt
1029 | not very
1030 | light brown
1031 | walk
1032 | traffic
1033 | restroom
1034 | man on right
1035 | 42
1036 | taxi
1037 | air force
1038 | first
1039 | skull
1040 | grill
1041 | medium
1042 | toothbrushes
1043 | garage
1044 | in sky
1045 | dodgers
1046 | 36
1047 | pickles
1048 | texas
1049 | family
1050 | playing video game
1051 | nintendo
1052 | british airways
1053 | camo
1054 | suv
1055 | throwing
1056 | beans
1057 | sleep
1058 | tape
1059 | hp
1060 | cucumber
1061 | clothing
1062 | coat
1063 | noodles
1064 | swinging
1065 | toothpaste
1066 | wheat
1067 | watching tv
1068 | bowtie
1069 | leash
1070 | harness
1071 | iphone
1072 | dragon
1073 | yankees
1074 | tired
1075 | water bottle
1076 | boxes
1077 | na
1078 | wagon
1079 | x
1080 | fun
1081 | man on left
1082 | can
1083 | australia
1084 | 70
1085 | pillows
1086 | pitbull
1087 | hill
1088 | dalmatian
1089 | sweatband
1090 | wristband
1091 | polka dots
1092 | orioles
1093 | britain
1094 | autumn
1095 | leaf
1096 | new
1097 | beagle
1098 | geese
1099 | polka dot
1100 | bow tie
1101 | boston
1102 | green and blue
1103 | bank of america
1104 | bush
1105 | day time
1106 | cleats
1107 | sparrow
1108 | lamb
1109 | brown and black
1110 | ship
1111 | aluminum
1112 | 500
1113 | barbed wire
1114 | rack
1115 | uk
1116 | catching frisbee
1117 | do not enter
1118 | chairs
1119 | animals
1120 | track
1121 | y
1122 | cupcakes
1123 | dirt bike
1124 | cumulus
1125 | pants
1126 | ski pole
1127 | seat
1128 | partly cloudy
1129 | hearts
1130 | nice
1131 | plates
1132 | pilot
1133 | apartment
1134 | girls
1135 | all
1136 | uphill
1137 | moss
1138 | fedora
1139 | on shelf
1140 | volkswagen
1141 | 1000
1142 | messy
1143 | s
1144 | rust
1145 | bagel
1146 | chips
1147 | going
1148 | caucasian
1149 | in grass
1150 | blurry
1151 | in car
1152 | pavement
1153 | ram
1154 | circular
1155 | leaving
1156 | sail
1157 | owner
1158 | hills
1159 | in back
1160 | below
1161 | jeep
1162 | mustache
1163 | rectangular
1164 | flags
1165 | cereal
1166 | mutt
1167 | evergreen
1168 | shadows
1169 | cabinet
1170 | green and red
1171 | setting
1172 | foggy
1173 | labrador
1174 | mexico
1175 | southwest
1176 | sunflower
1177 | brushing
1178 | pastries
1179 | desktop
1180 | normal
1181 | smoking
1182 | stand
1183 | love
1184 | hands
1185 | wires
1186 | tent
1187 | wine glasses
1188 | curly
1189 | post
1190 | black and gray
1191 | cones
1192 | logs
1193 | behind fence
1194 | tusks
1195 | american flag
1196 | produce
1197 | salmon
1198 | chase
1199 | e
1200 | horizontal
1201 | animal
1202 | eat
1203 | bathtub
1204 | corner
1205 | multi
1206 | british
1207 | tank top
1208 | engine
1209 | screen
1210 | orange and black
1211 | flip
1212 | kickstand
1213 | idk
1214 | natural
1215 | right hand
1216 | frog
1217 | trailer
1218 | grizzly
1219 | on tower
1220 | 37
1221 | game controller
1222 | finch
1223 | bathing
1224 | 5 feet
1225 | coming
1226 | white and orange
1227 | stones
1228 | ties
1229 | weeds
1230 | parachute
1231 | asparagus
1232 | traveling
1233 | bushes
1234 | tablecloth
1235 | 41
1236 | transportation
1237 | peacock
1238 | older
1239 | sweet
1240 | midday
1241 | canoe
1242 | probably
1243 | chevrolet
1244 | thin
1245 | windy
1246 | platform
1247 | lying down
1248 | wetsuits
1249 | 43
1250 | knife and fork
1251 | surfboarding
1252 | cattle
1253 | cubs
1254 | nose
1255 | traffic light
1256 | green and black
1257 | pool
1258 | downtown
1259 | 1 foot
1260 | model
1261 | falling
1262 | mom
1263 | passengers
1264 | brushing his teeth
1265 | dining
1266 | porcelain
1267 | propeller
1268 | santa
1269 | stuffed bear
1270 | pizza cutter
1271 | happiness
1272 | lg
1273 | yield
1274 | dishwasher
1275 | rabbit
1276 | exit
1277 | pitching
1278 | snowboards
1279 | cut
1280 | cauliflower
1281 | emirates
1282 | queen
1283 | curtains
1284 | guy
1285 | ipod
1286 | 6 feet
1287 | 4 feet
1288 | sunflowers
1289 | white and yellow
1290 | telephone
1291 | ostrich
1292 | power lines
1293 | shallow
1294 | mud
1295 | private
1296 | champagne
1297 | hamburger
1298 | cactus
1299 | baseball cap
1300 | cookie
1301 | icing
1302 | crow
1303 | planes
1304 | budweiser
1305 | construction
1306 | stool
1307 | time
1308 | potato
1309 | oil
1310 | hungry
1311 | tissue
1312 | arabic
1313 | smoothie
1314 | squash
1315 | 150
1316 | moped
1317 | lavender
1318 | eyes
1319 | pancakes
1320 | cane
1321 | backhand
1322 | tennis player
1323 | grapefruit
1324 | lime
1325 | warning
1326 | computers
1327 | taking photo
1328 | bib
1329 | tractor
1330 | soft
1331 | checkerboard
1332 | dancing
1333 | skatepark
1334 | boogie board
1335 | top left
1336 | upside down
1337 | trains
1338 | waving
1339 | happy birthday
1340 | steps
1341 | 80
1342 | in middle
1343 | mexican
1344 | black and blue
1345 | spatula
1346 | on couch
1347 | red bull
1348 | hundreds
1349 | aqua
1350 | flip phone
1351 | penguin
1352 | black bear
1353 | sushi
1354 | buses
1355 | fair
1356 | amtrak
1357 | mug
1358 | arm
1359 | parasail
1360 | khaki
1361 | fur
1362 | ana
1363 | towards
1364 | drinking water
1365 | 47
1366 | roof
1367 | frosting
1368 | crown
1369 | friends
1370 | baseball glove
1371 | travel
1372 | feet
1373 | doubles
1374 | skate
1375 | stopped
1376 | lufthansa
1377 | virgin
1378 | florida
1379 | fighting
1380 | wheelchair
1381 | vans
1382 | toaster oven
1383 | fell
1384 | very tall
1385 | teddy
1386 | r
1387 | alaska
1388 | chihuahua
1389 | hawk
1390 | 51
1391 | dark brown
1392 | tags
1393 | blue and gray
1394 | home plate
1395 | to eat
1396 | gothic
1397 | foot
1398 | broadway
1399 | husky
1400 | motorola
1401 | land
1402 | court
1403 | 48
1404 | hummingbird
1405 | black and brown
1406 | smartphone
1407 | neck
1408 | taking pictures
1409 | indoor
1410 | on pole
1411 | hazy
1412 | skirt
1413 | o
1414 | double
1415 | lily
1416 | suzuki
1417 | red white
1418 | cabbage
1419 | talking on cell phone
1420 | korean
1421 | microsoft
1422 | goose
1423 | photo
1424 | wiimote
1425 | curb
1426 | denim
1427 | top right
1428 | beanie
1429 | rowing
1430 | nuts
1431 | on top
1432 | free
1433 | paris
1434 | styrofoam
1435 | converse
1436 | storage
1437 | 46
1438 | g
1439 | 2007
1440 | hotel room
1441 | dots
1442 | cross country skiing
1443 | pm
1444 | 52
1445 | kiwi
1446 | 350
1447 | l
1448 | burger
1449 | monitor
1450 | picnic
1451 | handle
1452 | speaker
1453 | goalie
1454 | bottles
1455 | power
1456 | referee
1457 | veggie
1458 | on chair
1459 | berries
1460 | friend
1461 | tire
1462 | washington dc
1463 | 300
1464 | side
1465 | safari
1466 | yarn
1467 | snowing
1468 | sub
1469 | words
1470 | mat
1471 | dove
1472 | feeding
1473 | bath
1474 | quilt
1475 | lanyard
1476 | doll
1477 | red white blue
1478 | coal
1479 | picnic table
1480 | shark
1481 | swing
1482 | dead
1483 | mushroom
1484 | pond
1485 | cardinals
1486 | fern
1487 | olive
1488 | antique
1489 | tropical
1490 | houses
1491 | fresh
1492 | 2000
1493 | african american
1494 | in
1495 | red and gray
1496 | 39
1497 | on track
1498 | salt
1499 | tub
1500 | toaster
1501 | ivory
1502 | pipe
1503 | overalls
1504 | music
1505 | lime green
1506 | veggies
1507 | avocado
1508 | 66
1509 | at beach
1510 | for fun
1511 | stickers
1512 | tissues
1513 | celery
1514 | canopy
1515 | baseball game
1516 | end
1517 | bud light
1518 | street sign
1519 | christmas tree
1520 | cooked
1521 | radiator
1522 | gate
1523 | golf
1524 | bad
1525 | curved
1526 | lexus
1527 | nissan
1528 | fried
1529 | window sill
1530 | mickey mouse
1531 | helicopter
1532 | on water
1533 | mohawk
1534 | google
1535 | balloon
1536 | signs
1537 | gray and black
1538 | clydesdale
1539 | balloons
1540 | octopus
1541 | dachshund
1542 | rooster
1543 | stroller
1544 | orange and yellow
1545 | tongue
1546 | gun
1547 | smiley face
1548 | fabric
1549 | knee pads
1550 | 20 feet
1551 | top hat
1552 | pumpkin
1553 | fly kite
1554 | f
1555 | captivity
1556 | brace
1557 | belt
1558 | multicolored
1559 | mitt
1560 | dark blue
1561 | not clear
1562 | spaghetti
1563 | formal
1564 | boys
1565 | band
1566 | cobblestone
1567 | rough
1568 | dresser
1569 | socks
1570 | frisbees
1571 | linoleum
1572 | leopard
1573 | toward
1574 | tulip
1575 | foam
1576 | cool
1577 | peas
1578 | tarmac
1579 | puppy
1580 | necktie
1581 | cowboy hat
1582 | thumbs up
1583 | alcohol
1584 | ollie
1585 | hitting ball
1586 | pasture
1587 | deep
1588 | drawing
1589 | utensils
1590 | tablet
1591 | lilies
1592 | heineken
1593 | knives
1594 | kayak
1595 | race
1596 | sliced
1597 | hit
1598 | mets
1599 | pacifier
1600 | grilled
1601 | 49
1602 | cups
1603 | copper
1604 | train tracks
1605 | in street
1606 | hugging
1607 | lemons
1608 | police officer
1609 | writing
1610 | motion
1611 | off white
1612 | blood
1613 | toothpick
1614 | coach
1615 | briefcase
1616 | cooler
1617 | toilets
1618 | legos
1619 | boxing
1620 | dr pepper
1621 | play
1622 | honey
1623 | toshiba
1624 | chalk
1625 | jungle
1626 | brown and tan
1627 | hoodie
1628 | fluorescent
1629 | black and silver
1630 | shelves
1631 | rugby
1632 | crossing
1633 | mother
1634 | ride
1635 | pointing
1636 | bookshelf
1637 | powdered sugar
1638 | arriving
1639 | wheels
1640 | 54
1641 | blankets
1642 | ultimate frisbee
1643 | sea
1644 | mattress
1645 | arrow
1646 | for sale
1647 | pirate
1648 | kawasaki
1649 | urinal
1650 | papers
1651 | wallpaper
1652 | cell
1653 | bronze
1654 | all of them
1655 | canon
1656 | over
1657 | nursing
1658 | cord
1659 | regular
1660 | rackets
1661 | pony
1662 | cherries
1663 | yellow and orange
1664 | klm
1665 | benches
1666 | saddle
1667 | arch
1668 | sepia
1669 | shell
1670 | pajamas
1671 | money
1672 | some
1673 | circus
1674 | swans
1675 | checkers
1676 | parking meters
1677 | pacific
1678 | los angeles
1679 | dishes
1680 | green and brown
1681 | air france
1682 | awake
1683 | sports
1684 | 2 hours
1685 | thanksgiving
1686 | kettle
1687 | suits
1688 | hanging
1689 | display
1690 | very high
1691 | coconut
1692 | in vase
1693 | angels
1694 | legs
1695 | wakeboarding
1696 | united kingdom
1697 | herding
1698 | jets
1699 | checker
1700 | buffalo
1701 | phillies
1702 | staring
1703 | teeth
1704 | me
1705 | pedestal
1706 | earring
1707 | robe
1708 | sweatshirt
1709 | identification
1710 | flickr
1711 | boxer
1712 | looking out window
1713 | gatorade
1714 | crosstown
1715 | birthday cake
1716 | cameraman
1717 | on toilet
1718 | dinosaur
1719 | muffin
1720 | garden
1721 | wine tasting
1722 | snowboarder
1723 | cafe
1724 | lifeguard
1725 | mixer
1726 | barrel
1727 | on man
1728 | computer screen
1729 | right handed
1730 | railroad crossing
1731 | on stove
1732 | bunk
1733 | stir fry
1734 | dugout
1735 | tarp
1736 | kitesurfing
1737 | checked
1738 | someone
1739 | menu
1740 | relish
1741 | russian
1742 | shepherd
1743 | donkey
1744 | heels
1745 | bleachers
1746 | healthy
1747 | rolex
1748 | green and orange
1749 | throwing frisbee
1750 | on sink
1751 | ivy
1752 | air conditioner
1753 | lines
1754 | raw
1755 | in park
1756 | poor
1757 | blackberry
1758 | new york city
1759 | diesel
1760 | radio
1761 | baskets
1762 | tree branch
1763 | ferry
1764 | scale
1765 | laptop computer
1766 | mall
1767 | robin
1768 | seeds
1769 | chiquita
1770 | dozens
1771 | olympics
1772 | purple and white
1773 | diamonds
1774 | king
1775 | tiled
1776 | green beans
1777 | sheet
1778 | dole
1779 | cage
1780 | far right
1781 | hitting
1782 | bunny
1783 | toast
1784 | horse racing
1785 | harbor
1786 | uniform
1787 | pedestrians
1788 | pitch
1789 | gravy
1790 | jockey
1791 | 53
1792 | urinals
1793 | accident
1794 | es
1795 | flat
1796 | blue green
1797 | navy blue
1798 | logo
1799 | hard
1800 | blueberries
1801 | design
1802 | against wall
1803 | photography
1804 | crib
1805 | castle
1806 | medal
1807 | farmer
1808 | in zoo
1809 | vent
1810 | sled
1811 | corona
1812 | catch
1813 | sedan
1814 | hood
1815 | raspberry
1816 | vehicle
1817 | looking at camera
1818 | shower curtain
1819 | camel
1820 | highway
1821 | kissing
1822 | vegetarian
1823 | onion rings
1824 | strips
1825 | street light
1826 | antelope
1827 | 65
1828 | dawn
1829 | rear
1830 | dad
1831 | driver
1832 | tow truck
1833 | retriever
1834 | eating grass
1835 | residential
1836 | moon
1837 | skull and crossbones
1838 | flour
1839 | eiffel tower
1840 | bridle
1841 | side of road
1842 | ibm
1843 | backward
1844 | handicap
1845 | toilet brush
1846 | tunnel
1847 | little girl
1848 | chains
1849 | badminton
1850 | too many to count
1851 | seattle
1852 | above sink
1853 | t shirt
1854 | cover
1855 | on his head
1856 | chevy
1857 | main
1858 | island
1859 | balls
1860 | light green
1861 | roast beef
1862 | singing
1863 | blue white
1864 | detroit
1865 | 1 year
1866 | stretching
1867 | drinks
1868 | boarding
1869 | tongs
1870 | lace
1871 | hose
1872 | spiderman
1873 | ketchup and mustard
1874 | nikon
1875 | sniffing
1876 | tank
1877 | ski slope
1878 | bulls
1879 | audi
1880 | shearing
1881 | far
1882 | oriental
1883 | fox
1884 | himself
1885 | 1st
1886 | pocket
1887 | chrome
1888 | line
1889 | decorative
1890 | thumb
1891 | apartments
1892 | sliding
1893 | d
1894 | my best guess is yes
1895 | us open
1896 | baking
1897 | bottom left
1898 | silver and black
1899 | freight
1900 | raspberries
1901 | policeman
1902 | tourist
1903 | trick
1904 | frisbee golf
1905 | spain
1906 | fire extinguisher
1907 | in sink
1908 | unclear
1909 | continental
1910 | in suitcase
1911 | pink and yellow
1912 | far left
1913 | shirts
1914 | broom
1915 | news
1916 | under
1917 | skateboard trick
1918 | magazine
1919 | mouth
1920 | packing
1921 | gone
1922 | kangaroo
1923 | paisley
1924 | airplanes
1925 | yard
1926 | golden
1927 | wavy
1928 | trolley
1929 | cardinal
1930 | bottom right
1931 | playing video games
1932 | single
1933 | tires
1934 | printer
1935 | uniforms
1936 | deck
1937 | furniture
1938 | jelly
1939 | riding horse
1940 | croissant
1941 | xbox
1942 | shore
1943 | jesus
1944 | slippers
1945 | earphones
1946 | coffee cup
1947 | rubber
1948 | letters
1949 | rodeo
1950 | brushing her teeth
1951 | pc
1952 | petting
1953 | star wars
1954 | throw
1955 | braves
1956 | sunrise
1957 | windmill
1958 | bars
1959 | crest
1960 | in snow
1961 | taking selfie
1962 | video
1963 | peanut butter
1964 | coins
1965 | parakeet
1966 | black and orange
1967 | on laptop
1968 | fountain
1969 | directions
1970 | wii controllers
1971 | desserts
1972 | shut
1973 | trucks
1974 | fancy
1975 | wax paper
1976 | hiking
1977 | drain
1978 | baseball player
1979 | earrings
1980 | wii remotes
1981 | lid
1982 | clocks
1983 | 2008
1984 | heat
1985 | raft
1986 | forks
1987 | wheel
1988 | ox
1989 | mercedes benz
1990 | spoons
1991 | sesame
1992 | 68
1993 | married
1994 | smooth
1995 | club
1996 | haircut
1997 | mixed
1998 | bone
1999 | smaller
2000 | overhead
2001 | motor
2002 | coffee table
2003 | on field
2004 | climbing
2005 | buggy
2006 | bright
2007 | aluminum foil
2008 | mint
2009 | leg
2010 | triangles
2011 | k
2012 | on skateboard
2013 | frisby
2014 | 56
2015 | globe
2016 | savannah
2017 | cinnamon
2018 | ferris wheel
2019 | stopping
2020 | sailboats
2021 | us air force
2022 | eyeglasses
2023 | pedestrian crossing
2024 | 120
2025 | shopping
2026 | domestic
2027 | pottery
2028 | dairy
2029 | yogurt
2030 | metro
2031 | base
2032 | pigtails
2033 | ireland
2034 | alligator
2035 | lighter
2036 | maple leaf
2037 | gas station
2038 | on train
2039 | movie
2040 | flamingo
2041 | 101
2042 | cucumbers
2043 | cloud
2044 | plain
2045 | across street
2046 | stadium
2047 | above stove
2048 | liquid
2049 | sticker
2050 | spectators
2051 | decorations
2052 | skies
2053 | ascending
2054 | goatee
2055 | blue and orange
2056 | vinyl
2057 | pink and blue
2058 | ducati
2059 | speakers
2060 | scooters
2061 | heater
2062 | electronics
2063 | stew
2064 | 400
2065 | 1950
2066 | 72
2067 | security
2068 | intersection
2069 | magazines
2070 | 6 inches
2071 | meow
2072 | chili
2073 | speed
2074 | squatting
2075 | robot
2076 | pirates
2077 | remotes
2078 | seafood
2079 | 8 feet
2080 | pencil
2081 | graduation
2082 | lift
2083 | left hand
2084 | stands
2085 | squirrel
2086 | orange and green
2087 | placemat
2088 | space
2089 | wedding cake
2090 | den
2091 | heinz
2092 | on boat
2093 | jetblue
2094 | in kitchen
2095 | whale
2096 | bidet
2097 | branches
2098 | mountain dew
2099 | stuffed
2100 | rocky
2101 | braid
2102 | my best guess is no
2103 | vines
2104 | painted
2105 | pads
2106 | red and silver
2107 | cabinets
2108 | second
2109 | persian
2110 | h
2111 | scrambled
2112 | garlic
2113 | colorado
2114 | spray paint
2115 | main street
2116 | orchid
2117 | oar
2118 | swimsuit
2119 | washington monument
2120 | stormy
2121 | faucet
2122 | antenna
2123 | pizza hut
2124 | ping pong
2125 | lego
2126 | concert
2127 | abstract
2128 | dump truck
2129 | bats
2130 | walkway
2131 | cakes
2132 | 90
2133 | commuter
2134 | disc
2135 | paddle boarding
2136 | 12 feet
2137 | volvo
2138 | victorian
2139 | weather vane
2140 | wireless
2141 | multi colored
2142 | spots
2143 | tattoos
2144 | 1 way
2145 | in ocean
2146 | coaster
2147 | player
2148 | couple
2149 | never
2150 | very old
2151 | camper
2152 | turning
2153 | conductor
2154 | town
2155 | video games
2156 | daffodils
2157 | washing
2158 | skateboard park
2159 | costume
2160 | broke
2161 | surprise
2162 | photos
2163 | mack
2164 | plow
2165 | turtle
2166 | thomas
2167 | button
2168 | tour
2169 | blue jeans
2170 | advertisement
2171 | one in front
2172 | spider
2173 | peeing
2174 | 2nd
2175 | still
2176 | silk
2177 | for safety
2178 | surfers
2179 | target
2180 | macbook
2181 | jar
2182 | michigan
2183 | swiss
2184 | catch ball
2185 | state farm
2186 | classroom
2187 | great britain
2188 | dandelions
2189 | border collie
2190 | biking
2191 | fisheye
2192 | stained glass
2193 | war
2194 | motocross
2195 | reins
2196 | windowsill
2197 | semi
2198 | biplane
2199 | 100 feet
2200 | sideways
2201 | ladybug
2202 | in box
2203 | parmesan
2204 | napkins
2205 | wreath
2206 | case
2207 | harry potter
2208 | fog
2209 | 3 inches
2210 | forehand
2211 | plunger
2212 | above toilet
2213 | diet coke
2214 | winnie pooh
2215 | life jacket
2216 | chevron
2217 | hawaiian
2218 | suspenders
2219 | white and pink
2220 | united states of america
2221 | grocery
2222 | riding bike
2223 | brazil
2224 | polar bears
2225 | blue jay
2226 | carnation
2227 | herd
2228 | lobster
2229 | hit tennis ball
2230 | notebook
2231 | cartoon
2232 | soon
2233 | black and green
2234 | tusk
2235 | santa hat
2236 | baked
2237 | trunks
2238 | hockey
2239 | movement
2240 | western
2241 | thai
2242 | in corner
2243 | batman
2244 | dish
2245 | feeding giraffe
2246 | caramel
2247 | cirrus
2248 | ledge
2249 | behind man
2250 | 74
2251 | silverware
2252 | bunch
2253 | soldier
2254 | monster
2255 | verizon
2256 | brass
2257 | pelicans
2258 | patio
2259 | rv
2260 | at airport
2261 | 99
2262 | hsbc
2263 | footprints
2264 | cutting hair
2265 | shih tzu
2266 | on phone
2267 | horseback riding
2268 | sewing
2269 | air canada
2270 | buoy
2271 | calendar
2272 | paddling
2273 | 5 years
2274 | 10 years
2275 | sony ericsson
2276 | straight ahead
2277 | grape
2278 | orange and blue
2279 | joy
2280 | clown
2281 | lamps
2282 | family room
2283 | earbuds
2284 | farmers market
2285 | hammer time
2286 | shaking hands
2287 | teapot
2288 | finger
2289 | patterned
2290 | high heels
2291 | steeple
2292 | carnations
2293 | sticks
2294 | ski resort
2295 | hexagon
2296 | 64
2297 | butt
2298 | pine trees
2299 | almonds
2300 | tennis balls
2301 | 61
2302 | reebok
2303 | towards camera
2304 | huge
2305 | on bike
2306 | burton
2307 | wings
2308 | on bus
2309 | in basket
2310 | bank
2311 | dodge
2312 | bagels
2313 | cluttered
2314 | play tennis
2315 | middle one
2316 | marina
2317 | feathers
2318 | toddler
2319 | russia
2320 | thick
2321 | pear
2322 | ny
2323 | victoria
2324 | skiers
2325 | brown and yellow
2326 | yellow and brown
2327 | mound
2328 | licking
2329 | peaches
2330 | anniversary
2331 | cheetah
2332 | aa
2333 | paper towel
2334 | keys
2335 | no left turn
2336 | pens
2337 | bbq
2338 | transparent
2339 | twins
2340 | in tree
2341 | bus station
2342 | american airlines
2343 | protest
2344 | hound
2345 | elmo
2346 | fans
2347 | white one
2348 | leaning
2349 | coleslaw
2350 | gazebo
2351 | sheepdog
2352 | stripped
2353 | mango
2354 | fanta
2355 | fluffy
2356 | calf
2357 | under armour
2358 | throw frisbee
2359 | geico
2360 | paintings
2361 | lacoste
2362 | very big
2363 | more than 10
2364 | strap
2365 | index
2366 | camping
2367 | poop
2368 | peanuts
2369 | macaroni
2370 | ear
2371 | cop
2372 | run
2373 | kfc
2374 | lizard
2375 | sauerkraut
2376 | hiding
2377 | rye
2378 | tea kettle
2379 | hammock
2380 | grandfather
2381 | shopping cart
2382 | shells
2383 | skyscrapers
2384 | controllers
2385 | scratching
2386 | bunt
2387 | back left
2388 | ceiling fan
2389 | 3 ft
2390 | moose
2391 | size
2392 | 30 feet
2393 | yellow and gray
2394 | mother and child
2395 | 15 feet
2396 | tube
2397 | yacht
2398 | turn
2399 | fir
2400 | on sign
2401 | banana peel
2402 | on tree
2403 | muffins
2404 | very fast
2405 | singles
2406 | mouse pad
2407 | atv
2408 | no smoking
2409 | 50 years
2410 | vacation
2411 | competition
2412 | not at all
2413 | radish
2414 | birthday party
2415 | one in back
2416 | playing soccer
2417 | dough
2418 | coats
2419 | container
2420 | woman on left
2421 | toothpicks
2422 | coffee maker
2423 | sidecar
2424 | puma
2425 | logitech
2426 | crocs
2427 | 1 hour
2428 | poster
2429 | gazelle
2430 | junk
2431 | roll
2432 | lilac
2433 | mayo
2434 | tourists
2435 | for balance
2436 | rails
2437 | taller
2438 | cannot tell
2439 | no shirt
2440 | emergency
2441 | jean
2442 | skater
2443 | in bed
2444 | hawaii
2445 | one world
2446 | cigarettes
2447 | chest
2448 | sandy
2449 | ups
2450 | in motion
2451 | mario
2452 | fly
2453 | lasagna
2454 | life vest
2455 | 700
2456 | pearl
2457 | salt and pepper
2458 | stork
2459 | red light
2460 | blow dryer
2461 | somewhat
2462 | laminate
2463 | ears
2464 | factory
2465 | la
2466 | wing
2467 | males
2468 | plantains
2469 | nasa
2470 | acer
2471 | multiple
2472 | lighting
2473 | opened
2474 | bunk bed
2475 | posing for picture
2476 | mountainous
2477 | on rack
2478 | yorkie
2479 | 67
2480 | on mountain
2481 | snake
2482 | braids
2483 | easton
2484 | adults
2485 | show
2486 | tokyo
2487 | few
2488 | card
2489 | privacy
2490 | for protection
2491 | gmc
2492 | snowflakes
2493 | holding
2494 | tan and white
2495 | surprised
2496 | tropicana
2497 | blocks
2498 | rectangles
2499 | mosaic
2500 | gray and red
2501 | tin
2502 | wheelie
2503 | red wine
2504 | tools
2505 | classic
2506 | nature
2507 | greyhound
2508 | mayonnaise
2509 | floating
2510 | team
2511 | biker
2512 | welcome
2513 | 88
2514 | tennis match
2515 | food truck
2516 | room
2517 | best buy
2518 | city street
2519 | canadian
2520 | jet ski
2521 | in bathroom
2522 | balancing
2523 | landscape
2524 | ski sticks
2525 | yellow and pink
2526 | snow skiing
2527 | peace sign
2528 | enclosure
2529 | hallway
2530 | on tray
2531 | kayaking
2532 | broccoli and carrots
2533 | all way
2534 | railroad
2535 | 2 years
2536 | dark green
2537 | theater
2538 | descending
2539 | cleaner
2540 | collage
2541 | pickup
2542 | dark gray
2543 | prom
2544 | closet
2545 | feta
2546 | cameras
2547 | college
2548 | listening
2549 | frisbie
2550 | hanger
2551 | practice
2552 | on sand
2553 | sleeveless
2554 | person on left
2555 | 63
2556 | red and orange
2557 | younger
2558 | philadelphia
2559 | behind clouds
2560 | sightseeing
2561 | motorbikes
2562 | parasails
2563 | freezer
2564 | plastic wrap
2565 | stop light
2566 | wakeboard
2567 | zucchini
2568 | jp morgan
2569 | dog and cat
2570 | easter
2571 | pink and black
2572 | grocery store
2573 | hyundai
2574 | lava lamp
2575 | towing
2576 | 250
2577 | rome
2578 | homemade
2579 | oars
2580 | v
2581 | cola
2582 | great
2583 | whipped cream
2584 | chickens
2585 | 50 feet
2586 | safe
2587 | lemonade
2588 | selling
2589 | ginger
2590 | house cat
2591 | blue team
2592 | cat and dog
2593 | toward camera
2594 | riding motorcycle
2595 | pet
2596 | shaving
2597 | ahead
2598 | burrito
2599 | comfort
2600 | garbage can
2601 | shoulder
2602 | in wild
2603 | cathedral
2604 | cd
2605 | double decker bus
2606 | cruise ship
2607 | in oven
2608 | glaze
2609 | traffic lights
2610 | first base
2611 | qantas
2612 | website
2613 | scared
2614 | marines
2615 | tripod
2616 | 1950s
2617 | neon
2618 | sword
2619 | facebook
2620 | vw
2621 | handicapped
2622 | isuzu
2623 | tortilla
2624 | curious
2625 | violet
2626 | on his face
2627 | rams
2628 | 103
2629 | 20 mph
2630 | choppy
2631 | in stands
2632 | 4 ft
2633 | thailand
2634 | ticket
2635 | dome
2636 | syrup
2637 | bob
2638 | reds
2639 | laughing
2640 | tying tie
2641 | mo
2642 | man made
2643 | wood and metal
2644 | high chair
2645 | transport
2646 | 125
2647 | pedestrian
2648 | wrench
2649 | parrots
2650 | wisconsin
2651 | hilly
2652 | pita
2653 | grain
2654 | posts
2655 | baggage claim
2656 | baltimore
2657 | on snow
2658 | porch
2659 | fighter
2660 | dolphin
2661 | pink and purple
2662 | chimney
2663 | windsor
2664 | on runway
2665 | on hill
2666 | name
2667 | digital
2668 | busy
2669 | elm
2670 | planter
2671 | eat it
2672 | beets
2673 | under sink
2674 | brown bear
2675 | neon green
2676 | vintage
2677 | union station
2678 | lap
2679 | fires
2680 | crab
2681 | spiral
2682 | toilet seat
2683 | pans
2684 | backyard
2685 | greek
2686 | casserole
2687 | firefighter
2688 | print
2689 | fighter jet
2690 | balcony
2691 | grooming
2692 | white and tan
2693 | information
2694 | heavy
2695 | beads
2696 | professional
2697 | playground
2698 | oregon
2699 | half full
2700 | dashboard
2701 | kite string
2702 | buttons
2703 | tell time
2704 | tuna
2705 | only
2706 | turban
2707 | take off
2708 | nightstand
2709 | fireman
2710 | mail
2711 | name tag
2712 | sale
2713 | bookcase
2714 | close
2715 | j
2716 | ambulance
2717 | htc
2718 | red yellow
2719 | butterflies
2720 | melon
2721 | philips
2722 | slide
2723 | eye
2724 | upper left
2725 | blue and pink
2726 | omelet
2727 | sculpture
2728 | baggage
2729 | sprite
2730 | under bed
2731 | tiara
2732 | wine bottle
2733 | san diego
2734 | 6 ft
2735 | behind him
2736 | frame
2737 | train car
2738 | 85
2739 | flash
2740 | away from camera
2741 | rider
2742 | left handed
2743 | numerous
2744 | block sun
2745 | statue of liberty
2746 | downward
2747 | looking at phone
2748 | backpacks
2749 | colorful
2750 | sandal
2751 | looking for food
2752 | little boy
2753 | artwork
2754 | after
2755 | used
2756 | golden gate
2757 | babies
2758 | blt
2759 | wax
2760 | waffle
2761 | to hit ball
2762 | very long
2763 | man in middle
2764 | 98
2765 | wii bowling
2766 | the
2767 | yak
2768 | clip
2769 | partly
2770 | vehicles
2771 | disney
2772 | shepard
2773 | miami
2774 | mac and cheese
2775 | liquor
2776 | hilton
2777 | catholic
2778 | loading
2779 | countryside
2780 | for shade
2781 | steer
2782 | paper towels
2783 | casual
2784 | computer mouse
2785 | milking
2786 | tomato sauce
2787 | knee
2788 | lambs
2789 | pears
2790 | 747
2791 | festival
2792 | on plane
2793 | cards
2794 | tennessee
2795 | on rock
2796 | baker
2797 | slow down
2798 | hoagie
2799 | supreme
2800 | fashion
2801 | cans
2802 | ranch
2803 | photograph
2804 | ge
2805 | student
2806 | upper right
2807 | earth
2808 | white black
2809 | south africa
2810 | stainless
2811 | pretty
2812 | suit and tie
2813 | to see
2814 | boating
2815 | students
2816 | blue and silver
2817 | red velvet
2818 | riding horses
2819 | dreadlocks
2820 | riding elephant
2821 | blinders
2822 | burgers
2823 | goal
2824 | jackson
2825 | bored
2826 | water ski
2827 | holding it
2828 | amazon
2829 | foreign
2830 | stuffed toy
2831 | herself
2832 | deli
2833 | lays
2834 | dog bed
2835 | on roof
2836 | stoplight
2837 | 140
2838 | 5 ft
2839 | jal
2840 | iris
2841 | practicing
2842 | skillet
2843 | laundry
2844 | gym
2845 | down street
2846 | pickup truck
2847 | buns
2848 | rottweiler
2849 | flat screen
2850 | motel
2851 | by window
2852 | pipes
2853 | 89
2854 | photographing
2855 | wii sports
2856 | by water
2857 | paw
2858 | worms
2859 | 3rd
2860 | gaming
2861 | deep dish
2862 | multicolor
2863 | digging
2864 | kingfisher
2865 | blueberry
2866 | union pacific
2867 | bell pepper
2868 | flower pot
2869 | 76
2870 | plastic bag
2871 | block
2872 | crows
2873 | googles
2874 | on computer
2875 | sure
2876 | under tree
2877 | path
2878 | lion king
2879 | cilantro
2880 | orange and red
2881 | in woods
2882 | on side
2883 | paper plate
2884 | turf
2885 | moo
2886 | parking garage
2887 | light pole
2888 | skeleton
2889 | towel rack
2890 | new york yankees
2891 | under table
2892 | switzerland
2893 | spices
2894 | feather
2895 | bus driver
2896 | u haul
2897 | mustard and ketchup
2898 | countertop
2899 | meeting
2900 | pomeranian
2901 | flats
2902 | drying
2903 | very deep
2904 | telling time
2905 | calculator
2906 | heron
2907 | weather
2908 | egypt
2909 | pee
2910 | rhino
2911 | artificial
2912 | dead end
2913 | thousands
2914 | throw ball
2915 | cushion
2916 | gull
2917 | prince
2918 | melbourne
2919 | to catch ball
2920 | deciduous
2921 | around neck
2922 | fast food
2923 | analog
2924 | playing games
2925 | laptop screen
2926 | palms
2927 | check
2928 | venice
2929 | baseball mitt
2930 | scotland
2931 | grinding
2932 | grind
2933 | father
2934 | brown and green
2935 | late afternoon
2936 | limes
2937 | violin
2938 | visibility
2939 | urinating
2940 | shovel
2941 | mural
2942 | equestrian
2943 | 2005
2944 | milking cow
2945 | sweat
2946 | more than 20
2947 | solid
2948 | 1900
2949 | us airways
2950 | kneeling
2951 | his left
2952 | 57
2953 | scarves
2954 | sailing
2955 | boeing
2956 | stem
2957 | lower
2958 | soccer field
2959 | storm
2960 | celebrating
2961 | asleep
2962 | panasonic
2963 | man and woman
2964 | parent
2965 | north face
2966 | on tennis court
2967 | sas
2968 | street lights
2969 | wait
2970 | 69
2971 | bird feeder
2972 | comforter
2973 | in window
2974 | feeder
2975 | tricks
2976 | using laptop
2977 | kleenex
2978 | jackets
2979 | style
2980 | taco
2981 | bowls
2982 | long time
2983 | sweden
2984 | zig zag
2985 | december
2986 | boardwalk
2987 | toronto
2988 | stuff
2989 | using computer
2990 | skinny
2991 | mesh
2992 | buffet
2993 | burnt
2994 | walmart
2995 | tigers
2996 | no entry
2997 | bending
2998 | bay
2999 | angry birds
3000 | cleveland
3001 | dc
3002 | 


--------------------------------------------------------------------------------
/data/vqa/gt_layout_train2014_new_parse.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/gt_layout_train2014_new_parse.npy


--------------------------------------------------------------------------------
/data/vqa/gt_layout_val2014_new_parse.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/gt_layout_val2014_new_parse.npy


--------------------------------------------------------------------------------
/data/vqa/vocabulary_layout.txt:
--------------------------------------------------------------------------------
1 | _Find
2 | _Transform
3 | _And
4 | _Describe
5 | <eos>
6 | 


--------------------------------------------------------------------------------
/data/vqa/vocabulary_vqa_glove.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/data/vqa/vocabulary_vqa_glove.npy


--------------------------------------------------------------------------------
/eval_model/eval_example.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | from torch.autograd import Variable
  3 | from models.layout_assembler import Assembler
  4 | from Utils.data_reader import DataReader
  5 | import sys
  6 | 
  7 | import os
  8 | import torch
  9 | import numpy as np
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.ticker as ticker
 12 | 
 13 | 
 14 | from Utils.text_processing import *
 15 | 
 16 | from eval_model.layout_evaluator import run_eval
 17 | 
 18 | 
 19 | # Data files
 20 | vocab_question_file = './exp_clevr/data/vocabulary_clevr.txt'
 21 | vocab_layout_file = './exp_clevr/data/vocabulary_layout.txt'
 22 | vocab_answer_file = './exp_clevr/data/answers_clevr.txt'
 23 | 
 24 | 
 25 | exp_name = 'clevr_gt_layout'
 26 | tst_image_set = 'val'
 27 | out_file = "layout_learning_on_eval_dataset.txt"
 28 | 
 29 | snapshot_name='%08d' % 500
 30 | 
 31 | T_encoder = 45
 32 | T_decoder = 20
 33 | N = 64
 34 | prune_filter_module = True
 35 | 
 36 | 
 37 | 
 38 | imdb_file_tst = './exp_clevr/data/imdb/imdb_%s.npy' % tst_image_set
 39 | snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, snapshot_name)
 40 | 
 41 | save_file = './exp_clevr/results/%s/%s.%s.txt' % (exp_name, snapshot_name, tst_image_set)
 42 | os.makedirs(os.path.dirname(save_file), exist_ok=True)
 43 | 
 44 | eval_output_file = './exp_clevr/eval_outputs/%s/%s.%s.txt' % (exp_name, snapshot_name, tst_image_set)
 45 | os.makedirs(os.path.dirname(eval_output_file), exist_ok=True)
 46 | 
 47 | assembler = Assembler(vocab_layout_file)
 48 | data_reader_tst = DataReader(imdb_file_tst, shuffle=False, one_pass=True,
 49 |                              batch_size=N,
 50 |                              T_encoder=T_encoder,
 51 |                              T_decoder=T_decoder,
 52 |                              assembler=assembler,
 53 |                              vocab_question_file=vocab_question_file,
 54 |                              vocab_answer_file=vocab_answer_file,
 55 |                              prune_filter_module=prune_filter_module)
 56 | 
 57 | print('Running test ...')
 58 | answer_correct_total = 0
 59 | layout_correct_total = 0
 60 | layout_valid_total = 0
 61 | num_questions_total = 0
 62 | answer_word_list = data_reader_tst.batch_loader.answer_dict.word_list
 63 | output_answers = []
 64 | 
 65 | ##load my model
 66 | model = torch.load(snapshot_file)
 67 | 
 68 | n_total = 0
 69 | 
 70 | batch = data_reader_tst.prefetch_queue.get(block=True)
 71 | _, batch_size = batch['input_seq_batch'].shape
 72 | 
 73 | input_text_seq_lens = batch['seq_length_batch']
 74 | input_text_seqs = batch['input_seq_batch']
 75 | input_layouts = batch['gt_layout_batch']
 76 | 
 77 | num_questions_total += batch_size
 78 | 
 79 | n_correct_layout = 0
 80 | 
 81 | input_variable = Variable(torch.LongTensor(input_text_seqs))
 82 | 
 83 | target_variable = Variable(torch.LongTensor(input_layouts))
 84 | 
 85 | myLayouts, myAttentions = model(input_variable, input_text_seq_lens, target_variable)
 86 | predicted_layouts = torch.topk(myLayouts, 1)[1].cpu().data.numpy()[:, :, 0]
 87 | 
 88 | 
 89 | sample_idx = 17
 90 | 
 91 | 
 92 | 
 93 | def plot_sample(sample_idx):
 94 |     example_text = input_text_seqs[:,sample_idx]
 95 |     example_att = myAttentions.data.cpu().numpy()[sample_idx,:,:]
 96 |     example_layout = predicted_layouts[:,sample_idx]
 97 |     word_list = load_str_list(vocab_question_file)
 98 |     layout_list=load_str_list(vocab_layout_file)
 99 |     sentence = list(map(lambda x:word_list[x],example_text))
100 |     layout = list(map(lambda x: layout_list[x],example_layout))
101 |     sentence_len = sum(1 for i in sentence if i != ';')
102 |     layout_len = sum(1 for i in layout if i != '<eos>')
103 |     fig = plt.figure()
104 |     ax = fig.add_subplot(111)
105 |     cax = ax.matshow(example_att[0:layout_len,0:sentence_len],cmap='bone')
106 |     fig.colorbar(cax)
107 |     ax.set_xticklabels(['']+sentence[0:sentence_len],rotation=90)
108 |     ax.set_yticklabels(['']+layout[0:layout_len])
109 | 
110 |     ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
111 |     ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
112 |     plt.show()
113 | 
114 | 
115 | plot_sample(17)


--------------------------------------------------------------------------------
/eval_model/eval_layout_accuracy.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | from eval_model.layout_evaluator import run_eval
 3 | import argparse
 4 | 
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--gpu_id', type=int, default=0)
 8 | 
 9 | parser.add_argument('--exp_name', required=True)
10 | parser.add_argument('--snapshot_name', required=True)
11 | parser.add_argument('--test_split', required=True)
12 | parser.add_argument("--data_dir",type=str, required=True)
13 | parser.add_argument("--image_dir",type=str, required=True)
14 | parser.add_argument("--model_dir",type=str, required=True)
15 | 
16 | args = parser.parse_args()
17 | 
18 | 
19 | exp_name = args.exp_name
20 | snapshot_name = args.snapshot_name
21 | tst_image_set = args.test_split
22 | data_dir = args.data_dir
23 | image_dir = args.image_dir
24 | model_dir = args.model_dir
25 | 
26 | layout_accuracy, layout_correct_total, num_questions_total,answer_accuracy =\
27 |     run_eval(exp_name,snapshot_name,tst_image_set,data_dir, image_dir, model_dir, print_log=True)
28 | 
29 | 
30 | print('On split: %s' % tst_image_set)
31 | print('\t layout accuracy = %f (%d / %d) answer_accuracy= %f' %
32 |       (layout_accuracy, layout_correct_total, num_questions_total,answer_accuracy))


--------------------------------------------------------------------------------
/eval_model/eval_layout_learning.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from eval_model.layout_evaluator import run_eval
 5 | 
 6 | exp_name = 'clevr_gt_layout'
 7 | dataSplitSet = 'val'
 8 | out_file = "layout_learning_on_eval_dataset.txt"
 9 | 
10 | eval_results = []
11 | 
12 | 
13 | with open(out_file, 'w') as f:
14 |     for i_iter in range(500):
15 |         snapshot_name = '%08d' % i_iter
16 |         snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, snapshot_name)
17 |         if os.path.exists(snapshot_file):
18 |             accuracy,_, total = run_eval(exp_name, snapshot_name, dataSplitSet)
19 |             eval_results.append((i_iter,accuracy,total))
20 |             print("iter:", i_iter,"\taccuracy:", accuracy, "\ttotal:", total)
21 |             sys.stdout.flush()
22 |             print("iter:", i_iter, "\taccuracy:", accuracy, "\ttotal:", total,file=f)
23 | 
24 | 


--------------------------------------------------------------------------------
/eval_model/layout_evaluator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | from torch.autograd import Variable
  3 | from models.layout_assembler import Assembler
  4 | from Utils.data_reader import DataReader
  5 | 
  6 | import os
  7 | import torch
  8 | import numpy as np
  9 | 
 10 | from global_variables.global_variables import use_cuda
 11 | 
 12 | 
 13 | T_encoder = 45
 14 | T_decoder = 10
 15 | N = 64
 16 | prune_filter_module = True
 17 | 
 18 | 
 19 | 
 20 | 
 21 | def run_eval(exp_name, snapshot_name, tst_image_set, data_dir, image_feat_dir, tf_model_dir,print_log = False):
 22 |     vocab_question_file = os.path.join(data_dir,"vocabulary_clevr.txt")
 23 |     vocab_layout_file = os.path.join(data_dir,"vocabulary_layout.txt")
 24 |     vocab_answer_file = os.path.join(data_dir,"answers_clevr.txt")
 25 | 
 26 |     imdb_file_tst_base_name = 'imdb_%s.npy' % tst_image_set
 27 |     imdb_file_tst = os.path.join(data_dir,"imdb",imdb_file_tst_base_name)
 28 | 
 29 |     image_feat_dir_tst = os.path.join(image_feat_dir,tst_image_set)
 30 | 
 31 |     #module_snapshot_file = './exp_clevr/tfmodel/%s/%s' % (exp_name, "model_"+snapshot_name)
 32 | 
 33 |     module_snapshot_file = os.path.join(tf_model_dir, exp_name, "model_"+snapshot_name)
 34 |     assembler = Assembler(vocab_layout_file)
 35 | 
 36 |     data_reader_tst = DataReader(imdb_file_tst,image_feat_dir_tst, shuffle=False, one_pass=True,
 37 |                                  batch_size=N,
 38 |                                  T_encoder=T_encoder,
 39 |                                  T_decoder=T_decoder,
 40 |                                  assembler=assembler,
 41 |                                  vocab_question_file=vocab_question_file,
 42 |                                  vocab_answer_file=vocab_answer_file,
 43 |                                  prune_filter_module=prune_filter_module)
 44 | 
 45 | 
 46 |     if data_reader_tst is not None:
 47 |         print('Running test ...')
 48 | 
 49 | 
 50 |         answer_correct_total = 0
 51 |         layout_correct_total = 0
 52 |         layout_valid_total = 0
 53 |         num_questions_total = 0
 54 | 
 55 | 
 56 |         ##load my model
 57 |         myModel = torch.load(module_snapshot_file)
 58 | 
 59 | 
 60 |         for i, batch in enumerate(data_reader_tst.batches()):
 61 | 
 62 |             _, batch_size = batch['input_seq_batch'].shape
 63 | 
 64 |             input_text_seq_lens = batch['seq_length_batch']
 65 |             input_text_seqs = batch['input_seq_batch']
 66 |             input_layouts = batch['gt_layout_batch']
 67 |             input_images = batch['image_feat_batch']
 68 |             input_answers = batch['answer_label_batch']
 69 | 
 70 |             num_questions_total += batch_size
 71 | 
 72 | 
 73 |             input_txt_variable = Variable(torch.LongTensor(input_text_seqs))
 74 |             input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable
 75 | 
 76 |             input_layout_variable = None
 77 | 
 78 |             _, _, myAnswer, predicted_layouts, expr_validity_array,_ = myModel(
 79 |                 input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens,
 80 |                 input_layout_variable=input_layout_variable,
 81 |                 input_answers=None, input_images=input_images,sample_token=False)
 82 | 
 83 | 
 84 |             layout_correct_total += np.sum(np.all(predicted_layouts == input_layouts, axis=0))
 85 | 
 86 | 
 87 |             answer_correct_total += np.sum(np.logical_and(expr_validity_array, myAnswer == input_answers))
 88 | 
 89 |             layout_valid_total += np.sum(expr_validity_array)
 90 | 
 91 |             ##current accuracy
 92 |             layout_accuracy = layout_correct_total / num_questions_total
 93 |             answer_accuracy = answer_correct_total / num_questions_total
 94 |             layout_validity = layout_valid_total / num_questions_total
 95 | 
 96 |             if (i+1)%100 ==0 and print_log:
 97 |                 print("iter:", i + 1, " layout_accuracy=%.4f"% layout_accuracy,
 98 |                       " answer_accuracy=%.4f"% answer_accuracy,
 99 |                       " layout_validity=%.4f"% layout_validity,)
100 | 
101 | 
102 | 
103 |         return layout_accuracy, layout_correct_total ,num_questions_total, answer_accuracy


--------------------------------------------------------------------------------
/global_variables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/global_variables/__init__.py


--------------------------------------------------------------------------------
/global_variables/global_variables.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | use_cuda = torch.cuda.is_available()
5 | 
6 | model_type_gt = "gt_layout"
7 | model_type_scratch = "scratch"
8 | model_type_gt_rl = "gt+rl"
9 | 


--------------------------------------------------------------------------------
/loadn2nmn_pytorch_env.sh:
--------------------------------------------------------------------------------
1 | export PYTHONPATH="/private/home/tinayujiang/n2nmn_pytorch:$PYTHONPATH"
2 | 
3 | module load cudnn/v7.0-cuda.9.0
4 | 
5 | 


--------------------------------------------------------------------------------
/models/Attention2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import numpy as np
  6 | from global_variables.global_variables import use_cuda
  7 | 
  8 | 
  9 | class EncoderRNN(nn.Module):
 10 |     def __init__(self, input_size, hidden_size, input_encoding_size, num_layers=1):
 11 |         super(EncoderRNN, self).__init__()
 12 |         self.hidden_size = hidden_size
 13 |         self.num_layers = num_layers
 14 | 
 15 |         self.embedding = nn.Embedding(input_size, input_encoding_size)
 16 |         self.lstm = nn.LSTM(input_encoding_size, hidden_size)
 17 | 
 18 |     def forward(self, input_seqs, input_seq_lens, hidden):
 19 |         embedded = self.embedding(input_seqs)
 20 |         outputs, hidden = self.lstm(embedded)
 21 |         return outputs, hidden, embedded
 22 | 
 23 |     def initHidden(self,batch_size):
 24 |         result = Variable(torch.zeros(self.num_layers, batch_size, self.hidden_size))
 25 |         if use_cuda:
 26 |             return result.cuda()
 27 |         else:
 28 |             return result
 29 | 
 30 | 
 31 | 
 32 | class AttnDecoderRNN(nn.Module):
 33 |     def __init__(self, hidden_size, output_size, output_encoding_size,
 34 |                  max_decoder_len=0, dropout_p=0.1,num_layers = 1,
 35 |                  assembler_w=None, assembler_b=None, assembler_p = None,EOStoken=-1):
 36 |         super(AttnDecoderRNN, self).__init__()
 37 |         self.hidden_size = hidden_size
 38 |         self.output_size = output_size
 39 |         self.output_encoding_size = output_encoding_size
 40 |         self.dropout_p = dropout_p
 41 |         self.num_layers = num_layers
 42 |         self.max_decoder_len = max_decoder_len
 43 | 
 44 |         self.go_embeding = nn.Embedding(1, self.output_encoding_size)
 45 |         self.embedding = nn.Embedding(self.output_size, self.output_encoding_size)
 46 |         self.dropout = nn.Dropout(self.dropout_p)
 47 |         self.lstm = nn.LSTM(self.output_encoding_size, self.hidden_size)
 48 |         self.out = nn.Linear(self.hidden_size * 2, self.output_size)
 49 |         self.encoderLinear = nn.Linear(self.hidden_size, self.hidden_size)
 50 |         self.decoderLinear = nn.Linear(self.hidden_size, self.hidden_size)
 51 |         self.attnLinear = nn.Linear(self.hidden_size, 1)
 52 |         self.assembler_w = torch.FloatTensor(assembler_w).cuda() if use_cuda else torch.FloatTensor(assembler_w)
 53 |         self.assembler_b = torch.FloatTensor(assembler_b).cuda() if use_cuda else torch.FloatTensor(assembler_b)
 54 |         self.assembler_p = torch.FloatTensor(assembler_p).cuda() if use_cuda else torch.FloatTensor(assembler_p)
 55 |         self.batch_size = 0
 56 |         self.EOS_token = EOStoken
 57 |         self._init_par()
 58 |     
 59 |     def _init_par(self):
 60 |         torch.nn.init.xavier_uniform(self.decoderLinear.weight)
 61 |         torch.nn.init.xavier_uniform(self.attnLinear.weight)
 62 |         torch.nn.init.constant(self.decoderLinear.bias,0)
 63 |         torch.nn.init.constant(self.attnLinear.bias,0)
 64 |     '''
 65 |         compute if a token is valid at current sequence
 66 |         decoding_state [N,3]
 67 |         assembler_w [3,output_size, 4 ]
 68 |         assembler_b [output_size, 4]
 69 |         output [N, output_size]
 70 |     '''
 71 |     def _get_valid_tokens(self,decoding_state, assembler_W, assembler_b):
 72 | 
 73 |         batch_size = decoding_state.size(0)
 74 |         expanded_state = decoding_state.view(batch_size,3,1,1).expand(batch_size, 3, self.output_size, 4)
 75 | 
 76 |         expanded_w= assembler_W.view(1,3, self.output_size,4).expand(batch_size, 3, self.output_size, 4)
 77 | 
 78 |         tmp1 = torch.sum(expanded_state * expanded_w, dim=1)
 79 |         expanded_b = assembler_b.view(1,-1,4).expand(batch_size,-1,4)
 80 |         tmp2= tmp1 - expanded_b
 81 |         tmp3 = torch.min(tmp2,dim=2)[0]
 82 |         token_invalidity = torch.lt(tmp3, 0)
 83 |         token_invalidity = token_invalidity.cuda() if use_cuda else token_invalidity
 84 |         return token_invalidity
 85 | 
 86 | 
 87 |     '''
 88 |         update the decoding state, which is used to determine if a token is valid
 89 |         decoding_state [N,3]
 90 |         assembler_p [output_size, 3]
 91 |         predicted_token [N,output_size]
 92 |         output [N, output_size]
 93 |     '''
 94 |     def _update_decoding_state(self, decoding_state, predicted_token, assembler_P):
 95 |         decoding_state = decoding_state + torch.mm(predicted_token , assembler_P)
 96 |         return decoding_state
 97 | 
 98 | 
 99 |     '''
100 |         for a give state compute the lstm hidden layer, attention and predicted layers
101 |         can handle the situation where seq_len is 1 or >1 (i.e., s=using groudtruth layout)
102 |         
103 |         input parameters :
104 |             time: int, time step of decoder
105 |             previous_token: [decoder_len, batch], decoder_len=1 for step-by-step decoder
106 |             previous_hidden_state: (h_n, c_n), dimmension:both are (num_layers * num_directions, batch, hidden_size)
107 |             encoder_outputs : outputs from LSTM in encoder[seq_len, batch, hidden_size * num_directions]
108 |             encoder_lens: list of input sequence lengths
109 |             decoding_state: the state used to decide valid tokens
110 |         
111 |         output parameters : 
112 |             predicted_token: [decoder_len, batch]
113 |             Att_weighted_text: batch,out_len,txt_embed_dim
114 |             log_seq_prob: [batch]
115 |             neg_entropy: [batch]
116 |     '''
117 |     def _step_by_step_attention_decoder(self, time, embedded, previous_hidden_state,
118 |                                         encoder_outputs, encoder_lens, decoding_state,target_variable,sample_token):
119 | 
120 |         ##step1 run LSTM to get decoder hidden state
121 |         seq_len = encoder_outputs.size(0)
122 |         batch_size = encoder_outputs.size(1)
123 |         hidden_size = encoder_outputs.size(2)
124 | 
125 |         out_len = embedded.size(0)
126 | 
127 |         output, hidden = self.lstm(embedded, previous_hidden_state)
128 |         ##step2: use function in Eq(2) of the paper to compute attention
129 |         ##size encoder_outputs (seq_len,batch_size,hidden_size)==>(out_len,seq_len,batch_size,hidden_size)
130 |         encoder_outputs_expand = encoder_outputs.view(1, seq_len, batch_size, hidden_size).expand(out_len, seq_len,
131 |                                                                                                   batch_size,
132 |                                                                                                   hidden_size)
133 |         encoder_transform = self.encoderLinear(encoder_outputs_expand)
134 | 
135 |         ##size output (out_len,batch_size,hidden_size)
136 |         output_expand = output.view(out_len, 1, batch_size, hidden_size).expand(out_len, seq_len, batch_size,
137 |                                                                                 hidden_size)
138 |         output_transfrom = self.decoderLinear(output_expand)
139 | 
140 |         ##raw_attention size (out_len,seq_len,batch_size,1)
141 |         raw_attention = self.attnLinear(F.tanh(encoder_transform + output_transfrom)).view(out_len, seq_len,
142 |                                                                                            batch_size)  ## Eq2
143 | 
144 |         # (out_len, seq_len, batch_size)==>(batch_size,out_len,seq_len)
145 |         raw_attention = raw_attention.permute(2, 0, 1)
146 | 
147 |         ##mask the end of the question
148 |         if encoder_lens is not None:
149 |             mask = np.ones((batch_size, out_len, seq_len))
150 |             for i, v in enumerate(encoder_lens):
151 |                 mask[i, :, 0:v] = 0
152 |             mask_tensor = torch.ByteTensor(mask)
153 |             mask_tensor = mask_tensor.cuda() if use_cuda else mask_tensor
154 |             raw_attention.data.masked_fill_(mask_tensor, -float('inf'))
155 | 
156 |         attention = F.softmax(raw_attention, dim=2)  ##(batch,out_len,seq_len)
157 |         
158 | 
159 |         ##c_t = \sum_{i=1}^I att_{ti}h_i t: decoder time t, and encoder time i
160 |         ## (seq_len,batch_size,hidden_size) ==>(batch_size,seq_len,hidden_size)
161 |         encoder_batch_first = encoder_outputs.permute(1, 0, 2)
162 |         context = torch.bmm(attention, encoder_batch_first)
163 | 
164 |         ##(out_len,batch,hidden_size) --> (batch,out_len,hidden_size)
165 |         output_batch_first = output.permute(1, 0, 2)
166 | 
167 |         ##(batch,out_len,hidden_size*2)
168 |         combined = torch.cat((context, output_batch_first), dim=2).permute(1, 0, 2)
169 | 
170 |         ## [out_len,batch,out_size]
171 |         output_prob = F.softmax(self.out(combined), dim=2)
172 | 
173 | 
174 | 
175 |         ##get the valid token for current position based on previous token to perform a mask for next prediction
176 |         ## token_validity [N, output_size]
177 |         token_invalidity = self._get_valid_tokens(decoding_state=decoding_state,
178 |                                                 assembler_W=self.assembler_w,
179 |                                                 assembler_b=self.assembler_b)
180 | 
181 |         ## probs
182 |         probs = output_prob.view(-1,self.output_size)
183 |         probs.data.masked_fill_(token_invalidity,0.0)
184 |         probs_sum = torch.sum(probs, dim=1, keepdim=True)
185 |         probs = probs/probs_sum
186 | 
187 | 
188 |         if target_variable is not None:
189 |             predicted_token = target_variable[time, :].view(-1,1)
190 |         elif sample_token:
191 |             predicted_token = probs.multinomial()
192 |         else:
193 |             predicted_token = torch.max(probs, dim=1)[1].view(-1, 1)
194 | 
195 | 
196 |         ##[batch_size, self.output_size]
197 |         tmp = torch.zeros(batch_size, self.output_size)
198 |         tmp = tmp.cuda() if use_cuda else tmp
199 |         predicted_token_encoded = tmp.scatter_(1, predicted_token.data, 1.0)
200 |         predicted_token_encoded = predicted_token_encoded.cuda() if use_cuda else predicted_token_encoded
201 | 
202 |         updated_decoding_state = self._update_decoding_state(decoding_state=decoding_state,
203 |                                                              predicted_token=predicted_token_encoded,
204 |                                                              assembler_P=self.assembler_p)
205 | 
206 |         ## compute the negative entropy
207 |         token_invalidity_float = Variable(token_invalidity.type(torch.FloatTensor)).detach()
208 |         token_invalidity_float = token_invalidity_float.cuda() if use_cuda else token_invalidity_float
209 |         token_neg_entropy = torch.sum(probs.detach() * torch.log(probs + 0.000001), dim=1)
210 | 
211 |         ## compute log_seq_prob
212 |         selected_token_log_prob =torch.log(torch.sum(probs * Variable(predicted_token_encoded), dim=1)+ 0.000001)
213 | 
214 | 
215 |         return predicted_token.permute(1, 0), hidden, attention, updated_decoding_state,token_neg_entropy, selected_token_log_prob
216 | 
217 | 
218 | 
219 | 
220 | 
221 |     def forward(self,encoder_hidden,encoder_outputs,encoder_lens,target_variable,sample_token):
222 |         self.batch_size = encoder_outputs.size(1)
223 |         total_neg_entropy = 0
224 |         total_seq_prob = 0
225 | 
226 |         ## set initiate step:
227 |         time = 0
228 |         start_token = Variable(torch.LongTensor(np.zeros((1, self.batch_size))), requires_grad=False)
229 |         start_token = start_token.cuda() if use_cuda else start_token
230 |         next_input = self.go_embeding(start_token)
231 |         next_decoding_state = torch.FloatTensor([[0, 0, self.max_decoder_len]]).expand(self.batch_size, 3).contiguous()
232 |         next_decoding_state = next_decoding_state.cuda() if use_cuda else next_decoding_state
233 |         loop_state = True
234 |         previous_hidden = encoder_hidden
235 | 
236 |         while time < self.max_decoder_len :
237 |             predicted_token, previous_hidden, context, next_decoding_state, neg_entropy, log_seq_prob = \
238 |                 self._step_by_step_attention_decoder(time=time,
239 |                     embedded= next_input,
240 |                     previous_hidden_state=previous_hidden, encoder_outputs=encoder_outputs,
241 |                     encoder_lens=encoder_lens, decoding_state=next_decoding_state,target_variable= target_variable,sample_token=sample_token)
242 | 
243 |             if time == 0:
244 |                 predicted_tokens = predicted_token
245 |                 total_neg_entropy = neg_entropy
246 |                 total_seq_prob = log_seq_prob
247 |                 context_total = context
248 |             else:
249 |                 predicted_tokens = torch.cat((predicted_tokens, predicted_token))
250 |                 total_neg_entropy += neg_entropy
251 |                 total_seq_prob += log_seq_prob
252 |                 context_total = torch.cat((context_total, context), dim=1)
253 | 
254 |             time +=1
255 |             next_input =self.embedding(predicted_token)
256 |             loop_state = torch.ne(predicted_token, self.EOS_token).any()
257 | 
258 |         return predicted_tokens, context_total, total_neg_entropy, total_seq_prob
259 | 
260 | 
261 | 
262 | 
263 | class attention_seq2seq(nn.Module):
264 |     def __init__(self, encoder, decoder):
265 |         super(attention_seq2seq, self).__init__()
266 |         self.encoder = encoder
267 |         self.decoder = decoder
268 | 
269 |     def forward(self, input_seqs,input_seq_lens,target_variable,sample_token):
270 |         encoder_hidden = self.encoder.initHidden(len(input_seq_lens))
271 |         encoder_outputs, encoder_hidden, txt_embedded = self.encoder(input_seqs,input_seq_lens, encoder_hidden)
272 |         decoder_results, attention, neg_entropy, log_seq_prob = self.decoder(target_variable=target_variable,
273 |                                                                     encoder_hidden= encoder_hidden,
274 |                                                                     encoder_outputs= encoder_outputs,
275 |                                                                     encoder_lens=input_seq_lens, sample_token=sample_token
276 |                                                                              )
277 |         ##using attention from decoder and txt_embedded from the encoder to get the attention weighted text
278 |         ## txt_embedded [seq_len,batch,input_encoding_size]
279 |         ## attention [batch, out_len,seq_len]
280 |         txt_embedded_perm = txt_embedded.permute(1,0,2)
281 |         att_weighted_text = torch.bmm(attention, txt_embedded_perm)
282 |         
283 |         
284 |         return decoder_results, att_weighted_text, neg_entropy, log_seq_prob
285 |         #return decoder_results, attention, neg_entropy, log_seq_prob
286 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/models/__init__.py


--------------------------------------------------------------------------------
/models/custom_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class custom_loss(nn.Module):
 5 |     def __init__(self,lambda_entropy):
 6 |         super(custom_loss, self).__init__()
 7 |         self.lambda_entropy = lambda_entropy
 8 | 
 9 |     def forward(self, neg_entropy, answer_loss, policy_gradient_losses=None,layout_loss =None):
10 |         answer = torch.mean(answer_loss)
11 |         #entropy = torch.mean(neg_entropy)
12 |         #policy_gradient = torch.mean(policy_gradient_losses)
13 |         #print(" answer= %f, entropy  = %f, policy_gradient = %f" %
14 |         #          (answer,entropy,policy_gradient))
15 | 
16 |         if layout_loss is None:
17 |             return torch.mean(neg_entropy) * self.lambda_entropy +\
18 |                torch.mean(answer_loss)+torch.mean(policy_gradient_losses), answer
19 |         else:
20 |             return answer + layout_loss, answer
21 | 


--------------------------------------------------------------------------------
/models/end2endModuleNet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import sys
  4 | from models.Attention2 import *
  5 | from models.module_net import *
  6 | from Utils.utils import unique_columns
  7 | 
  8 | 
  9 | 
 10 | 
 11 | class end2endModuleNet(nn.Module):
 12 |     def __init__(self, num_vocab_txt, num_vocab_nmn, out_num_choices,
 13 |                 embed_dim_nmn, embed_dim_txt, image_height, image_width, in_image_dim,
 14 |                 hidden_size, assembler, layout_criterion, answer_criterion,max_layout_len, num_layers=1, decoder_dropout=0,**kwarg):
 15 | 
 16 |         super(end2endModuleNet, self).__init__()
 17 | 
 18 |         self.assembler = assembler
 19 |         self.layout_criterion = layout_criterion
 20 |         self.answer_criterion = answer_criterion
 21 | 
 22 | 
 23 |         ##initiate encoder and decoder
 24 |         myEncoder = EncoderRNN(num_vocab_txt, hidden_size, embed_dim_txt, num_layers)
 25 |         myDecoder = AttnDecoderRNN(hidden_size, num_vocab_nmn, embed_dim_nmn,
 26 |                                    max_decoder_len = max_layout_len,
 27 |                                    dropout_p=decoder_dropout, num_layers= num_layers,
 28 |                                    assembler_w=self.assembler.W, assembler_b=self.assembler.b,
 29 |                                    assembler_p=self.assembler.P, EOStoken=self.assembler.EOS_idx)
 30 | 
 31 |         if use_cuda:
 32 |             myEncoder = myEncoder.cuda()
 33 |             myDecoder = myDecoder.cuda()
 34 | 
 35 | 
 36 |         ##initatiate attentionSeq2seq
 37 |         mySeq2seq = attention_seq2seq(myEncoder, myDecoder)
 38 |         self.mySeq2seq = mySeq2seq.cuda() if use_cuda else mySeq2seq
 39 | 
 40 | 
 41 |         ##initiate moduleNet
 42 |         myModuleNet = module_net(image_height=image_height, image_width=image_width, in_image_dim=in_image_dim,
 43 |                                  in_text_dim=embed_dim_txt, out_num_choices=out_num_choices, map_dim=hidden_size)
 44 | 
 45 |         self.myModuleNet = myModuleNet.cuda() if use_cuda else myModuleNet
 46 | 
 47 |     def forward(self, input_txt_variable, input_text_seq_lens,
 48 |                 input_images, input_answers,
 49 |                 input_layout_variable,sample_token, policy_gradient_baseline=None,
 50 |                 baseline_decay=None):
 51 | 
 52 |         batch_size = len(input_text_seq_lens)
 53 | 
 54 |         ##run attentionSeq2Seq
 55 |         myLayouts, myAttentions, neg_entropy, log_seq_prob = \
 56 |             self.mySeq2seq(input_txt_variable, input_text_seq_lens, input_layout_variable,sample_token)
 57 | 
 58 | 
 59 |         layout_loss = None
 60 |         if input_layout_variable is not None:
 61 |             layout_loss = torch.mean(-log_seq_prob)
 62 | 
 63 |         predicted_layouts = np.asarray(myLayouts.cpu().data.numpy())
 64 |         expr_list, expr_validity_array = self.assembler.assemble(predicted_layouts)
 65 | 
 66 |         ## group samples based on layout
 67 |         sample_groups_by_layout = unique_columns(predicted_layouts)
 68 | 
 69 |         ##run moduleNet
 70 |         answer_losses = None
 71 |         policy_gradient_losses = None
 72 |         avg_answer_loss =None
 73 |         total_loss = None
 74 |         updated_baseline = policy_gradient_baseline
 75 |         current_answer = np.zeros(batch_size)
 76 | 
 77 |         for sample_group in sample_groups_by_layout:
 78 |             if sample_group.shape == 0:
 79 |                 continue
 80 | 
 81 |             first_in_group = sample_group[0]
 82 |             if expr_validity_array[first_in_group]:
 83 |                 layout_exp = expr_list[first_in_group]
 84 | 
 85 |                 if input_answers is None:
 86 |                     ith_answer_variable = None
 87 |                 else:
 88 |                     ith_answer = input_answers[sample_group]
 89 |                     ith_answer_variable = Variable(torch.LongTensor(ith_answer))
 90 |                     ith_answer_variable = ith_answer_variable.cuda() if use_cuda else ith_answer_variable
 91 | 
 92 |                 textAttention = myAttentions[sample_group, :]
 93 | 
 94 |                 ith_image = input_images[sample_group, :, :, :]
 95 |                 ith_images_variable = Variable(torch.FloatTensor(ith_image))
 96 |                 ith_images_variable = ith_images_variable.cuda() if use_cuda else ith_images_variable
 97 | 
 98 |                 ##image[batch_size, H_feat, W_feat, D_feat] ==> [batch_size, D_feat, W_feat, H_feat] for conv2d
 99 |                 #ith_images_variable = ith_images_variable.permute(0, 3, 1, 2)
100 | 
101 |                 ith_images_variable = ith_images_variable.contiguous()
102 | 
103 |                 myAnswers = self.myModuleNet(input_image_variable=ith_images_variable,
104 |                                         input_text_attention_variable=textAttention,
105 |                                         target_answer_variable=ith_answer_variable,
106 |                                         expr_list=layout_exp)
107 |                 current_answer[sample_group] = torch.topk(myAnswers, 1)[1].cpu().data.numpy()[:, 0]
108 | 
109 | 
110 |                 ##compute loss function only when answer is provided
111 |                 if ith_answer_variable is not None:
112 |                     current_answer_loss = self.answer_criterion(myAnswers, ith_answer_variable)
113 |                     sample_group_tensor = torch.cuda.LongTensor(sample_group) if use_cuda else torch.LongTensor(sample_group)
114 |                 
115 |                     current_log_seq_prob = log_seq_prob[sample_group_tensor]
116 |                     current_answer_loss_val = Variable(current_answer_loss.data,requires_grad=False)
117 |                     tmp1 = current_answer_loss_val - policy_gradient_baseline
118 |                     current_policy_gradient_loss = tmp1 * current_log_seq_prob
119 | 
120 |                     if answer_losses is None:
121 |                         answer_losses = current_answer_loss
122 |                         policy_gradient_losses = current_policy_gradient_loss
123 |                     else:
124 |                         answer_losses = torch.cat((answer_losses, current_answer_loss))
125 |                         policy_gradient_losses = torch.cat((policy_gradient_losses, current_policy_gradient_loss))
126 | 
127 |         try:
128 |             if input_answers is not None:
129 |                 total_loss, avg_answer_loss = self.layout_criterion(neg_entropy=neg_entropy,
130 |                                                                             answer_loss=answer_losses,
131 |                                                                             policy_gradient_losses=policy_gradient_losses,
132 |                                                                             layout_loss=layout_loss)
133 |                 ##update layout policy baseline
134 |                 avg_sample_loss = torch.mean(answer_losses)
135 |                 avg_sample_loss_value = avg_sample_loss.cpu().data.numpy()[0]
136 |                 updated_baseline = policy_gradient_baseline + (1 - baseline_decay) * (
137 |                 avg_sample_loss_value - policy_gradient_baseline)
138 | 
139 |         except:
140 |             print("sample_group = ", sample_group)
141 |             print("neg_entropy=", neg_entropy)
142 |             print("answer_losses=", answer_losses)
143 |             print("policy_gradient_losses=", policy_gradient_losses)
144 |             print("layout_loss=", layout_loss)
145 |             sys.stdout.flush()
146 |             sys.exit("Exception Occur")
147 | 
148 |                     
149 |         
150 |             
151 | 
152 |         return total_loss, avg_answer_loss, current_answer, predicted_layouts, expr_validity_array, updated_baseline
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/models/function2Module.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from models.modules import *
 3 | 
 4 | function2module = {
 5 |     'filter_color': FilterModule,
 6 |     'filter_material': FilterModule,
 7 |     'filter_shape': FilterModule,
 8 |     'filter_size': FilterModule,
 9 | 
10 |     'same_color': FindSamePropertyModule,
11 |     'same_material': FindSamePropertyModule,
12 |     'same_shape': FindSamePropertyModule,
13 |     'same_size': FindSamePropertyModule,
14 | 
15 |     'relate': TransformModule,
16 |     'intersect': AndModule,
17 |     'union': OrModule,
18 | 
19 |     'count': CountModule,
20 |     'exist': ExistModule,
21 |     'equal_integer': EqualNumModule,
22 |     'greater_than': MoreNumModule,
23 |     'less_than': LessNumModule,
24 | 
25 |     'equal_color': SamePropertyModule,
26 |     'equal_material': SamePropertyModule,
27 |     'equal_shape': SamePropertyModule,
28 |     'equal_size': SamePropertyModule,
29 | 
30 |     'query_color': DescribeModule,
31 |     'query_material': DescribeModule,
32 |     'query_shape': DescribeModule,
33 |     'query_size': DescribeModule,
34 | 
35 |     'scene': SceneModule,
36 |     'unique': None
37 | }


--------------------------------------------------------------------------------
/models/layout_assembler.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import numpy as np
  4 | 
  5 | # the number of attention input to each module
  6 | _module_input_num = {
  7 |     '_Scene': 0,
  8 |     '_Find': 0,
  9 |     '_Filter': 1,
 10 |     '_FindSameProperty': 1,
 11 |     '_Transform': 1,
 12 |     '_And': 2,
 13 |     '_Or': 2,
 14 |     '_Count': 1,
 15 |     '_Exist': 1,
 16 |     '_EqualNum': 2,
 17 |     '_MoreNum': 2,
 18 |     '_LessNum': 2,
 19 |     '_SameProperty': 2,
 20 |     '_Describe': 1}
 21 | 
 22 | # output type of each module
 23 | _module_output_type = {
 24 |     '_Scene': 'att',
 25 |     '_Find': 'att',
 26 |     '_Filter': 'att',
 27 |     '_FindSameProperty': 'att',
 28 |     '_Transform': 'att',
 29 |     '_And': 'att',
 30 |     '_Or': 'att',
 31 |     '_Count': 'ans',
 32 |     '_Exist': 'ans',
 33 |     '_EqualNum': 'ans',
 34 |     '_MoreNum': 'ans',
 35 |     '_LessNum': 'ans',
 36 |     '_SameProperty': 'ans',
 37 |     '_Describe': 'ans'}
 38 | 
 39 | INVALID_EXPR = 'INVALID_EXPR'
 40 | # decoding validity: maintaining a state x of [#att, #ans, T_remain]
 41 | # when T_remain is T_decoder when decoding the first module token
 42 | # a token s can be predicted iff all(<x, w_s> - b_s >= 0)
 43 | # the validity token list is
 44 | #       XW - b >= 0
 45 | # the state transition matrix is P, so the state update is X += S P,
 46 | # where S is the predicted tokens (one-hot vectors)
 47 | def _build_validity_mats(module_names):
 48 |     state_size = 3
 49 |     num_vocab_nmn = len(module_names)
 50 |     num_constraints = 4
 51 |     P = np.zeros((num_vocab_nmn, state_size), np.int32)
 52 |     W = np.zeros((state_size, num_vocab_nmn, num_constraints), np.int32)
 53 |     b = np.zeros((num_vocab_nmn, num_constraints), np.int32)
 54 | 
 55 |     # collect the input and output numbers of each module
 56 |     att_in_nums = np.zeros(num_vocab_nmn)
 57 |     att_out_nums = np.zeros(num_vocab_nmn)
 58 |     ans_out_nums = np.zeros(num_vocab_nmn)
 59 |     for n_s, s in enumerate(module_names):
 60 |         if s != '<eos>':
 61 |             att_in_nums[n_s] = _module_input_num[s]
 62 |             att_out_nums[n_s] = _module_output_type[s] == 'att'
 63 |             ans_out_nums[n_s] = _module_output_type[s] == 'ans'
 64 |     # construct the trasition matrix P
 65 |     for n_s, s in enumerate(module_names):
 66 |         P[n_s, 0] = att_out_nums[n_s] - att_in_nums[n_s]
 67 |         P[n_s, 1] = ans_out_nums[n_s]
 68 |         P[n_s, 2] = -1
 69 |     # construct the validity W and b
 70 |     att_absorb_nums = (att_in_nums - att_out_nums)
 71 |     max_att_absorb_nonans = np.max(att_absorb_nums * (ans_out_nums == 0))
 72 |     max_att_absorb_ans = np.max(att_absorb_nums * (ans_out_nums != 0))
 73 |     for n_s, s in enumerate(module_names):
 74 |         if s != '<eos>':
 75 |             # constraint: a non-<eos> module can be outputted iff all the following holds
 76 |             # * 0) there's enough att in the stack
 77 |             #      #att >= att_in_nums[n_s]
 78 |             W[0, n_s, 0] = 1
 79 |             b[n_s, 0] = att_in_nums[n_s]
 80 |             # * 1) for answer modules, there's no extra att in the stack
 81 |             #      #att <= att_in_nums[n_s]
 82 |             #      -#att >= -att_in_nums[n_s]
 83 |             #      for non-answer modules, T_remain >= 3
 84 |             #      (the last two has to be AnswerType and <eos>)
 85 |             if ans_out_nums[n_s] != 0:
 86 |                 W[0, n_s, 1] = -1
 87 |                 b[n_s, 1] = -att_in_nums[n_s]
 88 |             else:
 89 |                 W[2, n_s, 1] = 1
 90 |                 b[n_s, 1] = 3
 91 |             # * 2) there's no answer in the stack (otherwise <eos> only)
 92 |             #      #ans <= 0
 93 |             #      -#ans >= 0
 94 |             W[1, n_s, 2] = -1
 95 |             # * 3) there's enough time to consume the all attentions, output answer plus <eos>
 96 |             #      3.1) for non-answer modules, we already have T_remain>= 3 from constraint 2
 97 |             #           In maximum (T_remain-3) further steps
 98 |             #           (plus 3 steps for this, ans, <eos>) to consume atts
 99 |             #           (T_remain-3) * max_att_absorb_nonans + max_att_absorb_ans + att_absorb_nums[n_s] >= #att
100 |             #           T_remain*MANA - #att >= 3*MANA - MAA - A[s]
101 |             #           - #att + MANA * T_remain >= 3*MANA - MAA - A[s]
102 |             #      3.2) for answer modules, if it can be decoded then constraint 0&1 ensures
103 |             #           that there'll be no att left in stack after decoding this answer,
104 |             #           hence no further constraints here
105 |             if ans_out_nums[n_s] == 0:
106 |                 W[0, n_s, 3] = -1
107 |                 W[2, n_s, 3] = max_att_absorb_nonans
108 |                 b[n_s, 3] = 3*max_att_absorb_nonans - max_att_absorb_ans - att_absorb_nums[n_s]
109 |         else:  # <eos>-case
110 |             # constraint: a <eos> token can be outputted iff all the following holds
111 |             # * 0) there's ans in the stack
112 |             #      #ans >= 1
113 |             W[1, n_s, 0] = 1
114 |             b[n_s, 0] = 1
115 | 
116 |     return P, W, b
117 | 
118 | class Assembler:
119 |     def __init__(self, module_vocab_file):
120 |         # read the module list, and record the index of each module and <eos>
121 |         with open(module_vocab_file) as f:
122 |             self.module_names = [s.strip() for s in f.readlines()]
123 |         # find the index of <eos>
124 |         for n_s in range(len(self.module_names)):
125 |             if self.module_names[n_s] == '<eos>':
126 |                 self.EOS_idx = n_s
127 |                 break
128 |         # build a dictionary from module name to token index
129 |         self.name2idx_dict = {name: n_s for n_s, name in enumerate(self.module_names)}
130 |         self.num_vocab_nmn = len(self.module_names)
131 | 
132 |         self.P, self.W, self.b = _build_validity_mats(self.module_names)
133 | 
134 |     def module_list2tokens(self, module_list, T=None):
135 |         layout_tokens = [self.name2idx_dict[name] for name in module_list]
136 |         if T is not None:
137 |             if len(module_list) >= T:
138 |                 raise ValueError('Not enough time steps to add <eos>')
139 |             layout_tokens += [self.EOS_idx]*(T-len(module_list))
140 |         return layout_tokens
141 | 
142 |     def _layout_tokens2str(self, layout_tokens):
143 |         return ' '.join([self.module_names[idx] for idx in layout_tokens])
144 | 
145 |     def _invalid_expr(self, layout_tokens, error_str):
146 |         return {'module': INVALID_EXPR,
147 |                 'expr_str': self._layout_tokens2str(layout_tokens),
148 |                 'error': error_str}
149 | 
150 |     def _assemble_layout_tokens(self, layout_tokens, batch_idx):
151 |         # All modules takes a time_idx as the index from LSTM hidden states
152 |         # (even if it doesn't need it, like _And), and different arity of
153 |         # attention inputs. The output type can be either attention or answer
154 |         #
155 |         # The final assembled expression for each instance is as follows:
156 |         # expr_type :=
157 |         #    {'module': '_Find',        'output_type': 'att', 'time_idx': idx}
158 |         #  | {'module': '_Transform',   'output_type': 'att', 'time_idx': idx,
159 |         #     'inputs_0': <expr_type>}
160 |         #  | {'module': '_And',         'output_type': 'att', 'time_idx': idx,
161 |         #     'inputs_0': <expr_type>,  'inputs_1': <expr_type>)}
162 |         #  | {'module': '_Answer',      'output_type': 'ans', 'time_idx': idx,
163 |         #     'inputs_0': <expr_type>}
164 |         #  | {'module': INVALID_EXPR, 'expr_str': '...', 'error': '...',
165 |         #     'assembly_loss': <float32>} (for invalid expressions)
166 |         #
167 | 
168 |         # A valid layout must contain <eos>. Assembly fails if it doesn't.
169 |         if not np.any(layout_tokens == self.EOS_idx):
170 |             return self._invalid_expr(layout_tokens, 'cannot find <eos>')
171 | 
172 |         # Decoding Reverse Polish Notation with a stack
173 |         decoding_stack = []
174 |         for t in range(len(layout_tokens)):
175 |             # decode a module/operation
176 |             module_idx = layout_tokens[t]
177 |             if module_idx == self.EOS_idx:
178 |                 break
179 |             module_name = self.module_names[module_idx]
180 |             expr = {'module': module_name,
181 |                     'output_type': _module_output_type[module_name],
182 |                     'time_idx': t, 'batch_idx': batch_idx}
183 | 
184 |             input_num = _module_input_num[module_name]
185 |             # Check if there are enough input in the stack
186 |             if len(decoding_stack) < input_num:
187 |                 # Invalid expression. Not enough input.
188 |                 return self._invalid_expr(layout_tokens, 'not enough input for ' + module_name)
189 | 
190 |             # Get the input from stack
191 |             for n_input in range(input_num-1, -1, -1):
192 |                 stack_top = decoding_stack.pop()
193 |                 if stack_top['output_type'] != 'att':
194 |                     # Invalid expression. Input must be attention
195 |                     return self._invalid_expr(layout_tokens, 'input incompatible for ' + module_name)
196 |                 expr['input_%d' % n_input] = stack_top
197 | 
198 |             decoding_stack.append(expr)
199 | 
200 |         # After decoding the reverse polish expression, there should be exactly
201 |         # one expression in the stack
202 |         if len(decoding_stack) != 1:
203 |             return self._invalid_expr(layout_tokens, 'final stack size not equal to 1 (%d remains)' % len(decoding_stack))
204 | 
205 |         result = decoding_stack[0]
206 |         # The result type should be answer, not attention
207 |         if result['output_type'] != 'ans':
208 |             return self._invalid_expr(layout_tokens, 'result type must be ans, not att')
209 |         return result
210 | 
211 |     def assemble(self, layout_tokens_batch):
212 |         # layout_tokens_batch is a numpy array with shape [T, N],
213 |         # containing module tokens and <eos>, in Reverse Polish Notation.
214 |         _, N = layout_tokens_batch.shape
215 |         expr_list = [self._assemble_layout_tokens(layout_tokens_batch[:, n], n)
216 |                      for n in range(N)]
217 |         expr_validity = np.array([expr['module'] != INVALID_EXPR
218 |                                   for expr in expr_list], np.bool)
219 |         return expr_list, expr_validity
220 | 


--------------------------------------------------------------------------------
/models/module_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from models.modules import *
  5 | from torch.autograd import Variable
  6 | 
  7 | 
  8 | use_cuda = torch.cuda.is_available()
  9 | 
 10 | 
 11 | 
 12 | class module_net(nn.Module):
 13 | 
 14 |     ##initiate all small modules which will be used here
 15 |     def __init__(self, image_height, image_width, in_image_dim, in_text_dim, out_num_choices, map_dim):
 16 |         super(module_net,self).__init__()
 17 |         self.image_height = image_height
 18 |         self.image_width = image_width
 19 |         self.in_image_dim = in_image_dim
 20 |         self.in_text_dim = in_text_dim
 21 |         self.out_num_choices = out_num_choices
 22 |         self.map_dim = map_dim
 23 |         self.SceneModule = SceneModule()
 24 |         self.FindModule = FindModule(image_dim=in_image_dim, text_dim=in_text_dim, map_dim= map_dim)
 25 |         self.TransformModule = TransformModule(image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim)
 26 |         self.AndModule = AndModule()
 27 |         self.OrModule = OrModule()
 28 |         self.FilterModule = FilterModule(findModule=self.FindModule, andModule=self.AndModule)
 29 |         self.FindSamePropertyModule = FindSamePropertyModule(
 30 |             output_num_choice=out_num_choices,image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim)
 31 | 
 32 |         self.CountModule = CountModule(output_num_choice=out_num_choices,
 33 |                                        image_height=image_height, image_width= image_width)
 34 | 
 35 |         self.ExistModule = ExistModule(output_num_choice=out_num_choices,
 36 |                                        image_height=image_height, image_width= image_width)
 37 | 
 38 |         self.EqualNumModule = EqualNumModule(output_num_choice=out_num_choices,
 39 |                                              image_height=image_height, image_width= image_width)
 40 | 
 41 |         self.MoreNumModule = MoreNumModule(output_num_choice=out_num_choices,
 42 |                                            image_height=image_height, image_width= image_width)
 43 | 
 44 |         self.LessNumModule = LessNumModule(output_num_choice=out_num_choices,
 45 |                                            image_height=image_height, image_width= image_width)
 46 | 
 47 |         self.SamePropertyModule = SamePropertyModule(
 48 |             output_num_choice=out_num_choices,image_dim=in_image_dim,text_dim=in_text_dim, map_dim=map_dim)
 49 | 
 50 |         self.DescribeModule = DescribeModule(
 51 |             output_num_choice=out_num_choices,image_dim=in_image_dim, text_dim=in_text_dim, map_dim = map_dim)
 52 | 
 53 |         self.layout2module = {
 54 |             '_Filter': self.FilterModule,
 55 |             '_FindSameProperty': self.FindSamePropertyModule,
 56 |             '_Transform': self.TransformModule,
 57 |             '_And': self.AndModule,
 58 |             '_Or': self.OrModule,
 59 |             '_Count': self.CountModule,
 60 |             '_Exist': self.ExistModule,
 61 |             '_EqualNum': self.EqualNumModule,
 62 |             '_MoreNum': self.MoreNumModule,
 63 |             '_LessNum': self.LessNumModule,
 64 |             '_SameProperty': self.SamePropertyModule,
 65 |             '_Describe': self.DescribeModule,
 66 |             '_Find': self.FindModule,
 67 |             '_Scene': self.SceneModule
 68 |         }
 69 | 
 70 |     #text[N, D_text]
 71 | 
 72 |     def recursively_assemble_network(self,input_image_variable, input_text_attention_variable,expr_list):
 73 |         current_module = self.layout2module[expr_list['module']]
 74 |         time_idx = expr_list['time_idx']
 75 |         text_index = Variable(torch.LongTensor([time_idx]))
 76 |         text_index = text_index.cuda() if use_cuda else text_index
 77 |         text_at_time = torch.index_select(input_text_attention_variable, dim=1,
 78 |                                           index=text_index).view(-1, self.in_text_dim)
 79 | 
 80 |         input_0 = None
 81 |         input_1 = None
 82 | 
 83 |         if 'input_0' in expr_list:
 84 |             input_0 = self.recursively_assemble_network(input_image_variable,
 85 |                                                         input_text_attention_variable, expr_list['input_0'])
 86 |         if 'input_1' in expr_list:
 87 |             input_1 = self.recursively_assemble_network(input_image_variable, input_text_attention_variable,
 88 |                                                         expr_list['input_1'])
 89 | 
 90 |         res = current_module(input_image_variable, text_at_time, input_0, input_1)
 91 |         return res
 92 | 
 93 | 
 94 |     def forward(self, input_image_variable, input_text_attention_variable, target_answer_variable, expr_list):
 95 | 
 96 | 
 97 |         ##for now assume batch_size = 1
 98 |         result = self.recursively_assemble_network(input_image_variable,input_text_attention_variable,expr_list)
 99 | 
100 |         return result
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/models/modules.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | 
  7 | use_cuda = torch.cuda.is_available()
  8 | 
  9 | 
 10 | '''
 11 | NOTE: in all modules, 
 12 | image_feat [N,D_image,H,W]
 13 | text [N,D_text]
 14 | attention [N,1,H,W]
 15 | '''
 16 | 
 17 | 
 18 | 
 19 | class SceneModule(nn.Module):
 20 |     def __init__(self):
 21 |         super(SceneModule,self).__init__()
 22 | 
 23 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
 24 |         N, _, H, W = input_image_feat.shape
 25 |         res = torch.ones((N, 1, H, W))
 26 |         att_grid = Variable(res)
 27 |         att_grid = att_grid.cuda() if use_cuda else att_grid
 28 |         return att_grid
 29 | 
 30 | 
 31 | class FindModule(nn.Module):
 32 |     '''
 33 |     Mapping image_feat_grid X text_param ->att.grid
 34 |     (N,D_image,H,W) X (N,1,D_text) --> [N,1,H,W]
 35 |     '''
 36 |     def __init__(self, image_dim, text_dim, map_dim):
 37 |         super(FindModule,self).__init__()
 38 |         self.map_dim = map_dim
 39 |         self.conv1 = nn.Conv2d(image_dim,map_dim,kernel_size=1)
 40 |         self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1)
 41 |         self.textfc = nn.Linear(text_dim,map_dim)
 42 | 
 43 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
 44 |         image_mapped = self.conv1(input_image_feat)  #(N, map_dim, H, W)
 45 |         text_mapped = self.textfc(input_text).view(-1, self.map_dim,1,1).expand_as(image_mapped)
 46 |         elmtwize_mult = image_mapped * text_mapped
 47 |         elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) #(N, map_dim, H, W)
 48 |         att_grid = self.conv2(elmtwize_mult) #(N, 1, H, W)
 49 |         return att_grid
 50 | 
 51 | 
 52 | 
 53 | class FilterModule(nn.Module):
 54 |     def __init__(self, findModule, andModule):
 55 |         super(FilterModule,self).__init__()
 56 |         self.andModule = andModule
 57 |         self.findModule = findModule
 58 | 
 59 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
 60 |         find_result = self.findModule(input_image_feat,input_text,input_image_attention1,input_image_attention2)
 61 |         att_grid = self.andModule(input_image_feat,input_text,input_image_attention1,find_result)
 62 |         return att_grid
 63 | 
 64 | 
 65 | class FindSamePropertyModule(nn.Module):
 66 |     def __init__(self,output_num_choice, image_dim, text_dim, map_dim):
 67 |         super(FindSamePropertyModule,self).__init__()
 68 |         self.out_num_choice = output_num_choice
 69 |         self.image_dim = image_dim
 70 |         self.map_dim = map_dim
 71 |         self.text_fc = nn.Linear(text_dim, map_dim)
 72 |         self.att_fc_1 = nn.Linear(image_dim, map_dim)
 73 |         self.conv1 = nn.Conv2d(image_dim, map_dim, kernel_size=1)
 74 |         self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1)
 75 | 
 76 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
 77 |         H, W = input_image_attention1.shape[2:4]
 78 |         att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W)
 79 |         image_reshape = input_image_feat.view(-1,self.image_dim,H * W)
 80 |         att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2)    #[N, image_dim]
 81 |         att_feat_1_mapped = self.att_fc_1(att_feat_1).view(-1, self.map_dim,1,1)       #[N, map_dim,1,1]
 82 | 
 83 |         text_mapped = self.text_fc(input_text).view(-1,self.map_dim,1,1)
 84 | 
 85 |         image_mapped = self.conv1(input_image_feat)  # (N, map_dim, H, W)
 86 | 
 87 |         elmtwize_mult = image_mapped * text_mapped * att_feat_1_mapped #[N, map_dim, H, W]
 88 |         elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1)
 89 | 
 90 |         att_grid = self.conv2(elmtwize_mult)
 91 | 
 92 |         return att_grid
 93 | 
 94 | 
 95 | class TransformModule(nn.Module):
 96 |     def __init__(self, image_dim, text_dim, map_dim,kernel_size=5, padding=2):
 97 |         super(TransformModule,self).__init__()
 98 |         self.map_dim = map_dim
 99 |         self.conv1 = nn.Conv2d(1, map_dim, kernel_size=kernel_size, padding=padding)
100 |         self.conv2 = nn.Conv2d(map_dim, 1, kernel_size=1)
101 |         self.textfc = nn.Linear(text_dim,map_dim)
102 | 
103 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
104 |         image_att_mapped = self.conv1(input_image_attention1)  #(N, map_dim, H, W)
105 |         text_mapped = self.textfc(input_text).view(-1, self.map_dim,1,1).expand_as(image_att_mapped)
106 |         elmtwize_mult = image_att_mapped * text_mapped
107 |         elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1) #(N, map_dim, H, W)
108 |         att_grid = self.conv2(elmtwize_mult) #(N, 1, H, W)
109 |         return att_grid
110 | 
111 | 
112 | class AndModule(nn.Module):
113 |     def __init__(self):
114 |         super(AndModule,self).__init__()
115 | 
116 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
117 |         return torch.max(input_image_attention1, input_image_attention2)
118 | 
119 | 
120 | class OrModule(nn.Module):
121 |     def __init__(self):
122 |         super(OrModule,self).__init__()
123 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
124 |         return torch.min(input_image_attention1, input_image_attention2)
125 | 
126 | 
127 | 
128 | class CountModule(nn.Module):
129 |     def __init__(self,output_num_choice, image_height, image_width):
130 |         super(CountModule,self).__init__()
131 |         self.out_num_choice = output_num_choice
132 |         self.lc_out = nn.Linear(image_height*image_width + 3, self.out_num_choice)
133 | 
134 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
135 |         H, W = input_image_attention1.shape[2:4]
136 |         att_all = input_image_attention1.view(-1, H*W) ##flatten attention to [N, H*W]
137 |         att_avg = torch.mean(att_all, 1, keepdim=True)
138 |         att_min = torch.min(att_all, 1, keepdim=True)[0]
139 |         att_max = torch.max(att_all,1, keepdim=True)[0]
140 |         att_concat = torch.cat((att_all, att_avg, att_min, att_max), 1)
141 |         scores = self.lc_out(att_concat)
142 |         return scores
143 | 
144 | 
145 | 
146 | 
147 | class ExistModule(nn.Module):
148 |     def __init__(self,output_num_choice, image_height, image_width):
149 |         super(ExistModule,self).__init__()
150 |         self.out_num_choice = output_num_choice
151 |         self.lc_out = nn.Linear(image_height*image_width + 3, self.out_num_choice)
152 | 
153 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
154 |         H, W = input_image_attention1.shape[2:4]
155 |         att_all = input_image_attention1.view(-1, H*W) ##flatten attention to [N, H*W]
156 |         att_avg = torch.mean(att_all, 1, keepdim=True)
157 |         att_min = torch.min(att_all, 1, keepdim=True)[0]
158 |         att_max = torch.max(att_all, 1, keepdim=True)[0]
159 |         att_concat = torch.cat((att_all, att_avg, att_min, att_max), 1)
160 |         scores = self.lc_out(att_concat)
161 |         return scores
162 | 
163 | 
164 | class EqualNumModule(nn.Module):
165 |     def __init__(self,output_num_choice, image_height, image_width):
166 |         super(EqualNumModule,self).__init__()
167 |         self.out_num_choice = output_num_choice
168 |         self.lc_out = nn.Linear(image_height*image_width *2 + 6, self.out_num_choice)
169 | 
170 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
171 |         H, W = input_image_attention1.shape[2:4]
172 |         att1_all = input_image_attention1.view(-1, H * W) ##flatten attention to [N, H*W]
173 |         att1_avg = torch.mean(att1_all, 1, keepdim=True)
174 |         att1_min = torch.min(att1_all, 1, keepdim=True)[0]
175 |         att1_max = torch.max(att1_all, 1, keepdim=True)[0]
176 | 
177 |         att2_all = input_image_attention2.view(-1, H * W)  ##flatten attention to [N, H*W]
178 |         att2_avg = torch.mean(att2_all, 1, keepdim=True)
179 |         att2_min = torch.min(att2_all, 1, keepdim=True)[0]
180 |         att2_max = torch.max(att2_all, 1, keepdim=True)[0]
181 | 
182 |         att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max,att2_all, att2_avg, att2_min, att2_max), 1)
183 |         scores = self.lc_out(att_concat)
184 |         return scores
185 | 
186 | class MoreNumModule(nn.Module):
187 |     def __init__(self, output_num_choice, image_height, image_width):
188 |         super(MoreNumModule, self).__init__()
189 |         self.out_num_choice = output_num_choice
190 |         self.lc_out = nn.Linear(image_height * image_width * 2 + 6, self.out_num_choice)
191 | 
192 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
193 |         H, W = input_image_attention1.shape[2:4]
194 |         att1_all = input_image_attention1.view(-1, H * W)  ##flatten attention to [N, H*W]
195 |         att1_avg = torch.mean(att1_all, 1, keepdim=True)
196 |         att1_min = torch.min(att1_all, 1, keepdim=True)[0]
197 |         att1_max = torch.max(att1_all, 1, keepdim=True)[0]
198 | 
199 |         att2_all = input_image_attention2.view(-1, H * W)  ##flatten attention to [N, H*W]
200 |         att2_avg = torch.mean(att2_all, 1, keepdim=True)
201 |         att2_min = torch.min(att2_all, 1, keepdim=True)[0]
202 |         att2_max = torch.max(att2_all, 1, keepdim=True)[0]
203 | 
204 |         att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max, att2_all, att2_avg, att2_min, att2_max), 1)
205 |         scores = self.lc_out(att_concat)
206 |         return scores
207 | 
208 | class LessNumModule(nn.Module):
209 |     def __init__(self, output_num_choice, image_height, image_width):
210 |         super(LessNumModule, self).__init__()
211 |         self.out_num_choice = output_num_choice
212 |         self.lc_out = nn.Linear(image_height * image_width * 2 + 6, self.out_num_choice)
213 | 
214 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
215 |         H, W = input_image_attention1.shape[2:4]
216 |         att1_all = input_image_attention1.view(-1, H * W)  ##flatten attention to [N, H*W]
217 |         att1_avg = torch.mean(att1_all, 1, keepdim=True)
218 |         att1_min = torch.min(att1_all, 1, keepdim=True)[0]
219 |         att1_max = torch.max(att1_all, 1, keepdim=True)[0]
220 | 
221 |         att2_all = input_image_attention2.view(-1, H * W)  ##flatten attention to [N, H*W]
222 |         att2_avg = torch.mean(att2_all, 1, keepdim=True)
223 |         att2_min = torch.min(att2_all, 1, keepdim=True)[0]
224 |         att2_max = torch.max(att2_all, 1, keepdim=True)[0]
225 | 
226 |         att_concat = torch.cat((att1_all, att1_avg, att1_min, att1_max, att2_all, att2_avg, att2_min, att2_max), 1)
227 |         scores = self.lc_out(att_concat)
228 |         return scores
229 | 
230 | class SamePropertyModule(nn.Module):
231 |     def __init__(self,output_num_choice, image_dim, text_dim, map_dim):
232 |         super(SamePropertyModule,self).__init__()
233 |         self.out_num_choice = output_num_choice
234 |         self.image_dim = image_dim
235 |         self.text_fc = nn.Linear(text_dim, map_dim)
236 |         self.att_fc_1 = nn.Linear(image_dim, map_dim)
237 |         self.att_fc_2 = nn.Linear(image_dim, map_dim)
238 |         self.lc_out = nn.Linear(map_dim, self.out_num_choice)
239 | 
240 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
241 |         H, W = input_image_attention1.shape[2:4]
242 |         att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W)
243 |         att_softmax_2 = F.softmax(input_image_attention2.view(-1, H * W), dim=1).view(-1, 1, H*W)
244 |         image_reshape = input_image_feat.view(-1,self.image_dim,H * W)
245 |         att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2)    #[N, image_dim]
246 |         att_feat_2 = torch.sum(att_softmax_2 * image_reshape, dim=2)
247 |         att_feat_1_mapped = self.att_fc_1(att_feat_1)       #[N, map_dim]
248 |         att_feat_2_mapped = self.att_fc_2(att_feat_2)
249 | 
250 |         text_mapped = self.text_fc(input_text)
251 |         elmtwize_mult = att_feat_1_mapped * text_mapped * att_feat_2_mapped #[N, map_dim]
252 |         elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1)
253 |         scores = self.lc_out(elmtwize_mult)
254 | 
255 |         return scores
256 | 
257 | class DescribeModule(nn.Module):
258 |     def __init__(self,output_num_choice, image_dim, text_dim, map_dim):
259 |         super(DescribeModule,self).__init__()
260 |         self.out_num_choice = output_num_choice
261 |         self.image_dim = image_dim
262 |         self.text_fc = nn.Linear(text_dim, map_dim)
263 |         self.att_fc_1 = nn.Linear(image_dim, map_dim)
264 |         self.lc_out = nn.Linear(map_dim, self.out_num_choice)
265 | 
266 |     def forward(self, input_image_feat, input_text, input_image_attention1=None, input_image_attention2=None):
267 |         H, W = input_image_attention1.shape[2:4]
268 |         att_softmax_1 = F.softmax(input_image_attention1.view(-1, H * W),dim=1).view(-1, 1, H*W)
269 |         image_reshape = input_image_feat.view(-1,self.image_dim,H * W) #[N,image_dim,H*W]
270 |         att_feat_1 = torch.sum(att_softmax_1 * image_reshape, dim=2)    #[N, image_dim]
271 |         att_feat_1_mapped = self.att_fc_1(att_feat_1)       #[N, map_dim]
272 | 
273 |         text_mapped = self.text_fc(input_text)
274 |         elmtwize_mult = att_feat_1_mapped * text_mapped  #[N, map_dim]
275 |         elmtwize_mult = F.normalize(elmtwize_mult, p=2, dim=1)
276 |         scores = self.lc_out(elmtwize_mult)
277 | 
278 |         return scores


--------------------------------------------------------------------------------
/tools/build_clevr_imdb.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import json
 3 | import os
 4 | 
 5 | import sys
 6 | from Utils import text_processing
 7 | import argparse
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | 
11 | parser.add_argument("--data_dir",type=str, required=True, help="directory for data ")
12 | parser.add_argument("--out_dir",type=str, required=True, help="output directory for json files")
13 | args = parser.parse_args()
14 | data_dir = args.data_dir
15 | out_dir = args.out_dir
16 | 
17 | question_file = 'CLEVR_%s_questions_gt_layout.json'
18 | 
19 | def build_imdb(image_set):
20 |     print('building imdb %s' % image_set)
21 |     question_file_name = (question_file % image_set)
22 |     question_file_path = os.path.join(data_dir, question_file_name )
23 |     with open(question_file_path) as f:
24 |         questions = json.load(f)
25 |     imdb = [None]*len(questions)
26 |     for n_q, q in enumerate(questions):
27 |         if (n_q+1) % 10000 == 0:
28 |             print('processing %d / %d' % (n_q+1, len(questions)))
29 |         image_name = q['image_filename'].split('.')[0]
30 |         feature_name = image_name + '.npy'
31 |         question_str = q['question']
32 |         question_tokens = text_processing.tokenize(question_str)
33 |         gt_layout_tokens = None
34 |         if 'gt_layout' in q:
35 |             gt_layout_tokens = q['gt_layout']
36 |         answer = None
37 |         if 'answer' in q:
38 |             answer = q['answer']
39 | 
40 |         iminfo = dict(image_name=image_name,
41 |                       feature_path=feature_name,
42 |                       question_str=question_str,
43 |                       question_tokens=question_tokens,
44 |                       gt_layout_tokens=gt_layout_tokens,
45 |                       answer=answer)
46 |         imdb[n_q] = iminfo
47 |     return imdb
48 | 
49 | 
50 | imdb_trn = build_imdb('train')
51 | imdb_val = build_imdb('val')
52 | imdb_tst = build_imdb('test')
53 | 
54 | os.makedirs('out_dir', exist_ok=True)
55 | 
56 | out_trn = os.path.join(out_dir, 'imdb_trn.npy')
57 | out_val = os.path.join(out_dir, 'imdb_val.npy')
58 | out_tst = os.path.join(out_dir, 'imdb_tst.npy')
59 | 
60 | np.save(out_trn, np.array(imdb_trn))
61 | np.save(out_val, np.array(imdb_val))
62 | np.save(out_tst, np.array(imdb_tst))
63 | 


--------------------------------------------------------------------------------
/tools/build_vqa_imdb.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import json
  3 | import os
  4 | from collections import defaultdict
  5 | import sys
  6 | from Utils import text_processing
  7 | import argparse
  8 | 
  9 | parser = argparse.ArgumentParser()
 10 | parser.add_argument("--data_dir", type=str, required=True, help="data directory")
 11 | parser.add_argument("--out_dir", type=str, required=True, help="imdb output directory")
 12 | args = parser.parse_args()
 13 | 
 14 | data_dir = args.data_dir
 15 | out_dir = args.out_dir
 16 | 
 17 | '''
 18 | vocab_answer_file = './answers_vqa.txt'
 19 | annotation_file = '../vqa-dataset/Annotations/mscoco_%s_annotations.json'
 20 | question_file = '../vqa-dataset/Questions/OpenEnded_mscoco_%s_questions.json'
 21 | gt_layout_file = './gt_layout_%s_new_parse.npy'
 22 | 
 23 | image_dir = '../vqa-dataset/Images/%s/'
 24 | feature_dir = './resnet_res5c/%s/'
 25 | '''
 26 | 
 27 | vocab_answer_file = os.path.join(out_dir, 'answers_vqa.txt')
 28 | gt_layout_file = os.path.join(out_dir, 'gt_layout_%s_new_parse.npy')
 29 | 
 30 | annotation_file = os.path.join(data_dir, 'mscoco_%s_annotations.json')
 31 | question_file = os.path.join(data_dir, 'OpenEnded_mscoco_%s_questions.json')
 32 | 
 33 | 
 34 | #image_dir = '../vqa-dataset/Images/%s/'
 35 | #feature_dir = './resnet_res5c/%s/'
 36 | 
 37 | 
 38 | 
 39 | answer_dict = text_processing.VocabDict(vocab_answer_file)
 40 | valid_answer_set = set(answer_dict.word_list)
 41 | 
 42 | def extract_answers(q_answers):
 43 |     all_answers = [answer["answer"] for answer in q_answers]
 44 |     valid_answers = [a for a in all_answers if a in valid_answer_set]
 45 |     return all_answers, valid_answers
 46 | 
 47 | def build_imdb(image_set):
 48 |     print('building imdb %s' % image_set)
 49 |     if image_set in ['train2014', 'val2014']:
 50 |         load_answer = True
 51 |         load_gt_layout = True
 52 |         with open(annotation_file % image_set) as f:
 53 |             annotations = json.load(f)["annotations"]
 54 |             qid2ann_dict = {ann['question_id']: ann for ann in annotations}
 55 |         qid2layout_dict = np.load(gt_layout_file % image_set)[()]
 56 |     else:
 57 |         load_answer = False
 58 |         load_gt_layout = False
 59 |     with open(question_file % image_set) as f:
 60 |         questions = json.load(f)['questions']
 61 |     coco_set_name = image_set.replace('-dev', '')
 62 |     #abs_image_dir = os.path.abspath(image_dir % coco_set_name)
 63 |     #abs_feature_dir = os.path.abspath(feature_dir % coco_set_name)
 64 |     image_name_template = 'COCO_' + coco_set_name + '_%012d'
 65 |     imdb = [None]*len(questions)
 66 | 
 67 |     unk_ans_count = 0
 68 |     for n_q, q in enumerate(questions):
 69 |         if (n_q+1) % 10000 == 0:
 70 |             print('processing %d / %d' % (n_q+1, len(questions)))
 71 |         image_id = q['image_id']
 72 |         question_id = q['question_id']
 73 |         image_name = image_name_template % image_id
 74 |         #image_path = os.path.join(abs_image_dir, image_name + '.jpg')
 75 |         feature_path = image_name + '.npy'
 76 |         #feature_path = os.path.join(abs_feature_dir, image_name + '.npy')
 77 |         question_str = q['question']
 78 |         question_tokens = text_processing.tokenize(question_str)
 79 | 
 80 |         iminfo = dict(image_name=image_name,
 81 |               image_id=image_id,
 82 |               question_id=question_id,
 83 |               feature_path=feature_path,
 84 |               question_str=question_str,
 85 |               question_tokens=question_tokens)
 86 | 
 87 |         # load answers
 88 |         if load_answer:
 89 |             ann = qid2ann_dict[question_id]
 90 |             all_answers, valid_answers = extract_answers(ann['answers'])
 91 |             if len(valid_answers) == 0:
 92 |                 valid_answers = ['<unk>']
 93 |                 unk_ans_count += 1
 94 |             iminfo['all_answers'] = all_answers
 95 |             iminfo['valid_answers'] = valid_answers
 96 | 
 97 |         if load_gt_layout:
 98 |             gt_layout_tokens = qid2layout_dict[question_id]
 99 |             iminfo['gt_layout_tokens'] = gt_layout_tokens
100 | 
101 |         imdb[n_q] = iminfo
102 |     print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions)))
103 |     return imdb
104 | 
105 | imdb_train2014 = build_imdb('train2014')
106 | imdb_val2014 = build_imdb('val2014')
107 | imdb_test2015 = build_imdb('test2015')
108 | imdb_test_dev2015 = build_imdb('test-dev2015')
109 | 
110 | 
111 | imdb_dir = os.path.join(out_dir,'imdb')
112 | os.makedirs(imdb_dir, exist_ok=True)
113 | np.save(os.path.join(imdb_dir, 'imdb_train2014.npy'), np.array(imdb_train2014))
114 | np.save(os.path.join(imdb_dir, 'imdb_val2014.npy'), np.array(imdb_val2014))
115 | np.save(os.path.join(imdb_dir, 'imdb_trainval2014.npy'), np.array(imdb_train2014 + imdb_val2014))
116 | np.save(os.path.join(imdb_dir, 'imdb_test2015.npy'), np.array(imdb_test2015))
117 | np.save(os.path.join(imdb_dir, 'imdb_test-dev2015.npy'), np.array(imdb_test_dev2015))
118 | 


--------------------------------------------------------------------------------
/tools/extract_visual_features_vgg_pool5.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | parser = argparse.ArgumentParser()
 3 | parser.add_argument('--gpu_id', type=int, default=0)
 4 | parser.add_argument("--data_dir",type=str, required=True)
 5 | parser.add_argument("--out_dir",type=str, required=True)
 6 | 
 7 | args = parser.parse_args()
 8 | gpu_id = args.gpu_id  # set GPU id to use
 9 | import os; os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
10 | import sys
11 | sys.path.append('../../')
12 | from glob import glob
13 | 
14 | import skimage.io
15 | import skimage.color
16 | import numpy as np
17 | 
18 | import torch
19 | import torchvision.models as models
20 | import torch.nn as nn
21 | from torch.autograd import Variable
22 | from global_variables.global_variables import use_cuda
23 | 
24 | image_basedir = args.data_dir
25 | save_basedir = args.out_dir
26 | 
27 | #H = 320
28 | #W = 480
29 | 
30 | channel_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32)
31 | 
32 | 
33 | 
34 | 
35 | 
36 | class vgg16_feature_module(nn.Module):
37 |     def __init__(self, vgg16_model):
38 |         super(vgg16_feature_module, self).__init__()
39 |         self.feature_module = nn.Sequential(*list(list(vgg16_model.children())[0]))
40 | 
41 |     def forward(self, x):
42 |         return self.feature_module(x)
43 | 
44 | vgg16 = models.vgg16(pretrained=True)
45 | 
46 | vgg16_feature = vgg16_feature_module(vgg16)
47 | vgg16_feature = vgg16_feature.cuda() if use_cuda else vgg16_feature
48 | 
49 | def extract_image_pool5(impath):
50 |     im = skimage.io.imread(impath)[..., :3]
51 |     im_val = (im[np.newaxis, ...]- channel_mean)
52 |     im_val = np.transpose(im_val,axes=(0,3,1,2))
53 |     im_val_tensor = torch.FloatTensor(im_val)
54 |     im_val_variable = Variable(im_val_tensor)
55 |     im_val_variable = im_val_variable.cuda() if use_cuda else im_val_variable
56 |     pool5_val = vgg16_feature(im_val_variable)
57 |     return pool5_val.data.cpu().numpy()
58 | 
59 | def extract_dataset_pool5(image_dir, save_dir, ext_filter='*.png'):
60 |     image_list = glob(image_dir + '/' + ext_filter)
61 |     os.makedirs(save_dir, exist_ok=True)
62 | 
63 |     for n_im, impath in enumerate(image_list):
64 |         if (n_im+1) % 100 == 0:
65 |             print('processing %d / %d' % (n_im+1, len(image_list)))
66 |         image_name = os.path.basename(impath).split('.')[0]
67 |         save_path = os.path.join(save_dir, image_name + '.npy')
68 |         if not os.path.exists(save_path):
69 |             pool5_val = extract_image_pool5(impath)
70 |             np.save(save_path, pool5_val)
71 | 
72 | for image_set in ['train', 'val', 'test']:
73 |     print('Extracting image set ' + image_set)
74 |     extract_dataset_pool5(os.path.join(image_basedir, image_set),
75 |                           os.path.join(save_basedir, image_set))
76 |     print('Done.')


--------------------------------------------------------------------------------
/tools/get_ground_truth_layout.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import os
  4 | import argparse
  5 | 
  6 | parser = argparse.ArgumentParser()
  7 | parser.add_argument('--question_dir', type=str, required=True, help="directory for questions")
  8 | parser.add_argument('--out_dir', type=str, required=True)
  9 | 
 10 | args = parser.parse_args()
 11 | question_dir = args.question_dir
 12 | out_dir = args.out_dir
 13 | 
 14 | function2module = {
 15 |     'filter_color': '_Filter',
 16 |     'filter_material': '_Filter',
 17 |     'filter_shape': '_Filter',
 18 |     'filter_size': '_Filter',
 19 | 
 20 |     'same_color': '_FindSameProperty',
 21 |     'same_material': '_FindSameProperty',
 22 |     'same_shape': '_FindSameProperty',
 23 |     'same_size': '_FindSameProperty',
 24 | 
 25 |     'relate': '_Transform',
 26 |     'intersect': '_And',
 27 |     'union': '_Or',
 28 | 
 29 |     'count': '_Count',
 30 |     'exist': '_Exist',
 31 |     'equal_integer': '_EqualNum',
 32 |     'greater_than': '_MoreNum',
 33 |     'less_than': '_LessNum',
 34 | 
 35 |     'equal_color': '_SameProperty',
 36 |     'equal_material': '_SameProperty',
 37 |     'equal_shape': '_SameProperty',
 38 |     'equal_size': '_SameProperty',
 39 | 
 40 |     'query_color': '_Describe',
 41 |     'query_material': '_Describe',
 42 |     'query_shape': '_Describe',
 43 |     'query_size': '_Describe',
 44 | 
 45 |     'scene': '_Scene',
 46 |     'unique': None
 47 | }
 48 | 
 49 | 
 50 | def _traversal(program, i):
 51 |     funcs = []
 52 |     for j in program[i]['inputs']:
 53 |         funcs += _traversal(program, j)
 54 |     funcs.append(program[i]['function'])
 55 |     return funcs
 56 | 
 57 | 
 58 | prune_set = {
 59 |     'equal_integer', 'greater_than', 'less_than', 'equal_color',
 60 |     'equal_material', 'equal_shape', 'equal_size'}
 61 | rm_set = {
 62 |     'count', 'query_color', 'query_material', 'query_shape', 'query_size'}
 63 | 
 64 | 
 65 | def _prune_program(program):
 66 |     for f in program:
 67 |         if f and f['function'] in prune_set:
 68 |             assert (len(f['inputs']) == 2)
 69 |             input_f_0 = program[f['inputs'][0]]
 70 |             input_f_1 = program[f['inputs'][1]]
 71 |             if input_f_0['function'] in rm_set:
 72 |                 assert (len(input_f_0['inputs']) == 1)
 73 |                 program[f['inputs'][0]] = None
 74 |                 f['inputs'][0] = input_f_0['inputs'][0]
 75 |             if input_f_1['function'] in rm_set:
 76 |                 assert (len(input_f_1['inputs']) == 1)
 77 |                 program[f['inputs'][1]] = None
 78 |                 f['inputs'][1] = input_f_1['inputs'][0]
 79 | 
 80 |     return program
 81 | 
 82 | 
 83 | def linearize_program(q):
 84 |     program = _prune_program(q['program'])
 85 |     # 1. Find root: the root module has no parent
 86 |     is_root = np.array([f is not None for f in program])
 87 |     for f in program:
 88 |         if f is not None:
 89 |             is_root[f['inputs']] = False
 90 |     if np.sum(is_root) != 1:
 91 |         assert (np.sum(is_root) >= 1)
 92 |         # remove the roots that are 'scene'
 93 |         is_not_scene = np.array([not (f and f['function'] == 'scene') for f in program])
 94 |         is_root = np.logical_and(is_root, is_not_scene)
 95 |         assert (np.sum(is_root) == 1)
 96 | 
 97 |     root = np.argmax(is_root)
 98 | 
 99 |     # 2. Post-order traversal to obtain RPN
100 |     funcs = _traversal(program, root)
101 | 
102 |     # 3. Map modules and fix exps
103 |     q_modules = [function2module[f] for f in funcs]
104 |     q_modules_new = q_modules[:]
105 |     for n_f in range(1, len(q_modules)):
106 |         # replace _Scene + _Filter with _Find
107 |         if q_modules[n_f - 1] == '_Scene' and q_modules[n_f] == '_Filter':
108 |             q_modules_new[n_f - 1] = None
109 |             q_modules_new[n_f] = '_Find'
110 | 
111 |     q_modules_new = [m for m in q_modules_new if m is not None]
112 |     return q_modules_new
113 | 
114 | 
115 | def add_gt_layout(question_file, save_file):
116 |     with open(question_file) as f:
117 |         questions = json.load(f)['questions']
118 | 
119 |     for n_q, q in enumerate(questions):
120 |         if (n_q + 1) % 1000 == 0:
121 |             print('processing %d / %d' % (n_q + 1, len(questions)))
122 |         if 'program' in q:
123 |             q['gt_layout'] = linearize_program(q)
124 | 
125 |     with open(save_file, 'w') as f:
126 |         json.dump(questions, f)
127 | 
128 | 
129 | # question_file_trn = '../clevr-dataset/questions/CLEVR_train_questions.json'
130 | # save_file_trn = './CLEVR_train_questions_gt_layout.json'
131 | 
132 | question_file_trn = os.path.join(question_dir, 'CLEVR_train_questions.json')
133 | save_file_trn = os.path.join(out_dir, 'CLEVR_train_questions_gt_layout.json')
134 | 
135 | add_gt_layout(question_file_trn, save_file_trn)
136 | 
137 | #question_file_val = '../clevr-dataset/questions/CLEVR_val_questions.json'
138 | #save_file_val = './CLEVR_val_questions_gt_layout.json'
139 | 
140 | question_file_val = os.path.join(question_dir, 'CLEVR_val_questions.json')
141 | save_file_val = os.path.join(out_dir, 'CLEVR_val_questions_gt_layout.json')
142 | 
143 | add_gt_layout(question_file_val, save_file_val)
144 | 
145 | #question_file_tst = '../clevr-dataset/questions/CLEVR_test_questions.json'
146 | #save_file_tst = './CLEVR_test_questions_gt_layout.json'
147 | 
148 | question_file_tst = os.path.join(question_dir, 'CLEVR_test_questions.json')
149 | save_file_tst = os.path.join(out_dir,'CLEVR_test_questions_gt_layout.json')
150 | 
151 | add_gt_layout(question_file_tst, save_file_tst)
152 | 


--------------------------------------------------------------------------------
/train_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YuJiang01/n2nmn_pytorch/4cc6eb51af2aff29a88bdce7d575a364d0e5e5cb/train_model/__init__.py


--------------------------------------------------------------------------------
/train_model/from_scratch_hyperparameters.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # Module parameters
 4 | H_feat = 10
 5 | W_feat = 15
 6 | D_feat = 512
 7 | embed_dim_txt = 300
 8 | embed_dim_nmn = 300
 9 | lstm_dim = 512
10 | num_layers = 2
11 | encoder_dropout = False
12 | decoder_dropout = False
13 | decoder_sampling = True
14 | T_encoder = 45
15 | T_decoder = 10
16 | N = 64
17 | prune_filter_module = True
18 | 
19 | # Training parameters
20 | invalid_expr_loss = np.log(28)  # loss value when the layout is invalid
21 | lambda_entropy = 0.01
22 | weight_decay = 0
23 | baseline_decay = 0.99
24 | max_grad_l2_norm = 10
25 | max_iter = 120000
26 | snapshot_interval = 10000
27 | learning_rate = 0.001
28 | 
29 | 


--------------------------------------------------------------------------------
/train_model/gt_hyperparameters.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Module parameters
 4 | H_feat = 10
 5 | W_feat = 15
 6 | D_feat = 512
 7 | embed_dim_txt = 300
 8 | #embed_dim_txt = 512
 9 | embed_dim_nmn = 300
10 | lstm_dim = 512
11 | num_layers = 2
12 | encoder_dropout = False
13 | decoder_dropout = False
14 | decoder_sampling = True
15 | T_encoder = 45
16 | T_decoder = 10
17 | N = 64
18 | prune_filter_module = True
19 | 
20 | # Training parameters
21 | weight_decay = 5e-6
22 | baseline_decay = 0.99
23 | max_grad_l2_norm = 10
24 | max_iter = 80000
25 | snapshot_interval = 10000
26 | 
27 | lambda_entropy = 0
28 | 
29 | learning_rate = 0.001
30 | 


--------------------------------------------------------------------------------
/train_model/gt_rl_hyperparameters.py:
--------------------------------------------------------------------------------
 1 | # Module parameters
 2 | H_feat = 10
 3 | W_feat = 15
 4 | D_feat = 512
 5 | embed_dim_txt = 300
 6 | #embed_dim_txt = 512
 7 | embed_dim_nmn = 300
 8 | lstm_dim = 512
 9 | num_layers = 2
10 | encoder_dropout = False
11 | decoder_dropout = False
12 | decoder_sampling = True
13 | T_encoder = 45
14 | T_decoder = 10
15 | N = 64
16 | prune_filter_module = True
17 | 
18 | # Training parameters
19 | invalid_expr_loss = 0.5  # loss value when the layout is invalid
20 | lambda_entropy = 0.005
21 | weight_decay = 5e-6
22 | baseline_decay = 0.99
23 | max_grad_l2_norm = 10
24 | max_iter = 80000
25 | snapshot_interval = 10000
26 | 
27 | learning_rate = 0.0001


--------------------------------------------------------------------------------
/train_model/input_parameters.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division, print_function
 2 | import argparse
 3 | import os
 4 | import sys
 5 | from global_variables.global_variables import *
 6 | 
 7 | parser = argparse.ArgumentParser()
 8 | parser.add_argument('--gpu_id', type=int, default=0)
 9 | parser.add_argument("--exp_name",type=str, default="clevr_gt_layout")
10 | parser.add_argument("--model_type",type=str, choices=[model_type_scratch, model_type_gt, model_type_gt_rl],
11 |                     required=True, help='models:'+ model_type_scratch + ',' +model_type_gt+', '+model_type_gt_rl)
12 | parser.add_argument("--model_path",type=str, required=False)
13 | parser.add_argument("--data_dir",type=str,default="./exp_clevr/data")
14 | parser.add_argument("--image_feat_dir",type=str,default="/Users/tinayujiang/work/clevr_dataset/data/vgg_pool5/train")
15 | parser.add_argument("--out_dir",type=str,default="./exp_clevr")
16 | args = parser.parse_args()
17 | 
18 | gpu_id = args.gpu_id  # set GPU id to use
19 | exp_name = args.exp_name
20 | model_type = args.model_type
21 | 
22 | 
23 | 
24 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
25 | 
26 | 
27 | out_dir = args.out_dir
28 | data_dir = args.data_dir
29 | image_feat_dir = args.image_feat_dir
30 | 
31 | if model_type == model_type_scratch:
32 |     from train_model.from_scratch_hyperparameters import *
33 | elif model_type == model_type_gt:
34 |     from train_model.gt_hyperparameters import *
35 | elif model_type == model_type_gt_rl:
36 |     from train_model.gt_rl_hyperparameters import *
37 |     if(args.model_path == None):
38 |         exit("model ",model_type_gt_rl," require a pretrained model using --model_path")
39 |     model_path = args.model_path
40 | else:
41 |     sys.exit("unknown model type", model_type)
42 | 
43 | 
44 | 
45 | # Log params
46 | #log_dir = './exp_clevr/tb/%s/' % exp_name
47 | log_dir = os.path.join(out_dir, 'tb', exp_name)
48 | 
49 | # Data files
50 | #vocab_question_file = './exp_clevr/data/vocabulary_clevr.txt'
51 | #vocab_layout_file = './exp_clevr/data/vocabulary_layout.txt'
52 | #vocab_answer_file = './exp_clevr/data/answers_clevr.txt'
53 | 
54 | vocab_question_file = os.path.join(data_dir, 'vocabulary_clevr.txt')
55 | vocab_layout_file = os.path.join(data_dir, 'vocabulary_layout.txt')
56 | vocab_answer_file = os.path.join(data_dir, 'answers_clevr.txt')
57 | 
58 | 
59 | #imdb_file_trn = './exp_clevr/data/imdb/imdb_trn.npy'
60 | #imdb_file_tst = './exp_clevr/data/imdb/imdb_val.npy'
61 | imdb_file_trn = os.path.join(data_dir, 'imdb/imdb_trn.npy')
62 | imdb_file_tst = os.path.join(data_dir, 'imdb/imdb_val.npy')
63 | image_feat_dir = image_feat_dir
64 | 
65 | ##snapshot directory name
66 | #snapshot_dir = './exp_clevr/tfmodel/%s/' % exp_name
67 | 
68 | snapshot_dir = os.path.join(out_dir,"tfmodel",exp_name)


--------------------------------------------------------------------------------
/train_model/main.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import argparse
  3 | import os
  4 | from models.layout_assembler import Assembler
  5 | from models.end2endModuleNet import *
  6 | from models.custom_loss import custom_loss
  7 | from global_variables.global_variables import *
  8 | from Utils.data_reader import DataReader
  9 | from torch import optim
 10 | 
 11 | 
 12 | 
 13 | parser = argparse.ArgumentParser()
 14 | parser.add_argument("--config", type=str, required=True, help="config yaml file")
 15 | parser.add_argument("--out_dir",type=str, required=True, help="output directory")
 16 | args = parser.parse_args()
 17 | 
 18 | config_file= args.config
 19 | out_dir = args.out_dir
 20 | 
 21 | with open(config_file, 'r') as f:
 22 |     config = yaml.load(f)
 23 | 
 24 | torch.manual_seed(1)
 25 | ##update config file with commandline arguments
 26 | 
 27 | 
 28 | def prepare_train_data_set(**data_cofig):
 29 |     data_root_dir = data_cofig['data_root_dir']
 30 |     vocab_layout_file = os.path.join(data_root_dir, data_cofig['vocab_layout_file'])
 31 |     assembler = Assembler(vocab_layout_file)
 32 |     imdb_file_trn = os.path.join(data_root_dir, 'imdb',data_cofig['imdb_file_trn'])
 33 |     image_feat_dir = os.path.join(data_root_dir,data_cofig['preprocess_model'],'train')
 34 |     vocab_question_file = os.path.join(data_root_dir,data_cofig['vocab_question_file'])
 35 |     vocab_answer_file = os.path.join(data_root_dir,data_cofig['vocab_answer_file'])
 36 |     prune_filter_module = data_cofig['prune_filter_module']
 37 |     N = data_cofig['N']
 38 |     T_encoder = data_cofig['T_encoder']
 39 |     T_decoder = data_cofig['T_decoder']
 40 | 
 41 |     data_reader_trn = DataReader(imdb_file_trn, image_feat_dir, shuffle=False, one_pass=True,
 42 |                                  batch_size=N,
 43 |                                  T_encoder=T_encoder,
 44 |                                  T_decoder=T_decoder,
 45 |                                  assembler=assembler,
 46 |                                  vocab_question_file=vocab_question_file,
 47 |                                  vocab_answer_file=vocab_answer_file,
 48 |                                  prune_filter_module=prune_filter_module)
 49 | 
 50 |     num_vocab_txt = data_reader_trn.batch_loader.vocab_dict.num_vocab
 51 |     num_vocab_nmn = len(assembler.module_names)
 52 |     num_choices = data_reader_trn.batch_loader.answer_dict.num_vocab
 53 | 
 54 |     return data_reader_trn, num_vocab_txt, num_choices,num_vocab_nmn, assembler
 55 | 
 56 | 
 57 | def prepare_model(num_vocab_txt, num_choices, num_vocab_nmn,assembler, **model_config):
 58 |     if model_config['model_type'] == model_type_gt_rl:
 59 |         myModel = torch.load(model_config['model_path'])
 60 |     else:
 61 |         '''myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn,
 62 |                                    out_num_choices=num_choices,
 63 |                                    embed_dim_nmn=embed_dim_nmn, embed_dim_txt=embed_dim_txt,
 64 |                                    image_height=H_feat, image_width=W_feat, in_image_dim=D_feat,
 65 |                                    hidden_size=lstm_dim, assembler=assembler, layout_criterion=criterion_layout,
 66 |                                    max_layout_len=T_decoder,
 67 |                                    answer_criterion=criterion_answer, num_layers=num_layers, decoder_dropout=0)'''
 68 | 
 69 |         criterion_layout = custom_loss(lambda_entropy= model_config['lambda_entropy'])
 70 |         criterion_answer = nn.CrossEntropyLoss(size_average=False, reduce=False)
 71 | 
 72 |         myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn,
 73 |                                    out_num_choices=num_choices, assembler= assembler,
 74 |                                    layout_criterion=criterion_layout, answer_criterion=criterion_answer,
 75 |                                    max_layout_len=model_config['T_decoder'], **model_config)
 76 |         myModel = myModel.cuda() if use_cuda else myModel
 77 | 
 78 |     return myModel
 79 | 
 80 | 
 81 | data_reader_trn, num_vocab_txt, num_choices, num_vocab_nmn, assembler = prepare_train_data_set(**config['data'], **config['model'])
 82 | myModel = prepare_model(num_vocab_txt, num_choices, num_vocab_nmn, assembler, **config['model'])
 83 | 
 84 | training_parameters = config['training_parameters']
 85 | myOptimizer = optim.Adam(myModel.parameters(),
 86 |                          weight_decay=training_parameters['weight_decay'],
 87 |                          lr=training_parameters['learning_rate'])
 88 | 
 89 | model_type = config['model']['model_type']
 90 | avg_accuracy = 0
 91 | accuracy_decay = 0.99
 92 | avg_layout_accuracy = 0
 93 | updated_baseline = np.log(28)
 94 | max_iter = training_parameters['max_iter']
 95 | baseline_decay = training_parameters['baseline_decay']
 96 | max_grad_l2_norm = training_parameters['max_grad_l2_norm']
 97 | snapshot_interval = training_parameters['snapshot_interval']
 98 | snapshot_dir = os.path.join(config['output']['root_dir'],"tfmodel",config['output']['exp_name'])
 99 | 
100 | for i_iter, batch in enumerate(data_reader_trn.batches()):
101 |     if i_iter >= max_iter:
102 |         break
103 | 
104 |     _, n_sample = batch['input_seq_batch'].shape
105 |     input_text_seq_lens = batch['seq_length_batch']
106 |     input_text_seqs = batch['input_seq_batch']
107 |     input_layouts = batch['gt_layout_batch']
108 |     input_images = batch['image_feat_batch']
109 |     input_answers = batch['answer_label_batch']
110 | 
111 |     np.savetxt("/private/home/tinayujiang/temp/temp_out/input_text_seqs.txt",input_text_seqs)
112 |     np.savetxt("/private/home/tinayujiang/temp/temp_out/input_layouts.txt", input_layouts)
113 |     #np.savetxt("/private/home/tinayujiang/temp/temp_out/input_images.txt", input_images[0,:,:])
114 |     np.savetxt("/private/home/tinayujiang/temp/temp_out/input_answers.txt", input_answers)
115 | 
116 | 
117 | 
118 | 
119 |     n_correct_layout = 0
120 |     n_correct_answer = 0
121 | 
122 |     input_txt_variable = Variable(torch.LongTensor(input_text_seqs))
123 |     input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable
124 | 
125 |     input_layout_variable = None
126 |     decoder_sampling = True
127 | 
128 |     if model_type == model_type_gt:
129 |         decoder_sampling = False
130 |         input_layout_variable = Variable(torch.LongTensor(input_layouts))
131 |         input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable
132 | 
133 |     myOptimizer.zero_grad()
134 | 
135 |     total_loss, avg_answer_loss, myAnswer, predicted_layouts, expr_validity_array, updated_baseline \
136 |         = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens,
137 |                   input_answers=input_answers, input_images=input_images, policy_gradient_baseline=updated_baseline,
138 |                   baseline_decay=baseline_decay, input_layout_variable=input_layout_variable,
139 |                   sample_token=decoder_sampling
140 |                   )
141 | 
142 |     if total_loss is not None:
143 |         total_loss.backward()
144 |         torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm)
145 |         myOptimizer.step()
146 | 
147 |     layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0))
148 |     avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy)
149 | 
150 |     accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers))
151 |     avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy)
152 |     validity = np.mean(expr_validity_array)
153 | 
154 |     if (i_iter + 1) % 100 == 0:
155 |         print("iter:", i_iter + 1,
156 |               " cur_layout_acc:%.3f" % layout_accuracy, " avg_layout_acc:%.3f" % avg_layout_accuracy,
157 |               " cur_ans_acc:%.4f" % accuracy, " avg_answer_acc:%.4f" % avg_accuracy,
158 |               "total loss:%.4f" % total_loss.data.cpu().numpy()[0],
159 |               "avg_answer_loss:%.4f" % avg_answer_loss.data.cpu().numpy()[0])
160 | 
161 |         sys.stdout.flush()
162 | 
163 |     # Save snapshot
164 |     if (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter:
165 |         model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1))
166 |         torch.save(myModel, model_snapshot_file)
167 |         print('snapshot saved to ' + model_snapshot_file)
168 |         sys.stdout.flush()
169 | 


--------------------------------------------------------------------------------
/train_model/main_copy.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import argparse
  3 | import os
  4 | from models.layout_assembler import Assembler
  5 | from models.end2endModuleNet import *
  6 | from models.custom_loss import custom_loss
  7 | from global_variables.global_variables import *
  8 | from Utils.data_reader import DataReader
  9 | from torch import optim
 10 | from Utils.dataSet import vqa_dataset
 11 | from torch.utils.data import DataLoader
 12 | 
 13 | 
 14 | 
 15 | parser = argparse.ArgumentParser()
 16 | parser.add_argument("--config", type=str, required=True, help="config yaml file")
 17 | parser.add_argument("--out_dir",type=str, required=True, help="output directory")
 18 | args = parser.parse_args()
 19 | 
 20 | config_file= args.config
 21 | out_dir = args.out_dir
 22 | 
 23 | with open(config_file, 'r') as f:
 24 |     config = yaml.load(f)
 25 | 
 26 | torch.manual_seed(1)
 27 | 
 28 | ##update config file with commandline arguments
 29 | 
 30 | 
 31 | def prepare_train_data_set(**data_cofig):
 32 |     data_root_dir = data_cofig['data_root_dir']
 33 |     vocab_layout_file = os.path.join(data_root_dir, data_cofig['vocab_layout_file'])
 34 |     assembler = Assembler(vocab_layout_file)
 35 |     imdb_file_trn = os.path.join(data_root_dir, 'imdb',data_cofig['imdb_file_trn'])
 36 |     image_feat_dir = os.path.join(data_root_dir,data_cofig['preprocess_model'],'train')
 37 |     vocab_question_file = os.path.join(data_root_dir,data_cofig['vocab_question_file'])
 38 |     vocab_answer_file = os.path.join(data_root_dir,data_cofig['vocab_answer_file'])
 39 |     prune_filter_module = data_cofig['prune_filter_module']
 40 |     N = data_cofig['N']
 41 |     T_encoder = data_cofig['T_encoder']
 42 |     T_decoder = data_cofig['T_decoder']
 43 |     image_depth_first = data_cofig['image_depth_first']
 44 | 
 45 |     vqa_train_dataset = vqa_dataset(imdb_file=imdb_file_trn, image_feat_directory=image_feat_dir, T_encoder=T_encoder,
 46 |                                     T_decoder=T_decoder,
 47 |                                     assembler=assembler,
 48 |                                     vocab_question_file=vocab_question_file,
 49 |                                     vocab_answer_file=vocab_answer_file,
 50 |                                     prune_filter_module=prune_filter_module,
 51 |                                     image_depth_first=image_depth_first)
 52 | 
 53 |     data_reader_trn = DataLoader(dataset=vqa_train_dataset, batch_size=N, shuffle=True)
 54 | 
 55 |     num_vocab_txt = vqa_train_dataset.vocab_dict.num_vocab
 56 |     num_vocab_nmn = len(assembler.module_names)
 57 |     num_choices = vqa_train_dataset.answer_dict.num_vocab
 58 | 
 59 |     return data_reader_trn, num_vocab_txt, num_choices,num_vocab_nmn, assembler
 60 | 
 61 | 
 62 | def prepare_model(num_vocab_txt, num_choices, num_vocab_nmn,assembler, **model_config):
 63 |     if model_config['model_type'] == model_type_gt_rl:
 64 |         myModel = torch.load(model_config['model_path'])
 65 |     else:
 66 |         criterion_layout = custom_loss(lambda_entropy= model_config['lambda_entropy'])
 67 |         criterion_answer = nn.CrossEntropyLoss(size_average=False, reduce=False)
 68 | 
 69 |         myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn,
 70 |                                    out_num_choices=num_choices, assembler= assembler,
 71 |                                    layout_criterion=criterion_layout, answer_criterion=criterion_answer,
 72 |                                    max_layout_len=model_config['T_decoder'], **model_config)
 73 |         myModel = myModel.cuda() if use_cuda else myModel
 74 | 
 75 |     return myModel
 76 | 
 77 | 
 78 | data_reader_trn, num_vocab_txt, num_choices, num_vocab_nmn, assembler = prepare_train_data_set(**config['data'], **config['model'])
 79 | myModel = prepare_model(num_vocab_txt, num_choices, num_vocab_nmn, assembler, **config['model'])
 80 | 
 81 | training_parameters = config['training_parameters']
 82 | myOptimizer = optim.Adam(myModel.parameters(),
 83 |                          weight_decay=training_parameters['weight_decay'],
 84 |                          lr=training_parameters['learning_rate'])
 85 | 
 86 | model_type = config['model']['model_type']
 87 | avg_accuracy = 0
 88 | accuracy_decay = 0.99
 89 | avg_layout_accuracy = 0
 90 | updated_baseline = np.log(28)
 91 | max_iter = training_parameters['max_iter']
 92 | baseline_decay = training_parameters['baseline_decay']
 93 | max_grad_l2_norm = training_parameters['max_grad_l2_norm']
 94 | snapshot_interval = training_parameters['snapshot_interval']
 95 | snapshot_dir = os.path.join(config['output']['root_dir'],"tfmodel",config['output']['exp_name'])
 96 | os.makedirs(snapshot_dir, exist_ok=True)
 97 | 
 98 | i_iter = 0
 99 | for iepoch in range(100):
100 |     print("iepoch = ", iepoch)
101 |     if i_iter >= max_iter:
102 |         break
103 |     for i, batch in enumerate(data_reader_trn):
104 |         n_sample,_ = batch['input_seq_batch'].shape
105 |         input_text_seq_lens = batch['seq_length_batch'].cpu().numpy()
106 |         input_text_seqs = np.transpose(batch['input_seq_batch'].cpu().numpy())
107 |         input_layouts = np.transpose(batch['gt_layout_batch'].cpu().numpy())
108 |         input_images = batch['image_feat_batch'].cpu().numpy()
109 |         input_answers = batch['answer_label_batch'].cpu().numpy()
110 | 
111 |         n_correct_layout = 0
112 |         n_correct_answer = 0
113 | 
114 |         input_txt_variable = Variable(torch.LongTensor(input_text_seqs))
115 |         input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable
116 | 
117 |         input_layout_variable = None
118 |         decoder_sampling = True
119 | 
120 |         if model_type == model_type_gt:
121 |             decoder_sampling = False
122 |             input_layout_variable = Variable(torch.LongTensor(input_layouts))
123 |             input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable
124 | 
125 |         myOptimizer.zero_grad()
126 | 
127 |         total_loss, avg_answer_loss, myAnswer, predicted_layouts, expr_validity_array, updated_baseline \
128 |             = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens,
129 |                       input_answers=input_answers, input_images=input_images, policy_gradient_baseline=updated_baseline,
130 |                       baseline_decay=baseline_decay, input_layout_variable=input_layout_variable,
131 |                       sample_token=decoder_sampling
132 |                       )
133 | 
134 |         if total_loss is not None:
135 |             total_loss.backward()
136 |             torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm)
137 |             myOptimizer.step()
138 | 
139 |         layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0))
140 |         avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy)
141 | 
142 |         accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers))
143 |         avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy)
144 |         validity = np.mean(expr_validity_array)
145 | 
146 |         if (i_iter + 1) % 20 == 0:
147 |             print("iter:", i_iter + 1,
148 |                   " cur_layout_acc:%.3f" % layout_accuracy, " avg_layout_acc:%.3f" % avg_layout_accuracy,
149 |                   " cur_ans_acc:%.4f" % accuracy, " avg_answer_acc:%.4f" % avg_accuracy,
150 |                   "total loss:%.4f" % total_loss.data.cpu().numpy()[0],
151 |                   "avg_answer_loss:%.4f" % avg_answer_loss.data.cpu().numpy()[0])
152 | 
153 |             sys.stdout.flush()
154 | 
155 |         # Save snapshot
156 |         if (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter:
157 |             model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1))
158 |             torch.save(myModel, model_snapshot_file)
159 |             print('snapshot saved to ' + model_snapshot_file)
160 |             sys.stdout.flush()
161 |         i_iter += 1
162 | 
163 | 


--------------------------------------------------------------------------------
/train_model/train_clevr_gt_layout.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | from Utils.data_reader import DataReader
  3 | import sys
  4 | from torch import optim
  5 | 
  6 | from models.layout_assembler import Assembler
  7 | from train_model.input_parameters import *
  8 | from models.end2endModuleNet import *
  9 | from models.custom_loss import custom_loss
 10 | from global_variables.global_variables import use_cuda
 11 | 
 12 | 
 13 | 
 14 | 
 15 | ##create directory for snapshot
 16 | os.makedirs(snapshot_dir, exist_ok=True)
 17 | 
 18 | 
 19 | assembler = Assembler(vocab_layout_file)
 20 | 
 21 | data_reader_trn = DataReader(imdb_file_trn,image_feat_dir, shuffle=True, one_pass=False,
 22 |                              batch_size=N,
 23 |                              T_encoder=T_encoder,
 24 |                              T_decoder=T_decoder,
 25 |                              assembler=assembler,
 26 |                              vocab_question_file=vocab_question_file,
 27 |                              vocab_answer_file=vocab_answer_file,
 28 |                              prune_filter_module=prune_filter_module)
 29 | 
 30 | num_vocab_txt = data_reader_trn.batch_loader.vocab_dict.num_vocab
 31 | num_vocab_nmn = len(assembler.module_names)
 32 | num_choices = data_reader_trn.batch_loader.answer_dict.num_vocab
 33 | 
 34 | 
 35 | 
 36 | criterion_layout = custom_loss(lambda_entropy = lambda_entropy)
 37 | criterion_answer = nn.CrossEntropyLoss(size_average=False,reduce=False)
 38 | 
 39 | 
 40 | if model_type == model_type_gt_rl:
 41 |     myModel = torch.load(model_path)
 42 | else:
 43 |     myModel = end2endModuleNet(num_vocab_txt=num_vocab_txt, num_vocab_nmn=num_vocab_nmn, out_num_choices=num_choices,
 44 |                                embed_dim_nmn=embed_dim_nmn, embed_dim_txt=embed_dim_txt,
 45 |                                image_height=H_feat, image_width=W_feat, in_image_dim=D_feat,
 46 |                                hidden_size=lstm_dim, assembler=assembler, layout_criterion=criterion_layout,
 47 |                                max_layout_len=T_decoder,
 48 |                                answer_criterion=criterion_answer, num_layers=num_layers, decoder_dropout=0)
 49 | 
 50 | myOptimizer = optim.Adam(myModel.parameters(), weight_decay=weight_decay, lr=learning_rate)
 51 | 
 52 | 
 53 | avg_accuracy = 0
 54 | accuracy_decay = 0.99
 55 | avg_layout_accuracy = 0
 56 | updated_baseline = np.log(28)
 57 | 
 58 | for i_iter, batch in enumerate(data_reader_trn.batches()):
 59 |     if i_iter >= max_iter:
 60 |         break
 61 | 
 62 |     _, n_sample = batch['input_seq_batch'].shape
 63 |     input_text_seq_lens = batch['seq_length_batch']
 64 |     input_text_seqs = batch['input_seq_batch']
 65 |     input_layouts = batch['gt_layout_batch']
 66 |     input_images = batch['image_feat_batch']
 67 |     input_answers = batch['answer_label_batch']
 68 | 
 69 | 
 70 |     n_correct_layout = 0
 71 |     n_correct_answer = 0
 72 | 
 73 |     input_txt_variable = Variable(torch.LongTensor(input_text_seqs))
 74 |     input_txt_variable = input_txt_variable.cuda() if use_cuda else input_txt_variable
 75 | 
 76 |     input_layout_variable = None
 77 | 
 78 |     if model_type == model_type_gt:
 79 |         input_layout_variable = Variable(torch.LongTensor(input_layouts))
 80 |         input_layout_variable = input_layout_variable.cuda() if use_cuda else input_layout_variable
 81 | 
 82 | 
 83 | 
 84 | 
 85 |     
 86 |     myOptimizer.zero_grad()
 87 | 
 88 |     total_loss,avg_answer_loss ,myAnswer, predicted_layouts, expr_validity_array, updated_baseline \
 89 |         = myModel(input_txt_variable=input_txt_variable, input_text_seq_lens=input_text_seq_lens,
 90 |                   input_answers=input_answers, input_images=input_images,policy_gradient_baseline=updated_baseline,
 91 |                   baseline_decay=baseline_decay, input_layout_variable=input_layout_variable,
 92 |                   sample_token=decoder_sampling
 93 |                   )
 94 | 
 95 |     if total_loss is not None:
 96 |         total_loss.backward()
 97 |         torch.nn.utils.clip_grad_norm(myModel.parameters(), max_grad_l2_norm)
 98 |         myOptimizer.step()
 99 | 
100 |     layout_accuracy = np.mean(np.all(predicted_layouts == input_layouts, axis=0))
101 |     avg_layout_accuracy += (1 - accuracy_decay) * (layout_accuracy - avg_layout_accuracy)
102 | 
103 |     accuracy = np.mean(np.logical_and(expr_validity_array, myAnswer == input_answers))
104 |     avg_accuracy += (1 - accuracy_decay) * (accuracy - avg_accuracy)
105 |     validity = np.mean(expr_validity_array)
106 | 
107 |     if (i_iter + 1) % 20 == 0 :
108 |         print("iter:", i_iter + 1,
109 |               " cur_layout_acc:%.3f"% layout_accuracy, " avg_layout_acc:%.3f"% avg_layout_accuracy,
110 |               " cur_ans_acc:%.4f"% accuracy, " avg_answer_acc:%.4f"% avg_accuracy,
111 |               "total loss:%.4f"%total_loss.data.cpu().numpy()[0],
112 |               "avg_answer_loss:%.4f"% avg_answer_loss.data.cpu().numpy()[0])
113 | 
114 |         sys.stdout.flush()
115 | 
116 |     # Save snapshot
117 |     if  (i_iter + 1) % snapshot_interval == 0 or (i_iter + 1) == max_iter:
118 |         model_snapshot_file = os.path.join(snapshot_dir, "model_%08d" % (i_iter + 1))
119 |         torch.save(myModel, model_snapshot_file)
120 |         print('snapshot saved to ' + model_snapshot_file )
121 |         sys.stdout.flush()
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------