├── .idea ├── ARN.iml ├── encodings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── README.md ├── lib ├── crits │ ├── __init__.py │ └── max_margin_crit.py ├── layers │ ├── lang_decoder.py │ ├── lang_encoder.py │ ├── loss.py │ ├── match.py │ └── visual_encoder.py ├── loaders │ ├── dataloader.py │ └── loader.py ├── models │ ├── eval.py │ └── utils.py └── mrcn │ ├── __init__.py │ ├── inference.py │ └── inference_no_imdb.py └── tools ├── _init_paths.py ├── eval.py ├── opt.py └── train.py /.idea/ARN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 15 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyTorch Implementation of ARN 2 | 3 | ## Introduction 4 | 5 | This repository is Pytorch implementation of [Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding](https://arxiv.org/pdf/1908.10568.pdf) in ICCV 2019. 6 | Check our [paper](https://arxiv.org/pdf/1908.10568.pdf) for more details. 7 | 8 | ## Prerequisites 9 | 10 | * Python 3.5 11 | * Pytorch 0.4.1 12 | * CUDA 8.0 13 | 14 | ## Installation 15 | 16 | Please refer to [MattNet](https://github.com/lichengunc/MAttNet) to install [mask-faster-rcnn](https://github.com/lichengunc/mask-faster-rcnn), [REFER](https://github.com/lichengunc/refer) and [refer-parser2](https://github.com/lichengunc/refer-parser2). 17 | Follow Step 1 & 2 in Training to prepare the data and features. 18 | 19 | ## Training 20 | 21 | Train ARN with ground-truth annotation: 22 | 23 | ```bash 24 | CUDA_VISIBLE_DEVICES=${GPU_ID} python ./tools/train.py --dataset ${DATASET} --splitBy ${SPLITBY} --exp_id ${EXP_ID} 25 | ``` 26 | 27 | ## Evaluation 28 | 29 | Evaluate ARN with ground-truth annotation: 30 | 31 | ```bash 32 | CUDA_VISIBLE_DEVICES=${GPU_ID} python ./tools/eval.py --dataset ${DATASET} --splitBy ${SPLITBY} --split ${SPLIT} --id ${EXP_ID} 33 | ``` 34 | 35 | 36 | ## Citation 37 | 38 | @inproceedings{lxj2019arn, 39 | title={Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding}, 40 | author={Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Dechao Meng, and Qingming Huang}, 41 | booktitle={ICCV}, 42 | year={2019} 43 | } 44 | 45 | 46 | ## Acknowledgement 47 | 48 | Thanks for the work of [Licheng Yu](http://cs.unc.edu/~licheng/). Our code is based on the implementation of [MattNet](https://github.com/lichengunc/MAttNet). 49 | 50 | ## Authorship 51 | 52 | This project is maintained by [Xuejing Liu](https://gingl.github.io/). 53 | -------------------------------------------------------------------------------- /lib/crits/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GingL/ARN/b1001040d8ac41292b2ccf6a6ab1f41c1d77d0fa/lib/crits/__init__.py -------------------------------------------------------------------------------- /lib/crits/max_margin_crit.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | import torch.nn as nn 8 | import pdb 9 | 10 | class MaxMarginCriterion(nn.Module): 11 | 12 | def __init__(self, visual_rank_weight, lang_rank_weight, margin): 13 | # visual_rank_weight=1, lang_rank_weight=1, margin=0.1 14 | super(MaxMarginCriterion, self).__init__() 15 | self.visual_rank = visual_rank_weight > 0 16 | self.lang_rank = lang_rank_weight > 0 17 | self.visual_rank_weight = visual_rank_weight 18 | self.lang_rank_weight = lang_rank_weight 19 | self.margin = margin 20 | 21 | def forward(self, cossim): 22 | # pdb.set_trace() 23 | N = cossim.size(0) 24 | batch_size = 0 25 | if self.visual_rank and not self.lang_rank: 26 | batch_size = N//2 27 | assert isinstance(batch_size, int) 28 | paired = cossim[:batch_size] 29 | unpaired = cossim[batch_size:] 30 | visual_rank_loss = self.visual_rank_weight * torch.clamp(self.margin + unpaired - paired, min=0) 31 | lang_rank_loss = 0. 32 | 33 | elif not self.visual_rank and self.lang_rank: 34 | batch_size = N//2 35 | assert isinstance(batch_size, int) 36 | paired = cossim[:batch_size] 37 | unpaired = cossim[batch_size:] 38 | lang_rank_loss = self.lang_rank_weight * torch.clamp(self.margin + unpaired - paired, min=0) 39 | visual_rank_loss = 0. 40 | 41 | elif self.visual_rank and self.lang_rank: 42 | batch_size = N//3 43 | assert isinstance(batch_size, int) 44 | paired = cossim[:batch_size] 45 | visual_unpaired = cossim[batch_size: batch_size*2] 46 | lang_unpaired = cossim[batch_size*2:] 47 | visual_rank_loss = self.visual_rank_weight * torch.clamp(self.margin + visual_unpaired - paired, 0) 48 | lang_rank_loss = self.lang_rank_weight * torch.clamp(self.margin + lang_unpaired - paired, 0) 49 | 50 | else: 51 | raise NotImplementedError 52 | 53 | loss = (visual_rank_loss + lang_rank_loss).sum() / batch_size 54 | return loss 55 | 56 | -------------------------------------------------------------------------------- /lib/layers/lang_decoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import torch 7 | from torch.autograd import Variable 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | 11 | 12 | class LocationDecoder(nn.Module): 13 | def __init__(self, opt): 14 | super(LocationDecoder, self).__init__() 15 | self.mlp = nn.Sequential(nn.Linear(5 + 25, opt['jemb_dim'])) 16 | 17 | def forward(self, loc_feats, total_ann_score): 18 | total_ann_score = total_ann_score.unsqueeze(1) 19 | loc_feats_fuse = torch.bmm(total_ann_score, loc_feats) 20 | loc_feats_fuse = loc_feats_fuse.squeeze(1) 21 | loc_feats_fuse = self.mlp(loc_feats_fuse) 22 | return loc_feats_fuse 23 | 24 | 25 | class SubjectDecoder(nn.Module): 26 | def __init__(self, opt): 27 | super(SubjectDecoder, self).__init__() 28 | self.mlp = nn.Sequential(nn.Linear(opt['pool5_dim'] + opt['fc7_dim'], opt['jemb_dim'])) 29 | 30 | def forward(self, sub_feats, total_ann_score): 31 | total_ann_score = total_ann_score.unsqueeze(1) 32 | sub_feats_fuse = torch.bmm(total_ann_score, sub_feats) 33 | sub_feats_fuse = sub_feats_fuse.squeeze(1) 34 | sub_feats_fuse = self.mlp(sub_feats_fuse) 35 | return sub_feats_fuse 36 | 37 | 38 | class RelationDecoder(nn.Module): 39 | def __init__(self, opt): 40 | super(RelationDecoder, self).__init__() 41 | self.jemb_dim = opt['jemb_dim'] 42 | self.word_vec_size = opt['word_vec_size'] 43 | self.fc7_dim = opt['fc7_dim'] 44 | self.mlp = nn.Sequential(nn.Linear(self.fc7_dim + 5, self.jemb_dim)) 45 | 46 | def forward(self, rel_feats, total_ann_score, ixs): 47 | sent_num, ann_num = ixs.size(0), ixs.size(1) 48 | total_ann_score = total_ann_score.unsqueeze(1) 49 | ixs = ixs.view(sent_num, ann_num, 1).unsqueeze(3).expand(sent_num, ann_num, 1, 50 | self.fc7_dim + 5) 51 | rel_feats_max = torch.gather(rel_feats, 2, ixs) 52 | rel_feats_max = rel_feats_max.squeeze(2) 53 | rel_feats_fuse = torch.bmm(total_ann_score, rel_feats_max) 54 | rel_feats_fuse = rel_feats_fuse.squeeze(1) 55 | rel_feats_fuse = self.mlp(rel_feats_fuse) 56 | return rel_feats_fuse 57 | -------------------------------------------------------------------------------- /lib/layers/lang_encoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | class RNNEncoder(nn.Module): 11 | def __init__(self, vocab_size, word_embedding_size, word_vec_size, hidden_size, bidirectional=False, 12 | input_dropout_p=0, dropout_p=0, n_layers=1, rnn_type='lstm', variable_lengths=True): 13 | super(RNNEncoder, self).__init__() 14 | self.variable_lengths = variable_lengths 15 | self.embedding = nn.Embedding(vocab_size, word_embedding_size) 16 | self.input_dropout = nn.Dropout(input_dropout_p) 17 | self.mlp = nn.Sequential(nn.Linear(word_embedding_size, word_vec_size), 18 | nn.ReLU()) 19 | self.rnn_type = rnn_type 20 | self.rnn = getattr(nn, rnn_type.upper())(word_vec_size, hidden_size, n_layers, 21 | batch_first=True, bidirectional=bidirectional, dropout=dropout_p) 22 | self.num_dirs = 2 if bidirectional else 1 23 | 24 | def forward(self, input_labels): 25 | """ 26 | Inputs: 27 | - input_labels: Variable long (batch, seq_len) 28 | Outputs: 29 | - output : Variable float (batch, max_len, hidden_size * num_dirs) 30 | - hidden : Variable float (batch, num_layers * num_dirs * hidden_size) 31 | - embedded: Variable float (batch, max_len, word_vec_size) 32 | """ 33 | if self.variable_lengths: 34 | input_lengths = (input_labels != 0).sum(1) 35 | 36 | input_lengths_list = input_lengths.data.cpu().numpy().tolist() 37 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist() 38 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist() 39 | s2r = {s: r for r, s in enumerate(sort_ixs)} 40 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))] 41 | assert max(input_lengths_list) == input_labels.size(1) 42 | 43 | sort_ixs = input_labels.data.new(sort_ixs).long() 44 | recover_ixs = input_labels.data.new(recover_ixs).long() 45 | 46 | input_labels = input_labels[sort_ixs] 47 | 48 | # embed 49 | embedded = self.embedding(input_labels) # (n, seq_len, word_embedding_size) 50 | embedded = self.input_dropout(embedded) # (n, seq_len, word_embedding_size) 51 | embedded = self.mlp(embedded) # (n, seq_len, word_embedding_size) 52 | if self.variable_lengths: 53 | embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_input_lengths_list, batch_first=True) 54 | 55 | output, hidden = self.rnn(embedded) 56 | 57 | # recover 58 | if self.variable_lengths: 59 | embedded, _ = nn.utils.rnn.pad_packed_sequence(embedded, batch_first=True) 60 | embedded = embedded[recover_ixs] 61 | 62 | # recover rnn 63 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True) # (batch, max_len, hidden) 64 | output = output[recover_ixs] 65 | 66 | if self.rnn_type=='lstm': 67 | hidden = hidden[0] 68 | hidden = hidden[:, recover_ixs, :] 69 | hidden = hidden.transpose(0, 1).contiguous() 70 | hidden = hidden.view(hidden.size(0), -1) 71 | 72 | return output, hidden, embedded 73 | 74 | class PhraseAttention(nn.Module): 75 | def __init__(self, input_dim): 76 | super(PhraseAttention, self).__init__() 77 | self.fc = nn.Linear(input_dim, 1) 78 | 79 | def forward(self, context, embedded, input_labels): 80 | cxt_scores = self.fc(context).squeeze(2) 81 | 82 | attn = F.softmax(cxt_scores) 83 | 84 | is_not_zero = (input_labels != 0).float() 85 | attn = attn * is_not_zero 86 | attn = attn / attn.sum(1).view(attn.size(0), 1).expand(attn.size(0), attn.size(1)) # (batch, seq_len) 87 | 88 | attn3 = attn.unsqueeze(1) 89 | weighted_emb = torch.bmm(attn3, embedded) 90 | weighted_emb = weighted_emb.squeeze(1) 91 | 92 | return attn, weighted_emb -------------------------------------------------------------------------------- /lib/layers/loss.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | import numpy as np 10 | 11 | # Attribut Reconstruction Loss 12 | class AttributeReconstructLoss(nn.Module): 13 | def __init__(self, opt): 14 | super(AttributeReconstructLoss, self).__init__() 15 | self.att_dropout = nn.Dropout(opt['visual_drop_out']) 16 | self.att_fc = nn.Linear(opt['fc7_dim']+opt['pool5_dim'], opt['num_atts']) 17 | 18 | 19 | def forward(self, attribute_feats, total_ann_score, att_labels, select_ixs, att_weights): 20 | """attribute_feats.shape = (sent_num, ann_num, 512), total_ann_score.shape = (sent_num, ann_num)""" 21 | total_ann_score = total_ann_score.unsqueeze(1) 22 | att_feats_fuse = torch.bmm(total_ann_score, attribute_feats) 23 | att_feats_fuse = att_feats_fuse.squeeze(1) 24 | att_feats_fuse = self.att_dropout(att_feats_fuse) 25 | att_scores = self.att_fc(att_feats_fuse) 26 | if len(select_ixs) == 0: 27 | att_loss = 0 28 | else: 29 | att_loss = nn.BCEWithLogitsLoss(att_weights.cuda())(att_scores.index_select(0, select_ixs), 30 | att_labels.index_select(0, select_ixs)) 31 | return att_scores, att_loss 32 | 33 | # Language Reconstruction Loss 34 | class LangReconstructionLoss(nn.Module): 35 | def __init__(self, opt): 36 | super(LangReconstructionLoss, self).__init__() 37 | 38 | self.variable_lengths = opt['variable_lengths'] > 0 39 | self.vocab_size = opt['vocab_size'] 40 | self.word_embedding_size = opt['word_embedding_size'] 41 | self.word_vec_size = opt['word_vec_size'] 42 | self.hidden_size = opt['rnn_hidden_size'] 43 | self.bidirectional = opt['decode_bidirectional'] > 0 44 | self.input_dropout_p = opt['word_drop_out'] 45 | self.dropout_p = opt['rnn_drop_out'] 46 | self.n_layers = opt['rnn_num_layers'] 47 | self.rnn_type = opt['rnn_type'] 48 | self.variable_lengths = opt['variable_lengths'] > 0 49 | 50 | self.embedding = nn.Embedding(self.vocab_size, self.word_embedding_size) 51 | self.input_dropout = nn.Dropout(self.input_dropout_p) 52 | self.mlp = nn.Sequential(nn.Linear(self.word_embedding_size, self.word_vec_size), nn.ReLU()) 53 | self.rnn_type = self.rnn_type 54 | self.rnn = getattr(nn, self.rnn_type.upper())(self.word_vec_size*2, self.hidden_size, self.n_layers, 55 | batch_first=True, bidirectional=self.bidirectional, 56 | dropout=self.dropout_p) 57 | self.num_dirs = 2 if self.bidirectional else 1 58 | 59 | self.fc = nn.Linear(self.num_dirs * self.hidden_size, self.vocab_size) 60 | self.cross_entropy = nn.CrossEntropyLoss(reduce=False) 61 | 62 | def forward(self, vis_att_fuse, enc_labels, dec_labels): 63 | seq_len = enc_labels.size(1) 64 | sent_num = enc_labels.size(0) 65 | label_mask = (dec_labels != 0).float() 66 | 67 | if self.variable_lengths: 68 | input_lengths = (enc_labels != 0).sum(1) 69 | input_lengths_list = input_lengths.data.cpu().numpy().tolist() 70 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist() 71 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist() 72 | s2r = {s: r for r, s in enumerate(sort_ixs)} 73 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))] 74 | 75 | assert max(input_lengths_list) == enc_labels.size(1) 76 | 77 | sort_ixs = enc_labels.data.new(sort_ixs).long() 78 | recover_ixs = enc_labels.data.new(recover_ixs).long() 79 | 80 | input_labels = enc_labels[sort_ixs] 81 | 82 | vis_att_fuse = vis_att_fuse.unsqueeze(1) 83 | embedded = self.embedding(input_labels) 84 | embedded = self.input_dropout(embedded) 85 | embedded = self.mlp(embedded) 86 | 87 | embedded = torch.cat([embedded, torch.cat([vis_att_fuse, torch.zeros(sent_num, seq_len - 1, 88 | self.word_vec_size).cuda()], 1)], 2) 89 | 90 | if self.variable_lengths: 91 | embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_input_lengths_list, batch_first=True) 92 | 93 | output, hidden = self.rnn(embedded) 94 | 95 | # recover 96 | if self.variable_lengths: 97 | # recover rnn 98 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True) 99 | output = output[recover_ixs] 100 | 101 | output = output.view(sent_num * seq_len, -1) 102 | output = self.fc(output) 103 | 104 | dec_labels = dec_labels.view(-1) 105 | label_mask = label_mask.view(-1) 106 | 107 | rec_loss = self.cross_entropy(output, dec_labels) 108 | rec_loss = torch.sum(rec_loss * label_mask) / torch.sum(label_mask) 109 | 110 | return rec_loss 111 | 112 | # Language Reconstruction Loss in Adaptive Reconstruction Loss 113 | class AdapLangReconstructLoss(nn.Module): 114 | def __init__(self, opt): 115 | super(AdapLangReconstructLoss, self).__init__() 116 | 117 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim'] 118 | 119 | self.variable_lengths = opt['variable_lengths'] > 0 120 | self.vocab_size = opt['vocab_size'] 121 | self.word_embedding_size = opt['word_embedding_size'] 122 | self.word_vec_size = opt['word_vec_size'] 123 | self.hidden_size = opt['rnn_hidden_size'] 124 | self.bidirectional = opt['decode_bidirectional'] > 0 125 | self.input_dropout_p = opt['word_drop_out'] 126 | self.dropout_p = opt['rnn_drop_out'] 127 | self.n_layers = opt['rnn_num_layers'] 128 | self.rnn_type = opt['rnn_type'] 129 | self.variable_lengths = opt['variable_lengths'] > 0 130 | 131 | self.embedding = nn.Embedding(self.vocab_size, self.word_embedding_size) 132 | self.input_dropout = nn.Dropout(self.input_dropout_p) 133 | self.mlp = nn.Sequential(nn.Linear(self.word_embedding_size, self.word_vec_size), nn.ReLU()) 134 | self.rnn_type = self.rnn_type 135 | self.rnn = getattr(nn, self.rnn_type.upper())(self.word_vec_size * 2, self.hidden_size, self.n_layers, 136 | batch_first=True, bidirectional=self.bidirectional, 137 | dropout=self.dropout_p) 138 | self.num_dirs = 2 if self.bidirectional else 1 139 | 140 | self.slr_mlp = nn.Sequential(nn.Linear(self.word_vec_size * 3, self.word_vec_size), 141 | nn.ReLU()) 142 | 143 | self.fc = nn.Linear(self.num_dirs * self.hidden_size, self.vocab_size) 144 | 145 | self.cross_entropy = nn.CrossEntropyLoss(reduce=False) 146 | 147 | def forward(self, sub_phrase_emb, loc_phrase_emb, rel_phrase_emb, enc_labels, dec_labels): 148 | """sub_phrase_emb, loc_phrase_emb, rel_phrase_emb.shape = (sent_num, 512), labels.shape = (sent_num, sent_length)""" 149 | slr_embeded = torch.cat([sub_phrase_emb, loc_phrase_emb, rel_phrase_emb], 1) 150 | slr_embeded = self.slr_mlp(slr_embeded) 151 | 152 | seq_len = enc_labels.size(1) 153 | label_mask = (dec_labels != 0).float() 154 | batchsize = enc_labels.size(0) 155 | 156 | if self.variable_lengths: 157 | input_lengths = (enc_labels != 0).sum(1) 158 | input_lengths_list = input_lengths.data.cpu().numpy().tolist() 159 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist() 160 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist() 161 | s2r = {s: r for r, s in enumerate(sort_ixs)} 162 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))] 163 | 164 | assert max(input_lengths_list) == enc_labels.size(1) 165 | 166 | sort_ixs = enc_labels.data.new(sort_ixs).long() 167 | recover_ixs = enc_labels.data.new(recover_ixs).long() 168 | 169 | input_labels = enc_labels[sort_ixs] 170 | 171 | slr_embeded = slr_embeded.view(batchsize, 1, -1) 172 | 173 | embedded = self.embedding(input_labels) 174 | embedded = self.input_dropout(embedded) 175 | embedded = self.mlp(embedded) 176 | 177 | slr_embedded = torch.cat([embedded, torch.cat([slr_embeded, torch.zeros(batchsize, seq_len - 1, 178 | self.word_embedding_size).cuda()], 1)], 2) 179 | 180 | if self.variable_lengths: 181 | slr_embedded = nn.utils.rnn.pack_padded_sequence(slr_embedded, sorted_input_lengths_list, batch_first=True) 182 | 183 | output, hidden = self.rnn(slr_embedded) 184 | 185 | # recover 186 | if self.variable_lengths: 187 | # recover rnn 188 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True) 189 | output = output[recover_ixs] 190 | 191 | output = output.view(batchsize * seq_len, -1) 192 | output = self.fc(output) 193 | 194 | dec_labels = dec_labels.view(-1) 195 | label_mask = label_mask.view(-1) 196 | 197 | lang_rec_loss = self.cross_entropy(output, dec_labels) 198 | lang_rec_loss = torch.sum(lang_rec_loss * label_mask) / torch.sum(label_mask) 199 | 200 | return lang_rec_loss 201 | 202 | # Visual Reconstruction Loss in Adaptive Reconstruction Loss 203 | class AdapVisualReconstructLoss(nn.Module): 204 | def __init__(self, opt): 205 | super(AdapVisualReconstructLoss, self).__init__() 206 | 207 | def forward(self, sub_phrase_emb, sub_phrase_recons, loc_phrase_emb, loc_phrase_recons, rel_phrase_emb, 208 | rel_phrase_recons, weights): 209 | """ 210 | (sub_phrase_emb, sub_phrase_recons, loc_phrase_emb, loc_phrase_recons, rel_phrase_emb, rel_phrase_recons).shape=(sent_num, 512) 211 | weights.shape = (sent_num, 3) 212 | """ 213 | sub_loss = self.mse_loss(sub_phrase_recons, sub_phrase_emb).sum(1).unsqueeze(1) 214 | loc_loss = self.mse_loss(loc_phrase_recons, loc_phrase_emb).sum(1).unsqueeze(1) 215 | rel_loss = self.mse_loss(rel_phrase_recons, rel_phrase_emb).sum(1).unsqueeze(1) 216 | 217 | total_loss = (weights * torch.cat([sub_loss, loc_loss, rel_loss], 1)).sum(1).mean(0) 218 | 219 | return total_loss 220 | 221 | def mse_loss(self, recons, emb): 222 | return (recons-emb)**2 223 | -------------------------------------------------------------------------------- /lib/layers/match.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from layers.lang_encoder import RNNEncoder, PhraseAttention 9 | from layers.visual_encoder import LocationEncoder, SubjectEncoder, RelationEncoder 10 | from layers.lang_decoder import LocationDecoder, SubjectDecoder, RelationDecoder 11 | from layers.loss import AttributeReconstructLoss, LangReconstructionLoss, AdapVisualReconstructLoss, AdapLangReconstructLoss 12 | 13 | class Score(nn.Module): 14 | def __init__(self, vis_dim, lang_dim, jemb_dim): 15 | super(Score, self).__init__() 16 | 17 | self.feat_fuse = nn.Sequential(nn.Linear(vis_dim+lang_dim, jemb_dim), 18 | nn.ReLU(), 19 | nn.Linear(jemb_dim, 1)) 20 | self.softmax = nn.Softmax(dim=1) 21 | self.lang_dim = lang_dim 22 | self.vis_dim = vis_dim 23 | 24 | def forward(self, visual_input, lang_input): 25 | 26 | sent_num, ann_num = visual_input.size(0), visual_input.size(1) 27 | 28 | lang_input = lang_input.unsqueeze(1).expand(sent_num, ann_num, self.lang_dim) 29 | lang_input = nn.functional.normalize(lang_input, p=2, dim=2) 30 | 31 | ann_attn = self.feat_fuse(torch.cat([visual_input, lang_input], 2)) 32 | 33 | ann_attn = self.softmax(ann_attn.view(sent_num, ann_num)) 34 | ann_attn = ann_attn.unsqueeze(2) 35 | 36 | return ann_attn 37 | 38 | 39 | class RelationScore(nn.Module): 40 | def __init__(self, vis_dim, lang_dim, jemb_dim): 41 | super(RelationScore, self).__init__() 42 | 43 | self.feat_fuse = nn.Sequential(nn.Linear(vis_dim+lang_dim, jemb_dim), 44 | nn.ReLU(), 45 | nn.Linear(jemb_dim, 1)) 46 | self.softmax = nn.Softmax(dim=1) 47 | self.lang_dim = lang_dim 48 | self.vis_dim = vis_dim 49 | 50 | def forward(self, visual_input, lang_input, masks): 51 | 52 | sent_num, ann_num, cxt_num = visual_input.size(0), visual_input.size(1), visual_input.size(2) 53 | 54 | visual_input = visual_input.view(sent_num, ann_num*cxt_num, -1) 55 | visual_emb_normalized = nn.functional.normalize(visual_input, p=2, dim=2) 56 | lang_input = lang_input.unsqueeze(1).expand(sent_num, ann_num, self.lang_dim).contiguous() 57 | lang_input = lang_input.unsqueeze(2).expand(sent_num, ann_num, cxt_num, self.lang_dim).contiguous() 58 | lang_input = lang_input.reshape(sent_num, ann_num*cxt_num, -1) 59 | lang_emb_normalized = nn.functional.normalize(lang_input, p=2, dim=2) 60 | 61 | 62 | ann_attn = self.feat_fuse(torch.cat([visual_emb_normalized, lang_emb_normalized], 2)) 63 | 64 | ann_attn = ann_attn.squeeze(2).contiguous() 65 | ann_attn = ann_attn.view(sent_num, ann_num, -1) 66 | 67 | ann_attn = masks * ann_attn 68 | ann_attn, ixs = torch.max(ann_attn, 2) 69 | ann_attn = self.softmax(ann_attn) 70 | ann_attn = ann_attn.unsqueeze(2) 71 | 72 | return ann_attn, ixs 73 | 74 | class AdaptiveReconstruct(nn.Module): 75 | def __init__(self, opt): 76 | super(AdaptiveReconstruct, self).__init__() 77 | num_layers = opt['rnn_num_layers'] 78 | hidden_size = opt['rnn_hidden_size'] 79 | num_dirs = 2 if opt['bidirectional'] > 0 else 1 80 | self.word_vec_size = opt['word_vec_size'] 81 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim'] 82 | 83 | self.lang_res_weight = opt['lang_res_weight'] 84 | self.vis_res_weight = opt['vis_res_weight'] 85 | self.att_res_weight = opt['att_res_weight'] 86 | self.loss_combined = opt['loss_combined'] 87 | self.loss_divided = opt['loss_divided'] 88 | 89 | # language rnn encoder 90 | self.rnn_encoder = RNNEncoder(vocab_size=opt['vocab_size'], 91 | word_embedding_size=opt['word_embedding_size'], 92 | word_vec_size=opt['word_vec_size'], 93 | hidden_size=opt['rnn_hidden_size'], 94 | bidirectional=opt['bidirectional']>0, 95 | input_dropout_p=opt['word_drop_out'], 96 | dropout_p=opt['rnn_drop_out'], 97 | n_layers=opt['rnn_num_layers'], 98 | rnn_type=opt['rnn_type'], 99 | variable_lengths=opt['variable_lengths'] > 0) 100 | 101 | 102 | self.weight_fc = nn.Linear(num_layers * num_dirs *hidden_size, 3) 103 | 104 | self.sub_attn = PhraseAttention(hidden_size * num_dirs) 105 | self.loc_attn = PhraseAttention(hidden_size * num_dirs) 106 | self.rel_attn = PhraseAttention(hidden_size * num_dirs) 107 | 108 | self.sub_encoder = SubjectEncoder(opt) 109 | self.loc_encoder = LocationEncoder(opt) 110 | self.rel_encoder = RelationEncoder(opt) 111 | 112 | self.sub_score = Score(self.pool5_dim+self.fc7_dim, opt['word_vec_size'], 113 | opt['jemb_dim']) 114 | self.loc_score = Score(25+5, opt['word_vec_size'], 115 | opt['jemb_dim']) 116 | self.rel_score = RelationScore(self.fc7_dim+5, opt['word_vec_size'], 117 | opt['jemb_dim']) 118 | 119 | self.sub_decoder = SubjectDecoder(opt) 120 | self.loc_decoder = LocationDecoder(opt) 121 | self.rel_decoder = RelationDecoder(opt) 122 | 123 | self.att_res_loss = AttributeReconstructLoss(opt) 124 | self.vis_res_loss = AdapVisualReconstructLoss(opt) 125 | self.lang_res_loss = AdapLangReconstructLoss(opt) 126 | self.rec_loss = LangReconstructionLoss(opt) 127 | 128 | self.sub_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], self.pool5_dim+self.fc7_dim)) 129 | self.loc_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], 25+5)) 130 | self.rel_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], self.fc7_dim+5)) 131 | 132 | self.feat_fuse = nn.Sequential( 133 | nn.Linear(self.fc7_dim + self.pool5_dim + 25 + 5 + self.fc7_dim + 5, opt['jemb_dim']), 134 | nn.ReLU()) 135 | 136 | def forward(self, pool5, fc7, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats, labels, enc_labels, dec_labels, att_labels, select_ixs, att_weights): 137 | 138 | context, hidden, embedded = self.rnn_encoder(labels) 139 | 140 | weights = F.softmax(self.weight_fc(hidden)) 141 | sub_attn, sub_phrase_emb = self.sub_attn(context, embedded, labels) 142 | loc_attn, loc_phrase_emb = self.loc_attn(context, embedded, labels) 143 | rel_attn, rel_phrase_emb = self.rel_attn(context, embedded, labels) 144 | 145 | sent_num = pool5.size(0) 146 | ann_num = pool5.size(1) 147 | 148 | # subject matching score 149 | sub_feats = self.sub_encoder(pool5, fc7, sub_phrase_emb) 150 | sub_ann_attn = self.sub_score(sub_feats, sub_phrase_emb) 151 | 152 | # location matching score 153 | loc_feats = self.loc_encoder(lfeats, dif_lfeats) 154 | loc_ann_attn = self.loc_score(loc_feats, loc_phrase_emb) 155 | 156 | # relation matching score 157 | rel_feats, masks = self.rel_encoder(cxt_fc7, cxt_lfeats) 158 | rel_ann_attn, rel_ixs = self.rel_score(rel_feats, rel_phrase_emb, masks) 159 | 160 | weights_expand = weights.unsqueeze(1).expand(sent_num, ann_num, 3) 161 | total_ann_score = (weights_expand * torch.cat([sub_ann_attn, loc_ann_attn, rel_ann_attn], 2)).sum(2) 162 | 163 | loss = 0 164 | att_res_loss = 0 165 | lang_res_loss = 0 166 | vis_res_loss = 0 167 | 168 | # divided_loss 169 | sub_phrase_recons = self.sub_decoder(sub_feats, total_ann_score) 170 | loc_phrase_recons = self.loc_decoder(loc_feats, total_ann_score) 171 | rel_phrase_recons = self.rel_decoder(rel_feats, total_ann_score, rel_ixs) 172 | 173 | if self.vis_res_weight > 0: 174 | vis_res_loss = self.vis_res_loss(sub_phrase_emb, sub_phrase_recons, loc_phrase_emb, 175 | loc_phrase_recons, rel_phrase_emb, rel_phrase_recons, weights) 176 | loss = self.vis_res_weight * vis_res_loss 177 | 178 | if self.lang_res_weight > 0: 179 | lang_res_loss = self.lang_res_loss(sub_phrase_emb, loc_phrase_emb, rel_phrase_emb, enc_labels, 180 | dec_labels) 181 | 182 | loss += self.lang_res_weight * lang_res_loss 183 | 184 | 185 | # combined_loss 186 | loss += self.loss_divided*loss 187 | 188 | ann_score = total_ann_score.unsqueeze(1) 189 | 190 | ixs = rel_ixs.view(sent_num, ann_num, 1).unsqueeze(3).expand(sent_num, ann_num, 1, self.fc7_dim + 5) 191 | rel_feats_max = torch.gather(rel_feats, 2, ixs) 192 | rel_feats_max = rel_feats_max.squeeze(2) 193 | 194 | fuse_feats = torch.cat([sub_feats, loc_feats, rel_feats_max], 2) 195 | fuse_feats = torch.bmm(ann_score, fuse_feats) 196 | fuse_feats = fuse_feats.squeeze(1) 197 | fuse_feats = self.feat_fuse(fuse_feats) 198 | rec_loss = self.rec_loss(fuse_feats, enc_labels, dec_labels) 199 | loss += self.loss_combined * rec_loss 200 | 201 | if self.att_res_weight > 0: 202 | att_scores, att_res_loss = self.att_res_loss(sub_feats, total_ann_score, att_labels, select_ixs, att_weights) 203 | loss += self.att_res_weight * att_res_loss 204 | 205 | return total_ann_score, loss, rel_ixs, sub_attn, loc_attn, rel_attn, weights, \ 206 | vis_res_loss, att_res_loss, lang_res_loss 207 | 208 | 209 | 210 | 211 | 212 | -------------------------------------------------------------------------------- /lib/layers/visual_encoder.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | from torch.autograd import Variable 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | class Normalize_Scale(nn.Module): 12 | def __init__(self, dim, init_norm=20): 13 | super(Normalize_Scale, self).__init__() 14 | self.init_norm = init_norm 15 | self.weight = nn.Parameter(torch.ones(1, dim) * init_norm) 16 | 17 | def forward(self, bottom): 18 | assert isinstance(bottom, Variable), 'bottom must be variable' 19 | 20 | bottom_normalized = nn.functional.normalize(bottom, p=2, dim=1) 21 | bottom_normalized_scaled = bottom_normalized * self.weight 22 | return bottom_normalized_scaled 23 | 24 | class LocationEncoder(nn.Module): 25 | def __init__(self, opt): 26 | super(LocationEncoder, self).__init__() 27 | init_norm = opt.get('visual_init_norm', 20) 28 | self.lfeats_normalizer = Normalize_Scale(5, init_norm) 29 | self.dif_lfeat_normalizer = Normalize_Scale(25, init_norm) 30 | 31 | def forward(self, lfeats, dif_lfeats): 32 | sent_num, ann_num = lfeats.size(0), lfeats.size(1) 33 | output = torch.cat([self.lfeats_normalizer(lfeats.contiguous().view(-1, 5)), 34 | self.dif_lfeat_normalizer(dif_lfeats.contiguous().view(-1, 25))], 1) 35 | output = output.view(sent_num, ann_num, 5+25) 36 | 37 | return output 38 | 39 | class SubjectEncoder(nn.Module): 40 | def __init__(self, opt): 41 | super(SubjectEncoder, self).__init__() 42 | self.word_vec_size = opt['word_vec_size'] 43 | self.jemb_dim = opt['jemb_dim'] 44 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim'] 45 | 46 | self.pool5_normalizer = Normalize_Scale(opt['pool5_dim'], opt['visual_init_norm']) 47 | self.fc7_normalizer = Normalize_Scale(opt['fc7_dim'], opt['visual_init_norm']) 48 | 49 | def forward(self, pool5, fc7, phrase_emb): 50 | sent_num, ann_num, grids = pool5.size(0), pool5.size(1), pool5.size(3)*pool5.size(4) 51 | batch = sent_num * ann_num 52 | 53 | pool5 = pool5.contiguous().view(batch, self.pool5_dim, -1) 54 | pool5 = pool5.transpose(1,2).contiguous().view(-1, self.pool5_dim) 55 | pool5 = self.pool5_normalizer(pool5) 56 | pool5 = pool5.view(sent_num, ann_num, 49, -1).transpose(2, 3).contiguous().mean(3) 57 | 58 | fc7 = fc7.contiguous().view(batch, self.fc7_dim, -1) 59 | fc7 = fc7.transpose(1, 2).contiguous().view(-1, self.fc7_dim) 60 | fc7 = self.fc7_normalizer(fc7) 61 | fc7 = fc7.view(sent_num, ann_num, 49, -1).transpose(2, 3).contiguous().mean(3) 62 | 63 | avg_att_feats = torch.cat([pool5, fc7], 2) 64 | 65 | return avg_att_feats 66 | 67 | class RelationEncoder(nn.Module): 68 | def __init__(self, opt): 69 | super(RelationEncoder, self).__init__() 70 | self.vis_feat_normalizer = Normalize_Scale(opt['fc7_dim'], opt['visual_init_norm']) 71 | self.lfeat_normalizer = Normalize_Scale(5, opt['visual_init_norm']) 72 | 73 | def forward(self, cxt_feats, cxt_lfeats): 74 | masks = (cxt_lfeats.sum(3) != 0).float() 75 | 76 | sent_num, ann_num = cxt_feats.size(0), cxt_feats.size(1) 77 | batch, num_cxt = sent_num*ann_num, cxt_feats.size(2) 78 | cxt_feats = self.vis_feat_normalizer(cxt_feats.contiguous().view(batch * num_cxt, -1)) 79 | cxt_lfeats = self.lfeat_normalizer(cxt_lfeats.contiguous().view(batch * num_cxt, -1)) 80 | 81 | rel_feats = torch.cat([cxt_feats, cxt_lfeats], 1) 82 | 83 | rel_feats = rel_feats.view(sent_num, ann_num, num_cxt, -1) 84 | return rel_feats, masks -------------------------------------------------------------------------------- /lib/loaders/dataloader.py: -------------------------------------------------------------------------------- 1 | """ 2 | data_json has 3 | 0. refs: [{ref_id, ann_id, box, image_id, split, category_id, sent_ids, att_wds}] 4 | 1. images: [{image_id, ref_ids, file_name, width, height, h5_id}] 5 | 2. anns: [{ann_id, category_id, image_id, box, h5_id}] 6 | 3. sentences: [{sent_id, tokens, h5_id}] 7 | 4. word_to_ix: {word: ix} 8 | 5. att_to_ix : {att_wd: ix} 9 | 6. att_to_cnt: {att_wd: cnt} 10 | 7. label_length: L 11 | 12 | Note, box in [xywh] format 13 | label_h5 has 14 | /labels is (M, max_length) uint32 array of encoded labels, zeros padded 15 | """ 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os.path as osp 22 | import numpy as np 23 | import h5py 24 | import random 25 | from loaders.loader import Loader 26 | 27 | import torch 28 | from torch.autograd import Variable 29 | 30 | from mrcn import inference_no_imdb 31 | import functools 32 | 33 | # box functions 34 | def xywh_to_xyxy(boxes): 35 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 36 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 37 | 38 | def xyxy_to_xywh(boxes): 39 | """Convert [x1 y1 x2 y2] box format to [x y w h] format.""" 40 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1)) 41 | 42 | class DataLoader(Loader): 43 | 44 | def __init__(self, data_json, data_h5): 45 | # parent loader instance 46 | Loader.__init__(self, data_json, data_h5) 47 | 48 | # prepare attributes 49 | self.att_to_ix = self.info['att_to_ix'] 50 | self.ix_to_att = {ix: wd for wd, ix in self.att_to_ix.items()} 51 | self.num_atts = len(self.att_to_ix) 52 | self.att_to_cnt = self.info['att_to_cnt'] 53 | 54 | # img_iterators for each split 55 | self.split_ix = {} 56 | self.iterators = {} 57 | for image_id, image in self.Images.items(): 58 | # we use its ref's split (there is assumption that each image only has one split) 59 | split = self.Refs[image['ref_ids'][0]]['split'] 60 | if split not in self.split_ix: 61 | self.split_ix[split] = [] 62 | self.iterators[split] = 0 63 | self.split_ix[split] += [image_id] 64 | for k, v in self.split_ix.items(): 65 | print('assigned %d images to split %s' %(len(v), k)) 66 | 67 | def prepare_mrcn(self, head_feats_dir, args): 68 | """ 69 | Arguments: 70 | head_feats_dir: cache/feats/dataset_splitBy/net_imdb_tag, containing all image conv_net feats 71 | args: imdb_name, net_name, iters, tag 72 | """ 73 | self.head_feats_dir = head_feats_dir 74 | self.mrcn = inference_no_imdb.Inference(args) 75 | assert args.net_name == 'res101' 76 | self.pool5_dim = 1024 77 | self.fc7_dim = 2048 78 | 79 | # load different kinds of feats 80 | def loadFeats(self, Feats): 81 | # Feats = {feats_name: feats_path} 82 | self.feats = {} 83 | self.feat_dim = None 84 | for feats_name, feats_path in Feats.items(): 85 | if osp.isfile(feats_path): 86 | self.feats[feats_name] = h5py.File(feats_path, 'r') 87 | self.feat_dim = self.feats[feats_name]['fc7'].shape[1] 88 | assert self.feat_dim == self.fc7_dim 89 | print('FeatLoader loading [%s] from %s [feat_dim %s]' %(feats_name, feats_path, self.feat_dim)) 90 | 91 | # shuffle split 92 | def shuffle(self, split): 93 | random.shuffle(self.split_ix[split]) 94 | 95 | # reset iterator 96 | def resetIterator(self, split): 97 | self.iterators[split]=0 98 | 99 | # expand list by seq per ref, i.e., [a,b], 3 -> [aaabbb] 100 | def expand_list(self, L, n): 101 | out = [] 102 | for l in L: 103 | out += [l] * n 104 | return out 105 | 106 | def image_to_head(self, image_id): 107 | """Returns 108 | head: float32 (1, 1024, H, W) 109 | im_info: float32 [[im_h, im_w, im_scale]] 110 | """ 111 | feats_h5 = osp.join(self.head_feats_dir, str(image_id)+'.h5') 112 | feats = h5py.File(feats_h5, 'r') 113 | head, im_info = feats['head'], feats['im_info'] 114 | return np.array(head), np.array(im_info) 115 | 116 | def fetch_sent_ids_by_ref_id(self, ref_id, num_sents): 117 | """ 118 | Sample #num_sents sents for each ref_id. 119 | """ 120 | sent_ids = list(self.Refs[ref_id]['sent_ids']) 121 | if len(sent_ids) < num_sents: 122 | append_sent_ids = [random.choice(sent_ids) for _ in range(num_sents-len(sent_ids))] 123 | sent_ids += append_sent_ids 124 | else: 125 | random.shuffle(sent_ids) 126 | sent_ids = sent_ids[:num_sents] 127 | assert len(sent_ids) == num_sents 128 | return sent_ids 129 | 130 | def fetch_neighbour_ids(self, ann_id): 131 | """ 132 | For a given ann_id, we return 133 | - st_ann_ids: same-type neighbouring ann_ids (not include itself) 134 | - dt_ann_ids: different-type neighbouring ann_ids 135 | Ordered by distance to the input ann_id 136 | """ 137 | ann = self.Anns[ann_id] 138 | x,y,w,h = ann['box'] 139 | rx, ry = x+w/2, y+h/2 140 | 141 | @functools.cmp_to_key 142 | def compare(ann_id0, ann_id1): 143 | x,y,w,h = self.Anns[ann_id0]['box'] 144 | ax0, ay0 = x+w/2, y+h/2 145 | x,y,w,h = self.Anns[ann_id1]['box'] 146 | ax1, ay1 = x+w/2, y+h/2 147 | # closer to farmer 148 | if (rx-ax0)**2+(ry-ay0)**2 <= (rx-ax1)**2+(ry-ay1)**2: 149 | return -1 150 | else: 151 | return 1 152 | 153 | image = self.Images[ann['image_id']] 154 | 155 | ann_ids = list(image['ann_ids']) 156 | ann_ids = sorted(ann_ids, key=compare) 157 | 158 | st_ann_ids, dt_ann_ids = [], [] 159 | for ann_id_else in ann_ids: 160 | if ann_id_else != ann_id: 161 | if self.Anns[ann_id_else]['category_id'] == ann['category_id']: 162 | st_ann_ids += [ann_id_else] 163 | else: 164 | dt_ann_ids +=[ann_id_else] 165 | return st_ann_ids, dt_ann_ids 166 | 167 | def fetch_grid_feats(self, boxes, net_conv, im_info): 168 | """returns -pool5 (n, 1024, 7, 7) -fc7 (n, 2048, 7, 7)""" 169 | pool5, fc7 = self.mrcn.box_to_spatial_fc7(net_conv, im_info, boxes) 170 | return pool5, fc7 171 | 172 | def compute_lfeats(self, ann_ids): 173 | # return ndarray float32 (#ann_ids, 5) 174 | lfeats = np.empty((len(ann_ids), 5), dtype=np.float32) 175 | for ix, ann_id in enumerate(ann_ids): 176 | ann = self.Anns[ann_id] 177 | image = self.Images[ann['image_id']] 178 | x, y ,w, h = ann['box'] 179 | ih, iw = image['height'], image['width'] 180 | lfeats[ix] = np.array([x/iw, y/ih, (x+w-1)/iw, (y+h-1)/ih, w*h/(iw*ih)],np.float32) 181 | return lfeats 182 | 183 | def compute_dif_lfeats(self, ann_ids, topK=5): 184 | # return ndarray float32 (#ann_ids, 5*topK) 185 | dif_lfeats = np.zeros((len(ann_ids), 5*topK), dtype=np.float32) 186 | for i, ann_id in enumerate(ann_ids): 187 | # reference box 188 | rbox = self.Anns[ann_id]['box'] 189 | rcx,rcy,rw,rh = rbox[0]+rbox[2]/2,rbox[1]+rbox[3]/2,rbox[2],rbox[3] 190 | st_ann_ids, _ =self.fetch_neighbour_ids(ann_id) 191 | # candidate box 192 | for j, cand_ann_id in enumerate(st_ann_ids[:topK]): 193 | cbox = self.Anns[cand_ann_id]['box'] 194 | cx1, cy1, cw, ch = cbox[0], cbox[1], cbox[2], cbox[3] 195 | dif_lfeats[i, j*5:(j+1)*5] = np.array([(cx1-rcx)/rw, (cy1-rcy)/rh, (cx1+cw-rcx)/rw, (cy1+ch-rcy)/rh, cw*ch/(rw*rh)]) 196 | return dif_lfeats 197 | 198 | def fetch_cxt_feats(self, ann_ids, opt): 199 | """ 200 | Return 201 | - cxt_feats : ndarray (#ann_ids, topK, fc7_dim) 202 | - cxt_lfeats: ndarray (#ann_ids, topK, 5) 203 | - cxt_ann_ids: [[ann_id]] of size (#ann_ids, topK), padded with -1 204 | Note we only use neighbouring "different" (+ "same") objects for computing context objects, zeros padded. 205 | """ 206 | topK = opt['num_cxt'] 207 | cxt_feats = np.zeros((len(ann_ids), topK, self.fc7_dim), dtype=np.float32) 208 | cxt_lfeats = np.zeros((len(ann_ids), topK, 5), dtype=np.float32) 209 | cxt_ann_ids = [[-1 for _ in range(topK)] for _ in range(len(ann_ids))] 210 | for i, ann_id in enumerate(ann_ids): 211 | # reference box 212 | rbox = self.Anns[ann_id]['box'] 213 | rcx, rcy, rw, rh = rbox[0]+rbox[2]/2, rbox[1]+rbox[3]/2, rbox[2], rbox[3] 214 | # candidate boxes 215 | st_ann_ids, dt_ann_ids = self.fetch_neighbour_ids(ann_id) 216 | if opt['with_st'] > 0: 217 | cand_ann_ids = dt_ann_ids+st_ann_ids 218 | else: 219 | cand_ann_ids = dt_ann_ids 220 | cand_ann_ids = cand_ann_ids[:topK] 221 | for j, cand_ann_id in enumerate(cand_ann_ids): 222 | cand_ann = self.Anns[cand_ann_id] 223 | cbox = cand_ann['box'] 224 | cx1, cy1, cw, ch = cbox[0], cbox[1], cbox[2], cbox[3] 225 | cxt_lfeats[i,j,:] = np.array([(cx1-rcx)/rw, (cy1-rcy)/rh, (cx1+cw-rcx)/rw, (cy1+ch-rcy)/rh, cw*ch/(rw*rh)]) 226 | cxt_feats[i,j,:] = self.feats['ann']['fc7'][cand_ann['h5_id'], :] 227 | cxt_ann_ids[i][j] = cand_ann_id 228 | return cxt_feats, cxt_lfeats, cxt_ann_ids 229 | 230 | def fetch_attribute_label(self, ref_ids): 231 | """Return 232 | - labels : Variable float (N, num_atts) 233 | - select_ixs: Variable long (n, ) 234 | """ 235 | labels = np.zeros((len(ref_ids), self.num_atts)) 236 | select_ixs = [] 237 | for i, ref_id in enumerate(ref_ids): 238 | ref = self.Refs[ref_id] 239 | if len(ref['att_wds']) > 0: 240 | select_ixs += [i] 241 | for wd in ref['att_wds']: 242 | labels[i, self.att_to_ix[wd]] = 1 243 | 244 | return Variable(torch.from_numpy(labels).float().cuda()), Variable(torch.LongTensor(select_ixs).cuda()) 245 | 246 | 247 | def extract_ann_features(self, image_id, opt): 248 | """Get features for all ann_ids in an image""" 249 | image = self.Images[image_id] 250 | ann_ids = image['ann_ids'] 251 | 252 | # fetch image features 253 | head, im_info = self.image_to_head(image_id) 254 | head = Variable(torch.from_numpy(head).cuda()) 255 | 256 | # fetch ann features 257 | ann_boxes = xywh_to_xyxy(np.vstack([self.Anns[ann_id]['box'] for ann_id in ann_ids])) 258 | ann_pool5, ann_fc7 = self.fetch_grid_feats(ann_boxes, head, im_info) 259 | 260 | # absolute location features 261 | lfeats = self.compute_lfeats(ann_ids) 262 | lfeats = Variable(torch.from_numpy(lfeats).cuda()) 263 | 264 | # relative location features 265 | dif_lfeats = self.compute_dif_lfeats(ann_ids) 266 | dif_lfeats = Variable(torch.from_numpy(dif_lfeats).cuda()) 267 | 268 | # fetch context_fc7 and context_lfeats 269 | cxt_fc7, cxt_lfeats, cxt_ann_ids = self.fetch_cxt_feats(ann_ids, opt) 270 | cxt_fc7 = Variable(torch.from_numpy(cxt_fc7).cuda()) 271 | cxt_lfeats = Variable(torch.from_numpy(cxt_lfeats).cuda()) 272 | 273 | return cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats 274 | 275 | 276 | # get batch of data 277 | def getBatch(self, split, opt): 278 | split_ix = self.split_ix[split] 279 | max_index = len(split_ix) - 1 280 | wrapped = False 281 | TopK = opt['num_cxt'] 282 | 283 | # each batch contains one image 284 | ri = self.iterators[split] 285 | ri_next = ri+1 286 | if ri_next > max_index: 287 | ri_next = 0 288 | wrapped = True 289 | self.iterators[split] = ri_next 290 | image_id = split_ix[ri] 291 | 292 | # fetch feats 293 | cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats = self.extract_ann_features(image_id, opt) 294 | ann_ids = self.Images[image_id]['ann_ids'] 295 | ann_num = len(ann_ids) 296 | ref_ids = self.Images[image_id]['ref_ids'] 297 | 298 | img_ref_ids = [] 299 | img_sent_ids = [] 300 | gd_ixs = [] 301 | gd_boxes = [] 302 | for ref_id in ref_ids: 303 | ref = self.Refs[ref_id] 304 | for sent_id in ref['sent_ids']: 305 | img_ref_ids += [ref_id] 306 | img_sent_ids += [sent_id] 307 | gd_ixs += [ann_ids.index(ref['ann_id'])] 308 | gd_boxes += [ref['box']] 309 | img_sent_num = len(img_sent_ids) 310 | 311 | pool5 = ann_pool5.unsqueeze(0).expand(img_sent_num, ann_num, self.pool5_dim, 7, 7) 312 | pool5.detach() 313 | fc7 = ann_fc7.unsqueeze(0).expand(img_sent_num, ann_num, self.fc7_dim, 7, 7) 314 | fc7.detach() 315 | lfeats = lfeats.unsqueeze(0).expand(img_sent_num, ann_num, 5) 316 | dif_lfeats = dif_lfeats.unsqueeze(0).expand(img_sent_num, ann_num, TopK*5) 317 | cxt_fc7 = cxt_fc7.unsqueeze(0).expand(img_sent_num, ann_num, TopK, self.fc7_dim) 318 | cxt_lfeats = cxt_lfeats.unsqueeze(0).expand(img_sent_num, ann_num, TopK, 5) 319 | 320 | 321 | att_labels, select_ixs = self.fetch_attribute_label(img_ref_ids) 322 | 323 | cxt_ann_ids = [cxt_ann_ids for j in range(img_sent_num)] 324 | 325 | 326 | labels = np.vstack([self.fetch_seq(sent_id) for sent_id in img_sent_ids]) 327 | labels = Variable(torch.from_numpy(labels).long().cuda()) 328 | max_len = (labels!=0).sum(1).max().data[0] 329 | labels = labels[:, :max_len] 330 | 331 | start_words = np.ones([labels.size(0), 1], dtype=int)*(self.word_to_ix['']) 332 | start_words = Variable(torch.from_numpy(start_words).long().cuda()) 333 | enc_labels = labels.clone() 334 | enc_labels = torch.cat([start_words, enc_labels], 1) 335 | 336 | zero_pad = np.zeros([labels.size(0), 1], dtype=int) 337 | zero_pad = Variable(torch.from_numpy(zero_pad).long().cuda()) 338 | dec_labels = labels.clone() 339 | dec_labels = torch.cat([dec_labels, zero_pad], 1) 340 | 341 | data = {} 342 | data['labels'] = labels 343 | data['enc_labels'] = enc_labels 344 | data['dec_labels'] = dec_labels 345 | data['ref_ids'] = ref_ids 346 | data['sent_ids'] = img_sent_ids 347 | data['gd_ixs'] = gd_ixs 348 | data['gd_boxes'] = gd_boxes 349 | data['cxt_ann_ids'] = cxt_ann_ids 350 | data['Feats'] = {'fc7': fc7, 'pool5': pool5, 'lfeats': lfeats, 'dif_lfeats': dif_lfeats, 351 | 'cxt_fc7': cxt_fc7, 'cxt_lfeats': cxt_lfeats} 352 | data['att_labels'] = att_labels 353 | data['select_ixs'] = select_ixs 354 | data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': max_index, 'wrapped': wrapped} 355 | return data 356 | 357 | def get_attribute_weights(self, scale = 10): 358 | # weights = \lamda * 1/sqrt(cnt) 359 | cnts = [self.att_to_cnt[self.ix_to_att[ix]] for ix in range(self.num_atts)] 360 | cnts = np.array(cnts) 361 | weights = 1 / cnts ** 0.5 362 | weights = (weights-np.min(weights))/(np.max(weights)-np.min(weights)) 363 | weights = weights * (scale - 1) + 1 364 | return torch.from_numpy(weights).float() 365 | 366 | def decode_attribute_label(self, scores): 367 | """- scores: Variable (cuda) (n, num_atts) after sigmoid range [0, 1] 368 | - labels:list of [[att, sc], [att, sc], ... 369 | """ 370 | scores = scores.data.cpu().numpy() 371 | N = scores.shape[0] 372 | labels = [] 373 | for i in range(N): 374 | label = [] 375 | score = scores[i] 376 | for j, sc in enumerate(list(score)): 377 | label += [(self.ix_to_att[j], sc)] 378 | labels.append(label) 379 | return labels 380 | 381 | def getTestBatch(self, split, opt): 382 | 383 | wrapped = False 384 | split_ix = self.split_ix[split] 385 | max_index = len(split_ix) - 1 386 | ri = self.iterators[split] 387 | ri_next = ri + 1 388 | if ri_next > max_index: 389 | ri_next = 0 390 | wrapped = True 391 | self.iterators[split] = ri_next 392 | image_id = split_ix[ri] 393 | image = self.Images[image_id] 394 | ann_ids = image['ann_ids'] 395 | cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats = self.extract_ann_features(image_id, opt) 396 | sent_ids = [] 397 | gd_ixs = [] 398 | gd_boxes = [] 399 | att_refs = [] 400 | for ref_id in image['ref_ids']: 401 | ref = self.Refs[ref_id] 402 | for sent_id in ref['sent_ids']: 403 | sent_ids += [sent_id] 404 | gd_ixs += [ann_ids.index(ref['ann_id'])] 405 | gd_boxes += [ref['box']] 406 | att_refs += [ref_id] 407 | 408 | labels = np.vstack([self.fetch_seq(sent_id) for sent_id in sent_ids]) 409 | labels = Variable(torch.from_numpy(labels).long().cuda()) 410 | max_len = (labels!=0).sum(1).max().data[0] 411 | labels = labels[:, :max_len] 412 | 413 | start_words = np.ones([labels.size(0), 1], dtype=int)*(self.word_to_ix['']) 414 | start_words = Variable(torch.from_numpy(start_words).long().cuda()) 415 | enc_labels = labels.clone() 416 | enc_labels = torch.cat([start_words, enc_labels], 1) 417 | 418 | zero_pad = np.zeros([labels.size(0), 1], dtype=int) 419 | zero_pad = Variable(torch.from_numpy(zero_pad).long().cuda()) 420 | dec_labels = labels.clone() 421 | dec_labels = torch.cat([dec_labels, zero_pad], 1) 422 | 423 | att_labels, select_ixs = self.fetch_attribute_label(att_refs) 424 | 425 | pool5 = ann_pool5.unsqueeze(0) 426 | pool5.detach() 427 | fc7 = ann_fc7.unsqueeze(0) 428 | fc7.detach() 429 | lfeats = lfeats.unsqueeze(0) 430 | dif_lfeats = dif_lfeats.unsqueeze(0) 431 | cxt_fc7 = cxt_fc7.unsqueeze(0) 432 | cxt_lfeats = cxt_lfeats.unsqueeze(0) 433 | 434 | data = {} 435 | data['image_id'] = image_id 436 | data['ann_ids'] = ann_ids 437 | data['cxt_ann_ids'] = cxt_ann_ids 438 | data['sent_ids'] = sent_ids 439 | data['gd_ixs'] = gd_ixs 440 | data['gd_boxes'] = gd_boxes 441 | data['Feats'] = {'fc7': fc7, 'pool5': pool5, 'lfeats': lfeats, 'dif_lfeats': dif_lfeats, 442 | 'cxt_fc7': cxt_fc7, 'cxt_lfeats': cxt_lfeats} 443 | 444 | data['labels'] = labels 445 | data['enc_labels'] = enc_labels 446 | data['dec_labels'] = dec_labels 447 | data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': max_index, 'wrapped': wrapped} 448 | data['att_labels'] = att_labels 449 | data['select_ixs'] = select_ixs 450 | return data 451 | 452 | -------------------------------------------------------------------------------- /lib/loaders/loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | data_json has 3 | 0. refs : list of {ref_id, ann_id, box, image_id, split, category_id, sent_ids} 4 | 1. images : list of {image_id, ref_ids, ann_ids, file_name, width, height, h5_id} 5 | 2. anns : list of {ann_id, category_id, image_id, box, h5_id} 6 | 3. sentences : list of {sent_id, tokens, h5_id} 7 | 4: word_to_ix : word->ix 8 | 5: cat_to_ix : cat->ix 9 | 6: label_length: L 10 | Note, box in [xywh] format 11 | data_h5 has 12 | /labels is (M, max_length) uint32 array of encoded labels, zeros padded 13 | """ 14 | from __future__ import absolute_import 15 | from __future__ import division 16 | from __future__ import print_function 17 | 18 | import os.path as osp 19 | import numpy as np 20 | import h5py 21 | import json 22 | import random 23 | 24 | class Loader(object): 25 | 26 | def __init__(self, data_json, data_h5=None): 27 | # load the json file which contains info about the dataset 28 | print('Loader loading data.json: ', data_json) 29 | self.info = json.load(open(data_json)) 30 | self.word_to_ix = self.info['word_to_ix'] 31 | self.ix_to_word = {ix: wd for wd, ix in self.word_to_ix.items()} 32 | print('vocab size is ', self.vocab_size) 33 | self.cat_to_ix = self.info['cat_to_ix'] 34 | self.ix_to_cat = {ix: cat for cat, ix in self.cat_to_ix.items()} 35 | print('object cateogry size is ', len(self.ix_to_cat)) 36 | self.images = self.info['images'] 37 | self.anns = self.info['anns'] 38 | self.refs = self.info['refs'] 39 | self.sentences = self.info['sentences'] 40 | print('we have %s images.' % len(self.images)) 41 | print('we have %s anns.' % len(self.anns)) 42 | print('we have %s refs.' % len(self.refs)) 43 | print('we have %s sentences.' % len(self.sentences)) 44 | print('label_length is ', self.label_length) 45 | 46 | # construct mapping 47 | self.Refs = {ref['ref_id']: ref for ref in self.refs} 48 | self.Images = {image['image_id']: image for image in self.images} 49 | self.Anns = {ann['ann_id']: ann for ann in self.anns} 50 | self.Sentences = {sent['sent_id']: sent for sent in self.sentences} 51 | self.annToRef = {ref['ann_id']: ref for ref in self.refs} 52 | self.sentToRef = {sent_id: ref for ref in self.refs for sent_id in ref['sent_ids']} 53 | 54 | # read data_h5 if exists 55 | self.data_h5 = None 56 | if data_h5 is not None: 57 | print('Loader loading data.h5: ', data_h5) 58 | self.data_h5 = h5py.File(data_h5, 'r') 59 | assert self.data_h5['labels'].shape[0] == len(self.sentences), 'label.shape[0] not match sentences' 60 | assert self.data_h5['labels'].shape[1] == self.label_length, 'label.shape[1] not match label_length' 61 | 62 | @property 63 | def vocab_size(self): 64 | # len(self.word_to_ix) == 1999 65 | return len(self.word_to_ix) 66 | 67 | @property 68 | def label_length(self): 69 | return self.info['label_length'] 70 | 71 | @property 72 | def sent_to_Ref(self, sent_id): 73 | return self.sent_to_Ref(sent_id) 74 | 75 | def encode_labels(self, sent_str_list): 76 | """Input: 77 | sent_str_list: list of n sents in string format 78 | return int32 (n, label_length) zeros padded in end 79 | """ 80 | num_sents = len(sent_str_list) 81 | L = np.zeros((num_sents, self.label_length), dtype=np.int32) 82 | for i, sent_str in enumerate(sent_str_list): 83 | tokens = sent_str.split() 84 | for j, w in enumerate(tokens): 85 | if j < self.label_length: 86 | L[i, j] = self.word_to_ix[w] if w in self.word_to_ix else self.word_to_ix[''] 87 | return L 88 | 89 | def decode_labels(self, labels): 90 | """ 91 | labels: int32 (n, label_length) zeros padded in end 92 | return: list of sents in string format 93 | """ 94 | decoded_sent_strs = [] 95 | num_sents = labels.shape[0] 96 | for i in range(num_sents): 97 | label = labels[i].tolist() 98 | sent_str = ' '.join([self.ix_to_word[int(i)] for i in label if i != 0]) 99 | decoded_sent_strs.append(sent_str) 100 | return decoded_sent_strs 101 | 102 | 103 | def fetch_label(self, ref_id, num_sents): 104 | """ 105 | return: int32 (num_sents, label_length) and picked_sent_ids 106 | """ 107 | ref = self.Refs[ref_id] 108 | sent_ids = list(ref['sent_ids']) # copy in case the raw list is changed 109 | seq = [] 110 | 111 | if len(sent_ids) < num_sents: 112 | append_sent_ids = [random.choice(sent_ids) for _ in range(num_sents - len(sent_ids))] 113 | sent_ids += append_sent_ids 114 | else: 115 | sent_ids = sent_ids[:num_sents] 116 | assert len(sent_ids) == num_sents 117 | # fetch label 118 | for sent_id in sent_ids: 119 | sent_h5_id = self.Sentences[sent_id]['h5_id'] 120 | seq += [self.data_h5['labels'][sent_h5_id, :]] 121 | seq = np.vstack(seq) 122 | return seq, sent_ids 123 | 124 | def fetch_seq(self, sent_id): 125 | # return int32 (label_length, ) 126 | sent_h5_id = self.Sentences[sent_id]['h5_id'] 127 | seq = self.data_h5['labels'][sent_h5_id, :] 128 | return seq 129 | -------------------------------------------------------------------------------- /lib/models/eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path as osp 7 | import numpy as np 8 | import json 9 | import h5py 10 | import time 11 | from pprint import pprint 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | import pdb 17 | 18 | 19 | # IoU function 20 | def computeIoU(box1, box2): 21 | # each box is of [x1, y1, w, h] 22 | inter_x1 = max(box1[0], box2[0]) 23 | inter_y1 = max(box1[1], box2[1]) 24 | inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1) 25 | inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1) 26 | 27 | if inter_x1 < inter_x2 and inter_y1 < inter_y2: 28 | inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1) 29 | else: 30 | inter = 0 31 | union = box1[2]*box1[3] + box2[2]*box2[3] - inter 32 | return float(inter)/union 33 | 34 | 35 | def eval_split(loader, model, split, opt): 36 | 37 | verbose = opt.get('verbose', True) 38 | num_sents = opt.get('num_sents', -1) 39 | assert split != 'train', 'Check the evaluation split.' 40 | 41 | model.eval() 42 | 43 | loader.resetIterator(split) 44 | loss_sum = 0 45 | loss_evals = 0 46 | acc = 0 47 | predictions = [] 48 | finish_flag = False 49 | model_time = 0 50 | vis_res_loss_sum = 0 51 | lang_res_loss_sum = 0 52 | 53 | 54 | while True: 55 | with torch.no_grad(): 56 | data = loader.getTestBatch(split, opt) 57 | att_weights = loader.get_attribute_weights() 58 | ann_ids = data['ann_ids'] 59 | sent_ids = data['sent_ids'] 60 | Feats = data['Feats'] 61 | labels = data['labels'] 62 | enc_labels = data['enc_labels'] 63 | dec_labels = data['dec_labels'] 64 | att_labels, select_ixs = data['att_labels'], data['select_ixs'] 65 | 66 | for i, sent_id in enumerate(sent_ids): 67 | enc_label = enc_labels[i:i + 1] 68 | max_len = (enc_label != 0).sum().data[0] 69 | enc_label = enc_label[:, :max_len] 70 | dec_label = dec_labels[i:i + 1] 71 | dec_label = dec_label[:, :max_len] 72 | 73 | label = labels[i:i + 1] 74 | max_len = (label != 0).sum().data[0] 75 | label = label[:, :max_len] 76 | 77 | att_label = att_labels[i:i + 1] 78 | if i in select_ixs: 79 | select_ix = torch.LongTensor([0]).cuda() 80 | else: 81 | select_ix = torch.LongTensor().cuda() 82 | 83 | tic = time.time() 84 | scores, loss, rel_ixs, sub_attn, loc_attn, rel_attn, weights, vis_res_loss, att_res_loss, lang_res_loss = \ 85 | model(Feats['pool5'], Feats['fc7'], Feats['lfeats'], Feats['dif_lfeats'], 86 | Feats['cxt_fc7'], Feats['cxt_lfeats'], label, enc_label, dec_label, att_label, select_ix, att_weights) 87 | 88 | scores = scores.squeeze(0).data.cpu().numpy() 89 | rel_ixs = rel_ixs.squeeze(0).data.cpu().numpy().tolist() 90 | 91 | loss = loss.data[0].item() 92 | 93 | if opt['loss_combined'] == 0: 94 | vis_res_loss=vis_res_loss.data[0].item() 95 | lang_res_loss = lang_res_loss.data[0].item() 96 | vis_res_loss_sum += vis_res_loss 97 | lang_res_loss_sum += lang_res_loss 98 | 99 | pred_ix = np.argmax(scores) 100 | gd_ix = data['gd_ixs'][i] 101 | loss_sum += loss 102 | loss_evals += 1 103 | 104 | pred_box = loader.Anns[ann_ids[pred_ix]]['box'] 105 | gd_box = data['gd_boxes'][i] 106 | 107 | if opt['use_IoU'] > 0: 108 | if computeIoU(pred_box, gd_box) >= 0.5: 109 | acc += 1 110 | else: 111 | if pred_ix == gd_ix: 112 | acc += 1 113 | 114 | rel_ix = rel_ixs[pred_ix] 115 | 116 | entry = {} 117 | entry['sent_id'] = sent_id 118 | entry['sent'] = loader.decode_labels(label.data.cpu().numpy())[0] 119 | entry['gd_ann_id'] = data['ann_ids'][gd_ix] 120 | entry['pred_ann_id'] = data['ann_ids'][pred_ix] 121 | entry['pred_score'] = scores.tolist()[pred_ix] 122 | 123 | entry['sub_attn'] = sub_attn.data.cpu().numpy().tolist() 124 | entry['loc_attn'] = loc_attn.data.cpu().numpy().tolist() 125 | entry['rel_attn'] = rel_attn.data.cpu().numpy().tolist() 126 | entry['rel_ann_id'] = data['cxt_ann_ids'][pred_ix][rel_ix] 127 | 128 | entry['weights'] = weights.data.cpu().numpy().tolist() 129 | 130 | predictions.append(entry) 131 | toc = time.time() 132 | model_time += (toc - tic) 133 | 134 | if num_sents > 0 and loss_evals >= num_sents: 135 | finish_flag = True 136 | break 137 | ix0 = data['bounds']['it_pos_now'] 138 | ix1 = data['bounds']['it_max'] 139 | if verbose: 140 | print('evaluating [%s] ... image[%d/%d]\'s sents, acc=%.2f%%, (%.4f), model time (per sent) is %.2fs' % \ 141 | (split, ix0, ix1, acc*100.0/loss_evals, loss, model_time/len(sent_ids))) 142 | model_time = 0 143 | 144 | if finish_flag or data['bounds']['wrapped']: 145 | break 146 | 147 | return loss_sum / loss_evals, acc / loss_evals, predictions, \ 148 | vis_res_loss_sum / loss_evals, lang_res_loss_sum / loss_evals 149 | 150 | 151 | -------------------------------------------------------------------------------- /lib/models/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | import pdb 10 | 11 | 12 | # grad_clip=0.1 13 | def clip_gradient(optimizer, grad_clip): 14 | for group in optimizer.param_groups: 15 | for param in group['params']: 16 | # pdb.set_trace() 17 | # 裁切一下 18 | if hasattr(param.grad, 'data'): 19 | param.grad.data.clamp_(-grad_clip, grad_clip) 20 | 21 | 22 | def set_lr(optimizer, lr): 23 | for group in optimizer.param_groups: 24 | group['lr'] = lr 25 | -------------------------------------------------------------------------------- /lib/mrcn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GingL/ARN/b1001040d8ac41292b2ccf6a6ab1f41c1d77d0fa/lib/mrcn/__init__.py -------------------------------------------------------------------------------- /lib/mrcn/inference.py: -------------------------------------------------------------------------------- 1 | """ 2 | args: imdb_name, net, iters, tag 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import os 9 | import os.path as osp 10 | import sys 11 | import json 12 | import time 13 | import numpy as np 14 | import pprint 15 | from scipy.misc import imread, imresize 16 | import cv2 17 | 18 | import torch 19 | from torch.autograd import Variable 20 | 21 | # mrcn imports 22 | import _init_paths 23 | from datasets.factory import get_imdb 24 | from model.config import cfg, cfg_from_file, cfg_from_list 25 | from model.bbox_transform import clip_boxes, bbox_transform_inv 26 | from nets.vgg16 import vgg16 27 | from nets.resnet_v1 import resnetv1 28 | from utils.blob import im_list_to_blob 29 | from utils.mask_utils import recover_masks 30 | from pycocotools import mask as COCOmask 31 | 32 | 33 | import pdb 34 | # mrcn dir 35 | this_dir = osp.dirname(__file__) 36 | mrcn_dir = osp.join(this_dir, '..', '..', 'pyutils', 'mask-faster-rcnn') 37 | 38 | 39 | def get_imdb_name(imdb_name): 40 | if imdb_name in ['refcoco', 'refcocog']: 41 | return {'TRAIN_IMDB': '%s_train+%s_val' % (imdb_name, imdb_name), 42 | 'TEST_IMDB': '%s_test' % imdb_name} 43 | elif imdb_name == 'coco_minus_refer': 44 | return {'TRAIN_IMDB': "coco_2014_train_minus_refer_valtest+coco_2014_valminusminival", 45 | 'TEST_IMDB': "coco_2014_minival"} 46 | 47 | 48 | class Inference: 49 | def __init__(self, args): 50 | 51 | self.imdb_name = args.imdb_name 52 | self.net_name = args.net_name 53 | self.tag = args.tag 54 | self.iters = args.iters 55 | 56 | # Config 57 | cfg_file = osp.join(mrcn_dir, 'experiments/cfgs/%s.yml' % self.net_name) 58 | cfg_list = ['ANCHOR_SCALES', [4, 8, 16, 32], 'ANCHOR_RATIOS', [0.5, 1, 2]] 59 | if cfg_file is not None: cfg_from_file(cfg_file) 60 | if cfg_list is not None: cfg_from_list(cfg_list) 61 | print('Using config:') 62 | pprint.pprint(cfg) 63 | 64 | # load imdb 65 | self.imdb = get_imdb(get_imdb_name(self.imdb_name)['TEST_IMDB']) 66 | 67 | # Load network 68 | self.net = self.load_net() 69 | 70 | def load_net(self): 71 | # Load network 72 | if self.net_name == 'vgg16': 73 | net = vgg16(batch_size=1) 74 | elif self.net_name == 'res101': 75 | net = resnetv1(batch_size=1, num_layers=101) 76 | else: 77 | raise NotImplementedError 78 | 79 | # 暂时未找到create_architecture 80 | net.create_architecture(self.imdb.num_classes, tag='default', 81 | anchor_scales=cfg.ANCHOR_SCALES, 82 | anchor_ratios=cfg.ANCHOR_RATIOS) 83 | net.eval() 84 | net.cuda() 85 | 86 | # Load model 87 | model = osp.join(mrcn_dir, 'output/%s/%s/%s/%s_mask_rcnn_iter_%s.pth' % \ 88 | (self.net_name, get_imdb_name(self.imdb_name)['TRAIN_IMDB'], self.tag, self.net_name, 89 | self.iters)) 90 | assert osp.isfile(model) 91 | net.load_state_dict(torch.load(model)) 92 | print('pretrained-model loaded from [%s].' % model) 93 | net.eval() 94 | return net 95 | 96 | def predict(self, img_path): 97 | # return scores/probs (num_rois, 81), pred_boxes (num_rois, 81*4) 98 | # in numpy 99 | im = cv2.imread(img_path) 100 | blobs, im_scales = self._get_blobs(im) 101 | im_blob = blobs['data'] # (1, iH, iW, 3) 102 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) 103 | 104 | # test_image returns cls_score, cls_prob, bbox_pred, rois, net_conv 105 | # scores.shape=(300,81), bbox_pred.shape=(300,324) 106 | # rois.shape=(300,5), net_conv.shape=(1,1024,38,57) 107 | _, scores, bbox_pred, rois, net_conv = self.net.test_image(blobs['data'], blobs['im_info']) 108 | 109 | # boxes.shape=(300,81) 110 | boxes = rois[:, 1:5] / im_scales[0] 111 | scores = np.reshape(scores, [scores.shape[0], -1]) 112 | bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1]) 113 | if cfg.TEST.BBOX_REG: 114 | # Apply bounding-box regression deltas 115 | box_deltas = bbox_pred 116 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 117 | pred_boxes = self._clip_boxes(pred_boxes, im.shape) 118 | else: 119 | # Simply repeat the boxes, once for each class 120 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 121 | 122 | return scores, pred_boxes 123 | 124 | def boxes_to_masks(self, img_path, boxes, labels): 125 | """ 126 | Arguments: 127 | - img_path: img_file 128 | - boxes : ndaray [[xyxy]] (n, 4) in original image 129 | - labels : ndarray (n, ) 130 | Return: 131 | - masks : (n, ih, iw) uint8 [0,1] 132 | - rles : list of rle instance 133 | """ 134 | im = cv2.imread(img_path) 135 | blobs, im_scales = self._get_blobs(im) 136 | im_blob = blobs['data'] # (1, iH, iW, 3) 137 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) 138 | 139 | # forward 140 | self.net.test_image(blobs['data'], blobs['im_info']) 141 | 142 | # net_conv 143 | net_conv = self.net._predictions['net_conv'] 144 | 145 | # run 146 | mask_prob = self.net._predict_masks_from_boxes_and_labels(net_conv, boxes * im_scales[0], labels) 147 | mask_prob = mask_prob.data.cpu().numpy() 148 | masks = recover_masks(mask_prob, boxes, im.shape[0], im.shape[1]) # (N, ih, iw) uint8 [0-255] 149 | masks = (masks > 122.).astype(np.uint8) # (N, ih, iw) uint8 [0,1] 150 | 151 | # encode to rles 152 | rles = [] 153 | for m in masks: 154 | rle = COCOmask.encode(np.asfortranarray(m)) 155 | rles += [rle] 156 | 157 | return masks, rles 158 | 159 | def extract_head(self, img_path): 160 | # extract head (1, 1024, im_height*scale/16.0, im_width*scale/16.0) in Variable cuda float 161 | # and im_info [[ih, iw, scale]] in float32 ndarray 162 | im = cv2.imread(img_path) 163 | blobs, im_scales = self._get_blobs(im) 164 | # _相当于net._layers["head"](Variable(torch.from_numpy(image.transpose([0,3,1,2])).cuda(), volatile=True)) 165 | # head_feat.shape = (1, 1024, im_height*scale/16.0, im_width*scale/16.0) 166 | head_feat = self.net.extract_head(blobs['data']) 167 | im_info = np.array([[blobs['data'].shape[1], blobs['data'].shape[2], im_scales[0]]]) 168 | return head_feat, im_info.astype(np.float32) 169 | 170 | def head_to_prediction(self, net_conv, im_info): 171 | """ 172 | Arguments: 173 | net_conv (Variable): (1, 1024, H, W) 174 | im_info (float) : [[ih, iw, scale]] 175 | Returns: 176 | scores (ndarray): (num_rois, 81) 177 | pred_boxes (ndarray): (num_rois, 81*4) in original image size 178 | """ 179 | self.net.eval() 180 | self.net._mode = 'TEST' 181 | 182 | # predict rois, cls_prob and bbox_pred 183 | self.net._im_info = im_info 184 | self.net._anchor_component(net_conv.size(2), net_conv.size(3)) 185 | rois = self.net._region_proposal(net_conv) 186 | if cfg.POOLING_MODE == 'crop': 187 | pool5 = self.net._crop_pool_layer(net_conv, rois) 188 | else: 189 | pool5 = self.net._roi_pool_layer(net_conv, rois) 190 | fc7 = self.net._head_to_tail(pool5) 191 | cls_prob, bbox_pred = self.net._region_classification(fc7) 192 | 193 | # add mean and std to bbox_pred if any 194 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 195 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.imdb.num_classes).unsqueeze( 196 | 0).expand_as(bbox_pred) 197 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.imdb.num_classes).unsqueeze( 198 | 0).expand_as(bbox_pred) 199 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means)) 200 | 201 | # convert to numpy 202 | scores = cls_prob.data.cpu().numpy() 203 | rois = rois.data.cpu().numpy() 204 | bbox_pred = bbox_pred.data.cpu().numpy() 205 | 206 | # regress boxes 207 | boxes = rois[:, 1:5] / im_info[0][2] 208 | if cfg.TEST.BBOX_REG: 209 | # Apply bounding-box regression deltas 210 | box_deltas = bbox_pred 211 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 212 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2]) 213 | else: 214 | # Simply repeat the boxes, once for each class 215 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 216 | 217 | return scores, pred_boxes 218 | 219 | def box_to_fc7(self, net_conv, im_info, ori_boxes): 220 | """ 221 | Arguments: 222 | net_conv (Variable) : (1, 1024, H, W) 223 | im_info (float32) : [[ih, iw, scale]] 224 | ori_boxes (float32) : (n, 4) [x1y1x2y2] 225 | Returns: 226 | fc7 (float) : (n, 2048) 227 | """ 228 | self.net.eval() 229 | self.net._mode = 'TEST' 230 | 231 | # make rois 232 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_()) 233 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 234 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 235 | rois = torch.cat([batch_inds, scaled_boxes], 1) 236 | 237 | # pool fc7 238 | if cfg.POOLING_MODE == 'crop': 239 | pool5 = self.net._crop_pool_layer(net_conv, rois) 240 | else: 241 | pool5 = self.net._roi_pool_layer(net_conv, rois) 242 | 243 | fc7 = self.net._head_to_tail(pool5) 244 | fc7 = fc7.mean(3).mean(2) 245 | return fc7 246 | 247 | def box_to_spatial_fc7(self, net_conv, im_info, ori_boxes): 248 | """ 249 | Arguments: 250 | net_conv (Variable) : (1, 1024, H, W) 251 | im_info (float32) : [[ih, iw, scale]] 252 | ori_boxes (float32) : (n, 4) [x1y1x2y2] 253 | Returns: 254 | pool5 (float) : (n, 1024, 7, 7) 255 | spatial_fc7 (float) : (n, 2048, 7, 7) 256 | """ 257 | self.net.eval() 258 | self.net._mode = 'TEST' 259 | 260 | # make rois 261 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_()) 262 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 263 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 264 | rois = torch.cat([batch_inds, scaled_boxes], 1) 265 | 266 | # pool fc7 267 | if cfg.POOLING_MODE == 'crop': 268 | pool5 = self.net._crop_pool_layer(net_conv, rois) 269 | else: 270 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n, 1024, 7, 7) 271 | 272 | spatial_fc7 = self.net.resnet.layer4(pool5) # (n, 2048, 7, 7) 273 | return pool5, spatial_fc7 274 | 275 | def spatial_fc7_to_prediction(self, spatial_fc7, im_info, ori_boxes): 276 | """Only used for testing. Testing the above box_to_fc7 [passed]""" 277 | cls_prob, bbox_pred = self.net._region_classification(spatial_fc7) 278 | 279 | # make rois 280 | batch_inds = Variable(spatial_fc7.data.new(ori_boxes.shape[0], 1).zero_()) 281 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 282 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 283 | rois = torch.cat([batch_inds, scaled_boxes], 1) 284 | 285 | # add mean and std to bbox_pred if any 286 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 287 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.imdb.num_classes).unsqueeze( 288 | 0).expand_as(bbox_pred) 289 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.imdb.num_classes).unsqueeze( 290 | 0).expand_as(bbox_pred) 291 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means)) 292 | 293 | # convert to numpy 294 | scores = cls_prob.data.cpu().numpy() 295 | rois = rois.data.cpu().numpy() 296 | bbox_pred = bbox_pred.data.cpu().numpy() 297 | 298 | # regress boxes 299 | boxes = rois[:, 1:5] / im_info[0][2] 300 | if cfg.TEST.BBOX_REG: 301 | # Apply bounding-box regression deltas 302 | box_deltas = bbox_pred 303 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 304 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2]) 305 | else: 306 | # Simply repeat the boxes, once for each class 307 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 308 | 309 | return scores, pred_boxes 310 | 311 | def _get_image_blob(self, im): 312 | """Converts an image into a network input. 313 | Arguments: 314 | im (ndarray): a color image in BGR order 315 | Returns: 316 | blob (ndarray): a data blob holding an image pyramid 317 | im_scale_factors (list): list of image scales (relative to im) used 318 | in the image pyramid 319 | """ 320 | # pdb.set_trace() 321 | im_orig = im.astype(np.float32, copy=True) 322 | im_orig -= cfg.PIXEL_MEANS 323 | 324 | # if im_shape=(320, 480, 3) then im_shape[0:2]=(320,480) 325 | # then im_size_min =320, im_size_max = 480 326 | im_shape = im_orig.shape 327 | im_size_min = np.min(im_shape[0:2]) 328 | im_size_max = np.max(im_shape[0:2]) 329 | 330 | processed_ims = [] 331 | im_scale_factors = [] 332 | 333 | for target_size in cfg.TEST.SCALES: 334 | im_scale = float(target_size) / float(im_size_min) 335 | # Prevent the biggest axis from being more than MAX_SIZE 336 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 337 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 338 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 339 | interpolation=cv2.INTER_LINEAR) 340 | im_scale_factors.append(im_scale) 341 | processed_ims.append(im) 342 | 343 | # Create a blob to hold the input images 344 | blob = im_list_to_blob(processed_ims) 345 | 346 | return blob, np.array(im_scale_factors) 347 | 348 | def _get_blobs(self, im): 349 | """Convert an image and RoIs within that image into network inputs.""" 350 | blobs = {} 351 | blobs['data'], im_scale_factors = self._get_image_blob(im) 352 | 353 | return blobs, im_scale_factors 354 | 355 | def _clip_boxes(self, boxes, im_shape): 356 | """Clip boxes to image boundaries.""" 357 | # x1 >= 0 358 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) 359 | # y1 >= 0 360 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0) 361 | # x2 < im_shape[1] 362 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1) 363 | # y2 < im_shape[0] 364 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1) 365 | return boxes 366 | -------------------------------------------------------------------------------- /lib/mrcn/inference_no_imdb.py: -------------------------------------------------------------------------------- 1 | """ 2 | args: net, iters, tag 3 | """ 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import os 9 | import os.path as osp 10 | import sys 11 | import json 12 | import time 13 | import numpy as np 14 | import pprint 15 | from scipy.misc import imread, imresize 16 | import cv2 17 | 18 | import torch 19 | from torch.autograd import Variable 20 | 21 | # mrcn imports 22 | import _init_paths 23 | from datasets.factory import get_imdb 24 | from model.config import cfg, cfg_from_file, cfg_from_list 25 | from model.bbox_transform import clip_boxes, bbox_transform_inv 26 | from nets.vgg16 import vgg16 27 | from nets.resnet_v1 import resnetv1 28 | from utils.blob import im_list_to_blob 29 | from utils.mask_utils import recover_masks 30 | from pycocotools import mask as COCOmask 31 | 32 | # mrcn dir 33 | this_dir = osp.dirname(__file__) 34 | mrcn_dir = osp.join(this_dir, '..', '..', 'pyutils', 'mask-faster-rcnn') 35 | 36 | 37 | def get_imdb_name(imdb_name): 38 | if imdb_name in ['refcoco', 'refcocog']: 39 | return {'TRAIN_IMDB': '%s_train+%s_val' % (imdb_name, imdb_name), 40 | 'TEST_IMDB': '%s_test' % imdb_name} 41 | elif imdb_name == 'coco_minus_refer': 42 | return {'TRAIN_IMDB': "coco_2014_train_minus_refer_valtest+coco_2014_valminusminival", 43 | 'TEST_IMDB': "coco_2014_minival"} 44 | 45 | 46 | class Inference: 47 | def __init__(self, args): 48 | 49 | self.imdb_name = args.imdb_name 50 | self.net_name = args.net_name 51 | self.tag = args.tag 52 | self.iters = args.iters 53 | 54 | # Config 55 | cfg_file = osp.join(mrcn_dir, 'experiments/cfgs/%s.yml' % self.net_name) 56 | cfg_list = ['ANCHOR_SCALES', [4, 8, 16, 32], 'ANCHOR_RATIOS', [0.5, 1, 2]] 57 | if cfg_file is not None: cfg_from_file(cfg_file) 58 | if cfg_list is not None: cfg_from_list(cfg_list) 59 | print('Using config:') 60 | pprint.pprint(cfg) 61 | 62 | # Load network 63 | self.num_classes = 81 # hard code this 64 | self.net = self.load_net() 65 | 66 | def load_net(self): 67 | # Load network 68 | if self.net_name == 'vgg16': 69 | net = vgg16(batch_size=1) 70 | elif self.net_name == 'res101': 71 | net = resnetv1(batch_size=1, num_layers=101) 72 | else: 73 | raise NotImplementedError 74 | 75 | net.create_architecture(self.num_classes, tag='default', 76 | anchor_scales=cfg.ANCHOR_SCALES, 77 | anchor_ratios=cfg.ANCHOR_RATIOS) 78 | net.eval() 79 | net.cuda() 80 | 81 | # Load model 82 | model = osp.join(mrcn_dir, 'output/%s/%s/%s/%s_mask_rcnn_iter_%s.pth' % \ 83 | (self.net_name, get_imdb_name(self.imdb_name)['TRAIN_IMDB'], self.tag, self.net_name, 84 | self.iters)) 85 | assert osp.isfile(model), model 86 | net.load_state_dict(torch.load(model)) 87 | print('pretrained-model loaded from [%s].' % model) 88 | 89 | return net 90 | 91 | def predict(self, img_path): 92 | # return scores/probs (num_rois, 81), pred_boxes (num_rois, 81*4) 93 | # in numpy 94 | im = cv2.imread(img_path) 95 | blobs, im_scales = self._get_blobs(im) 96 | im_blob = blobs['data'] # (1, iH, iW, 3) 97 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) 98 | 99 | # test_image returns cls_score, cls_prob, bbox_pred, rois, net_conv 100 | _, scores, bbox_pred, rois, _ = self.net.test_image(blobs['data'], blobs['im_info']) 101 | 102 | boxes = rois[:, 1:5] / im_scales[0] 103 | scores = np.reshape(scores, [scores.shape[0], -1]) 104 | bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1]) 105 | if cfg.TEST.BBOX_REG: 106 | # Apply bounding-box regression deltas 107 | box_deltas = bbox_pred 108 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 109 | pred_boxes = self._clip_boxes(pred_boxes, im.shape) 110 | else: 111 | # Simply repeat the boxes, once for each class 112 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 113 | 114 | return scores, pred_boxes 115 | 116 | def boxes_to_masks(self, img_path, boxes, labels): 117 | """ 118 | Arguments: 119 | - img_path: img_file 120 | - boxes : ndaray [[xyxy]] (n, 4) in original image 121 | - labels : ndarray (n, ) 122 | Return: 123 | - masks : (n, ih, iw) uint8 [0,1] 124 | - rles : list of rle instance 125 | """ 126 | im = cv2.imread(img_path) 127 | blobs, im_scales = self._get_blobs(im) 128 | im_blob = blobs['data'] # (1, iH, iW, 3) 129 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32) 130 | 131 | # forward 132 | self.net.test_image(blobs['data'], blobs['im_info']) 133 | 134 | # net_conv 135 | net_conv = self.net._predictions['net_conv'] 136 | 137 | # run 138 | mask_prob = self.net._predict_masks_from_boxes_and_labels(net_conv, boxes * im_scales[0], labels) 139 | mask_prob = mask_prob.data.cpu().numpy() 140 | masks = recover_masks(mask_prob, boxes, im.shape[0], im.shape[1]) # (N, ih, iw) uint8 [0-255] 141 | masks = (masks > 122.).astype(np.uint8) # (N, ih, iw) uint8 [0,1] 142 | 143 | # encode to rles 144 | rles = [] 145 | for m in masks: 146 | rle = COCOmask.encode(np.asfortranarray(m)) 147 | rles += [rle] 148 | 149 | return masks, rles 150 | 151 | def extract_head(self, img_path): 152 | # extract head (1, 1024, im_height*scale/16.0, im_width*scale/16.0) in Variable cuda float 153 | # and im_info [[ih, iw, scale]] in float32 ndarray 154 | im = cv2.imread(img_path) 155 | blobs, im_scales = self._get_blobs(im) 156 | head_feat = self.net.extract_head(blobs['data']) 157 | im_info = np.array([[blobs['data'].shape[1], blobs['data'].shape[2], im_scales[0]]]) 158 | return head_feat, im_info.astype(np.float32) 159 | 160 | def head_to_prediction(self, net_conv, im_info): 161 | """ 162 | Arguments: 163 | net_conv (Variable): (1, 1024, H, W) 164 | im_info (float) : [[ih, iw, scale]] 165 | Returns: 166 | scores (ndarray): (num_rois, 81) 167 | pred_boxes (ndarray): (num_rois, 81*4) in original image size 168 | """ 169 | self.net.eval() 170 | self.net._mode = 'TEST' 171 | 172 | # predict rois, cls_prob and bbox_pred 173 | self.net._im_info = im_info 174 | self.net._anchor_component(net_conv.size(2), net_conv.size(3)) 175 | rois = self.net._region_proposal(net_conv) 176 | if cfg.POOLING_MODE == 'crop': 177 | pool5 = self.net._crop_pool_layer(net_conv, rois) 178 | else: 179 | pool5 = self.net._roi_pool_layer(net_conv, rois) 180 | fc7 = self.net._head_to_tail(pool5) 181 | cls_prob, bbox_pred = self.net._region_classification(fc7) 182 | 183 | # add mean and std to bbox_pred if any 184 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 185 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.num_classes).unsqueeze(0).expand_as( 186 | bbox_pred) 187 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.num_classes).unsqueeze(0).expand_as( 188 | bbox_pred) 189 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means)) 190 | 191 | # convert to numpy 192 | scores = cls_prob.data.cpu().numpy() 193 | rois = rois.data.cpu().numpy() 194 | bbox_pred = bbox_pred.data.cpu().numpy() 195 | 196 | # regress boxes 197 | boxes = rois[:, 1:5] / im_info[0][2] 198 | if cfg.TEST.BBOX_REG: 199 | # Apply bounding-box regression deltas 200 | box_deltas = bbox_pred 201 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 202 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2]) 203 | else: 204 | # Simply repeat the boxes, once for each class 205 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 206 | 207 | return scores, pred_boxes 208 | 209 | def box_to_spatial_fc7(self, net_conv, im_info, ori_boxes): 210 | """ 211 | Arguments: 212 | net_conv (Variable) : (1, 1024, H, W) 213 | im_info (float32) : [[ih, iw, scale]] 214 | ori_boxes (float32) : (n, 4) [x1y1x2y2] 215 | Returns: 216 | pool5 (float) : (n, 1024, 7, 7) 217 | spatial_fc7 (float) : (n, 2048, 7, 7) 218 | """ 219 | self.net.eval() 220 | self.net._mode = 'TEST' 221 | 222 | # make rois 223 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_()) 224 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 225 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 226 | rois = torch.cat([batch_inds, scaled_boxes], 1) 227 | 228 | # pool fc7 229 | if cfg.POOLING_MODE == 'crop': 230 | pool5 = self.net._crop_pool_layer(net_conv, rois) 231 | else: 232 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n, 1024, 7, 7) 233 | 234 | spatial_fc7 = self.net.resnet.layer4(pool5) # (n, 2048, 7, 7), equavalent to _head_to_tail 235 | return pool5, spatial_fc7 236 | 237 | def box_to_pool5_fc7(self, net_conv, im_info, ori_boxes): 238 | """ 239 | Arguments: 240 | net_conv (Variable) : (1, 1024, H, W) 241 | im_info (float32) : [[ih, iw, scale]] 242 | ori_boxes (float32) : (n, 4) [x1y1x2y2] 243 | Returns: 244 | pool5 (float): (n, 1024) 245 | fc7 (float) : (n, 2048) 246 | """ 247 | self.net.eval() 248 | self.net._mode = 'TEST' 249 | 250 | # make rois 251 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_()) 252 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 253 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 254 | rois = torch.cat([batch_inds, scaled_boxes], 1) 255 | 256 | # pool fc7 257 | if cfg.POOLING_MODE == 'crop': 258 | pool5 = self.net._crop_pool_layer(net_conv, rois) 259 | else: 260 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n,1024,7,7) 261 | 262 | fc7 = self.net._head_to_tail(pool5) # (n, 2048, 7, 7) 263 | pool5 = pool5.mean(3).mean(2) # (n, 1024) 264 | fc7 = fc7.mean(3).mean(2) # (n, 2048) 265 | return pool5, fc7 266 | 267 | def box_to_fc7(self, net_conv, im_info, ori_boxes): 268 | """ 269 | Arguments: 270 | net_conv (Variable) : (1, 1024, H, W) 271 | im_info (float32) : [[ih, iw, scale]] 272 | ori_boxes (float32) : (n, 4) [x1y1x2y2] 273 | Returns: 274 | fc7 (float) : (n, 2048) 275 | """ 276 | self.net.eval() 277 | self.net._mode = 'TEST' 278 | 279 | # make rois 280 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_()) 281 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 282 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 283 | rois = torch.cat([batch_inds, scaled_boxes], 1) 284 | 285 | # pool fc7 286 | if cfg.POOLING_MODE == 'crop': 287 | pool5 = self.net._crop_pool_layer(net_conv, rois) 288 | else: 289 | pool5 = self.net._roi_pool_layer(net_conv, rois) 290 | 291 | fc7 = self.net._head_to_tail(pool5) # (n, 2048, 7, 7) 292 | fc7 = fc7.mean(3).mean(2) # (n, 2048) 293 | return fc7 294 | 295 | def spatial_fc7_to_prediction(self, spatial_fc7, im_info, ori_boxes): 296 | """Only used for testing. Testing the above box_to_fc7 [passed]""" 297 | cls_prob, bbox_pred = self.net._region_classification(spatial_fc7) 298 | 299 | # make rois 300 | batch_inds = Variable(spatial_fc7.data.new(ori_boxes.shape[0], 1).zero_()) 301 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32) 302 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda()) 303 | rois = torch.cat([batch_inds, scaled_boxes], 1) 304 | 305 | # add mean and std to bbox_pred if any 306 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 307 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.num_classes).unsqueeze(0).expand_as( 308 | bbox_pred) 309 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.num_classes).unsqueeze(0).expand_as( 310 | bbox_pred) 311 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means)) 312 | 313 | # convert to numpy 314 | scores = cls_prob.data.cpu().numpy() 315 | rois = rois.data.cpu().numpy() 316 | bbox_pred = bbox_pred.data.cpu().numpy() 317 | 318 | # regress boxes 319 | boxes = rois[:, 1:5] / im_info[0][2] 320 | if cfg.TEST.BBOX_REG: 321 | # Apply bounding-box regression deltas 322 | box_deltas = bbox_pred 323 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy() 324 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2]) 325 | else: 326 | # Simply repeat the boxes, once for each class 327 | pred_boxes = np.tile(boxes, (1, scores.shape[1])) 328 | 329 | return scores, pred_boxes 330 | 331 | def _get_image_blob(self, im): 332 | """Converts an image into a network input. 333 | Arguments: 334 | im (ndarray): a color image in BGR order 335 | Returns: 336 | blob (ndarray): a data blob holding an image pyramid 337 | im_scale_factors (list): list of image scales (relative to im) used 338 | in the image pyramid 339 | """ 340 | im_orig = im.astype(np.float32, copy=True) 341 | im_orig -= cfg.PIXEL_MEANS 342 | 343 | im_shape = im_orig.shape 344 | im_size_min = np.min(im_shape[0:2]) 345 | im_size_max = np.max(im_shape[0:2]) 346 | 347 | processed_ims = [] 348 | im_scale_factors = [] 349 | 350 | for target_size in cfg.TEST.SCALES: 351 | im_scale = float(target_size) / float(im_size_min) 352 | # Prevent the biggest axis from being more than MAX_SIZE 353 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 354 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 355 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 356 | interpolation=cv2.INTER_LINEAR) 357 | im_scale_factors.append(im_scale) 358 | processed_ims.append(im) 359 | 360 | # Create a blob to hold the input images 361 | blob = im_list_to_blob(processed_ims) 362 | 363 | return blob, np.array(im_scale_factors) 364 | 365 | def _get_blobs(self, im): 366 | """Convert an image and RoIs within that image into network inputs.""" 367 | blobs = {} 368 | blobs['data'], im_scale_factors = self._get_image_blob(im) 369 | 370 | return blobs, im_scale_factors 371 | 372 | def _clip_boxes(self, boxes, im_shape): 373 | """Clip boxes to image boundaries.""" 374 | # x1 >= 0 375 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0) 376 | # y1 >= 0 377 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0) 378 | # x2 < im_shape[1] 379 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1) 380 | # y2 < im_shape[0] 381 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1) 382 | return boxes 383 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | 4 | # mrcn path 5 | this_dir = osp.dirname(__file__) 6 | mrcn_dir = osp.join(this_dir, '..', 'pyutils', 'mask-faster-rcnn') 7 | sys.path.insert(0, osp.join(mrcn_dir, 'lib')) 8 | sys.path.insert(0, osp.join(mrcn_dir, 'data', 'refer')) 9 | sys.path.insert(0, osp.join(mrcn_dir, 'data', 'coco', 'PythonAPI')) 10 | 11 | # refer path 12 | refer_dir = osp.join(this_dir, '..', 'pyutils', 'refer') 13 | sys.path.insert(0, refer_dir) 14 | 15 | # model path 16 | sys.path.insert(0, osp.join(this_dir, '..', 'lib')) -------------------------------------------------------------------------------- /tools/eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path as osp 7 | import sys 8 | import json 9 | import time 10 | import argparse 11 | import matplotlib.pyplot as plt 12 | from matplotlib.patches import Polygon, Rectangle 13 | import skimage.io as io 14 | 15 | # model 16 | import _init_paths 17 | from layers.match import AdaptiveReconstruct 18 | from loaders.dataloader import DataLoader 19 | import models.eval as eval 20 | 21 | # torch 22 | import torch 23 | import torch.nn as nn 24 | 25 | 26 | 27 | def load_model(checkpoint_path, opt): 28 | tic = time.time() 29 | model = AdaptiveReconstruct(opt) 30 | checkpoint = torch.load(checkpoint_path) 31 | model.load_state_dict(checkpoint['model'].state_dict()) 32 | model.eval() 33 | model.cuda() 34 | print('model loaded in %.2f seconds' % (time.time() - tic)) 35 | return model 36 | 37 | 38 | def evaluate(params): 39 | # set up loader 40 | data_json = osp.join('cache/prepro', params['dataset_splitBy'], 'data.json') 41 | data_h5 = osp.join('cache/prepro', params['dataset_splitBy'], 'data.h5') 42 | loader = DataLoader(data_h5=data_h5, data_json=data_json) 43 | 44 | # load mode info 45 | model_prefix = osp.join('output', params['dataset_splitBy'], params['id'], 'mrcn_cmr_with_st') 46 | infos = json.load(open(model_prefix + '.json')) 47 | model_opt = infos['opt'] 48 | model_path = model_prefix + '.pth' 49 | model = load_model(model_path, model_opt) 50 | 51 | # loader's feats 52 | feats_dir = '%s_%s_%s' % (model_opt['net_name'], model_opt['imdb_name'], model_opt['tag']) 53 | args.imdb_name = model_opt['imdb_name'] 54 | args.net_name = model_opt['net_name'] 55 | args.tag = model_opt['tag'] 56 | args.iters = model_opt['iters'] 57 | loader.prepare_mrcn(head_feats_dir=osp.join('cache/feats/', model_opt['dataset_splitBy'], 'mrcn', feats_dir), 58 | args=args) 59 | ann_feats = osp.join('cache/feats', model_opt['dataset_splitBy'], 'mrcn', 60 | '%s_%s_%s_ann_feats.h5' % (model_opt['net_name'], model_opt['imdb_name'], model_opt['tag'])) 61 | # load ann features 62 | loader.loadFeats({'ann': ann_feats}) 63 | 64 | # check model_info and params 65 | assert model_opt['dataset'] == params['dataset'] 66 | assert model_opt['splitBy'] == params['splitBy'] 67 | 68 | # evaluate on the split, 69 | split = params['split'] 70 | model_opt['num_sents'] = params['num_sents'] 71 | model_opt['verbose'] = params['verbose'] 72 | 73 | val_loss, acc, predictions, _, _ = eval.eval_split(loader, model, split, model_opt) 74 | 75 | 76 | print('Comprehension on %s\'s %s (%s sents) is %.2f%%' % \ 77 | (params['dataset_splitBy'], params['split'], len(predictions), acc * 100.)) 78 | 79 | # save 80 | out_dir = osp.join('results', params['dataset_splitBy'], 'easy') 81 | if not osp.isdir(out_dir): 82 | os.makedirs(out_dir) 83 | out_file = osp.join(out_dir, params['id'] + '_' + params['split'] + '.json') 84 | with open(out_file, 'w') as of: 85 | json.dump({'predictions': predictions, 'acc': acc}, of) 86 | 87 | # write to results.txt 88 | f = open('experiments/easy_results.txt', 'a') 89 | f.write('[%s]: [%s][%s], id[%s]\'s acc is %.2f%%\n' % \ 90 | (params['id'], params['dataset_splitBy'], params['split'], params['id'], acc * 100.0)) 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument('--dataset', type=str, default='refcoco+', 96 | help='dataset name: refclef, refcoco, refcoco+, refcocog') 97 | parser.add_argument('--splitBy', type=str, default='unc', help='splitBy: unc, google, berkeley') 98 | parser.add_argument('--split', type=str, default='val', help='split: testAB or val, etc') 99 | parser.add_argument('--id', type=str, default='exp0', help='model id name') 100 | parser.add_argument('--num_sents', type=int, default=-1, 101 | help='how many sentences to use when periodically evaluating the loss? (-1=all)') 102 | parser.add_argument('--verbose', type=int, default=1, help='if we want to print the testing progress') 103 | args = parser.parse_args() 104 | params = vars(args) 105 | 106 | # make other options 107 | params['dataset_splitBy'] = params['dataset'] + '_' + params['splitBy'] 108 | evaluate(params) 109 | 110 | 111 | -------------------------------------------------------------------------------- /tools/opt.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | import argparse 3 | 4 | def parse_opt(): 5 | 6 | parser = argparse.ArgumentParser() 7 | # Data input settings 8 | parser.add_argument('--dataset', type=str, default='refcoco', help='name of dataset') 9 | parser.add_argument('--splitBy', type=str, default='unc', help='who splits this dataset') 10 | parser.add_argument('--start_from', type=str, default=None, help='continuing training from saved model') 11 | # FRCN setting 12 | parser.add_argument('--imdb_name', default='coco_minus_refer', help='image databased trained on.') 13 | parser.add_argument('--net_name', default='res101', help='net_name: res101 or vgg16') 14 | parser.add_argument('--iters', default=1250000, type=int, help='iterations we trained for faster R-CNN') 15 | parser.add_argument('--tag', default='notime', help='on default tf, don\'t change this!') 16 | parser.add_argument('--vis_feats_type', type=str, default='res101', help='visual features type: vgg16 or res101') 17 | # Visual Encoder Setting 18 | parser.add_argument('--visual_sample_ratio', type=float, default=0.3, help='ratio of same-type objects over different-type objects') 19 | parser.add_argument('--visual_fuse_mode', type=str, default='concat', help='concat or mul') 20 | parser.add_argument('--visual_init_norm', type=float, default=20, help='norm of each visual representation') 21 | parser.add_argument('--visual_use_bn', type=int, default=-1, help='>0: use bn, -1: do not use bn in visual layer') 22 | parser.add_argument('--visual_use_cxt', type=int, default=1, help='if we use contxt') 23 | parser.add_argument('--visual_cxt_type', type=str, default='frcn', help='frcn or res101') 24 | parser.add_argument('--visual_drop_out', type=float, default=0.2, help='dropout on visual encoder') 25 | parser.add_argument('--window_scale', type=float, default=2.5, help='visual context type') 26 | # Visual Feats Setting 27 | parser.add_argument('--with_st', type=int, default=1, help='if incorporating same-type objects as contexts') 28 | parser.add_argument('--num_cxt', type=int, default=5, help='how many surrounding objects do we use') 29 | # Language Encoder Setting 30 | parser.add_argument('--word_embedding_size', type=int, default=512, help='the encoding size of each token') 31 | parser.add_argument('--word_vec_size', type=int, default=512, help='further non-linear of word embedding') 32 | parser.add_argument('--word_drop_out', type=float, default=0.5, help='word drop out after embedding') 33 | parser.add_argument('--bidirectional', type=int, default=1, help='bi-rnn') 34 | parser.add_argument('--rnn_hidden_size', type=int, default=512, help='hidden size of LSTM') 35 | parser.add_argument('--rnn_type', type=str, default='lstm', help='rnn, gru or lstm') 36 | parser.add_argument('--rnn_drop_out', type=float, default=0.2, help='dropout between stacked rnn layers') 37 | parser.add_argument('--rnn_num_layers', type=int, default=1, help='number of layers in lang_encoder') 38 | parser.add_argument('--variable_lengths', type=int, default=1, help='use variable length to encode') 39 | # Joint Embedding setting 40 | parser.add_argument('--jemb_drop_out', type=float, default=0.1, help='dropout in the joint embedding') 41 | parser.add_argument('--jemb_dim', type=int, default=512, help='joint embedding layer dimension') 42 | # Reconstruct Settings 43 | parser.add_argument('--decode_bidirectional', type=int, default=0, help='whther to use bidirection LSTM in reconstrcution') 44 | # Loss Setting 45 | parser.add_argument('--att_weight', type=float, default=1.0, help='weight on attribute prediction') 46 | parser.add_argument('--visual_rank_weight', type=float, default=1.0, help='weight on paired (ref, sent) over unpaired (neg_ref, sent)') 47 | parser.add_argument('--lang_rank_weight', type=float, default=1.0, help='weight on paired (ref, sent) over unpaired (ref, neg_sent)') 48 | parser.add_argument('--margin', type=float, default=0.1, help='margin for ranking loss') 49 | parser.add_argument('--lang_res_weight', type=float, default=1.0, help='weight on language reconstruction loss') 50 | parser.add_argument('--vis_res_weight', type=float, default=0.01, help='weight on visual reconstruction loss') 51 | parser.add_argument('--att_res_weight', type=float, default=1.0, help='weight on attribute reconstruction loss') 52 | parser.add_argument('--loss_combined', type=float, default=5.0, help='weight on loss_combined') 53 | parser.add_argument('--loss_divided', type=float, default=1.0, help='weight on loss_divided' ) 54 | # Optimization: General 55 | parser.add_argument('--max_iters', type=int, default=30000, help='max number of iterations to run') 56 | parser.add_argument('--sample_ratio', type=float, default=0.3, help='ratio of same-type objects over different-type objects') 57 | parser.add_argument('--batch_size', type=int, default=5, help='batch size in number of images per batch') 58 | parser.add_argument('--grad_clip', type=float, default=0.1, help='clip gradients at this value') 59 | parser.add_argument('--seq_per_ref', type=int, default=3, help='number of expressions per object during training') 60 | parser.add_argument('--learning_rate_decay_start', type=int, default=8000, help='at what iter to start decaying learning rate') 61 | parser.add_argument('--learning_rate_decay_every', type=int, default=8000, help='every how many iters thereafter to drop LR by half') 62 | parser.add_argument('--optim_epsilon', type=float, default=1e-8, help='epsilon that goes into denominator for smoothing') 63 | parser.add_argument('--learning_rate', type=float, default=4e-4, help='learning rate') 64 | parser.add_argument('--optim_alpha', type=float, default=0.8, help='alpha for adam') 65 | parser.add_argument('--optim_beta', type=float, default=0.999, help='beta used for adam') 66 | parser.add_argument('--weight_decay', type=float, default=0.0005, help='weight decay for adam') 67 | # Evaluation/Checkpointing 68 | parser.add_argument('--num_sents', type=int, default=-1, help='how many images to use when periodically evaluating the validation loss? (-1 = all)') 69 | parser.add_argument('--save_checkpoint_every', type=int, default=2000, help='how often to save a model checkpoint?') 70 | parser.add_argument('--checkpoint_path', type=str, default='output', help='directory to save models') 71 | parser.add_argument('--language_eval', type=int, default=0, help='Evaluate language as well (1 = yes, 0 = no)?') 72 | parser.add_argument('--losses_log_every', type=int, default=25, help='How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)') 73 | parser.add_argument('--load_best_score', type=int, default=1, help='Do we load previous best score when resuming training.') 74 | parser.add_argument('--use_IoU', type=int, default=1, help='Whether to use IoU evaluation or not') 75 | # misc 76 | parser.add_argument('--id', type=str, default='mrcn_cmr_with_st', help='an id identifying this run/job.') 77 | parser.add_argument('--seed', type=int, default=24, help='random number generator seed to use') 78 | parser.add_argument('--gpuid', type=int, default=0, help='which gpu to use, -1 = use CPU') 79 | parser.add_argument('--exp_id', type=str, default='', help='experiment id') 80 | 81 | # parse 82 | args = parser.parse_args() 83 | opt = vars(args) 84 | pprint('parsed input parameters:') 85 | pprint(opt) 86 | return args 87 | 88 | if __name__ == '__main__': 89 | 90 | opt = parse_opt() 91 | print('opt[\'id\'] is ', opt['id']) 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /tools/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import os.path as osp 7 | import json 8 | import time 9 | import random 10 | 11 | # model 12 | import _init_paths 13 | from loaders.dataloader import DataLoader 14 | from layers.match import AdaptiveReconstruct 15 | import models.utils as model_utils 16 | import models.eval as eval 17 | from opt import parse_opt 18 | 19 | import torch 20 | 21 | def main(args): 22 | opt = vars(args) 23 | # initialize 24 | opt['dataset_splitBy'] = opt['dataset'] + '_' + opt['splitBy'] 25 | checkpoint_dir = osp.join(opt['checkpoint_path'], opt['dataset_splitBy'], opt['exp_id']) 26 | if not osp.isdir(checkpoint_dir): 27 | os.makedirs(checkpoint_dir) 28 | 29 | # set random seed 30 | torch.manual_seed(opt['seed']) 31 | random.seed(opt['seed']) 32 | 33 | # set up loader 34 | data_json = osp.join('cache/prepro', opt['dataset_splitBy'], 'data.json') 35 | data_h5 = osp.join('cache/prepro', opt['dataset_splitBy'], 'data.h5') 36 | loader = DataLoader(data_h5=data_h5, data_json=data_json) 37 | 38 | # prepare feats 39 | feats_dir = '%s_%s_%s' % (args.net_name, args.imdb_name, args.tag) 40 | head_feats_dir = osp.join('cache/feats/', opt['dataset_splitBy'], 'mrcn', feats_dir) 41 | 42 | loader.prepare_mrcn(head_feats_dir, args) 43 | 44 | ann_feats = osp.join('cache/feats', opt['dataset_splitBy'], 'mrcn', 45 | '%s_%s_%s_ann_feats.h5' % (opt['net_name'], opt['imdb_name'], opt['tag'])) 46 | loader.loadFeats({'ann': ann_feats}) 47 | 48 | # set up model 49 | opt['vocab_size'] = loader.vocab_size 50 | opt['fc7_dim'] = loader.fc7_dim 51 | opt['pool5_dim'] = loader.pool5_dim 52 | opt['num_atts'] = loader.num_atts 53 | model = AdaptiveReconstruct(opt) 54 | 55 | infos = {} 56 | if opt['start_from'] is not None: 57 | pass 58 | iter = infos.get('iter', 0) 59 | epoch = infos.get('epoch', 0) 60 | val_accuracies = infos.get('val_accuracies', []) 61 | val_loss_history = infos.get('val_loss_history', {}) 62 | val_result_history = infos.get('val_result_history', {}) 63 | loss_history = infos.get('loss_history', {}) 64 | loader.iterators = infos.get('iterators', loader.iterators) 65 | if opt['load_best_score'] == 1: 66 | best_val_score = infos.get('best_val_score', None) 67 | 68 | 69 | att_weights = loader.get_attribute_weights() 70 | 71 | if opt['gpuid'] >= 0: 72 | model.cuda() 73 | 74 | 75 | # set up optimizer 76 | optimizer = torch.optim.Adam(model.parameters(), 77 | lr=opt['learning_rate'], 78 | betas=(opt['optim_alpha'], opt['optim_beta']), 79 | eps=opt['optim_epsilon']) 80 | 81 | data_time, model_time = 0, 0 82 | lr = opt['learning_rate'] 83 | best_prediction, best_overall = None, None 84 | while True: 85 | model.train() 86 | optimizer.zero_grad() 87 | 88 | T = {} 89 | 90 | tic = time.time() 91 | data = loader.getBatch('train', opt) 92 | 93 | labels = data['labels'] 94 | enc_labels = data['enc_labels'] 95 | dec_labels = data['dec_labels'] 96 | Feats = data['Feats'] 97 | att_labels, select_ixs = data['att_labels'], data['select_ixs'] 98 | 99 | T['data'] = time.time() - tic 100 | 101 | tic = time.time() 102 | scores, loss,_,_,_,_,_,vis_res_loss, att_res_loss, lang_res_loss = model(Feats['pool5'], Feats['fc7'], Feats['lfeats'], Feats['dif_lfeats'], 103 | Feats['cxt_fc7'], Feats['cxt_lfeats'], labels, enc_labels, dec_labels, att_labels, select_ixs, att_weights) 104 | 105 | loss.backward() 106 | model_utils.clip_gradient(optimizer, opt['grad_clip']) 107 | optimizer.step() 108 | T['model'] = time.time()-tic 109 | wrapped = data['bounds']['wrapped'] 110 | 111 | data_time += T['data'] 112 | model_time += T['model'] 113 | 114 | if iter % opt['losses_log_every'] == 0: 115 | loss_history[iter]=(loss.data[0]).item() 116 | print('iter[%s](epoch[%s]), train_loss=%.3f, lr=%.2E, data:%.2fs/iter, model:%.2fs/iter' \ 117 | % (iter, epoch, loss.data[0].item(), lr, data_time / opt['losses_log_every'], model_time/opt['losses_log_every'])) 118 | data_time, model_time = 0, 0 119 | 120 | if opt['learning_rate_decay_start'] > 0 and iter > opt['learning_rate_decay_start']: 121 | frac = (iter - opt['learning_rate_decay_start']) / opt['learning_rate_decay_every'] 122 | decay_factor = 0.1**frac 123 | lr = opt['learning_rate'] * decay_factor 124 | model_utils.set_lr(optimizer, lr) 125 | 126 | if (iter) % opt['save_checkpoint_every'] == 0 or iter == opt['max_iters']: 127 | val_loss, acc, predictions, val_vis_res_loss, val_lang_res_loss = eval.eval_split(loader, model, 'testB', opt) 128 | val_loss_history[iter] = val_loss 129 | val_result_history[iter] = {'loss': val_loss, 'accuracy': acc} 130 | val_accuracies += [(iter, acc)] 131 | print('validation loss: %.2f' % val_loss) 132 | print('validation acc : %.2f%%\n' % (acc * 100.0)) 133 | 134 | current_score = acc 135 | if best_val_score is None or current_score > best_val_score: 136 | best_val_score = current_score 137 | best_predictions = predictions 138 | checkpoint_path = osp.join(checkpoint_dir, opt['id'] + '.pth') 139 | checkpoint = {} 140 | checkpoint['model'] = model 141 | checkpoint['opt'] = opt 142 | torch.save(checkpoint, checkpoint_path) 143 | print('model saved to %s' % checkpoint_path) 144 | 145 | infos['iter'] = iter 146 | infos['epoch'] = epoch 147 | infos['iterators'] = loader.iterators 148 | infos['loss_history'] = loss_history 149 | infos['val_accuracies'] = val_accuracies 150 | infos['val_loss_history'] = val_loss_history 151 | infos['best_val_score'] = best_val_score 152 | infos['best_predictions'] = predictions if best_predictions is None else best_predictions 153 | 154 | infos['opt'] = opt 155 | infos['val_result_history'] = val_result_history 156 | infos['word_to_ix'] = loader.word_to_ix 157 | infos['att_to_ix'] = loader.att_to_ix 158 | with open(osp.join(checkpoint_dir, opt['id'] + '.json'), 'w', encoding="utf8") as io: 159 | json.dump(infos, io) 160 | 161 | iter += 1 162 | if wrapped: 163 | epoch += 1 164 | if iter >= opt['max_iters'] and opt['max_iters'] > 0: 165 | break 166 | 167 | if __name__ == '__main__': 168 | args = parse_opt() 169 | main(args) 170 | --------------------------------------------------------------------------------