├── .idea
├── ARN.iml
├── encodings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── lib
├── crits
│ ├── __init__.py
│ └── max_margin_crit.py
├── layers
│ ├── lang_decoder.py
│ ├── lang_encoder.py
│ ├── loss.py
│ ├── match.py
│ └── visual_encoder.py
├── loaders
│ ├── dataloader.py
│ └── loader.py
├── models
│ ├── eval.py
│ └── utils.py
└── mrcn
│ ├── __init__.py
│ ├── inference.py
│ └── inference_no_imdb.py
└── tools
├── _init_paths.py
├── eval.py
├── opt.py
└── train.py
/.idea/ARN.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PyTorch Implementation of ARN
2 |
3 | ## Introduction
4 |
5 | This repository is Pytorch implementation of [Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding](https://arxiv.org/pdf/1908.10568.pdf) in ICCV 2019.
6 | Check our [paper](https://arxiv.org/pdf/1908.10568.pdf) for more details.
7 |
8 | ## Prerequisites
9 |
10 | * Python 3.5
11 | * Pytorch 0.4.1
12 | * CUDA 8.0
13 |
14 | ## Installation
15 |
16 | Please refer to [MattNet](https://github.com/lichengunc/MAttNet) to install [mask-faster-rcnn](https://github.com/lichengunc/mask-faster-rcnn), [REFER](https://github.com/lichengunc/refer) and [refer-parser2](https://github.com/lichengunc/refer-parser2).
17 | Follow Step 1 & 2 in Training to prepare the data and features.
18 |
19 | ## Training
20 |
21 | Train ARN with ground-truth annotation:
22 |
23 | ```bash
24 | CUDA_VISIBLE_DEVICES=${GPU_ID} python ./tools/train.py --dataset ${DATASET} --splitBy ${SPLITBY} --exp_id ${EXP_ID}
25 | ```
26 |
27 | ## Evaluation
28 |
29 | Evaluate ARN with ground-truth annotation:
30 |
31 | ```bash
32 | CUDA_VISIBLE_DEVICES=${GPU_ID} python ./tools/eval.py --dataset ${DATASET} --splitBy ${SPLITBY} --split ${SPLIT} --id ${EXP_ID}
33 | ```
34 |
35 |
36 | ## Citation
37 |
38 | @inproceedings{lxj2019arn,
39 | title={Adaptive Reconstruction Network for Weakly Supervised Referring Expression Grounding},
40 | author={Xuejing Liu, Liang Li, Shuhui Wang, Zheng-Jun Zha, Dechao Meng, and Qingming Huang},
41 | booktitle={ICCV},
42 | year={2019}
43 | }
44 |
45 |
46 | ## Acknowledgement
47 |
48 | Thanks for the work of [Licheng Yu](http://cs.unc.edu/~licheng/). Our code is based on the implementation of [MattNet](https://github.com/lichengunc/MAttNet).
49 |
50 | ## Authorship
51 |
52 | This project is maintained by [Xuejing Liu](https://gingl.github.io/).
53 |
--------------------------------------------------------------------------------
/lib/crits/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GingL/ARN/b1001040d8ac41292b2ccf6a6ab1f41c1d77d0fa/lib/crits/__init__.py
--------------------------------------------------------------------------------
/lib/crits/max_margin_crit.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | from torch.autograd import Variable
7 | import torch.nn as nn
8 | import pdb
9 |
10 | class MaxMarginCriterion(nn.Module):
11 |
12 | def __init__(self, visual_rank_weight, lang_rank_weight, margin):
13 | # visual_rank_weight=1, lang_rank_weight=1, margin=0.1
14 | super(MaxMarginCriterion, self).__init__()
15 | self.visual_rank = visual_rank_weight > 0
16 | self.lang_rank = lang_rank_weight > 0
17 | self.visual_rank_weight = visual_rank_weight
18 | self.lang_rank_weight = lang_rank_weight
19 | self.margin = margin
20 |
21 | def forward(self, cossim):
22 | # pdb.set_trace()
23 | N = cossim.size(0)
24 | batch_size = 0
25 | if self.visual_rank and not self.lang_rank:
26 | batch_size = N//2
27 | assert isinstance(batch_size, int)
28 | paired = cossim[:batch_size]
29 | unpaired = cossim[batch_size:]
30 | visual_rank_loss = self.visual_rank_weight * torch.clamp(self.margin + unpaired - paired, min=0)
31 | lang_rank_loss = 0.
32 |
33 | elif not self.visual_rank and self.lang_rank:
34 | batch_size = N//2
35 | assert isinstance(batch_size, int)
36 | paired = cossim[:batch_size]
37 | unpaired = cossim[batch_size:]
38 | lang_rank_loss = self.lang_rank_weight * torch.clamp(self.margin + unpaired - paired, min=0)
39 | visual_rank_loss = 0.
40 |
41 | elif self.visual_rank and self.lang_rank:
42 | batch_size = N//3
43 | assert isinstance(batch_size, int)
44 | paired = cossim[:batch_size]
45 | visual_unpaired = cossim[batch_size: batch_size*2]
46 | lang_unpaired = cossim[batch_size*2:]
47 | visual_rank_loss = self.visual_rank_weight * torch.clamp(self.margin + visual_unpaired - paired, 0)
48 | lang_rank_loss = self.lang_rank_weight * torch.clamp(self.margin + lang_unpaired - paired, 0)
49 |
50 | else:
51 | raise NotImplementedError
52 |
53 | loss = (visual_rank_loss + lang_rank_loss).sum() / batch_size
54 | return loss
55 |
56 |
--------------------------------------------------------------------------------
/lib/layers/lang_decoder.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import torch
7 | from torch.autograd import Variable
8 | import torch.nn as nn
9 | import torch.nn.functional as F
10 |
11 |
12 | class LocationDecoder(nn.Module):
13 | def __init__(self, opt):
14 | super(LocationDecoder, self).__init__()
15 | self.mlp = nn.Sequential(nn.Linear(5 + 25, opt['jemb_dim']))
16 |
17 | def forward(self, loc_feats, total_ann_score):
18 | total_ann_score = total_ann_score.unsqueeze(1)
19 | loc_feats_fuse = torch.bmm(total_ann_score, loc_feats)
20 | loc_feats_fuse = loc_feats_fuse.squeeze(1)
21 | loc_feats_fuse = self.mlp(loc_feats_fuse)
22 | return loc_feats_fuse
23 |
24 |
25 | class SubjectDecoder(nn.Module):
26 | def __init__(self, opt):
27 | super(SubjectDecoder, self).__init__()
28 | self.mlp = nn.Sequential(nn.Linear(opt['pool5_dim'] + opt['fc7_dim'], opt['jemb_dim']))
29 |
30 | def forward(self, sub_feats, total_ann_score):
31 | total_ann_score = total_ann_score.unsqueeze(1)
32 | sub_feats_fuse = torch.bmm(total_ann_score, sub_feats)
33 | sub_feats_fuse = sub_feats_fuse.squeeze(1)
34 | sub_feats_fuse = self.mlp(sub_feats_fuse)
35 | return sub_feats_fuse
36 |
37 |
38 | class RelationDecoder(nn.Module):
39 | def __init__(self, opt):
40 | super(RelationDecoder, self).__init__()
41 | self.jemb_dim = opt['jemb_dim']
42 | self.word_vec_size = opt['word_vec_size']
43 | self.fc7_dim = opt['fc7_dim']
44 | self.mlp = nn.Sequential(nn.Linear(self.fc7_dim + 5, self.jemb_dim))
45 |
46 | def forward(self, rel_feats, total_ann_score, ixs):
47 | sent_num, ann_num = ixs.size(0), ixs.size(1)
48 | total_ann_score = total_ann_score.unsqueeze(1)
49 | ixs = ixs.view(sent_num, ann_num, 1).unsqueeze(3).expand(sent_num, ann_num, 1,
50 | self.fc7_dim + 5)
51 | rel_feats_max = torch.gather(rel_feats, 2, ixs)
52 | rel_feats_max = rel_feats_max.squeeze(2)
53 | rel_feats_fuse = torch.bmm(total_ann_score, rel_feats_max)
54 | rel_feats_fuse = rel_feats_fuse.squeeze(1)
55 | rel_feats_fuse = self.mlp(rel_feats_fuse)
56 | return rel_feats_fuse
57 |
--------------------------------------------------------------------------------
/lib/layers/lang_encoder.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import torch
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 | class RNNEncoder(nn.Module):
11 | def __init__(self, vocab_size, word_embedding_size, word_vec_size, hidden_size, bidirectional=False,
12 | input_dropout_p=0, dropout_p=0, n_layers=1, rnn_type='lstm', variable_lengths=True):
13 | super(RNNEncoder, self).__init__()
14 | self.variable_lengths = variable_lengths
15 | self.embedding = nn.Embedding(vocab_size, word_embedding_size)
16 | self.input_dropout = nn.Dropout(input_dropout_p)
17 | self.mlp = nn.Sequential(nn.Linear(word_embedding_size, word_vec_size),
18 | nn.ReLU())
19 | self.rnn_type = rnn_type
20 | self.rnn = getattr(nn, rnn_type.upper())(word_vec_size, hidden_size, n_layers,
21 | batch_first=True, bidirectional=bidirectional, dropout=dropout_p)
22 | self.num_dirs = 2 if bidirectional else 1
23 |
24 | def forward(self, input_labels):
25 | """
26 | Inputs:
27 | - input_labels: Variable long (batch, seq_len)
28 | Outputs:
29 | - output : Variable float (batch, max_len, hidden_size * num_dirs)
30 | - hidden : Variable float (batch, num_layers * num_dirs * hidden_size)
31 | - embedded: Variable float (batch, max_len, word_vec_size)
32 | """
33 | if self.variable_lengths:
34 | input_lengths = (input_labels != 0).sum(1)
35 |
36 | input_lengths_list = input_lengths.data.cpu().numpy().tolist()
37 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist()
38 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist()
39 | s2r = {s: r for r, s in enumerate(sort_ixs)}
40 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))]
41 | assert max(input_lengths_list) == input_labels.size(1)
42 |
43 | sort_ixs = input_labels.data.new(sort_ixs).long()
44 | recover_ixs = input_labels.data.new(recover_ixs).long()
45 |
46 | input_labels = input_labels[sort_ixs]
47 |
48 | # embed
49 | embedded = self.embedding(input_labels) # (n, seq_len, word_embedding_size)
50 | embedded = self.input_dropout(embedded) # (n, seq_len, word_embedding_size)
51 | embedded = self.mlp(embedded) # (n, seq_len, word_embedding_size)
52 | if self.variable_lengths:
53 | embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_input_lengths_list, batch_first=True)
54 |
55 | output, hidden = self.rnn(embedded)
56 |
57 | # recover
58 | if self.variable_lengths:
59 | embedded, _ = nn.utils.rnn.pad_packed_sequence(embedded, batch_first=True)
60 | embedded = embedded[recover_ixs]
61 |
62 | # recover rnn
63 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True) # (batch, max_len, hidden)
64 | output = output[recover_ixs]
65 |
66 | if self.rnn_type=='lstm':
67 | hidden = hidden[0]
68 | hidden = hidden[:, recover_ixs, :]
69 | hidden = hidden.transpose(0, 1).contiguous()
70 | hidden = hidden.view(hidden.size(0), -1)
71 |
72 | return output, hidden, embedded
73 |
74 | class PhraseAttention(nn.Module):
75 | def __init__(self, input_dim):
76 | super(PhraseAttention, self).__init__()
77 | self.fc = nn.Linear(input_dim, 1)
78 |
79 | def forward(self, context, embedded, input_labels):
80 | cxt_scores = self.fc(context).squeeze(2)
81 |
82 | attn = F.softmax(cxt_scores)
83 |
84 | is_not_zero = (input_labels != 0).float()
85 | attn = attn * is_not_zero
86 | attn = attn / attn.sum(1).view(attn.size(0), 1).expand(attn.size(0), attn.size(1)) # (batch, seq_len)
87 |
88 | attn3 = attn.unsqueeze(1)
89 | weighted_emb = torch.bmm(attn3, embedded)
90 | weighted_emb = weighted_emb.squeeze(1)
91 |
92 | return attn, weighted_emb
--------------------------------------------------------------------------------
/lib/layers/loss.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | from torch.autograd import Variable
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 | import numpy as np
10 |
11 | # Attribut Reconstruction Loss
12 | class AttributeReconstructLoss(nn.Module):
13 | def __init__(self, opt):
14 | super(AttributeReconstructLoss, self).__init__()
15 | self.att_dropout = nn.Dropout(opt['visual_drop_out'])
16 | self.att_fc = nn.Linear(opt['fc7_dim']+opt['pool5_dim'], opt['num_atts'])
17 |
18 |
19 | def forward(self, attribute_feats, total_ann_score, att_labels, select_ixs, att_weights):
20 | """attribute_feats.shape = (sent_num, ann_num, 512), total_ann_score.shape = (sent_num, ann_num)"""
21 | total_ann_score = total_ann_score.unsqueeze(1)
22 | att_feats_fuse = torch.bmm(total_ann_score, attribute_feats)
23 | att_feats_fuse = att_feats_fuse.squeeze(1)
24 | att_feats_fuse = self.att_dropout(att_feats_fuse)
25 | att_scores = self.att_fc(att_feats_fuse)
26 | if len(select_ixs) == 0:
27 | att_loss = 0
28 | else:
29 | att_loss = nn.BCEWithLogitsLoss(att_weights.cuda())(att_scores.index_select(0, select_ixs),
30 | att_labels.index_select(0, select_ixs))
31 | return att_scores, att_loss
32 |
33 | # Language Reconstruction Loss
34 | class LangReconstructionLoss(nn.Module):
35 | def __init__(self, opt):
36 | super(LangReconstructionLoss, self).__init__()
37 |
38 | self.variable_lengths = opt['variable_lengths'] > 0
39 | self.vocab_size = opt['vocab_size']
40 | self.word_embedding_size = opt['word_embedding_size']
41 | self.word_vec_size = opt['word_vec_size']
42 | self.hidden_size = opt['rnn_hidden_size']
43 | self.bidirectional = opt['decode_bidirectional'] > 0
44 | self.input_dropout_p = opt['word_drop_out']
45 | self.dropout_p = opt['rnn_drop_out']
46 | self.n_layers = opt['rnn_num_layers']
47 | self.rnn_type = opt['rnn_type']
48 | self.variable_lengths = opt['variable_lengths'] > 0
49 |
50 | self.embedding = nn.Embedding(self.vocab_size, self.word_embedding_size)
51 | self.input_dropout = nn.Dropout(self.input_dropout_p)
52 | self.mlp = nn.Sequential(nn.Linear(self.word_embedding_size, self.word_vec_size), nn.ReLU())
53 | self.rnn_type = self.rnn_type
54 | self.rnn = getattr(nn, self.rnn_type.upper())(self.word_vec_size*2, self.hidden_size, self.n_layers,
55 | batch_first=True, bidirectional=self.bidirectional,
56 | dropout=self.dropout_p)
57 | self.num_dirs = 2 if self.bidirectional else 1
58 |
59 | self.fc = nn.Linear(self.num_dirs * self.hidden_size, self.vocab_size)
60 | self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
61 |
62 | def forward(self, vis_att_fuse, enc_labels, dec_labels):
63 | seq_len = enc_labels.size(1)
64 | sent_num = enc_labels.size(0)
65 | label_mask = (dec_labels != 0).float()
66 |
67 | if self.variable_lengths:
68 | input_lengths = (enc_labels != 0).sum(1)
69 | input_lengths_list = input_lengths.data.cpu().numpy().tolist()
70 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist()
71 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist()
72 | s2r = {s: r for r, s in enumerate(sort_ixs)}
73 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))]
74 |
75 | assert max(input_lengths_list) == enc_labels.size(1)
76 |
77 | sort_ixs = enc_labels.data.new(sort_ixs).long()
78 | recover_ixs = enc_labels.data.new(recover_ixs).long()
79 |
80 | input_labels = enc_labels[sort_ixs]
81 |
82 | vis_att_fuse = vis_att_fuse.unsqueeze(1)
83 | embedded = self.embedding(input_labels)
84 | embedded = self.input_dropout(embedded)
85 | embedded = self.mlp(embedded)
86 |
87 | embedded = torch.cat([embedded, torch.cat([vis_att_fuse, torch.zeros(sent_num, seq_len - 1,
88 | self.word_vec_size).cuda()], 1)], 2)
89 |
90 | if self.variable_lengths:
91 | embedded = nn.utils.rnn.pack_padded_sequence(embedded, sorted_input_lengths_list, batch_first=True)
92 |
93 | output, hidden = self.rnn(embedded)
94 |
95 | # recover
96 | if self.variable_lengths:
97 | # recover rnn
98 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
99 | output = output[recover_ixs]
100 |
101 | output = output.view(sent_num * seq_len, -1)
102 | output = self.fc(output)
103 |
104 | dec_labels = dec_labels.view(-1)
105 | label_mask = label_mask.view(-1)
106 |
107 | rec_loss = self.cross_entropy(output, dec_labels)
108 | rec_loss = torch.sum(rec_loss * label_mask) / torch.sum(label_mask)
109 |
110 | return rec_loss
111 |
112 | # Language Reconstruction Loss in Adaptive Reconstruction Loss
113 | class AdapLangReconstructLoss(nn.Module):
114 | def __init__(self, opt):
115 | super(AdapLangReconstructLoss, self).__init__()
116 |
117 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim']
118 |
119 | self.variable_lengths = opt['variable_lengths'] > 0
120 | self.vocab_size = opt['vocab_size']
121 | self.word_embedding_size = opt['word_embedding_size']
122 | self.word_vec_size = opt['word_vec_size']
123 | self.hidden_size = opt['rnn_hidden_size']
124 | self.bidirectional = opt['decode_bidirectional'] > 0
125 | self.input_dropout_p = opt['word_drop_out']
126 | self.dropout_p = opt['rnn_drop_out']
127 | self.n_layers = opt['rnn_num_layers']
128 | self.rnn_type = opt['rnn_type']
129 | self.variable_lengths = opt['variable_lengths'] > 0
130 |
131 | self.embedding = nn.Embedding(self.vocab_size, self.word_embedding_size)
132 | self.input_dropout = nn.Dropout(self.input_dropout_p)
133 | self.mlp = nn.Sequential(nn.Linear(self.word_embedding_size, self.word_vec_size), nn.ReLU())
134 | self.rnn_type = self.rnn_type
135 | self.rnn = getattr(nn, self.rnn_type.upper())(self.word_vec_size * 2, self.hidden_size, self.n_layers,
136 | batch_first=True, bidirectional=self.bidirectional,
137 | dropout=self.dropout_p)
138 | self.num_dirs = 2 if self.bidirectional else 1
139 |
140 | self.slr_mlp = nn.Sequential(nn.Linear(self.word_vec_size * 3, self.word_vec_size),
141 | nn.ReLU())
142 |
143 | self.fc = nn.Linear(self.num_dirs * self.hidden_size, self.vocab_size)
144 |
145 | self.cross_entropy = nn.CrossEntropyLoss(reduce=False)
146 |
147 | def forward(self, sub_phrase_emb, loc_phrase_emb, rel_phrase_emb, enc_labels, dec_labels):
148 | """sub_phrase_emb, loc_phrase_emb, rel_phrase_emb.shape = (sent_num, 512), labels.shape = (sent_num, sent_length)"""
149 | slr_embeded = torch.cat([sub_phrase_emb, loc_phrase_emb, rel_phrase_emb], 1)
150 | slr_embeded = self.slr_mlp(slr_embeded)
151 |
152 | seq_len = enc_labels.size(1)
153 | label_mask = (dec_labels != 0).float()
154 | batchsize = enc_labels.size(0)
155 |
156 | if self.variable_lengths:
157 | input_lengths = (enc_labels != 0).sum(1)
158 | input_lengths_list = input_lengths.data.cpu().numpy().tolist()
159 | sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist()
160 | sort_ixs = np.argsort(input_lengths_list)[::-1].tolist()
161 | s2r = {s: r for r, s in enumerate(sort_ixs)}
162 | recover_ixs = [s2r[s] for s in range(len(input_lengths_list))]
163 |
164 | assert max(input_lengths_list) == enc_labels.size(1)
165 |
166 | sort_ixs = enc_labels.data.new(sort_ixs).long()
167 | recover_ixs = enc_labels.data.new(recover_ixs).long()
168 |
169 | input_labels = enc_labels[sort_ixs]
170 |
171 | slr_embeded = slr_embeded.view(batchsize, 1, -1)
172 |
173 | embedded = self.embedding(input_labels)
174 | embedded = self.input_dropout(embedded)
175 | embedded = self.mlp(embedded)
176 |
177 | slr_embedded = torch.cat([embedded, torch.cat([slr_embeded, torch.zeros(batchsize, seq_len - 1,
178 | self.word_embedding_size).cuda()], 1)], 2)
179 |
180 | if self.variable_lengths:
181 | slr_embedded = nn.utils.rnn.pack_padded_sequence(slr_embedded, sorted_input_lengths_list, batch_first=True)
182 |
183 | output, hidden = self.rnn(slr_embedded)
184 |
185 | # recover
186 | if self.variable_lengths:
187 | # recover rnn
188 | output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)
189 | output = output[recover_ixs]
190 |
191 | output = output.view(batchsize * seq_len, -1)
192 | output = self.fc(output)
193 |
194 | dec_labels = dec_labels.view(-1)
195 | label_mask = label_mask.view(-1)
196 |
197 | lang_rec_loss = self.cross_entropy(output, dec_labels)
198 | lang_rec_loss = torch.sum(lang_rec_loss * label_mask) / torch.sum(label_mask)
199 |
200 | return lang_rec_loss
201 |
202 | # Visual Reconstruction Loss in Adaptive Reconstruction Loss
203 | class AdapVisualReconstructLoss(nn.Module):
204 | def __init__(self, opt):
205 | super(AdapVisualReconstructLoss, self).__init__()
206 |
207 | def forward(self, sub_phrase_emb, sub_phrase_recons, loc_phrase_emb, loc_phrase_recons, rel_phrase_emb,
208 | rel_phrase_recons, weights):
209 | """
210 | (sub_phrase_emb, sub_phrase_recons, loc_phrase_emb, loc_phrase_recons, rel_phrase_emb, rel_phrase_recons).shape=(sent_num, 512)
211 | weights.shape = (sent_num, 3)
212 | """
213 | sub_loss = self.mse_loss(sub_phrase_recons, sub_phrase_emb).sum(1).unsqueeze(1)
214 | loc_loss = self.mse_loss(loc_phrase_recons, loc_phrase_emb).sum(1).unsqueeze(1)
215 | rel_loss = self.mse_loss(rel_phrase_recons, rel_phrase_emb).sum(1).unsqueeze(1)
216 |
217 | total_loss = (weights * torch.cat([sub_loss, loc_loss, rel_loss], 1)).sum(1).mean(0)
218 |
219 | return total_loss
220 |
221 | def mse_loss(self, recons, emb):
222 | return (recons-emb)**2
223 |
--------------------------------------------------------------------------------
/lib/layers/match.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from layers.lang_encoder import RNNEncoder, PhraseAttention
9 | from layers.visual_encoder import LocationEncoder, SubjectEncoder, RelationEncoder
10 | from layers.lang_decoder import LocationDecoder, SubjectDecoder, RelationDecoder
11 | from layers.loss import AttributeReconstructLoss, LangReconstructionLoss, AdapVisualReconstructLoss, AdapLangReconstructLoss
12 |
13 | class Score(nn.Module):
14 | def __init__(self, vis_dim, lang_dim, jemb_dim):
15 | super(Score, self).__init__()
16 |
17 | self.feat_fuse = nn.Sequential(nn.Linear(vis_dim+lang_dim, jemb_dim),
18 | nn.ReLU(),
19 | nn.Linear(jemb_dim, 1))
20 | self.softmax = nn.Softmax(dim=1)
21 | self.lang_dim = lang_dim
22 | self.vis_dim = vis_dim
23 |
24 | def forward(self, visual_input, lang_input):
25 |
26 | sent_num, ann_num = visual_input.size(0), visual_input.size(1)
27 |
28 | lang_input = lang_input.unsqueeze(1).expand(sent_num, ann_num, self.lang_dim)
29 | lang_input = nn.functional.normalize(lang_input, p=2, dim=2)
30 |
31 | ann_attn = self.feat_fuse(torch.cat([visual_input, lang_input], 2))
32 |
33 | ann_attn = self.softmax(ann_attn.view(sent_num, ann_num))
34 | ann_attn = ann_attn.unsqueeze(2)
35 |
36 | return ann_attn
37 |
38 |
39 | class RelationScore(nn.Module):
40 | def __init__(self, vis_dim, lang_dim, jemb_dim):
41 | super(RelationScore, self).__init__()
42 |
43 | self.feat_fuse = nn.Sequential(nn.Linear(vis_dim+lang_dim, jemb_dim),
44 | nn.ReLU(),
45 | nn.Linear(jemb_dim, 1))
46 | self.softmax = nn.Softmax(dim=1)
47 | self.lang_dim = lang_dim
48 | self.vis_dim = vis_dim
49 |
50 | def forward(self, visual_input, lang_input, masks):
51 |
52 | sent_num, ann_num, cxt_num = visual_input.size(0), visual_input.size(1), visual_input.size(2)
53 |
54 | visual_input = visual_input.view(sent_num, ann_num*cxt_num, -1)
55 | visual_emb_normalized = nn.functional.normalize(visual_input, p=2, dim=2)
56 | lang_input = lang_input.unsqueeze(1).expand(sent_num, ann_num, self.lang_dim).contiguous()
57 | lang_input = lang_input.unsqueeze(2).expand(sent_num, ann_num, cxt_num, self.lang_dim).contiguous()
58 | lang_input = lang_input.reshape(sent_num, ann_num*cxt_num, -1)
59 | lang_emb_normalized = nn.functional.normalize(lang_input, p=2, dim=2)
60 |
61 |
62 | ann_attn = self.feat_fuse(torch.cat([visual_emb_normalized, lang_emb_normalized], 2))
63 |
64 | ann_attn = ann_attn.squeeze(2).contiguous()
65 | ann_attn = ann_attn.view(sent_num, ann_num, -1)
66 |
67 | ann_attn = masks * ann_attn
68 | ann_attn, ixs = torch.max(ann_attn, 2)
69 | ann_attn = self.softmax(ann_attn)
70 | ann_attn = ann_attn.unsqueeze(2)
71 |
72 | return ann_attn, ixs
73 |
74 | class AdaptiveReconstruct(nn.Module):
75 | def __init__(self, opt):
76 | super(AdaptiveReconstruct, self).__init__()
77 | num_layers = opt['rnn_num_layers']
78 | hidden_size = opt['rnn_hidden_size']
79 | num_dirs = 2 if opt['bidirectional'] > 0 else 1
80 | self.word_vec_size = opt['word_vec_size']
81 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim']
82 |
83 | self.lang_res_weight = opt['lang_res_weight']
84 | self.vis_res_weight = opt['vis_res_weight']
85 | self.att_res_weight = opt['att_res_weight']
86 | self.loss_combined = opt['loss_combined']
87 | self.loss_divided = opt['loss_divided']
88 |
89 | # language rnn encoder
90 | self.rnn_encoder = RNNEncoder(vocab_size=opt['vocab_size'],
91 | word_embedding_size=opt['word_embedding_size'],
92 | word_vec_size=opt['word_vec_size'],
93 | hidden_size=opt['rnn_hidden_size'],
94 | bidirectional=opt['bidirectional']>0,
95 | input_dropout_p=opt['word_drop_out'],
96 | dropout_p=opt['rnn_drop_out'],
97 | n_layers=opt['rnn_num_layers'],
98 | rnn_type=opt['rnn_type'],
99 | variable_lengths=opt['variable_lengths'] > 0)
100 |
101 |
102 | self.weight_fc = nn.Linear(num_layers * num_dirs *hidden_size, 3)
103 |
104 | self.sub_attn = PhraseAttention(hidden_size * num_dirs)
105 | self.loc_attn = PhraseAttention(hidden_size * num_dirs)
106 | self.rel_attn = PhraseAttention(hidden_size * num_dirs)
107 |
108 | self.sub_encoder = SubjectEncoder(opt)
109 | self.loc_encoder = LocationEncoder(opt)
110 | self.rel_encoder = RelationEncoder(opt)
111 |
112 | self.sub_score = Score(self.pool5_dim+self.fc7_dim, opt['word_vec_size'],
113 | opt['jemb_dim'])
114 | self.loc_score = Score(25+5, opt['word_vec_size'],
115 | opt['jemb_dim'])
116 | self.rel_score = RelationScore(self.fc7_dim+5, opt['word_vec_size'],
117 | opt['jemb_dim'])
118 |
119 | self.sub_decoder = SubjectDecoder(opt)
120 | self.loc_decoder = LocationDecoder(opt)
121 | self.rel_decoder = RelationDecoder(opt)
122 |
123 | self.att_res_loss = AttributeReconstructLoss(opt)
124 | self.vis_res_loss = AdapVisualReconstructLoss(opt)
125 | self.lang_res_loss = AdapLangReconstructLoss(opt)
126 | self.rec_loss = LangReconstructionLoss(opt)
127 |
128 | self.sub_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], self.pool5_dim+self.fc7_dim))
129 | self.loc_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], 25+5))
130 | self.rel_mlp = nn.Sequential(nn.Linear(opt['jemb_dim'], self.fc7_dim+5))
131 |
132 | self.feat_fuse = nn.Sequential(
133 | nn.Linear(self.fc7_dim + self.pool5_dim + 25 + 5 + self.fc7_dim + 5, opt['jemb_dim']),
134 | nn.ReLU())
135 |
136 | def forward(self, pool5, fc7, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats, labels, enc_labels, dec_labels, att_labels, select_ixs, att_weights):
137 |
138 | context, hidden, embedded = self.rnn_encoder(labels)
139 |
140 | weights = F.softmax(self.weight_fc(hidden))
141 | sub_attn, sub_phrase_emb = self.sub_attn(context, embedded, labels)
142 | loc_attn, loc_phrase_emb = self.loc_attn(context, embedded, labels)
143 | rel_attn, rel_phrase_emb = self.rel_attn(context, embedded, labels)
144 |
145 | sent_num = pool5.size(0)
146 | ann_num = pool5.size(1)
147 |
148 | # subject matching score
149 | sub_feats = self.sub_encoder(pool5, fc7, sub_phrase_emb)
150 | sub_ann_attn = self.sub_score(sub_feats, sub_phrase_emb)
151 |
152 | # location matching score
153 | loc_feats = self.loc_encoder(lfeats, dif_lfeats)
154 | loc_ann_attn = self.loc_score(loc_feats, loc_phrase_emb)
155 |
156 | # relation matching score
157 | rel_feats, masks = self.rel_encoder(cxt_fc7, cxt_lfeats)
158 | rel_ann_attn, rel_ixs = self.rel_score(rel_feats, rel_phrase_emb, masks)
159 |
160 | weights_expand = weights.unsqueeze(1).expand(sent_num, ann_num, 3)
161 | total_ann_score = (weights_expand * torch.cat([sub_ann_attn, loc_ann_attn, rel_ann_attn], 2)).sum(2)
162 |
163 | loss = 0
164 | att_res_loss = 0
165 | lang_res_loss = 0
166 | vis_res_loss = 0
167 |
168 | # divided_loss
169 | sub_phrase_recons = self.sub_decoder(sub_feats, total_ann_score)
170 | loc_phrase_recons = self.loc_decoder(loc_feats, total_ann_score)
171 | rel_phrase_recons = self.rel_decoder(rel_feats, total_ann_score, rel_ixs)
172 |
173 | if self.vis_res_weight > 0:
174 | vis_res_loss = self.vis_res_loss(sub_phrase_emb, sub_phrase_recons, loc_phrase_emb,
175 | loc_phrase_recons, rel_phrase_emb, rel_phrase_recons, weights)
176 | loss = self.vis_res_weight * vis_res_loss
177 |
178 | if self.lang_res_weight > 0:
179 | lang_res_loss = self.lang_res_loss(sub_phrase_emb, loc_phrase_emb, rel_phrase_emb, enc_labels,
180 | dec_labels)
181 |
182 | loss += self.lang_res_weight * lang_res_loss
183 |
184 |
185 | # combined_loss
186 | loss += self.loss_divided*loss
187 |
188 | ann_score = total_ann_score.unsqueeze(1)
189 |
190 | ixs = rel_ixs.view(sent_num, ann_num, 1).unsqueeze(3).expand(sent_num, ann_num, 1, self.fc7_dim + 5)
191 | rel_feats_max = torch.gather(rel_feats, 2, ixs)
192 | rel_feats_max = rel_feats_max.squeeze(2)
193 |
194 | fuse_feats = torch.cat([sub_feats, loc_feats, rel_feats_max], 2)
195 | fuse_feats = torch.bmm(ann_score, fuse_feats)
196 | fuse_feats = fuse_feats.squeeze(1)
197 | fuse_feats = self.feat_fuse(fuse_feats)
198 | rec_loss = self.rec_loss(fuse_feats, enc_labels, dec_labels)
199 | loss += self.loss_combined * rec_loss
200 |
201 | if self.att_res_weight > 0:
202 | att_scores, att_res_loss = self.att_res_loss(sub_feats, total_ann_score, att_labels, select_ixs, att_weights)
203 | loss += self.att_res_weight * att_res_loss
204 |
205 | return total_ann_score, loss, rel_ixs, sub_attn, loc_attn, rel_attn, weights, \
206 | vis_res_loss, att_res_loss, lang_res_loss
207 |
208 |
209 |
210 |
211 |
212 |
--------------------------------------------------------------------------------
/lib/layers/visual_encoder.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | from torch.autograd import Variable
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 |
11 | class Normalize_Scale(nn.Module):
12 | def __init__(self, dim, init_norm=20):
13 | super(Normalize_Scale, self).__init__()
14 | self.init_norm = init_norm
15 | self.weight = nn.Parameter(torch.ones(1, dim) * init_norm)
16 |
17 | def forward(self, bottom):
18 | assert isinstance(bottom, Variable), 'bottom must be variable'
19 |
20 | bottom_normalized = nn.functional.normalize(bottom, p=2, dim=1)
21 | bottom_normalized_scaled = bottom_normalized * self.weight
22 | return bottom_normalized_scaled
23 |
24 | class LocationEncoder(nn.Module):
25 | def __init__(self, opt):
26 | super(LocationEncoder, self).__init__()
27 | init_norm = opt.get('visual_init_norm', 20)
28 | self.lfeats_normalizer = Normalize_Scale(5, init_norm)
29 | self.dif_lfeat_normalizer = Normalize_Scale(25, init_norm)
30 |
31 | def forward(self, lfeats, dif_lfeats):
32 | sent_num, ann_num = lfeats.size(0), lfeats.size(1)
33 | output = torch.cat([self.lfeats_normalizer(lfeats.contiguous().view(-1, 5)),
34 | self.dif_lfeat_normalizer(dif_lfeats.contiguous().view(-1, 25))], 1)
35 | output = output.view(sent_num, ann_num, 5+25)
36 |
37 | return output
38 |
39 | class SubjectEncoder(nn.Module):
40 | def __init__(self, opt):
41 | super(SubjectEncoder, self).__init__()
42 | self.word_vec_size = opt['word_vec_size']
43 | self.jemb_dim = opt['jemb_dim']
44 | self.pool5_dim, self.fc7_dim = opt['pool5_dim'], opt['fc7_dim']
45 |
46 | self.pool5_normalizer = Normalize_Scale(opt['pool5_dim'], opt['visual_init_norm'])
47 | self.fc7_normalizer = Normalize_Scale(opt['fc7_dim'], opt['visual_init_norm'])
48 |
49 | def forward(self, pool5, fc7, phrase_emb):
50 | sent_num, ann_num, grids = pool5.size(0), pool5.size(1), pool5.size(3)*pool5.size(4)
51 | batch = sent_num * ann_num
52 |
53 | pool5 = pool5.contiguous().view(batch, self.pool5_dim, -1)
54 | pool5 = pool5.transpose(1,2).contiguous().view(-1, self.pool5_dim)
55 | pool5 = self.pool5_normalizer(pool5)
56 | pool5 = pool5.view(sent_num, ann_num, 49, -1).transpose(2, 3).contiguous().mean(3)
57 |
58 | fc7 = fc7.contiguous().view(batch, self.fc7_dim, -1)
59 | fc7 = fc7.transpose(1, 2).contiguous().view(-1, self.fc7_dim)
60 | fc7 = self.fc7_normalizer(fc7)
61 | fc7 = fc7.view(sent_num, ann_num, 49, -1).transpose(2, 3).contiguous().mean(3)
62 |
63 | avg_att_feats = torch.cat([pool5, fc7], 2)
64 |
65 | return avg_att_feats
66 |
67 | class RelationEncoder(nn.Module):
68 | def __init__(self, opt):
69 | super(RelationEncoder, self).__init__()
70 | self.vis_feat_normalizer = Normalize_Scale(opt['fc7_dim'], opt['visual_init_norm'])
71 | self.lfeat_normalizer = Normalize_Scale(5, opt['visual_init_norm'])
72 |
73 | def forward(self, cxt_feats, cxt_lfeats):
74 | masks = (cxt_lfeats.sum(3) != 0).float()
75 |
76 | sent_num, ann_num = cxt_feats.size(0), cxt_feats.size(1)
77 | batch, num_cxt = sent_num*ann_num, cxt_feats.size(2)
78 | cxt_feats = self.vis_feat_normalizer(cxt_feats.contiguous().view(batch * num_cxt, -1))
79 | cxt_lfeats = self.lfeat_normalizer(cxt_lfeats.contiguous().view(batch * num_cxt, -1))
80 |
81 | rel_feats = torch.cat([cxt_feats, cxt_lfeats], 1)
82 |
83 | rel_feats = rel_feats.view(sent_num, ann_num, num_cxt, -1)
84 | return rel_feats, masks
--------------------------------------------------------------------------------
/lib/loaders/dataloader.py:
--------------------------------------------------------------------------------
1 | """
2 | data_json has
3 | 0. refs: [{ref_id, ann_id, box, image_id, split, category_id, sent_ids, att_wds}]
4 | 1. images: [{image_id, ref_ids, file_name, width, height, h5_id}]
5 | 2. anns: [{ann_id, category_id, image_id, box, h5_id}]
6 | 3. sentences: [{sent_id, tokens, h5_id}]
7 | 4. word_to_ix: {word: ix}
8 | 5. att_to_ix : {att_wd: ix}
9 | 6. att_to_cnt: {att_wd: cnt}
10 | 7. label_length: L
11 |
12 | Note, box in [xywh] format
13 | label_h5 has
14 | /labels is (M, max_length) uint32 array of encoded labels, zeros padded
15 | """
16 |
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 |
21 | import os.path as osp
22 | import numpy as np
23 | import h5py
24 | import random
25 | from loaders.loader import Loader
26 |
27 | import torch
28 | from torch.autograd import Variable
29 |
30 | from mrcn import inference_no_imdb
31 | import functools
32 |
33 | # box functions
34 | def xywh_to_xyxy(boxes):
35 | """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
36 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
37 |
38 | def xyxy_to_xywh(boxes):
39 | """Convert [x1 y1 x2 y2] box format to [x y w h] format."""
40 | return np.hstack((boxes[:, 0:2], boxes[:, 2:4] - boxes[:, 0:2] + 1))
41 |
42 | class DataLoader(Loader):
43 |
44 | def __init__(self, data_json, data_h5):
45 | # parent loader instance
46 | Loader.__init__(self, data_json, data_h5)
47 |
48 | # prepare attributes
49 | self.att_to_ix = self.info['att_to_ix']
50 | self.ix_to_att = {ix: wd for wd, ix in self.att_to_ix.items()}
51 | self.num_atts = len(self.att_to_ix)
52 | self.att_to_cnt = self.info['att_to_cnt']
53 |
54 | # img_iterators for each split
55 | self.split_ix = {}
56 | self.iterators = {}
57 | for image_id, image in self.Images.items():
58 | # we use its ref's split (there is assumption that each image only has one split)
59 | split = self.Refs[image['ref_ids'][0]]['split']
60 | if split not in self.split_ix:
61 | self.split_ix[split] = []
62 | self.iterators[split] = 0
63 | self.split_ix[split] += [image_id]
64 | for k, v in self.split_ix.items():
65 | print('assigned %d images to split %s' %(len(v), k))
66 |
67 | def prepare_mrcn(self, head_feats_dir, args):
68 | """
69 | Arguments:
70 | head_feats_dir: cache/feats/dataset_splitBy/net_imdb_tag, containing all image conv_net feats
71 | args: imdb_name, net_name, iters, tag
72 | """
73 | self.head_feats_dir = head_feats_dir
74 | self.mrcn = inference_no_imdb.Inference(args)
75 | assert args.net_name == 'res101'
76 | self.pool5_dim = 1024
77 | self.fc7_dim = 2048
78 |
79 | # load different kinds of feats
80 | def loadFeats(self, Feats):
81 | # Feats = {feats_name: feats_path}
82 | self.feats = {}
83 | self.feat_dim = None
84 | for feats_name, feats_path in Feats.items():
85 | if osp.isfile(feats_path):
86 | self.feats[feats_name] = h5py.File(feats_path, 'r')
87 | self.feat_dim = self.feats[feats_name]['fc7'].shape[1]
88 | assert self.feat_dim == self.fc7_dim
89 | print('FeatLoader loading [%s] from %s [feat_dim %s]' %(feats_name, feats_path, self.feat_dim))
90 |
91 | # shuffle split
92 | def shuffle(self, split):
93 | random.shuffle(self.split_ix[split])
94 |
95 | # reset iterator
96 | def resetIterator(self, split):
97 | self.iterators[split]=0
98 |
99 | # expand list by seq per ref, i.e., [a,b], 3 -> [aaabbb]
100 | def expand_list(self, L, n):
101 | out = []
102 | for l in L:
103 | out += [l] * n
104 | return out
105 |
106 | def image_to_head(self, image_id):
107 | """Returns
108 | head: float32 (1, 1024, H, W)
109 | im_info: float32 [[im_h, im_w, im_scale]]
110 | """
111 | feats_h5 = osp.join(self.head_feats_dir, str(image_id)+'.h5')
112 | feats = h5py.File(feats_h5, 'r')
113 | head, im_info = feats['head'], feats['im_info']
114 | return np.array(head), np.array(im_info)
115 |
116 | def fetch_sent_ids_by_ref_id(self, ref_id, num_sents):
117 | """
118 | Sample #num_sents sents for each ref_id.
119 | """
120 | sent_ids = list(self.Refs[ref_id]['sent_ids'])
121 | if len(sent_ids) < num_sents:
122 | append_sent_ids = [random.choice(sent_ids) for _ in range(num_sents-len(sent_ids))]
123 | sent_ids += append_sent_ids
124 | else:
125 | random.shuffle(sent_ids)
126 | sent_ids = sent_ids[:num_sents]
127 | assert len(sent_ids) == num_sents
128 | return sent_ids
129 |
130 | def fetch_neighbour_ids(self, ann_id):
131 | """
132 | For a given ann_id, we return
133 | - st_ann_ids: same-type neighbouring ann_ids (not include itself)
134 | - dt_ann_ids: different-type neighbouring ann_ids
135 | Ordered by distance to the input ann_id
136 | """
137 | ann = self.Anns[ann_id]
138 | x,y,w,h = ann['box']
139 | rx, ry = x+w/2, y+h/2
140 |
141 | @functools.cmp_to_key
142 | def compare(ann_id0, ann_id1):
143 | x,y,w,h = self.Anns[ann_id0]['box']
144 | ax0, ay0 = x+w/2, y+h/2
145 | x,y,w,h = self.Anns[ann_id1]['box']
146 | ax1, ay1 = x+w/2, y+h/2
147 | # closer to farmer
148 | if (rx-ax0)**2+(ry-ay0)**2 <= (rx-ax1)**2+(ry-ay1)**2:
149 | return -1
150 | else:
151 | return 1
152 |
153 | image = self.Images[ann['image_id']]
154 |
155 | ann_ids = list(image['ann_ids'])
156 | ann_ids = sorted(ann_ids, key=compare)
157 |
158 | st_ann_ids, dt_ann_ids = [], []
159 | for ann_id_else in ann_ids:
160 | if ann_id_else != ann_id:
161 | if self.Anns[ann_id_else]['category_id'] == ann['category_id']:
162 | st_ann_ids += [ann_id_else]
163 | else:
164 | dt_ann_ids +=[ann_id_else]
165 | return st_ann_ids, dt_ann_ids
166 |
167 | def fetch_grid_feats(self, boxes, net_conv, im_info):
168 | """returns -pool5 (n, 1024, 7, 7) -fc7 (n, 2048, 7, 7)"""
169 | pool5, fc7 = self.mrcn.box_to_spatial_fc7(net_conv, im_info, boxes)
170 | return pool5, fc7
171 |
172 | def compute_lfeats(self, ann_ids):
173 | # return ndarray float32 (#ann_ids, 5)
174 | lfeats = np.empty((len(ann_ids), 5), dtype=np.float32)
175 | for ix, ann_id in enumerate(ann_ids):
176 | ann = self.Anns[ann_id]
177 | image = self.Images[ann['image_id']]
178 | x, y ,w, h = ann['box']
179 | ih, iw = image['height'], image['width']
180 | lfeats[ix] = np.array([x/iw, y/ih, (x+w-1)/iw, (y+h-1)/ih, w*h/(iw*ih)],np.float32)
181 | return lfeats
182 |
183 | def compute_dif_lfeats(self, ann_ids, topK=5):
184 | # return ndarray float32 (#ann_ids, 5*topK)
185 | dif_lfeats = np.zeros((len(ann_ids), 5*topK), dtype=np.float32)
186 | for i, ann_id in enumerate(ann_ids):
187 | # reference box
188 | rbox = self.Anns[ann_id]['box']
189 | rcx,rcy,rw,rh = rbox[0]+rbox[2]/2,rbox[1]+rbox[3]/2,rbox[2],rbox[3]
190 | st_ann_ids, _ =self.fetch_neighbour_ids(ann_id)
191 | # candidate box
192 | for j, cand_ann_id in enumerate(st_ann_ids[:topK]):
193 | cbox = self.Anns[cand_ann_id]['box']
194 | cx1, cy1, cw, ch = cbox[0], cbox[1], cbox[2], cbox[3]
195 | dif_lfeats[i, j*5:(j+1)*5] = np.array([(cx1-rcx)/rw, (cy1-rcy)/rh, (cx1+cw-rcx)/rw, (cy1+ch-rcy)/rh, cw*ch/(rw*rh)])
196 | return dif_lfeats
197 |
198 | def fetch_cxt_feats(self, ann_ids, opt):
199 | """
200 | Return
201 | - cxt_feats : ndarray (#ann_ids, topK, fc7_dim)
202 | - cxt_lfeats: ndarray (#ann_ids, topK, 5)
203 | - cxt_ann_ids: [[ann_id]] of size (#ann_ids, topK), padded with -1
204 | Note we only use neighbouring "different" (+ "same") objects for computing context objects, zeros padded.
205 | """
206 | topK = opt['num_cxt']
207 | cxt_feats = np.zeros((len(ann_ids), topK, self.fc7_dim), dtype=np.float32)
208 | cxt_lfeats = np.zeros((len(ann_ids), topK, 5), dtype=np.float32)
209 | cxt_ann_ids = [[-1 for _ in range(topK)] for _ in range(len(ann_ids))]
210 | for i, ann_id in enumerate(ann_ids):
211 | # reference box
212 | rbox = self.Anns[ann_id]['box']
213 | rcx, rcy, rw, rh = rbox[0]+rbox[2]/2, rbox[1]+rbox[3]/2, rbox[2], rbox[3]
214 | # candidate boxes
215 | st_ann_ids, dt_ann_ids = self.fetch_neighbour_ids(ann_id)
216 | if opt['with_st'] > 0:
217 | cand_ann_ids = dt_ann_ids+st_ann_ids
218 | else:
219 | cand_ann_ids = dt_ann_ids
220 | cand_ann_ids = cand_ann_ids[:topK]
221 | for j, cand_ann_id in enumerate(cand_ann_ids):
222 | cand_ann = self.Anns[cand_ann_id]
223 | cbox = cand_ann['box']
224 | cx1, cy1, cw, ch = cbox[0], cbox[1], cbox[2], cbox[3]
225 | cxt_lfeats[i,j,:] = np.array([(cx1-rcx)/rw, (cy1-rcy)/rh, (cx1+cw-rcx)/rw, (cy1+ch-rcy)/rh, cw*ch/(rw*rh)])
226 | cxt_feats[i,j,:] = self.feats['ann']['fc7'][cand_ann['h5_id'], :]
227 | cxt_ann_ids[i][j] = cand_ann_id
228 | return cxt_feats, cxt_lfeats, cxt_ann_ids
229 |
230 | def fetch_attribute_label(self, ref_ids):
231 | """Return
232 | - labels : Variable float (N, num_atts)
233 | - select_ixs: Variable long (n, )
234 | """
235 | labels = np.zeros((len(ref_ids), self.num_atts))
236 | select_ixs = []
237 | for i, ref_id in enumerate(ref_ids):
238 | ref = self.Refs[ref_id]
239 | if len(ref['att_wds']) > 0:
240 | select_ixs += [i]
241 | for wd in ref['att_wds']:
242 | labels[i, self.att_to_ix[wd]] = 1
243 |
244 | return Variable(torch.from_numpy(labels).float().cuda()), Variable(torch.LongTensor(select_ixs).cuda())
245 |
246 |
247 | def extract_ann_features(self, image_id, opt):
248 | """Get features for all ann_ids in an image"""
249 | image = self.Images[image_id]
250 | ann_ids = image['ann_ids']
251 |
252 | # fetch image features
253 | head, im_info = self.image_to_head(image_id)
254 | head = Variable(torch.from_numpy(head).cuda())
255 |
256 | # fetch ann features
257 | ann_boxes = xywh_to_xyxy(np.vstack([self.Anns[ann_id]['box'] for ann_id in ann_ids]))
258 | ann_pool5, ann_fc7 = self.fetch_grid_feats(ann_boxes, head, im_info)
259 |
260 | # absolute location features
261 | lfeats = self.compute_lfeats(ann_ids)
262 | lfeats = Variable(torch.from_numpy(lfeats).cuda())
263 |
264 | # relative location features
265 | dif_lfeats = self.compute_dif_lfeats(ann_ids)
266 | dif_lfeats = Variable(torch.from_numpy(dif_lfeats).cuda())
267 |
268 | # fetch context_fc7 and context_lfeats
269 | cxt_fc7, cxt_lfeats, cxt_ann_ids = self.fetch_cxt_feats(ann_ids, opt)
270 | cxt_fc7 = Variable(torch.from_numpy(cxt_fc7).cuda())
271 | cxt_lfeats = Variable(torch.from_numpy(cxt_lfeats).cuda())
272 |
273 | return cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats
274 |
275 |
276 | # get batch of data
277 | def getBatch(self, split, opt):
278 | split_ix = self.split_ix[split]
279 | max_index = len(split_ix) - 1
280 | wrapped = False
281 | TopK = opt['num_cxt']
282 |
283 | # each batch contains one image
284 | ri = self.iterators[split]
285 | ri_next = ri+1
286 | if ri_next > max_index:
287 | ri_next = 0
288 | wrapped = True
289 | self.iterators[split] = ri_next
290 | image_id = split_ix[ri]
291 |
292 | # fetch feats
293 | cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats = self.extract_ann_features(image_id, opt)
294 | ann_ids = self.Images[image_id]['ann_ids']
295 | ann_num = len(ann_ids)
296 | ref_ids = self.Images[image_id]['ref_ids']
297 |
298 | img_ref_ids = []
299 | img_sent_ids = []
300 | gd_ixs = []
301 | gd_boxes = []
302 | for ref_id in ref_ids:
303 | ref = self.Refs[ref_id]
304 | for sent_id in ref['sent_ids']:
305 | img_ref_ids += [ref_id]
306 | img_sent_ids += [sent_id]
307 | gd_ixs += [ann_ids.index(ref['ann_id'])]
308 | gd_boxes += [ref['box']]
309 | img_sent_num = len(img_sent_ids)
310 |
311 | pool5 = ann_pool5.unsqueeze(0).expand(img_sent_num, ann_num, self.pool5_dim, 7, 7)
312 | pool5.detach()
313 | fc7 = ann_fc7.unsqueeze(0).expand(img_sent_num, ann_num, self.fc7_dim, 7, 7)
314 | fc7.detach()
315 | lfeats = lfeats.unsqueeze(0).expand(img_sent_num, ann_num, 5)
316 | dif_lfeats = dif_lfeats.unsqueeze(0).expand(img_sent_num, ann_num, TopK*5)
317 | cxt_fc7 = cxt_fc7.unsqueeze(0).expand(img_sent_num, ann_num, TopK, self.fc7_dim)
318 | cxt_lfeats = cxt_lfeats.unsqueeze(0).expand(img_sent_num, ann_num, TopK, 5)
319 |
320 |
321 | att_labels, select_ixs = self.fetch_attribute_label(img_ref_ids)
322 |
323 | cxt_ann_ids = [cxt_ann_ids for j in range(img_sent_num)]
324 |
325 |
326 | labels = np.vstack([self.fetch_seq(sent_id) for sent_id in img_sent_ids])
327 | labels = Variable(torch.from_numpy(labels).long().cuda())
328 | max_len = (labels!=0).sum(1).max().data[0]
329 | labels = labels[:, :max_len]
330 |
331 | start_words = np.ones([labels.size(0), 1], dtype=int)*(self.word_to_ix[''])
332 | start_words = Variable(torch.from_numpy(start_words).long().cuda())
333 | enc_labels = labels.clone()
334 | enc_labels = torch.cat([start_words, enc_labels], 1)
335 |
336 | zero_pad = np.zeros([labels.size(0), 1], dtype=int)
337 | zero_pad = Variable(torch.from_numpy(zero_pad).long().cuda())
338 | dec_labels = labels.clone()
339 | dec_labels = torch.cat([dec_labels, zero_pad], 1)
340 |
341 | data = {}
342 | data['labels'] = labels
343 | data['enc_labels'] = enc_labels
344 | data['dec_labels'] = dec_labels
345 | data['ref_ids'] = ref_ids
346 | data['sent_ids'] = img_sent_ids
347 | data['gd_ixs'] = gd_ixs
348 | data['gd_boxes'] = gd_boxes
349 | data['cxt_ann_ids'] = cxt_ann_ids
350 | data['Feats'] = {'fc7': fc7, 'pool5': pool5, 'lfeats': lfeats, 'dif_lfeats': dif_lfeats,
351 | 'cxt_fc7': cxt_fc7, 'cxt_lfeats': cxt_lfeats}
352 | data['att_labels'] = att_labels
353 | data['select_ixs'] = select_ixs
354 | data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': max_index, 'wrapped': wrapped}
355 | return data
356 |
357 | def get_attribute_weights(self, scale = 10):
358 | # weights = \lamda * 1/sqrt(cnt)
359 | cnts = [self.att_to_cnt[self.ix_to_att[ix]] for ix in range(self.num_atts)]
360 | cnts = np.array(cnts)
361 | weights = 1 / cnts ** 0.5
362 | weights = (weights-np.min(weights))/(np.max(weights)-np.min(weights))
363 | weights = weights * (scale - 1) + 1
364 | return torch.from_numpy(weights).float()
365 |
366 | def decode_attribute_label(self, scores):
367 | """- scores: Variable (cuda) (n, num_atts) after sigmoid range [0, 1]
368 | - labels:list of [[att, sc], [att, sc], ...
369 | """
370 | scores = scores.data.cpu().numpy()
371 | N = scores.shape[0]
372 | labels = []
373 | for i in range(N):
374 | label = []
375 | score = scores[i]
376 | for j, sc in enumerate(list(score)):
377 | label += [(self.ix_to_att[j], sc)]
378 | labels.append(label)
379 | return labels
380 |
381 | def getTestBatch(self, split, opt):
382 |
383 | wrapped = False
384 | split_ix = self.split_ix[split]
385 | max_index = len(split_ix) - 1
386 | ri = self.iterators[split]
387 | ri_next = ri + 1
388 | if ri_next > max_index:
389 | ri_next = 0
390 | wrapped = True
391 | self.iterators[split] = ri_next
392 | image_id = split_ix[ri]
393 | image = self.Images[image_id]
394 | ann_ids = image['ann_ids']
395 | cxt_ann_ids, ann_fc7, ann_pool5, lfeats, dif_lfeats, cxt_fc7, cxt_lfeats = self.extract_ann_features(image_id, opt)
396 | sent_ids = []
397 | gd_ixs = []
398 | gd_boxes = []
399 | att_refs = []
400 | for ref_id in image['ref_ids']:
401 | ref = self.Refs[ref_id]
402 | for sent_id in ref['sent_ids']:
403 | sent_ids += [sent_id]
404 | gd_ixs += [ann_ids.index(ref['ann_id'])]
405 | gd_boxes += [ref['box']]
406 | att_refs += [ref_id]
407 |
408 | labels = np.vstack([self.fetch_seq(sent_id) for sent_id in sent_ids])
409 | labels = Variable(torch.from_numpy(labels).long().cuda())
410 | max_len = (labels!=0).sum(1).max().data[0]
411 | labels = labels[:, :max_len]
412 |
413 | start_words = np.ones([labels.size(0), 1], dtype=int)*(self.word_to_ix[''])
414 | start_words = Variable(torch.from_numpy(start_words).long().cuda())
415 | enc_labels = labels.clone()
416 | enc_labels = torch.cat([start_words, enc_labels], 1)
417 |
418 | zero_pad = np.zeros([labels.size(0), 1], dtype=int)
419 | zero_pad = Variable(torch.from_numpy(zero_pad).long().cuda())
420 | dec_labels = labels.clone()
421 | dec_labels = torch.cat([dec_labels, zero_pad], 1)
422 |
423 | att_labels, select_ixs = self.fetch_attribute_label(att_refs)
424 |
425 | pool5 = ann_pool5.unsqueeze(0)
426 | pool5.detach()
427 | fc7 = ann_fc7.unsqueeze(0)
428 | fc7.detach()
429 | lfeats = lfeats.unsqueeze(0)
430 | dif_lfeats = dif_lfeats.unsqueeze(0)
431 | cxt_fc7 = cxt_fc7.unsqueeze(0)
432 | cxt_lfeats = cxt_lfeats.unsqueeze(0)
433 |
434 | data = {}
435 | data['image_id'] = image_id
436 | data['ann_ids'] = ann_ids
437 | data['cxt_ann_ids'] = cxt_ann_ids
438 | data['sent_ids'] = sent_ids
439 | data['gd_ixs'] = gd_ixs
440 | data['gd_boxes'] = gd_boxes
441 | data['Feats'] = {'fc7': fc7, 'pool5': pool5, 'lfeats': lfeats, 'dif_lfeats': dif_lfeats,
442 | 'cxt_fc7': cxt_fc7, 'cxt_lfeats': cxt_lfeats}
443 |
444 | data['labels'] = labels
445 | data['enc_labels'] = enc_labels
446 | data['dec_labels'] = dec_labels
447 | data['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': max_index, 'wrapped': wrapped}
448 | data['att_labels'] = att_labels
449 | data['select_ixs'] = select_ixs
450 | return data
451 |
452 |
--------------------------------------------------------------------------------
/lib/loaders/loader.py:
--------------------------------------------------------------------------------
1 | """
2 | data_json has
3 | 0. refs : list of {ref_id, ann_id, box, image_id, split, category_id, sent_ids}
4 | 1. images : list of {image_id, ref_ids, ann_ids, file_name, width, height, h5_id}
5 | 2. anns : list of {ann_id, category_id, image_id, box, h5_id}
6 | 3. sentences : list of {sent_id, tokens, h5_id}
7 | 4: word_to_ix : word->ix
8 | 5: cat_to_ix : cat->ix
9 | 6: label_length: L
10 | Note, box in [xywh] format
11 | data_h5 has
12 | /labels is (M, max_length) uint32 array of encoded labels, zeros padded
13 | """
14 | from __future__ import absolute_import
15 | from __future__ import division
16 | from __future__ import print_function
17 |
18 | import os.path as osp
19 | import numpy as np
20 | import h5py
21 | import json
22 | import random
23 |
24 | class Loader(object):
25 |
26 | def __init__(self, data_json, data_h5=None):
27 | # load the json file which contains info about the dataset
28 | print('Loader loading data.json: ', data_json)
29 | self.info = json.load(open(data_json))
30 | self.word_to_ix = self.info['word_to_ix']
31 | self.ix_to_word = {ix: wd for wd, ix in self.word_to_ix.items()}
32 | print('vocab size is ', self.vocab_size)
33 | self.cat_to_ix = self.info['cat_to_ix']
34 | self.ix_to_cat = {ix: cat for cat, ix in self.cat_to_ix.items()}
35 | print('object cateogry size is ', len(self.ix_to_cat))
36 | self.images = self.info['images']
37 | self.anns = self.info['anns']
38 | self.refs = self.info['refs']
39 | self.sentences = self.info['sentences']
40 | print('we have %s images.' % len(self.images))
41 | print('we have %s anns.' % len(self.anns))
42 | print('we have %s refs.' % len(self.refs))
43 | print('we have %s sentences.' % len(self.sentences))
44 | print('label_length is ', self.label_length)
45 |
46 | # construct mapping
47 | self.Refs = {ref['ref_id']: ref for ref in self.refs}
48 | self.Images = {image['image_id']: image for image in self.images}
49 | self.Anns = {ann['ann_id']: ann for ann in self.anns}
50 | self.Sentences = {sent['sent_id']: sent for sent in self.sentences}
51 | self.annToRef = {ref['ann_id']: ref for ref in self.refs}
52 | self.sentToRef = {sent_id: ref for ref in self.refs for sent_id in ref['sent_ids']}
53 |
54 | # read data_h5 if exists
55 | self.data_h5 = None
56 | if data_h5 is not None:
57 | print('Loader loading data.h5: ', data_h5)
58 | self.data_h5 = h5py.File(data_h5, 'r')
59 | assert self.data_h5['labels'].shape[0] == len(self.sentences), 'label.shape[0] not match sentences'
60 | assert self.data_h5['labels'].shape[1] == self.label_length, 'label.shape[1] not match label_length'
61 |
62 | @property
63 | def vocab_size(self):
64 | # len(self.word_to_ix) == 1999
65 | return len(self.word_to_ix)
66 |
67 | @property
68 | def label_length(self):
69 | return self.info['label_length']
70 |
71 | @property
72 | def sent_to_Ref(self, sent_id):
73 | return self.sent_to_Ref(sent_id)
74 |
75 | def encode_labels(self, sent_str_list):
76 | """Input:
77 | sent_str_list: list of n sents in string format
78 | return int32 (n, label_length) zeros padded in end
79 | """
80 | num_sents = len(sent_str_list)
81 | L = np.zeros((num_sents, self.label_length), dtype=np.int32)
82 | for i, sent_str in enumerate(sent_str_list):
83 | tokens = sent_str.split()
84 | for j, w in enumerate(tokens):
85 | if j < self.label_length:
86 | L[i, j] = self.word_to_ix[w] if w in self.word_to_ix else self.word_to_ix['']
87 | return L
88 |
89 | def decode_labels(self, labels):
90 | """
91 | labels: int32 (n, label_length) zeros padded in end
92 | return: list of sents in string format
93 | """
94 | decoded_sent_strs = []
95 | num_sents = labels.shape[0]
96 | for i in range(num_sents):
97 | label = labels[i].tolist()
98 | sent_str = ' '.join([self.ix_to_word[int(i)] for i in label if i != 0])
99 | decoded_sent_strs.append(sent_str)
100 | return decoded_sent_strs
101 |
102 |
103 | def fetch_label(self, ref_id, num_sents):
104 | """
105 | return: int32 (num_sents, label_length) and picked_sent_ids
106 | """
107 | ref = self.Refs[ref_id]
108 | sent_ids = list(ref['sent_ids']) # copy in case the raw list is changed
109 | seq = []
110 |
111 | if len(sent_ids) < num_sents:
112 | append_sent_ids = [random.choice(sent_ids) for _ in range(num_sents - len(sent_ids))]
113 | sent_ids += append_sent_ids
114 | else:
115 | sent_ids = sent_ids[:num_sents]
116 | assert len(sent_ids) == num_sents
117 | # fetch label
118 | for sent_id in sent_ids:
119 | sent_h5_id = self.Sentences[sent_id]['h5_id']
120 | seq += [self.data_h5['labels'][sent_h5_id, :]]
121 | seq = np.vstack(seq)
122 | return seq, sent_ids
123 |
124 | def fetch_seq(self, sent_id):
125 | # return int32 (label_length, )
126 | sent_h5_id = self.Sentences[sent_id]['h5_id']
127 | seq = self.data_h5['labels'][sent_h5_id, :]
128 | return seq
129 |
--------------------------------------------------------------------------------
/lib/models/eval.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import os.path as osp
7 | import numpy as np
8 | import json
9 | import h5py
10 | import time
11 | from pprint import pprint
12 |
13 | import torch
14 | import torch.nn.functional as F
15 | from torch.autograd import Variable
16 | import pdb
17 |
18 |
19 | # IoU function
20 | def computeIoU(box1, box2):
21 | # each box is of [x1, y1, w, h]
22 | inter_x1 = max(box1[0], box2[0])
23 | inter_y1 = max(box1[1], box2[1])
24 | inter_x2 = min(box1[0]+box1[2]-1, box2[0]+box2[2]-1)
25 | inter_y2 = min(box1[1]+box1[3]-1, box2[1]+box2[3]-1)
26 |
27 | if inter_x1 < inter_x2 and inter_y1 < inter_y2:
28 | inter = (inter_x2-inter_x1+1)*(inter_y2-inter_y1+1)
29 | else:
30 | inter = 0
31 | union = box1[2]*box1[3] + box2[2]*box2[3] - inter
32 | return float(inter)/union
33 |
34 |
35 | def eval_split(loader, model, split, opt):
36 |
37 | verbose = opt.get('verbose', True)
38 | num_sents = opt.get('num_sents', -1)
39 | assert split != 'train', 'Check the evaluation split.'
40 |
41 | model.eval()
42 |
43 | loader.resetIterator(split)
44 | loss_sum = 0
45 | loss_evals = 0
46 | acc = 0
47 | predictions = []
48 | finish_flag = False
49 | model_time = 0
50 | vis_res_loss_sum = 0
51 | lang_res_loss_sum = 0
52 |
53 |
54 | while True:
55 | with torch.no_grad():
56 | data = loader.getTestBatch(split, opt)
57 | att_weights = loader.get_attribute_weights()
58 | ann_ids = data['ann_ids']
59 | sent_ids = data['sent_ids']
60 | Feats = data['Feats']
61 | labels = data['labels']
62 | enc_labels = data['enc_labels']
63 | dec_labels = data['dec_labels']
64 | att_labels, select_ixs = data['att_labels'], data['select_ixs']
65 |
66 | for i, sent_id in enumerate(sent_ids):
67 | enc_label = enc_labels[i:i + 1]
68 | max_len = (enc_label != 0).sum().data[0]
69 | enc_label = enc_label[:, :max_len]
70 | dec_label = dec_labels[i:i + 1]
71 | dec_label = dec_label[:, :max_len]
72 |
73 | label = labels[i:i + 1]
74 | max_len = (label != 0).sum().data[0]
75 | label = label[:, :max_len]
76 |
77 | att_label = att_labels[i:i + 1]
78 | if i in select_ixs:
79 | select_ix = torch.LongTensor([0]).cuda()
80 | else:
81 | select_ix = torch.LongTensor().cuda()
82 |
83 | tic = time.time()
84 | scores, loss, rel_ixs, sub_attn, loc_attn, rel_attn, weights, vis_res_loss, att_res_loss, lang_res_loss = \
85 | model(Feats['pool5'], Feats['fc7'], Feats['lfeats'], Feats['dif_lfeats'],
86 | Feats['cxt_fc7'], Feats['cxt_lfeats'], label, enc_label, dec_label, att_label, select_ix, att_weights)
87 |
88 | scores = scores.squeeze(0).data.cpu().numpy()
89 | rel_ixs = rel_ixs.squeeze(0).data.cpu().numpy().tolist()
90 |
91 | loss = loss.data[0].item()
92 |
93 | if opt['loss_combined'] == 0:
94 | vis_res_loss=vis_res_loss.data[0].item()
95 | lang_res_loss = lang_res_loss.data[0].item()
96 | vis_res_loss_sum += vis_res_loss
97 | lang_res_loss_sum += lang_res_loss
98 |
99 | pred_ix = np.argmax(scores)
100 | gd_ix = data['gd_ixs'][i]
101 | loss_sum += loss
102 | loss_evals += 1
103 |
104 | pred_box = loader.Anns[ann_ids[pred_ix]]['box']
105 | gd_box = data['gd_boxes'][i]
106 |
107 | if opt['use_IoU'] > 0:
108 | if computeIoU(pred_box, gd_box) >= 0.5:
109 | acc += 1
110 | else:
111 | if pred_ix == gd_ix:
112 | acc += 1
113 |
114 | rel_ix = rel_ixs[pred_ix]
115 |
116 | entry = {}
117 | entry['sent_id'] = sent_id
118 | entry['sent'] = loader.decode_labels(label.data.cpu().numpy())[0]
119 | entry['gd_ann_id'] = data['ann_ids'][gd_ix]
120 | entry['pred_ann_id'] = data['ann_ids'][pred_ix]
121 | entry['pred_score'] = scores.tolist()[pred_ix]
122 |
123 | entry['sub_attn'] = sub_attn.data.cpu().numpy().tolist()
124 | entry['loc_attn'] = loc_attn.data.cpu().numpy().tolist()
125 | entry['rel_attn'] = rel_attn.data.cpu().numpy().tolist()
126 | entry['rel_ann_id'] = data['cxt_ann_ids'][pred_ix][rel_ix]
127 |
128 | entry['weights'] = weights.data.cpu().numpy().tolist()
129 |
130 | predictions.append(entry)
131 | toc = time.time()
132 | model_time += (toc - tic)
133 |
134 | if num_sents > 0 and loss_evals >= num_sents:
135 | finish_flag = True
136 | break
137 | ix0 = data['bounds']['it_pos_now']
138 | ix1 = data['bounds']['it_max']
139 | if verbose:
140 | print('evaluating [%s] ... image[%d/%d]\'s sents, acc=%.2f%%, (%.4f), model time (per sent) is %.2fs' % \
141 | (split, ix0, ix1, acc*100.0/loss_evals, loss, model_time/len(sent_ids)))
142 | model_time = 0
143 |
144 | if finish_flag or data['bounds']['wrapped']:
145 | break
146 |
147 | return loss_sum / loss_evals, acc / loss_evals, predictions, \
148 | vis_res_loss_sum / loss_evals, lang_res_loss_sum / loss_evals
149 |
150 |
151 |
--------------------------------------------------------------------------------
/lib/models/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import collections
6 | import torch
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 | import pdb
10 |
11 |
12 | # grad_clip=0.1
13 | def clip_gradient(optimizer, grad_clip):
14 | for group in optimizer.param_groups:
15 | for param in group['params']:
16 | # pdb.set_trace()
17 | # 裁切一下
18 | if hasattr(param.grad, 'data'):
19 | param.grad.data.clamp_(-grad_clip, grad_clip)
20 |
21 |
22 | def set_lr(optimizer, lr):
23 | for group in optimizer.param_groups:
24 | group['lr'] = lr
25 |
--------------------------------------------------------------------------------
/lib/mrcn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GingL/ARN/b1001040d8ac41292b2ccf6a6ab1f41c1d77d0fa/lib/mrcn/__init__.py
--------------------------------------------------------------------------------
/lib/mrcn/inference.py:
--------------------------------------------------------------------------------
1 | """
2 | args: imdb_name, net, iters, tag
3 | """
4 | from __future__ import absolute_import
5 | from __future__ import division
6 | from __future__ import print_function
7 |
8 | import os
9 | import os.path as osp
10 | import sys
11 | import json
12 | import time
13 | import numpy as np
14 | import pprint
15 | from scipy.misc import imread, imresize
16 | import cv2
17 |
18 | import torch
19 | from torch.autograd import Variable
20 |
21 | # mrcn imports
22 | import _init_paths
23 | from datasets.factory import get_imdb
24 | from model.config import cfg, cfg_from_file, cfg_from_list
25 | from model.bbox_transform import clip_boxes, bbox_transform_inv
26 | from nets.vgg16 import vgg16
27 | from nets.resnet_v1 import resnetv1
28 | from utils.blob import im_list_to_blob
29 | from utils.mask_utils import recover_masks
30 | from pycocotools import mask as COCOmask
31 |
32 |
33 | import pdb
34 | # mrcn dir
35 | this_dir = osp.dirname(__file__)
36 | mrcn_dir = osp.join(this_dir, '..', '..', 'pyutils', 'mask-faster-rcnn')
37 |
38 |
39 | def get_imdb_name(imdb_name):
40 | if imdb_name in ['refcoco', 'refcocog']:
41 | return {'TRAIN_IMDB': '%s_train+%s_val' % (imdb_name, imdb_name),
42 | 'TEST_IMDB': '%s_test' % imdb_name}
43 | elif imdb_name == 'coco_minus_refer':
44 | return {'TRAIN_IMDB': "coco_2014_train_minus_refer_valtest+coco_2014_valminusminival",
45 | 'TEST_IMDB': "coco_2014_minival"}
46 |
47 |
48 | class Inference:
49 | def __init__(self, args):
50 |
51 | self.imdb_name = args.imdb_name
52 | self.net_name = args.net_name
53 | self.tag = args.tag
54 | self.iters = args.iters
55 |
56 | # Config
57 | cfg_file = osp.join(mrcn_dir, 'experiments/cfgs/%s.yml' % self.net_name)
58 | cfg_list = ['ANCHOR_SCALES', [4, 8, 16, 32], 'ANCHOR_RATIOS', [0.5, 1, 2]]
59 | if cfg_file is not None: cfg_from_file(cfg_file)
60 | if cfg_list is not None: cfg_from_list(cfg_list)
61 | print('Using config:')
62 | pprint.pprint(cfg)
63 |
64 | # load imdb
65 | self.imdb = get_imdb(get_imdb_name(self.imdb_name)['TEST_IMDB'])
66 |
67 | # Load network
68 | self.net = self.load_net()
69 |
70 | def load_net(self):
71 | # Load network
72 | if self.net_name == 'vgg16':
73 | net = vgg16(batch_size=1)
74 | elif self.net_name == 'res101':
75 | net = resnetv1(batch_size=1, num_layers=101)
76 | else:
77 | raise NotImplementedError
78 |
79 | # 暂时未找到create_architecture
80 | net.create_architecture(self.imdb.num_classes, tag='default',
81 | anchor_scales=cfg.ANCHOR_SCALES,
82 | anchor_ratios=cfg.ANCHOR_RATIOS)
83 | net.eval()
84 | net.cuda()
85 |
86 | # Load model
87 | model = osp.join(mrcn_dir, 'output/%s/%s/%s/%s_mask_rcnn_iter_%s.pth' % \
88 | (self.net_name, get_imdb_name(self.imdb_name)['TRAIN_IMDB'], self.tag, self.net_name,
89 | self.iters))
90 | assert osp.isfile(model)
91 | net.load_state_dict(torch.load(model))
92 | print('pretrained-model loaded from [%s].' % model)
93 | net.eval()
94 | return net
95 |
96 | def predict(self, img_path):
97 | # return scores/probs (num_rois, 81), pred_boxes (num_rois, 81*4)
98 | # in numpy
99 | im = cv2.imread(img_path)
100 | blobs, im_scales = self._get_blobs(im)
101 | im_blob = blobs['data'] # (1, iH, iW, 3)
102 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32)
103 |
104 | # test_image returns cls_score, cls_prob, bbox_pred, rois, net_conv
105 | # scores.shape=(300,81), bbox_pred.shape=(300,324)
106 | # rois.shape=(300,5), net_conv.shape=(1,1024,38,57)
107 | _, scores, bbox_pred, rois, net_conv = self.net.test_image(blobs['data'], blobs['im_info'])
108 |
109 | # boxes.shape=(300,81)
110 | boxes = rois[:, 1:5] / im_scales[0]
111 | scores = np.reshape(scores, [scores.shape[0], -1])
112 | bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1])
113 | if cfg.TEST.BBOX_REG:
114 | # Apply bounding-box regression deltas
115 | box_deltas = bbox_pred
116 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
117 | pred_boxes = self._clip_boxes(pred_boxes, im.shape)
118 | else:
119 | # Simply repeat the boxes, once for each class
120 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
121 |
122 | return scores, pred_boxes
123 |
124 | def boxes_to_masks(self, img_path, boxes, labels):
125 | """
126 | Arguments:
127 | - img_path: img_file
128 | - boxes : ndaray [[xyxy]] (n, 4) in original image
129 | - labels : ndarray (n, )
130 | Return:
131 | - masks : (n, ih, iw) uint8 [0,1]
132 | - rles : list of rle instance
133 | """
134 | im = cv2.imread(img_path)
135 | blobs, im_scales = self._get_blobs(im)
136 | im_blob = blobs['data'] # (1, iH, iW, 3)
137 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32)
138 |
139 | # forward
140 | self.net.test_image(blobs['data'], blobs['im_info'])
141 |
142 | # net_conv
143 | net_conv = self.net._predictions['net_conv']
144 |
145 | # run
146 | mask_prob = self.net._predict_masks_from_boxes_and_labels(net_conv, boxes * im_scales[0], labels)
147 | mask_prob = mask_prob.data.cpu().numpy()
148 | masks = recover_masks(mask_prob, boxes, im.shape[0], im.shape[1]) # (N, ih, iw) uint8 [0-255]
149 | masks = (masks > 122.).astype(np.uint8) # (N, ih, iw) uint8 [0,1]
150 |
151 | # encode to rles
152 | rles = []
153 | for m in masks:
154 | rle = COCOmask.encode(np.asfortranarray(m))
155 | rles += [rle]
156 |
157 | return masks, rles
158 |
159 | def extract_head(self, img_path):
160 | # extract head (1, 1024, im_height*scale/16.0, im_width*scale/16.0) in Variable cuda float
161 | # and im_info [[ih, iw, scale]] in float32 ndarray
162 | im = cv2.imread(img_path)
163 | blobs, im_scales = self._get_blobs(im)
164 | # _相当于net._layers["head"](Variable(torch.from_numpy(image.transpose([0,3,1,2])).cuda(), volatile=True))
165 | # head_feat.shape = (1, 1024, im_height*scale/16.0, im_width*scale/16.0)
166 | head_feat = self.net.extract_head(blobs['data'])
167 | im_info = np.array([[blobs['data'].shape[1], blobs['data'].shape[2], im_scales[0]]])
168 | return head_feat, im_info.astype(np.float32)
169 |
170 | def head_to_prediction(self, net_conv, im_info):
171 | """
172 | Arguments:
173 | net_conv (Variable): (1, 1024, H, W)
174 | im_info (float) : [[ih, iw, scale]]
175 | Returns:
176 | scores (ndarray): (num_rois, 81)
177 | pred_boxes (ndarray): (num_rois, 81*4) in original image size
178 | """
179 | self.net.eval()
180 | self.net._mode = 'TEST'
181 |
182 | # predict rois, cls_prob and bbox_pred
183 | self.net._im_info = im_info
184 | self.net._anchor_component(net_conv.size(2), net_conv.size(3))
185 | rois = self.net._region_proposal(net_conv)
186 | if cfg.POOLING_MODE == 'crop':
187 | pool5 = self.net._crop_pool_layer(net_conv, rois)
188 | else:
189 | pool5 = self.net._roi_pool_layer(net_conv, rois)
190 | fc7 = self.net._head_to_tail(pool5)
191 | cls_prob, bbox_pred = self.net._region_classification(fc7)
192 |
193 | # add mean and std to bbox_pred if any
194 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
195 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.imdb.num_classes).unsqueeze(
196 | 0).expand_as(bbox_pred)
197 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.imdb.num_classes).unsqueeze(
198 | 0).expand_as(bbox_pred)
199 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means))
200 |
201 | # convert to numpy
202 | scores = cls_prob.data.cpu().numpy()
203 | rois = rois.data.cpu().numpy()
204 | bbox_pred = bbox_pred.data.cpu().numpy()
205 |
206 | # regress boxes
207 | boxes = rois[:, 1:5] / im_info[0][2]
208 | if cfg.TEST.BBOX_REG:
209 | # Apply bounding-box regression deltas
210 | box_deltas = bbox_pred
211 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
212 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2])
213 | else:
214 | # Simply repeat the boxes, once for each class
215 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
216 |
217 | return scores, pred_boxes
218 |
219 | def box_to_fc7(self, net_conv, im_info, ori_boxes):
220 | """
221 | Arguments:
222 | net_conv (Variable) : (1, 1024, H, W)
223 | im_info (float32) : [[ih, iw, scale]]
224 | ori_boxes (float32) : (n, 4) [x1y1x2y2]
225 | Returns:
226 | fc7 (float) : (n, 2048)
227 | """
228 | self.net.eval()
229 | self.net._mode = 'TEST'
230 |
231 | # make rois
232 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_())
233 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
234 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
235 | rois = torch.cat([batch_inds, scaled_boxes], 1)
236 |
237 | # pool fc7
238 | if cfg.POOLING_MODE == 'crop':
239 | pool5 = self.net._crop_pool_layer(net_conv, rois)
240 | else:
241 | pool5 = self.net._roi_pool_layer(net_conv, rois)
242 |
243 | fc7 = self.net._head_to_tail(pool5)
244 | fc7 = fc7.mean(3).mean(2)
245 | return fc7
246 |
247 | def box_to_spatial_fc7(self, net_conv, im_info, ori_boxes):
248 | """
249 | Arguments:
250 | net_conv (Variable) : (1, 1024, H, W)
251 | im_info (float32) : [[ih, iw, scale]]
252 | ori_boxes (float32) : (n, 4) [x1y1x2y2]
253 | Returns:
254 | pool5 (float) : (n, 1024, 7, 7)
255 | spatial_fc7 (float) : (n, 2048, 7, 7)
256 | """
257 | self.net.eval()
258 | self.net._mode = 'TEST'
259 |
260 | # make rois
261 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_())
262 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
263 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
264 | rois = torch.cat([batch_inds, scaled_boxes], 1)
265 |
266 | # pool fc7
267 | if cfg.POOLING_MODE == 'crop':
268 | pool5 = self.net._crop_pool_layer(net_conv, rois)
269 | else:
270 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n, 1024, 7, 7)
271 |
272 | spatial_fc7 = self.net.resnet.layer4(pool5) # (n, 2048, 7, 7)
273 | return pool5, spatial_fc7
274 |
275 | def spatial_fc7_to_prediction(self, spatial_fc7, im_info, ori_boxes):
276 | """Only used for testing. Testing the above box_to_fc7 [passed]"""
277 | cls_prob, bbox_pred = self.net._region_classification(spatial_fc7)
278 |
279 | # make rois
280 | batch_inds = Variable(spatial_fc7.data.new(ori_boxes.shape[0], 1).zero_())
281 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
282 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
283 | rois = torch.cat([batch_inds, scaled_boxes], 1)
284 |
285 | # add mean and std to bbox_pred if any
286 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
287 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.imdb.num_classes).unsqueeze(
288 | 0).expand_as(bbox_pred)
289 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.imdb.num_classes).unsqueeze(
290 | 0).expand_as(bbox_pred)
291 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means))
292 |
293 | # convert to numpy
294 | scores = cls_prob.data.cpu().numpy()
295 | rois = rois.data.cpu().numpy()
296 | bbox_pred = bbox_pred.data.cpu().numpy()
297 |
298 | # regress boxes
299 | boxes = rois[:, 1:5] / im_info[0][2]
300 | if cfg.TEST.BBOX_REG:
301 | # Apply bounding-box regression deltas
302 | box_deltas = bbox_pred
303 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
304 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2])
305 | else:
306 | # Simply repeat the boxes, once for each class
307 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
308 |
309 | return scores, pred_boxes
310 |
311 | def _get_image_blob(self, im):
312 | """Converts an image into a network input.
313 | Arguments:
314 | im (ndarray): a color image in BGR order
315 | Returns:
316 | blob (ndarray): a data blob holding an image pyramid
317 | im_scale_factors (list): list of image scales (relative to im) used
318 | in the image pyramid
319 | """
320 | # pdb.set_trace()
321 | im_orig = im.astype(np.float32, copy=True)
322 | im_orig -= cfg.PIXEL_MEANS
323 |
324 | # if im_shape=(320, 480, 3) then im_shape[0:2]=(320,480)
325 | # then im_size_min =320, im_size_max = 480
326 | im_shape = im_orig.shape
327 | im_size_min = np.min(im_shape[0:2])
328 | im_size_max = np.max(im_shape[0:2])
329 |
330 | processed_ims = []
331 | im_scale_factors = []
332 |
333 | for target_size in cfg.TEST.SCALES:
334 | im_scale = float(target_size) / float(im_size_min)
335 | # Prevent the biggest axis from being more than MAX_SIZE
336 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
337 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
338 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
339 | interpolation=cv2.INTER_LINEAR)
340 | im_scale_factors.append(im_scale)
341 | processed_ims.append(im)
342 |
343 | # Create a blob to hold the input images
344 | blob = im_list_to_blob(processed_ims)
345 |
346 | return blob, np.array(im_scale_factors)
347 |
348 | def _get_blobs(self, im):
349 | """Convert an image and RoIs within that image into network inputs."""
350 | blobs = {}
351 | blobs['data'], im_scale_factors = self._get_image_blob(im)
352 |
353 | return blobs, im_scale_factors
354 |
355 | def _clip_boxes(self, boxes, im_shape):
356 | """Clip boxes to image boundaries."""
357 | # x1 >= 0
358 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
359 | # y1 >= 0
360 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
361 | # x2 < im_shape[1]
362 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
363 | # y2 < im_shape[0]
364 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
365 | return boxes
366 |
--------------------------------------------------------------------------------
/lib/mrcn/inference_no_imdb.py:
--------------------------------------------------------------------------------
1 | """
2 | args: net, iters, tag
3 | """
4 | from __future__ import absolute_import
5 | from __future__ import division
6 | from __future__ import print_function
7 |
8 | import os
9 | import os.path as osp
10 | import sys
11 | import json
12 | import time
13 | import numpy as np
14 | import pprint
15 | from scipy.misc import imread, imresize
16 | import cv2
17 |
18 | import torch
19 | from torch.autograd import Variable
20 |
21 | # mrcn imports
22 | import _init_paths
23 | from datasets.factory import get_imdb
24 | from model.config import cfg, cfg_from_file, cfg_from_list
25 | from model.bbox_transform import clip_boxes, bbox_transform_inv
26 | from nets.vgg16 import vgg16
27 | from nets.resnet_v1 import resnetv1
28 | from utils.blob import im_list_to_blob
29 | from utils.mask_utils import recover_masks
30 | from pycocotools import mask as COCOmask
31 |
32 | # mrcn dir
33 | this_dir = osp.dirname(__file__)
34 | mrcn_dir = osp.join(this_dir, '..', '..', 'pyutils', 'mask-faster-rcnn')
35 |
36 |
37 | def get_imdb_name(imdb_name):
38 | if imdb_name in ['refcoco', 'refcocog']:
39 | return {'TRAIN_IMDB': '%s_train+%s_val' % (imdb_name, imdb_name),
40 | 'TEST_IMDB': '%s_test' % imdb_name}
41 | elif imdb_name == 'coco_minus_refer':
42 | return {'TRAIN_IMDB': "coco_2014_train_minus_refer_valtest+coco_2014_valminusminival",
43 | 'TEST_IMDB': "coco_2014_minival"}
44 |
45 |
46 | class Inference:
47 | def __init__(self, args):
48 |
49 | self.imdb_name = args.imdb_name
50 | self.net_name = args.net_name
51 | self.tag = args.tag
52 | self.iters = args.iters
53 |
54 | # Config
55 | cfg_file = osp.join(mrcn_dir, 'experiments/cfgs/%s.yml' % self.net_name)
56 | cfg_list = ['ANCHOR_SCALES', [4, 8, 16, 32], 'ANCHOR_RATIOS', [0.5, 1, 2]]
57 | if cfg_file is not None: cfg_from_file(cfg_file)
58 | if cfg_list is not None: cfg_from_list(cfg_list)
59 | print('Using config:')
60 | pprint.pprint(cfg)
61 |
62 | # Load network
63 | self.num_classes = 81 # hard code this
64 | self.net = self.load_net()
65 |
66 | def load_net(self):
67 | # Load network
68 | if self.net_name == 'vgg16':
69 | net = vgg16(batch_size=1)
70 | elif self.net_name == 'res101':
71 | net = resnetv1(batch_size=1, num_layers=101)
72 | else:
73 | raise NotImplementedError
74 |
75 | net.create_architecture(self.num_classes, tag='default',
76 | anchor_scales=cfg.ANCHOR_SCALES,
77 | anchor_ratios=cfg.ANCHOR_RATIOS)
78 | net.eval()
79 | net.cuda()
80 |
81 | # Load model
82 | model = osp.join(mrcn_dir, 'output/%s/%s/%s/%s_mask_rcnn_iter_%s.pth' % \
83 | (self.net_name, get_imdb_name(self.imdb_name)['TRAIN_IMDB'], self.tag, self.net_name,
84 | self.iters))
85 | assert osp.isfile(model), model
86 | net.load_state_dict(torch.load(model))
87 | print('pretrained-model loaded from [%s].' % model)
88 |
89 | return net
90 |
91 | def predict(self, img_path):
92 | # return scores/probs (num_rois, 81), pred_boxes (num_rois, 81*4)
93 | # in numpy
94 | im = cv2.imread(img_path)
95 | blobs, im_scales = self._get_blobs(im)
96 | im_blob = blobs['data'] # (1, iH, iW, 3)
97 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32)
98 |
99 | # test_image returns cls_score, cls_prob, bbox_pred, rois, net_conv
100 | _, scores, bbox_pred, rois, _ = self.net.test_image(blobs['data'], blobs['im_info'])
101 |
102 | boxes = rois[:, 1:5] / im_scales[0]
103 | scores = np.reshape(scores, [scores.shape[0], -1])
104 | bbox_pred = np.reshape(bbox_pred, [bbox_pred.shape[0], -1])
105 | if cfg.TEST.BBOX_REG:
106 | # Apply bounding-box regression deltas
107 | box_deltas = bbox_pred
108 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
109 | pred_boxes = self._clip_boxes(pred_boxes, im.shape)
110 | else:
111 | # Simply repeat the boxes, once for each class
112 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
113 |
114 | return scores, pred_boxes
115 |
116 | def boxes_to_masks(self, img_path, boxes, labels):
117 | """
118 | Arguments:
119 | - img_path: img_file
120 | - boxes : ndaray [[xyxy]] (n, 4) in original image
121 | - labels : ndarray (n, )
122 | Return:
123 | - masks : (n, ih, iw) uint8 [0,1]
124 | - rles : list of rle instance
125 | """
126 | im = cv2.imread(img_path)
127 | blobs, im_scales = self._get_blobs(im)
128 | im_blob = blobs['data'] # (1, iH, iW, 3)
129 | blobs['im_info'] = np.array([[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], dtype=np.float32)
130 |
131 | # forward
132 | self.net.test_image(blobs['data'], blobs['im_info'])
133 |
134 | # net_conv
135 | net_conv = self.net._predictions['net_conv']
136 |
137 | # run
138 | mask_prob = self.net._predict_masks_from_boxes_and_labels(net_conv, boxes * im_scales[0], labels)
139 | mask_prob = mask_prob.data.cpu().numpy()
140 | masks = recover_masks(mask_prob, boxes, im.shape[0], im.shape[1]) # (N, ih, iw) uint8 [0-255]
141 | masks = (masks > 122.).astype(np.uint8) # (N, ih, iw) uint8 [0,1]
142 |
143 | # encode to rles
144 | rles = []
145 | for m in masks:
146 | rle = COCOmask.encode(np.asfortranarray(m))
147 | rles += [rle]
148 |
149 | return masks, rles
150 |
151 | def extract_head(self, img_path):
152 | # extract head (1, 1024, im_height*scale/16.0, im_width*scale/16.0) in Variable cuda float
153 | # and im_info [[ih, iw, scale]] in float32 ndarray
154 | im = cv2.imread(img_path)
155 | blobs, im_scales = self._get_blobs(im)
156 | head_feat = self.net.extract_head(blobs['data'])
157 | im_info = np.array([[blobs['data'].shape[1], blobs['data'].shape[2], im_scales[0]]])
158 | return head_feat, im_info.astype(np.float32)
159 |
160 | def head_to_prediction(self, net_conv, im_info):
161 | """
162 | Arguments:
163 | net_conv (Variable): (1, 1024, H, W)
164 | im_info (float) : [[ih, iw, scale]]
165 | Returns:
166 | scores (ndarray): (num_rois, 81)
167 | pred_boxes (ndarray): (num_rois, 81*4) in original image size
168 | """
169 | self.net.eval()
170 | self.net._mode = 'TEST'
171 |
172 | # predict rois, cls_prob and bbox_pred
173 | self.net._im_info = im_info
174 | self.net._anchor_component(net_conv.size(2), net_conv.size(3))
175 | rois = self.net._region_proposal(net_conv)
176 | if cfg.POOLING_MODE == 'crop':
177 | pool5 = self.net._crop_pool_layer(net_conv, rois)
178 | else:
179 | pool5 = self.net._roi_pool_layer(net_conv, rois)
180 | fc7 = self.net._head_to_tail(pool5)
181 | cls_prob, bbox_pred = self.net._region_classification(fc7)
182 |
183 | # add mean and std to bbox_pred if any
184 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
185 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.num_classes).unsqueeze(0).expand_as(
186 | bbox_pred)
187 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.num_classes).unsqueeze(0).expand_as(
188 | bbox_pred)
189 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means))
190 |
191 | # convert to numpy
192 | scores = cls_prob.data.cpu().numpy()
193 | rois = rois.data.cpu().numpy()
194 | bbox_pred = bbox_pred.data.cpu().numpy()
195 |
196 | # regress boxes
197 | boxes = rois[:, 1:5] / im_info[0][2]
198 | if cfg.TEST.BBOX_REG:
199 | # Apply bounding-box regression deltas
200 | box_deltas = bbox_pred
201 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
202 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2])
203 | else:
204 | # Simply repeat the boxes, once for each class
205 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
206 |
207 | return scores, pred_boxes
208 |
209 | def box_to_spatial_fc7(self, net_conv, im_info, ori_boxes):
210 | """
211 | Arguments:
212 | net_conv (Variable) : (1, 1024, H, W)
213 | im_info (float32) : [[ih, iw, scale]]
214 | ori_boxes (float32) : (n, 4) [x1y1x2y2]
215 | Returns:
216 | pool5 (float) : (n, 1024, 7, 7)
217 | spatial_fc7 (float) : (n, 2048, 7, 7)
218 | """
219 | self.net.eval()
220 | self.net._mode = 'TEST'
221 |
222 | # make rois
223 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_())
224 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
225 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
226 | rois = torch.cat([batch_inds, scaled_boxes], 1)
227 |
228 | # pool fc7
229 | if cfg.POOLING_MODE == 'crop':
230 | pool5 = self.net._crop_pool_layer(net_conv, rois)
231 | else:
232 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n, 1024, 7, 7)
233 |
234 | spatial_fc7 = self.net.resnet.layer4(pool5) # (n, 2048, 7, 7), equavalent to _head_to_tail
235 | return pool5, spatial_fc7
236 |
237 | def box_to_pool5_fc7(self, net_conv, im_info, ori_boxes):
238 | """
239 | Arguments:
240 | net_conv (Variable) : (1, 1024, H, W)
241 | im_info (float32) : [[ih, iw, scale]]
242 | ori_boxes (float32) : (n, 4) [x1y1x2y2]
243 | Returns:
244 | pool5 (float): (n, 1024)
245 | fc7 (float) : (n, 2048)
246 | """
247 | self.net.eval()
248 | self.net._mode = 'TEST'
249 |
250 | # make rois
251 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_())
252 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
253 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
254 | rois = torch.cat([batch_inds, scaled_boxes], 1)
255 |
256 | # pool fc7
257 | if cfg.POOLING_MODE == 'crop':
258 | pool5 = self.net._crop_pool_layer(net_conv, rois)
259 | else:
260 | pool5 = self.net._roi_pool_layer(net_conv, rois) # (n,1024,7,7)
261 |
262 | fc7 = self.net._head_to_tail(pool5) # (n, 2048, 7, 7)
263 | pool5 = pool5.mean(3).mean(2) # (n, 1024)
264 | fc7 = fc7.mean(3).mean(2) # (n, 2048)
265 | return pool5, fc7
266 |
267 | def box_to_fc7(self, net_conv, im_info, ori_boxes):
268 | """
269 | Arguments:
270 | net_conv (Variable) : (1, 1024, H, W)
271 | im_info (float32) : [[ih, iw, scale]]
272 | ori_boxes (float32) : (n, 4) [x1y1x2y2]
273 | Returns:
274 | fc7 (float) : (n, 2048)
275 | """
276 | self.net.eval()
277 | self.net._mode = 'TEST'
278 |
279 | # make rois
280 | batch_inds = Variable(net_conv.data.new(ori_boxes.shape[0], 1).zero_())
281 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
282 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
283 | rois = torch.cat([batch_inds, scaled_boxes], 1)
284 |
285 | # pool fc7
286 | if cfg.POOLING_MODE == 'crop':
287 | pool5 = self.net._crop_pool_layer(net_conv, rois)
288 | else:
289 | pool5 = self.net._roi_pool_layer(net_conv, rois)
290 |
291 | fc7 = self.net._head_to_tail(pool5) # (n, 2048, 7, 7)
292 | fc7 = fc7.mean(3).mean(2) # (n, 2048)
293 | return fc7
294 |
295 | def spatial_fc7_to_prediction(self, spatial_fc7, im_info, ori_boxes):
296 | """Only used for testing. Testing the above box_to_fc7 [passed]"""
297 | cls_prob, bbox_pred = self.net._region_classification(spatial_fc7)
298 |
299 | # make rois
300 | batch_inds = Variable(spatial_fc7.data.new(ori_boxes.shape[0], 1).zero_())
301 | scaled_boxes = (ori_boxes * im_info[0][2]).astype(np.float32)
302 | scaled_boxes = Variable(torch.from_numpy(scaled_boxes).cuda())
303 | rois = torch.cat([batch_inds, scaled_boxes], 1)
304 |
305 | # add mean and std to bbox_pred if any
306 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
307 | stds = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_STDS).repeat(self.num_classes).unsqueeze(0).expand_as(
308 | bbox_pred)
309 | means = bbox_pred.data.new(cfg.TRAIN.BBOX_NORMALIZE_MEANS).repeat(self.num_classes).unsqueeze(0).expand_as(
310 | bbox_pred)
311 | bbox_pred = bbox_pred.mul(Variable(stds)).add(Variable(means))
312 |
313 | # convert to numpy
314 | scores = cls_prob.data.cpu().numpy()
315 | rois = rois.data.cpu().numpy()
316 | bbox_pred = bbox_pred.data.cpu().numpy()
317 |
318 | # regress boxes
319 | boxes = rois[:, 1:5] / im_info[0][2]
320 | if cfg.TEST.BBOX_REG:
321 | # Apply bounding-box regression deltas
322 | box_deltas = bbox_pred
323 | pred_boxes = bbox_transform_inv(torch.from_numpy(boxes), torch.from_numpy(box_deltas)).numpy()
324 | pred_boxes = self._clip_boxes(pred_boxes, im_info[0][:2])
325 | else:
326 | # Simply repeat the boxes, once for each class
327 | pred_boxes = np.tile(boxes, (1, scores.shape[1]))
328 |
329 | return scores, pred_boxes
330 |
331 | def _get_image_blob(self, im):
332 | """Converts an image into a network input.
333 | Arguments:
334 | im (ndarray): a color image in BGR order
335 | Returns:
336 | blob (ndarray): a data blob holding an image pyramid
337 | im_scale_factors (list): list of image scales (relative to im) used
338 | in the image pyramid
339 | """
340 | im_orig = im.astype(np.float32, copy=True)
341 | im_orig -= cfg.PIXEL_MEANS
342 |
343 | im_shape = im_orig.shape
344 | im_size_min = np.min(im_shape[0:2])
345 | im_size_max = np.max(im_shape[0:2])
346 |
347 | processed_ims = []
348 | im_scale_factors = []
349 |
350 | for target_size in cfg.TEST.SCALES:
351 | im_scale = float(target_size) / float(im_size_min)
352 | # Prevent the biggest axis from being more than MAX_SIZE
353 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
354 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
355 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
356 | interpolation=cv2.INTER_LINEAR)
357 | im_scale_factors.append(im_scale)
358 | processed_ims.append(im)
359 |
360 | # Create a blob to hold the input images
361 | blob = im_list_to_blob(processed_ims)
362 |
363 | return blob, np.array(im_scale_factors)
364 |
365 | def _get_blobs(self, im):
366 | """Convert an image and RoIs within that image into network inputs."""
367 | blobs = {}
368 | blobs['data'], im_scale_factors = self._get_image_blob(im)
369 |
370 | return blobs, im_scale_factors
371 |
372 | def _clip_boxes(self, boxes, im_shape):
373 | """Clip boxes to image boundaries."""
374 | # x1 >= 0
375 | boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
376 | # y1 >= 0
377 | boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
378 | # x2 < im_shape[1]
379 | boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
380 | # y2 < im_shape[0]
381 | boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
382 | return boxes
383 |
--------------------------------------------------------------------------------
/tools/_init_paths.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import sys
3 |
4 | # mrcn path
5 | this_dir = osp.dirname(__file__)
6 | mrcn_dir = osp.join(this_dir, '..', 'pyutils', 'mask-faster-rcnn')
7 | sys.path.insert(0, osp.join(mrcn_dir, 'lib'))
8 | sys.path.insert(0, osp.join(mrcn_dir, 'data', 'refer'))
9 | sys.path.insert(0, osp.join(mrcn_dir, 'data', 'coco', 'PythonAPI'))
10 |
11 | # refer path
12 | refer_dir = osp.join(this_dir, '..', 'pyutils', 'refer')
13 | sys.path.insert(0, refer_dir)
14 |
15 | # model path
16 | sys.path.insert(0, osp.join(this_dir, '..', 'lib'))
--------------------------------------------------------------------------------
/tools/eval.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import os.path as osp
7 | import sys
8 | import json
9 | import time
10 | import argparse
11 | import matplotlib.pyplot as plt
12 | from matplotlib.patches import Polygon, Rectangle
13 | import skimage.io as io
14 |
15 | # model
16 | import _init_paths
17 | from layers.match import AdaptiveReconstruct
18 | from loaders.dataloader import DataLoader
19 | import models.eval as eval
20 |
21 | # torch
22 | import torch
23 | import torch.nn as nn
24 |
25 |
26 |
27 | def load_model(checkpoint_path, opt):
28 | tic = time.time()
29 | model = AdaptiveReconstruct(opt)
30 | checkpoint = torch.load(checkpoint_path)
31 | model.load_state_dict(checkpoint['model'].state_dict())
32 | model.eval()
33 | model.cuda()
34 | print('model loaded in %.2f seconds' % (time.time() - tic))
35 | return model
36 |
37 |
38 | def evaluate(params):
39 | # set up loader
40 | data_json = osp.join('cache/prepro', params['dataset_splitBy'], 'data.json')
41 | data_h5 = osp.join('cache/prepro', params['dataset_splitBy'], 'data.h5')
42 | loader = DataLoader(data_h5=data_h5, data_json=data_json)
43 |
44 | # load mode info
45 | model_prefix = osp.join('output', params['dataset_splitBy'], params['id'], 'mrcn_cmr_with_st')
46 | infos = json.load(open(model_prefix + '.json'))
47 | model_opt = infos['opt']
48 | model_path = model_prefix + '.pth'
49 | model = load_model(model_path, model_opt)
50 |
51 | # loader's feats
52 | feats_dir = '%s_%s_%s' % (model_opt['net_name'], model_opt['imdb_name'], model_opt['tag'])
53 | args.imdb_name = model_opt['imdb_name']
54 | args.net_name = model_opt['net_name']
55 | args.tag = model_opt['tag']
56 | args.iters = model_opt['iters']
57 | loader.prepare_mrcn(head_feats_dir=osp.join('cache/feats/', model_opt['dataset_splitBy'], 'mrcn', feats_dir),
58 | args=args)
59 | ann_feats = osp.join('cache/feats', model_opt['dataset_splitBy'], 'mrcn',
60 | '%s_%s_%s_ann_feats.h5' % (model_opt['net_name'], model_opt['imdb_name'], model_opt['tag']))
61 | # load ann features
62 | loader.loadFeats({'ann': ann_feats})
63 |
64 | # check model_info and params
65 | assert model_opt['dataset'] == params['dataset']
66 | assert model_opt['splitBy'] == params['splitBy']
67 |
68 | # evaluate on the split,
69 | split = params['split']
70 | model_opt['num_sents'] = params['num_sents']
71 | model_opt['verbose'] = params['verbose']
72 |
73 | val_loss, acc, predictions, _, _ = eval.eval_split(loader, model, split, model_opt)
74 |
75 |
76 | print('Comprehension on %s\'s %s (%s sents) is %.2f%%' % \
77 | (params['dataset_splitBy'], params['split'], len(predictions), acc * 100.))
78 |
79 | # save
80 | out_dir = osp.join('results', params['dataset_splitBy'], 'easy')
81 | if not osp.isdir(out_dir):
82 | os.makedirs(out_dir)
83 | out_file = osp.join(out_dir, params['id'] + '_' + params['split'] + '.json')
84 | with open(out_file, 'w') as of:
85 | json.dump({'predictions': predictions, 'acc': acc}, of)
86 |
87 | # write to results.txt
88 | f = open('experiments/easy_results.txt', 'a')
89 | f.write('[%s]: [%s][%s], id[%s]\'s acc is %.2f%%\n' % \
90 | (params['id'], params['dataset_splitBy'], params['split'], params['id'], acc * 100.0))
91 |
92 |
93 | if __name__ == '__main__':
94 | parser = argparse.ArgumentParser()
95 | parser.add_argument('--dataset', type=str, default='refcoco+',
96 | help='dataset name: refclef, refcoco, refcoco+, refcocog')
97 | parser.add_argument('--splitBy', type=str, default='unc', help='splitBy: unc, google, berkeley')
98 | parser.add_argument('--split', type=str, default='val', help='split: testAB or val, etc')
99 | parser.add_argument('--id', type=str, default='exp0', help='model id name')
100 | parser.add_argument('--num_sents', type=int, default=-1,
101 | help='how many sentences to use when periodically evaluating the loss? (-1=all)')
102 | parser.add_argument('--verbose', type=int, default=1, help='if we want to print the testing progress')
103 | args = parser.parse_args()
104 | params = vars(args)
105 |
106 | # make other options
107 | params['dataset_splitBy'] = params['dataset'] + '_' + params['splitBy']
108 | evaluate(params)
109 |
110 |
111 |
--------------------------------------------------------------------------------
/tools/opt.py:
--------------------------------------------------------------------------------
1 | from pprint import pprint
2 | import argparse
3 |
4 | def parse_opt():
5 |
6 | parser = argparse.ArgumentParser()
7 | # Data input settings
8 | parser.add_argument('--dataset', type=str, default='refcoco', help='name of dataset')
9 | parser.add_argument('--splitBy', type=str, default='unc', help='who splits this dataset')
10 | parser.add_argument('--start_from', type=str, default=None, help='continuing training from saved model')
11 | # FRCN setting
12 | parser.add_argument('--imdb_name', default='coco_minus_refer', help='image databased trained on.')
13 | parser.add_argument('--net_name', default='res101', help='net_name: res101 or vgg16')
14 | parser.add_argument('--iters', default=1250000, type=int, help='iterations we trained for faster R-CNN')
15 | parser.add_argument('--tag', default='notime', help='on default tf, don\'t change this!')
16 | parser.add_argument('--vis_feats_type', type=str, default='res101', help='visual features type: vgg16 or res101')
17 | # Visual Encoder Setting
18 | parser.add_argument('--visual_sample_ratio', type=float, default=0.3, help='ratio of same-type objects over different-type objects')
19 | parser.add_argument('--visual_fuse_mode', type=str, default='concat', help='concat or mul')
20 | parser.add_argument('--visual_init_norm', type=float, default=20, help='norm of each visual representation')
21 | parser.add_argument('--visual_use_bn', type=int, default=-1, help='>0: use bn, -1: do not use bn in visual layer')
22 | parser.add_argument('--visual_use_cxt', type=int, default=1, help='if we use contxt')
23 | parser.add_argument('--visual_cxt_type', type=str, default='frcn', help='frcn or res101')
24 | parser.add_argument('--visual_drop_out', type=float, default=0.2, help='dropout on visual encoder')
25 | parser.add_argument('--window_scale', type=float, default=2.5, help='visual context type')
26 | # Visual Feats Setting
27 | parser.add_argument('--with_st', type=int, default=1, help='if incorporating same-type objects as contexts')
28 | parser.add_argument('--num_cxt', type=int, default=5, help='how many surrounding objects do we use')
29 | # Language Encoder Setting
30 | parser.add_argument('--word_embedding_size', type=int, default=512, help='the encoding size of each token')
31 | parser.add_argument('--word_vec_size', type=int, default=512, help='further non-linear of word embedding')
32 | parser.add_argument('--word_drop_out', type=float, default=0.5, help='word drop out after embedding')
33 | parser.add_argument('--bidirectional', type=int, default=1, help='bi-rnn')
34 | parser.add_argument('--rnn_hidden_size', type=int, default=512, help='hidden size of LSTM')
35 | parser.add_argument('--rnn_type', type=str, default='lstm', help='rnn, gru or lstm')
36 | parser.add_argument('--rnn_drop_out', type=float, default=0.2, help='dropout between stacked rnn layers')
37 | parser.add_argument('--rnn_num_layers', type=int, default=1, help='number of layers in lang_encoder')
38 | parser.add_argument('--variable_lengths', type=int, default=1, help='use variable length to encode')
39 | # Joint Embedding setting
40 | parser.add_argument('--jemb_drop_out', type=float, default=0.1, help='dropout in the joint embedding')
41 | parser.add_argument('--jemb_dim', type=int, default=512, help='joint embedding layer dimension')
42 | # Reconstruct Settings
43 | parser.add_argument('--decode_bidirectional', type=int, default=0, help='whther to use bidirection LSTM in reconstrcution')
44 | # Loss Setting
45 | parser.add_argument('--att_weight', type=float, default=1.0, help='weight on attribute prediction')
46 | parser.add_argument('--visual_rank_weight', type=float, default=1.0, help='weight on paired (ref, sent) over unpaired (neg_ref, sent)')
47 | parser.add_argument('--lang_rank_weight', type=float, default=1.0, help='weight on paired (ref, sent) over unpaired (ref, neg_sent)')
48 | parser.add_argument('--margin', type=float, default=0.1, help='margin for ranking loss')
49 | parser.add_argument('--lang_res_weight', type=float, default=1.0, help='weight on language reconstruction loss')
50 | parser.add_argument('--vis_res_weight', type=float, default=0.01, help='weight on visual reconstruction loss')
51 | parser.add_argument('--att_res_weight', type=float, default=1.0, help='weight on attribute reconstruction loss')
52 | parser.add_argument('--loss_combined', type=float, default=5.0, help='weight on loss_combined')
53 | parser.add_argument('--loss_divided', type=float, default=1.0, help='weight on loss_divided' )
54 | # Optimization: General
55 | parser.add_argument('--max_iters', type=int, default=30000, help='max number of iterations to run')
56 | parser.add_argument('--sample_ratio', type=float, default=0.3, help='ratio of same-type objects over different-type objects')
57 | parser.add_argument('--batch_size', type=int, default=5, help='batch size in number of images per batch')
58 | parser.add_argument('--grad_clip', type=float, default=0.1, help='clip gradients at this value')
59 | parser.add_argument('--seq_per_ref', type=int, default=3, help='number of expressions per object during training')
60 | parser.add_argument('--learning_rate_decay_start', type=int, default=8000, help='at what iter to start decaying learning rate')
61 | parser.add_argument('--learning_rate_decay_every', type=int, default=8000, help='every how many iters thereafter to drop LR by half')
62 | parser.add_argument('--optim_epsilon', type=float, default=1e-8, help='epsilon that goes into denominator for smoothing')
63 | parser.add_argument('--learning_rate', type=float, default=4e-4, help='learning rate')
64 | parser.add_argument('--optim_alpha', type=float, default=0.8, help='alpha for adam')
65 | parser.add_argument('--optim_beta', type=float, default=0.999, help='beta used for adam')
66 | parser.add_argument('--weight_decay', type=float, default=0.0005, help='weight decay for adam')
67 | # Evaluation/Checkpointing
68 | parser.add_argument('--num_sents', type=int, default=-1, help='how many images to use when periodically evaluating the validation loss? (-1 = all)')
69 | parser.add_argument('--save_checkpoint_every', type=int, default=2000, help='how often to save a model checkpoint?')
70 | parser.add_argument('--checkpoint_path', type=str, default='output', help='directory to save models')
71 | parser.add_argument('--language_eval', type=int, default=0, help='Evaluate language as well (1 = yes, 0 = no)?')
72 | parser.add_argument('--losses_log_every', type=int, default=25, help='How often do we snapshot losses, for inclusion in the progress dump? (0 = disable)')
73 | parser.add_argument('--load_best_score', type=int, default=1, help='Do we load previous best score when resuming training.')
74 | parser.add_argument('--use_IoU', type=int, default=1, help='Whether to use IoU evaluation or not')
75 | # misc
76 | parser.add_argument('--id', type=str, default='mrcn_cmr_with_st', help='an id identifying this run/job.')
77 | parser.add_argument('--seed', type=int, default=24, help='random number generator seed to use')
78 | parser.add_argument('--gpuid', type=int, default=0, help='which gpu to use, -1 = use CPU')
79 | parser.add_argument('--exp_id', type=str, default='', help='experiment id')
80 |
81 | # parse
82 | args = parser.parse_args()
83 | opt = vars(args)
84 | pprint('parsed input parameters:')
85 | pprint(opt)
86 | return args
87 |
88 | if __name__ == '__main__':
89 |
90 | opt = parse_opt()
91 | print('opt[\'id\'] is ', opt['id'])
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/tools/train.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import os.path as osp
7 | import json
8 | import time
9 | import random
10 |
11 | # model
12 | import _init_paths
13 | from loaders.dataloader import DataLoader
14 | from layers.match import AdaptiveReconstruct
15 | import models.utils as model_utils
16 | import models.eval as eval
17 | from opt import parse_opt
18 |
19 | import torch
20 |
21 | def main(args):
22 | opt = vars(args)
23 | # initialize
24 | opt['dataset_splitBy'] = opt['dataset'] + '_' + opt['splitBy']
25 | checkpoint_dir = osp.join(opt['checkpoint_path'], opt['dataset_splitBy'], opt['exp_id'])
26 | if not osp.isdir(checkpoint_dir):
27 | os.makedirs(checkpoint_dir)
28 |
29 | # set random seed
30 | torch.manual_seed(opt['seed'])
31 | random.seed(opt['seed'])
32 |
33 | # set up loader
34 | data_json = osp.join('cache/prepro', opt['dataset_splitBy'], 'data.json')
35 | data_h5 = osp.join('cache/prepro', opt['dataset_splitBy'], 'data.h5')
36 | loader = DataLoader(data_h5=data_h5, data_json=data_json)
37 |
38 | # prepare feats
39 | feats_dir = '%s_%s_%s' % (args.net_name, args.imdb_name, args.tag)
40 | head_feats_dir = osp.join('cache/feats/', opt['dataset_splitBy'], 'mrcn', feats_dir)
41 |
42 | loader.prepare_mrcn(head_feats_dir, args)
43 |
44 | ann_feats = osp.join('cache/feats', opt['dataset_splitBy'], 'mrcn',
45 | '%s_%s_%s_ann_feats.h5' % (opt['net_name'], opt['imdb_name'], opt['tag']))
46 | loader.loadFeats({'ann': ann_feats})
47 |
48 | # set up model
49 | opt['vocab_size'] = loader.vocab_size
50 | opt['fc7_dim'] = loader.fc7_dim
51 | opt['pool5_dim'] = loader.pool5_dim
52 | opt['num_atts'] = loader.num_atts
53 | model = AdaptiveReconstruct(opt)
54 |
55 | infos = {}
56 | if opt['start_from'] is not None:
57 | pass
58 | iter = infos.get('iter', 0)
59 | epoch = infos.get('epoch', 0)
60 | val_accuracies = infos.get('val_accuracies', [])
61 | val_loss_history = infos.get('val_loss_history', {})
62 | val_result_history = infos.get('val_result_history', {})
63 | loss_history = infos.get('loss_history', {})
64 | loader.iterators = infos.get('iterators', loader.iterators)
65 | if opt['load_best_score'] == 1:
66 | best_val_score = infos.get('best_val_score', None)
67 |
68 |
69 | att_weights = loader.get_attribute_weights()
70 |
71 | if opt['gpuid'] >= 0:
72 | model.cuda()
73 |
74 |
75 | # set up optimizer
76 | optimizer = torch.optim.Adam(model.parameters(),
77 | lr=opt['learning_rate'],
78 | betas=(opt['optim_alpha'], opt['optim_beta']),
79 | eps=opt['optim_epsilon'])
80 |
81 | data_time, model_time = 0, 0
82 | lr = opt['learning_rate']
83 | best_prediction, best_overall = None, None
84 | while True:
85 | model.train()
86 | optimizer.zero_grad()
87 |
88 | T = {}
89 |
90 | tic = time.time()
91 | data = loader.getBatch('train', opt)
92 |
93 | labels = data['labels']
94 | enc_labels = data['enc_labels']
95 | dec_labels = data['dec_labels']
96 | Feats = data['Feats']
97 | att_labels, select_ixs = data['att_labels'], data['select_ixs']
98 |
99 | T['data'] = time.time() - tic
100 |
101 | tic = time.time()
102 | scores, loss,_,_,_,_,_,vis_res_loss, att_res_loss, lang_res_loss = model(Feats['pool5'], Feats['fc7'], Feats['lfeats'], Feats['dif_lfeats'],
103 | Feats['cxt_fc7'], Feats['cxt_lfeats'], labels, enc_labels, dec_labels, att_labels, select_ixs, att_weights)
104 |
105 | loss.backward()
106 | model_utils.clip_gradient(optimizer, opt['grad_clip'])
107 | optimizer.step()
108 | T['model'] = time.time()-tic
109 | wrapped = data['bounds']['wrapped']
110 |
111 | data_time += T['data']
112 | model_time += T['model']
113 |
114 | if iter % opt['losses_log_every'] == 0:
115 | loss_history[iter]=(loss.data[0]).item()
116 | print('iter[%s](epoch[%s]), train_loss=%.3f, lr=%.2E, data:%.2fs/iter, model:%.2fs/iter' \
117 | % (iter, epoch, loss.data[0].item(), lr, data_time / opt['losses_log_every'], model_time/opt['losses_log_every']))
118 | data_time, model_time = 0, 0
119 |
120 | if opt['learning_rate_decay_start'] > 0 and iter > opt['learning_rate_decay_start']:
121 | frac = (iter - opt['learning_rate_decay_start']) / opt['learning_rate_decay_every']
122 | decay_factor = 0.1**frac
123 | lr = opt['learning_rate'] * decay_factor
124 | model_utils.set_lr(optimizer, lr)
125 |
126 | if (iter) % opt['save_checkpoint_every'] == 0 or iter == opt['max_iters']:
127 | val_loss, acc, predictions, val_vis_res_loss, val_lang_res_loss = eval.eval_split(loader, model, 'testB', opt)
128 | val_loss_history[iter] = val_loss
129 | val_result_history[iter] = {'loss': val_loss, 'accuracy': acc}
130 | val_accuracies += [(iter, acc)]
131 | print('validation loss: %.2f' % val_loss)
132 | print('validation acc : %.2f%%\n' % (acc * 100.0))
133 |
134 | current_score = acc
135 | if best_val_score is None or current_score > best_val_score:
136 | best_val_score = current_score
137 | best_predictions = predictions
138 | checkpoint_path = osp.join(checkpoint_dir, opt['id'] + '.pth')
139 | checkpoint = {}
140 | checkpoint['model'] = model
141 | checkpoint['opt'] = opt
142 | torch.save(checkpoint, checkpoint_path)
143 | print('model saved to %s' % checkpoint_path)
144 |
145 | infos['iter'] = iter
146 | infos['epoch'] = epoch
147 | infos['iterators'] = loader.iterators
148 | infos['loss_history'] = loss_history
149 | infos['val_accuracies'] = val_accuracies
150 | infos['val_loss_history'] = val_loss_history
151 | infos['best_val_score'] = best_val_score
152 | infos['best_predictions'] = predictions if best_predictions is None else best_predictions
153 |
154 | infos['opt'] = opt
155 | infos['val_result_history'] = val_result_history
156 | infos['word_to_ix'] = loader.word_to_ix
157 | infos['att_to_ix'] = loader.att_to_ix
158 | with open(osp.join(checkpoint_dir, opt['id'] + '.json'), 'w', encoding="utf8") as io:
159 | json.dump(infos, io)
160 |
161 | iter += 1
162 | if wrapped:
163 | epoch += 1
164 | if iter >= opt['max_iters'] and opt['max_iters'] > 0:
165 | break
166 |
167 | if __name__ == '__main__':
168 | args = parse_opt()
169 | main(args)
170 |
--------------------------------------------------------------------------------