├── Eval_model.py
├── LICENSE
├── MoreNet.md
├── MoreNet
├── Eval_model.py
├── caption_model
│ ├── __init__.py
│ ├── att.py
│ ├── fc.py
│ └── rnnlm.py
├── train_fourgram.py
├── train_rnnlm.py
├── train_rnnlm_cider.py
├── train_sc_cider.py
├── train_trigram.py
└── train_warm.py
├── README.md
├── __init__.py
├── caption_model
├── __init__.py
├── att.py
├── fc.py
└── rnnlm.py
├── cider
└── README.md
├── data
├── README.md
└── karpathy
│ └── __init__.py
├── dataloader.py
├── get_ngram.py
├── images
├── badending.png
├── fourgram_att.png
├── rnn_att.png
└── rnn_fc.png
├── misc
├── __init__.pyc
├── ngram_reward.py
├── ngram_reward.pyc
├── ngram_utils.py
├── ngram_utils.pyc
├── resnet.py
├── resnet.pyc
├── resnet_utils.py
├── resnet_utils.pyc
├── rewards.py
├── rewards.pyc
├── utils.py
└── utils.pyc
├── mycider.py
├── ngram_opts.py
├── scripts
├── __init__.py
├── prepro_feats.py
├── prepro_labels.py
├── prepro_ngrams.py
├── resnet.py
├── resnet_utils.py
└── utils.py
├── tools.py
├── train_fourgram.py
├── train_rnnlm.py
├── train_rnnlm_cider.py
├── train_sc_cider.py
├── train_trigram.py
├── train_warm.py
└── vis
├── index.html
└── jquery-1.8.3.min.js
/Eval_model.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import logging
4 | import numpy as np
5 | import os.path as osp
6 | from pycoco.bleu.bleu import Bleu
7 | from pycoco.meteor.meteor import Meteor
8 | from pycoco.rouge.rouge import Rouge
9 | from pycoco.cider.cider import Cider
10 | bad_endings = ['a','an','the','in','for','at','of','with','before','after','on','upon','near','to','is','are','am']
11 |
12 | def count_bad(sen,max_step):
13 | sen = sen.split(' ')
14 | if len(sen) < max_step and sen[-1] in bad_endings:
15 | return 1
16 | else:
17 | return 0
18 |
19 |
20 | def evaluate(gt_file, re_file, logger=None):
21 | """
22 | This function is reformed from MSCOCO evaluating code.
23 | The reference sentences are read from gt_file,
24 | the generated sentences to be evaluated are read from res_file
25 |
26 | """
27 | gts = json.load(open(gt_file, 'r'))
28 | scorers = [
29 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
30 | (Meteor(), "METEOR"),
31 | (Rouge(), "ROUGE_L"),
32 | (Cider(), "CIDEr")
33 | ]
34 | metrics = []
35 | res = json.load(open(re_file, 'r'))
36 | res = {c['image_id']: [c['caption']] for c in res}
37 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res}
38 | for scorer, method in scorers:
39 | if logger is not None:
40 | logger.info('computing %s score...' % (scorer.method()))
41 | score, scores = scorer.compute_score(gts, res)
42 | if type(method) == list:
43 | for sc, scs, m in zip(score, scores, method):
44 | if logger is not None:
45 | logger.info("%s: %0.3f" % (m, sc))
46 | metrics.extend(score)
47 | else:
48 | if logger is not None:
49 | logger.info("%s: %0.3f" % (method, score))
50 | metrics.append(score)
51 | return metrics
52 |
53 |
54 |
55 | import sys
56 | import ngram_opts
57 | from dataloader import *
58 | opts = ngram_opts.parse_opt()
59 | lr = 0.0005
60 | opts.batch_size = 50
61 | loader = KKDataLoader(opts)
62 | vocabs = loader.get_vocab()
63 | vocab = ['#END#']
64 | for i in range(len(vocabs)):
65 | ids = str(i+1)
66 | vocab.append(vocabs[ids])
67 | save_dir = 'eval'
68 | model_type = opts.caption_model # fc or attention
69 | rl_type = opts.rl_type # 'fourgram', 'trigram', 'rnnlm'
70 | batch_size = opts.batch_size
71 | image_dim = 2048
72 | cell_size = 512
73 | if rl_type == 'fourgram':
74 | if model_type == 'att':
75 | from caption_model.att import *
76 | vocab_size = 9489
77 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
78 | ngram=4, on_gpu=True)
79 | model.load('fourgram_cider_model/att_model/model.best')
80 | results = []
81 | for kkk in range(5000 / opts.batch_size):
82 | data = loader.get_batch('test')
83 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
84 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
85 | fc_feats, att_feats = tmp
86 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
87 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
88 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
89 | results += greedy_res
90 | else:
91 | from caption_model.fc import *
92 | vocab_size = 9489
93 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
94 | ngram=4, on_gpu=True)
95 | model.load('fourgram_cider_model/fc_model/model.best')
96 | results = []
97 | for kkk in range(5000 / opts.batch_size):
98 | data = loader.get_batch('test')
99 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
100 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
101 | fc_feats, att_feats = tmp
102 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
103 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
104 | results += greedy_res
105 | elif rl_type =='trigram':
106 | if model_type == 'att':
107 | from caption_model.att import *
108 | vocab_size = 9489
109 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
110 | ngram=3, on_gpu=True)
111 | model.load('trigram_cider_model/att_model/model.best')
112 | results = []
113 | for kkk in range(5000 / opts.batch_size):
114 | data = loader.get_batch('test')
115 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
116 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
117 | fc_feats, att_feats = tmp
118 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
119 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
120 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
121 | results += greedy_res
122 | else:
123 | from caption_model.fc import *
124 | vocab_size = 9489
125 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
126 | ngram=3, on_gpu=True)
127 | model.load('trigram_cider_model/fc_model/model.best')
128 | results = []
129 | for kkk in range(5000 / opts.batch_size):
130 | data = loader.get_batch('test')
131 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
132 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
133 | fc_feats, att_feats = tmp
134 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
135 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
136 | results += greedy_res
137 | elif rl_type =='rnnlm':
138 | if model_type == 'att':
139 | from caption_model.att import *
140 | vocab_size = 9489
141 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
142 | model.load('rnnlm_cider_model/att_model/model.best')
143 | results = []
144 | for kkk in range(5000 / opts.batch_size):
145 | data = loader.get_batch('test')
146 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
147 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
148 | fc_feats, att_feats = tmp
149 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
150 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
151 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
152 | results += greedy_res
153 | else:
154 | from caption_model.fc import *
155 | vocab_size = 9489
156 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True)
157 | model.load('rnnlm_cider_model/fc_model/model.best')
158 | results = []
159 | for kkk in range(5000 / opts.batch_size):
160 | data = loader.get_batch('test')
161 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
162 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
163 | fc_feats, att_feats = tmp
164 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
165 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
166 | results += greedy_res
167 | elif rl_type =='sc':
168 | if model_type == 'att':
169 | from caption_model.att import *
170 | vocab_size = 9489
171 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
172 | model.load('sc_cider_model/att_model/model.best')
173 | results = []
174 | for kkk in range(5000 / opts.batch_size):
175 | data = loader.get_batch('test')
176 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
177 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
178 | fc_feats, att_feats = tmp
179 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
180 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
181 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
182 | results += greedy_res
183 | else:
184 | from caption_model.fc import *
185 | vocab_size = 9489
186 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True)
187 | model.load('sc_cider_model/fc_model/model.best')
188 | results = []
189 | for kkk in range(5000 / opts.batch_size):
190 | data = loader.get_batch('test')
191 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
192 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
193 | fc_feats, att_feats = tmp
194 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
195 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
196 | results += greedy_res
197 |
198 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
199 | gt_file = osp.join('data/features', 'captions_test.json')
200 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
201 | bad_count = [count_bad(results[i]['caption']) for i in range(5000)]
202 | total_bad_count = sum(bad_count)
203 | print score , total_bad_count
204 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Tszhang Guo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MoreNet.md:
--------------------------------------------------------------------------------
1 | We use the same data preprocess as in README.md and copy all files in `MoreNet/` to replace in the main directory.
2 | ### Warm Starm
3 | In order to help CIDEr based REINFORCE algorithm converge more stable and faster, We need to warm start the captioning model and run the script below
4 |
5 | ```bash
6 | $ python train_warm.py --caption_model fc
7 | ```
8 | if you want to use Attention, then run
9 | ```bash
10 | $ python train_warm.py --caption_model att
11 | ```
12 | Download our pretrained warm start model from this [link](https://drive.google.com/open?id=1fj_Dgy9Gmxc9t6phzWKaH6DUZZXB-a6T).
13 |
14 | ### Train using Self-critical
15 | ```bash
16 | $ python train_sc_cider.py --caption_model att
17 | ```
18 | You will also see a large boost of CIDEr score but with lots of bad endings.
19 |
20 | ### Train using Ngram constraint
21 | ```bash
22 | $ python train_fourgram.py --caption_model fc
23 | ```
24 |
25 | ### Train using Neural Language model
26 |
27 | First you should train a neural language or you can download our pretrained LSTM language model from [link](https://drive.google.com/open?id=1fj_Dgy9Gmxc9t6phzWKaH6DUZZXB-a6T).
28 | ```
29 | $ python train_rnnlm.py
30 | ```
31 |
32 | Then train RL setting with Neural Language model constraint with the same warm start model.
33 | ```bash
34 | $ python train_rnnlm_cider.py --caption_model fc
35 | ```
36 | or
37 | ```bash
38 | $ python train_rnnlm_cider.py --caption_model att
39 | ```
40 |
41 |
--------------------------------------------------------------------------------
/MoreNet/Eval_model.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import logging
4 | import numpy as np
5 | import os.path as osp
6 | from pycoco.bleu.bleu import Bleu
7 | from pycoco.meteor.meteor import Meteor
8 | from pycoco.rouge.rouge import Rouge
9 | from pycoco.cider.cider import Cider
10 | bad_endings = ['a','an','the','in','for','at','of','with','before','after','on','upon','near','to','is','are','am']
11 |
12 | def count_bad(sen,max_step):
13 | sen = sen.split(' ')
14 | if len(sen) < max_step and sen[-1] in bad_endings:
15 | return 1
16 | else:
17 | return 0
18 |
19 |
20 | def evaluate(gt_file, re_file, logger=None):
21 | """
22 | This function is reformed from MSCOCO evaluating code.
23 | The reference sentences are read from gt_file,
24 | the generated sentences to be evaluated are read from res_file
25 |
26 | """
27 | gts = json.load(open(gt_file, 'r'))
28 | scorers = [
29 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
30 | (Meteor(), "METEOR"),
31 | (Rouge(), "ROUGE_L"),
32 | (Cider(), "CIDEr")
33 | ]
34 | metrics = []
35 | res = json.load(open(re_file, 'r'))
36 | res = {c['image_id']: [c['caption']] for c in res}
37 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res}
38 | for scorer, method in scorers:
39 | if logger is not None:
40 | logger.info('computing %s score...' % (scorer.method()))
41 | score, scores = scorer.compute_score(gts, res)
42 | if type(method) == list:
43 | for sc, scs, m in zip(score, scores, method):
44 | if logger is not None:
45 | logger.info("%s: %0.3f" % (m, sc))
46 | metrics.extend(score)
47 | else:
48 | if logger is not None:
49 | logger.info("%s: %0.3f" % (method, score))
50 | metrics.append(score)
51 | return metrics
52 |
53 |
54 |
55 | import sys
56 | import ngram_opts
57 | from dataloader import *
58 | opts = ngram_opts.parse_opt()
59 | lr = 0.0005
60 | opts.batch_size = 50
61 | loader = KKDataLoader(opts)
62 | vocabs = loader.get_vocab()
63 | vocab = ['#END#']
64 | for i in range(len(vocabs)):
65 | ids = str(i+1)
66 | vocab.append(vocabs[ids])
67 | save_dir = 'eval'
68 | model_type = opts.caption_model # fc or attention
69 | rl_type = opts.rl_type # 'fourgram', 'trigram', 'rnnlm'
70 | batch_size = opts.batch_size
71 | image_dim = 2048
72 | cell_size = 512
73 | if rl_type == 'fourgram':
74 | if model_type == 'att':
75 | from caption_model.att import *
76 | vocab_size = 9489
77 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
78 | ngram=4, on_gpu=True)
79 | model.load('fourgram_cider_model/att_model/model.best')
80 | results = []
81 | for kkk in range(5000 / opts.batch_size):
82 | data = loader.get_batch('test')
83 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
84 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
85 | fc_feats, att_feats = tmp
86 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
87 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
88 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
89 | results += greedy_res
90 | else:
91 | from caption_model.fc import *
92 | vocab_size = 9489
93 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
94 | ngram=4, on_gpu=True)
95 | model.load('fourgram_cider_model/fc_model/model.best')
96 | results = []
97 | for kkk in range(5000 / opts.batch_size):
98 | data = loader.get_batch('test')
99 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
100 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
101 | fc_feats, att_feats = tmp
102 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
103 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
104 | results += greedy_res
105 | elif rl_type =='trigram':
106 | if model_type == 'att':
107 | from caption_model.att import *
108 | vocab_size = 9489
109 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
110 | ngram=3, on_gpu=True)
111 | model.load('trigram_cider_model/att_model/model.best')
112 | results = []
113 | for kkk in range(5000 / opts.batch_size):
114 | data = loader.get_batch('test')
115 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
116 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
117 | fc_feats, att_feats = tmp
118 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
119 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
120 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
121 | results += greedy_res
122 | else:
123 | from caption_model.fc import *
124 | vocab_size = 9489
125 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,
126 | ngram=3, on_gpu=True)
127 | model.load('trigram_cider_model/fc_model/model.best')
128 | results = []
129 | for kkk in range(5000 / opts.batch_size):
130 | data = loader.get_batch('test')
131 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
132 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
133 | fc_feats, att_feats = tmp
134 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
135 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
136 | results += greedy_res
137 | elif rl_type =='rnnlm':
138 | if model_type == 'att':
139 | from caption_model.att import *
140 | vocab_size = 9489
141 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
142 | model.load('rnnlm_cider_model/att_model/model.best')
143 | results = []
144 | for kkk in range(5000 / opts.batch_size):
145 | data = loader.get_batch('test')
146 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
147 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
148 | fc_feats, att_feats = tmp
149 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
150 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
151 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
152 | results += greedy_res
153 | else:
154 | from caption_model.fc import *
155 | vocab_size = 9489
156 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True)
157 | model.load('rnnlm_cider_model/fc_model/model.best')
158 | results = []
159 | for kkk in range(5000 / opts.batch_size):
160 | data = loader.get_batch('test')
161 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
162 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
163 | fc_feats, att_feats = tmp
164 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
165 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
166 | results += greedy_res
167 | elif rl_type =='sc':
168 | if model_type == 'att':
169 | from caption_model.att import *
170 | vocab_size = 9489
171 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
172 | model.load('sc_cider_model/att_model/model.best')
173 | results = []
174 | for kkk in range(5000 / opts.batch_size):
175 | data = loader.get_batch('test')
176 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
177 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
178 | fc_feats, att_feats = tmp
179 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
180 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3])
181 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20)
182 | results += greedy_res
183 | else:
184 | from caption_model.fc import *
185 | vocab_size = 9489
186 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True)
187 | model.load('sc_cider_model/fc_model/model.best')
188 | results = []
189 | for kkk in range(5000 / opts.batch_size):
190 | data = loader.get_batch('test')
191 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
192 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
193 | fc_feats, att_feats = tmp
194 | image_id = [data['infos'][i]['id'] for i in range(batch_size)]
195 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20)
196 | results += greedy_res
197 |
198 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
199 | gt_file = osp.join('data/features', 'captions_test.json')
200 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
201 | bad_count = [count_bad(results[i]['caption']) for i in range(5000)]
202 | total_bad_count = sum(bad_count)
203 | print score , total_bad_count
204 |
--------------------------------------------------------------------------------
/MoreNet/caption_model/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/MoreNet/caption_model/fc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 | import pickle
7 | class FCModel(nn.Module):
8 | def __init__(self, batch_size, cell_size, image_dim,
9 | vocab_size, lr,ngram=0, on_gpu=False):
10 | super(FCModel, self).__init__()
11 | # Settings
12 | self.batch_size = batch_size
13 | self.cell_size = cell_size
14 | self.image_dim = image_dim
15 | self.vocab_size = vocab_size
16 | self.lr = lr
17 | self.on_gpu = on_gpu
18 |
19 | # Word embedding lookup table
20 | self.word_embedding = nn.Embedding(vocab_size, cell_size)
21 |
22 | # Image embedding mlp
23 | self.image_embedding = nn.Linear(image_dim, cell_size, bias=False)
24 |
25 | # State initializer
26 | self.c_initializer = nn.Linear(cell_size, cell_size, bias=False)
27 | self.h_initializer = nn.Linear(cell_size, cell_size, bias=False)
28 |
29 | # Recurrent layer
30 | self.rnn = nn.LSTMCell(cell_size, cell_size)
31 |
32 | # Word predicting mlp
33 | self.predictor = nn.Linear(cell_size, vocab_size)
34 | if ngram == 3:
35 | trigram = pickle.load(open('data/trigram.pkl'))
36 | self.trigram_mask = {}
37 | for tri in trigram:
38 | temp = np.zeros((vocab_size,))
39 | for word in trigram[tri]:
40 | temp[word] = 1
41 | self.trigram_mask[tri] = temp
42 | elif ngram == 4:
43 | fourgram = pickle.load(open('data/fourgram.pkl'))
44 | self.fourgram_mask = {}
45 | for four in fourgram:
46 | temp = np.zeros((vocab_size,))
47 | for word in fourgram[four]:
48 | temp[word] = 1
49 | self.fourgram_mask[four] = temp
50 |
51 | # Onehot encoder
52 | self.onehot = torch.torch.eye(vocab_size)
53 | if self.on_gpu:
54 | self.onehot = self.onehot.cuda()
55 |
56 | # Optimizer
57 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
58 |
59 | # Move to gpu if necessary
60 | if self.on_gpu:
61 | self.cuda()
62 |
63 | def forward(self, word_emb, state):
64 | # Get states
65 | h_tm1, c_tm1 = state
66 |
67 | # RNN input
68 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1))
69 |
70 | # Next word's logtis
71 | logits = self.predictor(h_t)
72 |
73 | return logits, (h_t, c_t)
74 |
75 | def initial_state(self, image):
76 | image = Variable(torch.Tensor(image))
77 | if self.on_gpu:
78 | image = image.cuda()
79 |
80 | # Image embedding
81 | feat = self.image_embedding(image)
82 |
83 | # Initial state (batch_size, rnn_size)
84 | h0 = nn.Tanh()(self.h_initializer(feat))
85 | c0 = nn.Tanh()(self.c_initializer(feat))
86 |
87 | return h0, c0
88 |
89 | def train_on_batch(self, image, sentence, mask, reward):
90 | # Convert numpy to torch
91 | self.batch_size = image.shape[0]
92 | sentence = Variable(torch.LongTensor(sentence.tolist()))
93 | mask = torch.Tensor(mask)
94 | reward = torch.Tensor(reward)
95 | T = sentence.size()[0] - 1
96 |
97 | # If using gpu
98 | if self.on_gpu:
99 | sentence = sentence.cuda()
100 | mask = mask.cuda()
101 | reward = reward.cuda()
102 |
103 | # Initial state of RNN
104 | state = self.initial_state(image)
105 |
106 | # Word embedding for input sequence
107 | inputs = self.word_embedding(sentence[:-1, :])
108 |
109 | # Recurrent computation
110 | logits = []
111 | for i in xrange(T):
112 | word = inputs[i, :, :]
113 | logit, state = self.forward(word, state)
114 | logits.append(logit.unsqueeze(0))
115 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
116 | logits = logits.resize(T*self.batch_size, self.vocab_size)
117 |
118 | # Next word's distribution
119 | prob = nn.Softmax()(logits).data
120 |
121 | # Ground-truth
122 | targets = sentence.data[1:, :].view(T*self.batch_size)
123 | gt_prob = self.onehot.index_select(0, targets)
124 |
125 | # Gradients
126 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size)
127 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad)
128 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad)
129 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad)
130 | logit_grad = logit_grad / self.batch_size
131 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size)
132 |
133 | # Gradient descent
134 | self.optimizer.zero_grad()
135 | logits.backward(gradient=logit_grad)
136 | self.optimizer.step()
137 | loss = -1
138 |
139 | return loss
140 |
141 | def single_step(self, state, words, manner='greedy'):
142 | words = Variable(torch.LongTensor(words), volatile=True)
143 | if self.on_gpu:
144 | words = words.cuda()
145 |
146 | # Word embedding
147 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0)
148 |
149 | # Take a rnn step
150 | logits, new_state = self.forward(words, state)
151 |
152 | # Next words
153 | if manner == 'greedy':
154 | new_words = logits.data.cpu().numpy().argmax(1)
155 | elif manner == 'sample':
156 | # Gumbel argmax trick
157 | if self.on_gpu:
158 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
159 | else:
160 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
161 | V = logits.data - torch.log(-U.log())
162 | new_words = V.cpu().numpy().argmax(1)
163 | else:
164 | raise ValueError('Unknown manner: [{}]'.format(manner))
165 |
166 | return new_state, new_words
167 |
168 | def inference(self, vocab, image_ids, image, manner='greedy',
169 | max_length=16, verbose=0, batch_size=None):
170 | # Choose batch-size
171 | self.batch_size = image.shape[0]
172 |
173 | # Beginning tokens
174 | init_word = torch.LongTensor([0] * self.batch_size)
175 |
176 | # Initiazization
177 | results = []
178 | captions = []
179 |
180 | # Iteratively generate words
181 | state = self.initial_state(image)
182 | sentences = []
183 | word = init_word
184 | for _ in xrange(max_length):
185 | state, word = self.single_step(state, word, manner=manner)
186 | sentences.append(word)
187 | sentences = np.array(sentences).transpose()
188 |
189 | # Translate indexes to sentences
190 | for j in xrange(sentences.shape[0]):
191 | idxs = np.where(sentences[j, :] == 0)[0]
192 | end_index = idxs[0] if len(idxs) > 0 else max_length
193 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
194 | if verbose > 0:
195 | print 'id={}, {}'.format(image_ids[j], cap)
196 | captions.append(sentences[j, :end_index])
197 | results.append({'image_id': image_ids[j], 'caption': cap})
198 | # Type: captions (np.array), results (natural language)
199 | return captions, results
200 |
201 | def ngram_single_step(self, state, words,temp_mask, manner='greedy'):
202 | words = Variable(torch.LongTensor(words), volatile=True)
203 | Temp_mask = Variable(torch.Tensor(temp_mask))
204 | if self.on_gpu:
205 | words = words.cuda()
206 | Temp_mask = Temp_mask.cuda()
207 | Temp_mask = Temp_mask * 100000 - 100000
208 | # Word embedding
209 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0)
210 |
211 | # Take a rnn step
212 | logits, new_state = self.forward(words, state)
213 | logits = logits + Temp_mask
214 | # Next words
215 | if manner == 'greedy':
216 | new_words = logits.data.cpu().numpy().argmax(1)
217 | elif manner == 'sample':
218 | # Gumbel argmax trick
219 | if self.on_gpu:
220 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
221 | else:
222 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
223 | V = logits.data - torch.log(-U.log())
224 | new_words = V.cpu().numpy().argmax(1)
225 | else:
226 | raise ValueError('Unknown manner: [{}]'.format(manner))
227 |
228 | return new_state, new_words
229 |
230 | def fourgram_inference(self, vocab, image_ids, image, manner='greedy',
231 | max_length=16):
232 | # Choose batch-size
233 | self.batch_size = image.shape[0]
234 |
235 | # Beginning tokens
236 | init_word = torch.LongTensor([9488] * self.batch_size)
237 |
238 | # Initiazization
239 | results = []
240 | captions = []
241 |
242 | # Iteratively generate words
243 | state = self.initial_state(image)
244 | sentences = []
245 | word = init_word
246 | sentencemask = np.zeros((max_length + 3, self.batch_size), dtype=np.int32)
247 | sentencemask[0:3,:] = 9488
248 | for jj in xrange(max_length):
249 | temp_mask = self.get_four_Mask(sentencemask,jj+3)
250 | state, word = self.ngram_single_step(state, word,temp_mask,manner=manner)
251 | sentencemask[jj+3,:] = word
252 | sentences.append(word)
253 | sentences = np.array(sentences).transpose()
254 |
255 | # Translate indexes to sentences
256 | for j in xrange(sentences.shape[0]):
257 | idxs = np.where(sentences[j, :] == 0)[0]
258 | end_index = idxs[0] if len(idxs) > 0 else max_length
259 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
260 | captions.append(sentences[j, :end_index])
261 | results.append({'image_id': image_ids[j], 'caption': cap})
262 |
263 | # Type: captions (np.array), results (natural language)
264 | return captions, results
265 |
266 | def get_four_Mask(self,sentencemask,index):
267 | tempmask = np.zeros((self.batch_size,self.vocab_size))
268 | for hh in range(self.batch_size):
269 | temp = tuple(list(sentencemask[index-3:index,hh]))
270 | if temp in self.fourgram_mask:
271 | tempmask[hh] = self.fourgram_mask[temp]
272 | else:
273 | tempmask[hh][0] = 1 # END token
274 | return tempmask
275 |
276 | def trigram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16):
277 | # Choose batch-size
278 | self.batch_size = image.shape[0]
279 | results = []
280 | captions = []
281 |
282 | # Iteratively generate words
283 | state = self.initial_state(image)
284 | sentences = []
285 | init_word = torch.LongTensor([9488] * self.batch_size)
286 | word = init_word
287 | sentencemask = np.zeros((max_length + 2, self.batch_size), dtype=np.int32)
288 | sentencemask[0:2,:] = 9488
289 | for jj in xrange(max_length):
290 | temp_mask = self.get_tri_Mask(sentencemask,jj+2)
291 | state, word = self.ngram_single_step(state, word,temp_mask,manner=manner)
292 | sentencemask[jj+2,:] = word
293 | sentences.append(word)
294 | sentences = np.array(sentences).transpose()
295 |
296 | # Translate indexes to sentences
297 | for j in xrange(sentences.shape[0]):
298 | idxs = np.where(sentences[j, :] == 0)[0]
299 | end_index = idxs[0] if len(idxs) > 0 else max_length
300 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
301 | captions.append(sentences[j, :end_index])
302 | results.append({'image_id': image_ids[j], 'caption': cap})
303 |
304 | # Type: captions (np.array), results (natural language)
305 | return captions, results
306 |
307 | def get_tri_Mask(self,sentencemask,index):
308 | tempmask = np.zeros((self.batch_size,self.vocab_size))
309 | for hh in range(self.batch_size):
310 | temp = tuple(list(sentencemask[index-2:index,hh]))
311 | if temp in self.trigram_mask:
312 | tempmask[hh] = self.trigram_mask[temp]
313 | else:
314 | tempmask[hh][0] = 1 # END token
315 | return tempmask
316 |
317 | def save(self, file_path):
318 | with open(file_path, 'wb') as f:
319 | torch.save(self.state_dict(), f)
320 |
321 | def load(self, file_path):
322 | with open(file_path, 'rb') as f:
323 | self.load_state_dict(torch.load(f))
324 |
325 |
--------------------------------------------------------------------------------
/MoreNet/caption_model/rnnlm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import numpy as np
6 |
7 | class LM(nn.Module):
8 | def __init__(self, batch_size, hidden_size,
9 | vocab_size, word_embed_size,
10 | lr, on_gpu=True):
11 | super(LM, self).__init__()
12 | self.lr = lr
13 | self.vocab_size = vocab_size
14 | self.word_embed_size = word_embed_size
15 | self.hidden_size = hidden_size
16 | self.batch_size = batch_size
17 | self.on_gpu = on_gpu
18 |
19 | # word embedding layer
20 | self.word_embedding_layer = nn.Embedding(vocab_size,word_embed_size)
21 |
22 | # language model LSTM
23 | self.rnn = nn.LSTMCell(word_embed_size,hidden_size)
24 |
25 | # predict layer
26 | self.predict_layer = nn.Linear(hidden_size,vocab_size)
27 |
28 | # Optimizer
29 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
30 |
31 | self.onehot = torch.torch.eye(vocab_size)
32 | if self.on_gpu:
33 | self.onehot = self.onehot.cuda()
34 | # Move to gpu if necessary
35 | if self.on_gpu:
36 | self.cuda()
37 | def init_state(self):
38 | h = Variable(torch.zeros(self.batch_size, self.hidden_size))
39 | c = Variable(torch.zeros(self.batch_size, self.hidden_size))
40 | if self.on_gpu:
41 | h = h.cuda()
42 | c = c.cuda()
43 | return h,c
44 |
45 | def forward(self, word_emb, state):
46 | # Get states
47 | h_tm1, c_tm1 = state
48 | # RNN input
49 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1))
50 | # Next word's logtis
51 | logits = self.predict_layer(h_t)
52 | return logits, (h_t, c_t)
53 |
54 | def train_on_batch(self,sentence, mask, reward):
55 | # Convert numpy to torch
56 | sentence = Variable(torch.LongTensor(sentence.tolist()))
57 | mask = torch.Tensor(mask)
58 | reward = torch.Tensor(reward)
59 | T = sentence.size()[0] - 1
60 | # If using gpu
61 |
62 | sentence = sentence.cuda()
63 | mask = mask.cuda()
64 | reward = reward.cuda()
65 |
66 | # Initial state of RNN
67 | state = self.init_state()
68 | # Word embedding for input sequence
69 | inputs = self.word_embedding_layer(sentence[:-1, :])
70 | # Recurrent computation
71 | logits = []
72 | for i in xrange(T):
73 | word = inputs[i, :, :]
74 | logit, state = self.forward(word, state)
75 | logits.append(logit.unsqueeze(0))
76 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
77 | logits = logits.resize(T*self.batch_size, self.vocab_size)
78 | # Next word's distribution
79 | prob = F.softmax(logits).data
80 | # Ground-truth
81 | targets = sentence.data[1:, :].view(T*self.batch_size)
82 | gt_prob = self.onehot.index_select(0, targets)
83 | # Gradients
84 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size)
85 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad)
86 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad)
87 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad)
88 | logit_grad = logit_grad / self.batch_size
89 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size)
90 |
91 | # Gradient descent
92 | self.optimizer.zero_grad()
93 | logits.backward(gradient=logit_grad)
94 | self.optimizer.step()
95 | targets = targets.cpu().numpy()
96 | loss = - np.log(prob[np.arange(T * self.batch_size), targets])
97 | return loss.mean().numpy()
98 |
99 | def test_on_batch(self,sentence, mask):
100 | sentence = Variable(torch.LongTensor(sentence.tolist()))
101 | mask = torch.Tensor(mask)
102 | T = sentence.size()[0] - 1
103 | # If using gpu
104 | if self.on_gpu:
105 | sentence = sentence.cuda()
106 | mask = mask.cuda()
107 | # Initial state of RNN
108 | state = self.init_state()
109 | # Word embedding for input sequence
110 | inputs = self.word_embedding_layer(sentence[:-1, :])
111 | # Recurrent computation
112 | logits = []
113 | for i in xrange(T):
114 | word = inputs[i, :, :]
115 | logit, state = self.forward(word, state)
116 | logits.append(logit.unsqueeze(0))
117 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
118 | logits = logits.resize(T * self.batch_size, self.vocab_size)
119 | # Next word's distribution
120 | prob = F.softmax(logits).data
121 | prob = prob.view(T,self.batch_size,self.vocab_size)
122 | # Ground-truth
123 | return prob
124 |
125 | def single_step_prob(self,word,state):
126 | word = Variable(torch.LongTensor(word.tolist()))
127 | if self.on_gpu:
128 | word = word.cuda()
129 | word_emb = self.word_embedding_layer(word)
130 | logit, state2 = self.forward(word_emb, state) # logit : (batch_size, vocab_size)
131 | prob = F.softmax(logit).data # (batch_size, vocab_size)
132 | return prob,state2
133 |
134 | def save(self, file_path):
135 | with open(file_path, 'wb') as f:
136 | torch.save(self.state_dict(), f)
137 |
138 | def load(self, file_path):
139 | with open(file_path, 'rb') as f:
140 | self.load_state_dict(torch.load(f))
141 |
--------------------------------------------------------------------------------
/MoreNet/train_fourgram.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 |
16 | batch_size = opts.batch_size
17 |
18 | loader = KKDataLoader(opts)
19 | vocabs = loader.get_vocab()
20 | vocab = ['#END#']
21 | for i in range(len(vocabs)):
22 | ids = str(i+1)
23 | vocab.append(vocabs[ids])
24 |
25 | if not os.path.exists('fourgram_cider_model'):
26 | os.mkdir('fourgram_cider_model')
27 |
28 | if opts.use_att:
29 | save_dir = 'fourgram_cider_model/' + 'att_model'
30 | else:
31 | save_dir = 'fourgram_cider_model/' + 'fc_model'
32 | if not os.path.exists(save_dir):
33 | os.mkdir(save_dir)
34 | print(save_dir + ' has been built')
35 |
36 |
37 | image_dim = 2048
38 | vocab_size = loader.vocab_size + 2
39 | cell_size = 512
40 | lr = 0.00005
41 | if opts.use_att:
42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True)
43 | model.load('warm_model/att_warm/model.init')
44 | else:
45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True)
46 | model.load('warm_model/fc_warm/model.init')
47 |
48 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
49 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
50 |
51 | def cider_temp(res):
52 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
53 | score, _ = cider_scorer.compute_score()
54 | return score
55 |
56 | pool = Pool(processes=5)
57 | best_score = -1
58 | logger = Logger(save_dir)
59 | iter = 0
60 | finish_iter = 100000
61 | timer = Timer()
62 | timer.tic()
63 | while iter < finish_iter:
64 | iter += 1
65 | data = loader.get_batch('train')
66 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
67 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
68 | fc_feats, att_feats = tmp
69 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
70 | if opts.use_att:
71 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
72 | feature = att_feats
73 | else:
74 | feature = fc_feats
75 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
76 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
77 |
78 | all_caps, all_results, all_scores = [], [], []
79 | for _ in xrange(20):
80 | # Generate captions by sampling
81 | sample_caps, sample_results = model.fourgram_inference(vocab, image_id, feature,
82 | manner='sample',
83 | max_length=16)
84 |
85 | # Compute cider scores for sampled captions
86 | sample_scores = np.array(pool.map(cider_temp, sample_results))
87 | all_caps.append(sample_caps)
88 | all_results.append(sample_results)
89 | all_scores.append(sample_scores)
90 |
91 | all_scores = np.array(all_scores)
92 | sample_caps, sample_results, sample_scores = [], [], []
93 | for n in xrange(opts.batch_size):
94 | best_i = all_scores[:, n].argmax()
95 | sample_caps.append(all_caps[best_i][n])
96 | sample_results.append(all_results[best_i][n])
97 | sample_scores.append(all_scores[best_i, n])
98 | sample_scores = np.array(sample_scores)
99 |
100 | max_length = max([cap.shape[0] for cap in sample_caps])
101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
102 | for n in xrange(opts.batch_size):
103 | L = sample_caps[n].shape[0]
104 | caption[1:L + 1, n] = sample_caps[n]
105 | caption[L + 1:, n] = 0
106 | caption[0,:] = 9488
107 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
108 | for n in xrange(opts.batch_size):
109 | L = sample_caps[n].shape[0]
110 | mask[:L + 1, n] = 1
111 | reward = (sample_scores - greedy_scores).astype(np.float32)
112 | print image_id[0]
113 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
114 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
115 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
116 | if iter % 300 == 0:
117 | results = []
118 | for nn in range(5000/opts.batch_size):
119 | data = loader.get_batch('val')
120 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
121 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
122 | fc_feats, att_feats = tmp
123 | if opts.use_att:
124 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
125 | att_feats.shape[3])
126 | feature_val = att_feats
127 | else:
128 | feature_val = fc_feats
129 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
130 |
131 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
132 | # Generate sentences for validation set
133 | results += greedy_res
134 | # Evaluate generated captions
135 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
136 | gt_file = osp.join('data/features', 'captions_val.json')
137 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
138 |
139 | if score > best_score:
140 | best_score = score
141 | model.save(osp.join(save_dir, 'model.best'))
142 | model.save(osp.join(save_dir,'model.ckpt'))
143 | # Output training information
144 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
145 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
146 | # Reset loss and timer
147 | train_losses = []
148 | timer.tic()
149 |
150 | # If early-stop condition triggers
151 | if iter > finish_iter:
152 | break
153 |
154 |
--------------------------------------------------------------------------------
/MoreNet/train_rnnlm.py:
--------------------------------------------------------------------------------
1 | import ngram_opts
2 | from dataloader import *
3 | import os
4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
5 | from caption_model.rnnlm import *
6 |
7 | opts = ngram_opts.parse_opt()
8 | if opts.caption_model == 'fc':
9 | opts.use_att = False
10 | else:
11 | opts.use_att = True
12 | loader = KKDataLoader(opts)
13 |
14 |
15 | batch_size = 100
16 | hidden_size = 256
17 | word_embed_size = 256
18 | vocab_size = loader.vocab_size + 2 # set start token
19 | lr = 0.0001
20 | lm = LM(batch_size, hidden_size,vocab_size, word_embed_size,lr)
21 | lm.load('warm_model/rnnlm/model.init')
22 | Labels = loader.h5_label_file['labels']
23 | new_labels = np.zeros((Labels.shape[1]+1,Labels.shape[0]),dtype=Labels.dtype)
24 | new_labels[0,:] = 9488 # Set start token to 9488, the total vocab size is 9489
25 | for i in range(Labels.shape[0]):
26 | new_labels[1:,i] = Labels[i,:]
27 |
28 | Ind = range(len(Labels))
29 | mask = np.ones((16,100))
30 | reward = np.ones((100,))
31 | import random
32 | for i in range(1000):
33 | random.shuffle(Ind)
34 | Loss = []
35 | for j in range(100):
36 | index = Ind[j*batch_size:(j+1)*batch_size]
37 | batch_sen = new_labels[:,index]
38 | loss = lm.train_on_batch(batch_sen,mask,reward)
39 | Loss.append(loss)
40 | print i,np.mean(Loss)
41 | if i % 10 == 0:
42 | lm.save('warm_model2/rnnlm/model.init')
43 |
44 |
--------------------------------------------------------------------------------
/MoreNet/train_rnnlm_cider.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | from caption_model.rnnlm import *
6 | import os
7 | #os.environ["CUDA_VISIBLE_DEVICES"] = "2"
8 | import ngram_opts
9 | from tools import *
10 | from dataloader import *
11 | opts = ngram_opts.parse_opt()
12 | if opts.caption_model == 'fc':
13 | opts.use_att = False
14 | else:
15 | opts.use_att = True
16 |
17 | batch_size = opts.batch_size
18 |
19 | loader = KKDataLoader(opts)
20 | vocabs = loader.get_vocab()
21 | vocab = ['#END#']
22 | for i in range(len(vocabs)):
23 | ids = str(i+1)
24 | vocab.append(vocabs[ids])
25 |
26 | if opts.use_att:
27 | save_dir = 'rnnlm_cider_model/' + 'att_model'
28 | else:
29 | save_dir = 'rnnlm_cider_model/' + 'fc_model'
30 | if not os.path.exists(save_dir):
31 | os.mkdir(save_dir)
32 | print(save_dir + ' has been built')
33 |
34 |
35 | image_dim = opts.fc_feat_size
36 | vocab_size = loader.vocab_size + 2
37 | cell_size = opts.rnn_size
38 | lr = 0.00005
39 | if opts.use_att:
40 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
41 | model.load('warm_model/att_warm/model.init')
42 | else:
43 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
44 | model.load('warm_model/fc_warm/model.init')
45 |
46 | word_embed_size = 256
47 | hidden_size = 256
48 | manner = 'sample'
49 | lm = LM(batch_size, hidden_size,vocab_size+1, word_embed_size,lr)
50 | lm.load('warm_model/rnnlm/model.init')
51 |
52 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
53 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
54 | def cider_temp(res):
55 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
56 | score, _ = cider_scorer.compute_score()
57 | return score
58 |
59 | pool = Pool(processes=4)
60 | logger = Logger(save_dir)
61 | best_score = -1
62 | iters = 0
63 | finish_iter = 100
64 | timer = Timer()
65 | timer.tic()
66 | best_count = 0
67 | max_step = 14
68 | while iters < finish_iter:
69 | iters += 1
70 | data = loader.get_batch('train')
71 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
72 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
73 | fc_feats, att_feats = tmp
74 | if opts.use_att:
75 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
76 | feature = att_feats
77 | else:
78 | feature = fc_feats
79 |
80 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
81 |
82 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
83 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
84 |
85 | all_caps, all_results, all_scores = [], [], []
86 | for _ in xrange(30):
87 | # Generate captions by sampling
88 | if opts.use_att:
89 | sample_caps, sample_results = att_lm_caption(lm,model,image_id,vocab,loader,feature,max_step,'sample')
90 | else:
91 | sample_caps, sample_results = lm_caption(lm, model, image_id, vocab, loader, feature, max_step,'sample')
92 | # Compute cider scores for sampled captions
93 | sample_scores = np.array(pool.map(cider_temp, sample_results))
94 | all_caps.append(sample_caps)
95 | all_results.append(sample_results)
96 | all_scores.append(sample_scores)
97 |
98 | all_scores = np.array(all_scores)
99 | sample_caps, sample_results, sample_scores = [], [], []
100 | for n in xrange(opts.batch_size):
101 | best_i = all_scores[:, n].argmax()
102 | sample_caps.append(all_caps[best_i][:,n])
103 | sample_results.append(all_results[best_i][n])
104 | sample_scores.append(all_scores[best_i, n])
105 | sample_scores = np.array(sample_scores)
106 | sample_caps = np.array(sample_caps)
107 | sample_caps = sample_caps.transpose()
108 |
109 | mask = np.ones((sample_caps.shape[0]-1,sample_caps.shape[1]))
110 | for n in range(opts.batch_size):
111 | index = np.where(sample_caps[:,n] == 0)[0]
112 | if len(index) > 1:
113 | mask[index[1]-1:,n] = 0
114 |
115 | reward = (sample_scores - greedy_scores).astype(np.float32)
116 | sample_caps[0,:] = 9488
117 | print iters, image_id[0]
118 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
119 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
120 |
121 | loss_train = model.train_on_batch(feature,sample_caps, mask, reward)
122 | if iters % 300 == 0:
123 | results = []
124 | for kkk in range(5000/opts.batch_size):
125 | data = loader.get_batch('val')
126 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
127 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
128 | fc_feats, att_feats = tmp
129 | if opts.use_att:
130 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
131 | att_feats.shape[3])
132 | feature = att_feats
133 | else:
134 | feature = fc_feats
135 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
136 |
137 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
138 | results += greedy_res
139 | # Evaluate generated captions
140 | json.dump(results, open(osp.join(save_dir, 'rl_result.json'), 'w'))
141 | gt_file = osp.join('data/features', 'captions_val.json')
142 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'rl_result.json'))[-1]
143 |
144 | if score > best_score:
145 | best_score = score
146 | model.save(osp.join(save_dir, 'model.best'))
147 | model.save(osp.join(save_dir,'model.ckpt'))
148 | # Output training information
149 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
150 | .format(iters, -1, score, best_score, finish_iter, timer.toc()))
151 |
152 | train_losses = []
153 | timer.tic()
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
--------------------------------------------------------------------------------
/MoreNet/train_sc_cider.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 |
16 | batch_size = opts.batch_size
17 |
18 | loader = KKDataLoader(opts)
19 | vocabs = loader.get_vocab()
20 | vocab = ['#END#']
21 | for i in range(len(vocabs)):
22 | ids = str(i+1)
23 | vocab.append(vocabs[ids])
24 |
25 | if not os.path.exists('sc_cider_model'):
26 | os.mkdir('sc_cider_model')
27 |
28 | if opts.use_att:
29 | save_dir = 'sc_cider_model/' + 'att_model'
30 | else:
31 | save_dir = 'sc_cider_model/' + 'fc_model'
32 | if not os.path.exists(save_dir):
33 | os.mkdir(save_dir)
34 | print(save_dir + ' has been built')
35 |
36 |
37 | image_dim = 2048
38 | vocab_size = loader.vocab_size + 1
39 | cell_size = 512
40 | lr = 0.00005
41 | if opts.use_att:
42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True)
43 | model.load('warm_model/att_warm/model.init')
44 | else:
45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True)
46 | model.load('warm_model/fc_warm/model.init')
47 |
48 |
49 | # Initialize cider-scorer
50 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
51 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
52 |
53 | def cider_temp(res):
54 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
55 | score, _ = cider_scorer.compute_score()
56 | return score
57 |
58 | pool = Pool(processes=5)
59 |
60 | best_score = -1
61 | logger = Logger(save_dir)
62 | iter = 0
63 | finish_iter = 1000000
64 | timer = Timer()
65 | timer.tic()
66 | while iter < finish_iter:
67 | iter += 1
68 | data = loader.get_batch('train')
69 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
70 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
71 | fc_feats, att_feats = tmp
72 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
73 | if opts.use_att:
74 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
75 | feature = att_feats
76 | else:
77 | feature = fc_feats
78 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
79 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
80 |
81 | all_caps, all_results, all_scores = [], [], []
82 | for _ in xrange(20):
83 | # Generate captions by sampling
84 | sample_caps, sample_results = model.inference(vocab, image_id, feature,
85 | manner='sample',
86 | max_length=16)
87 |
88 | # Compute cider scores for sampled captions
89 | sample_scores = np.array(pool.map(cider_temp, sample_results))
90 | all_caps.append(sample_caps)
91 | all_results.append(sample_results)
92 | all_scores.append(sample_scores)
93 |
94 | all_scores = np.array(all_scores)
95 | sample_caps, sample_results, sample_scores = [], [], []
96 | for n in xrange(opts.batch_size):
97 | best_i = all_scores[:, n].argmax()
98 | sample_caps.append(all_caps[best_i][n])
99 | sample_results.append(all_results[best_i][n])
100 | sample_scores.append(all_scores[best_i, n])
101 | sample_scores = np.array(sample_scores)
102 |
103 | max_length = max([cap.shape[0] for cap in sample_caps])
104 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
105 | for n in xrange(opts.batch_size):
106 | L = sample_caps[n].shape[0]
107 | caption[1:L + 1, n] = sample_caps[n]
108 | caption[L + 1:, n] = 0
109 | caption[0,:] = 9488
110 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
111 | for n in xrange(opts.batch_size):
112 | L = sample_caps[n].shape[0]
113 | mask[:L + 1, n] = 1
114 | reward = (sample_scores - greedy_scores).astype(np.float32)
115 | print image_id[0]
116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
117 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
118 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
119 | if iter % 300 == 0:
120 | results = []
121 | for nn in range(5000/opts.batch_size):
122 | data = loader.get_batch('val')
123 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
124 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
125 | fc_feats, att_feats = tmp
126 | if opts.use_att:
127 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
128 | att_feats.shape[3])
129 | feature_val = att_feats
130 | else:
131 | feature_val = fc_feats
132 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
133 |
134 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
135 | # Generate sentences for validation set
136 | results += greedy_res
137 | # Evaluate generated captions
138 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
139 | gt_file = osp.join('data/features', 'captions_val.json')
140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
141 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w'))
142 | # Update if finding new best model
143 | if score > best_score:
144 | best_score = score
145 | model.save(osp.join(save_dir, 'model.best'))
146 | model.save(osp.join(save_dir,'model.ckpt'))
147 | # Output training information
148 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
149 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
150 | # Reset loss and timer
151 | train_losses = []
152 | timer.tic()
153 |
154 | # If early-stop condition triggers
155 | if iter > finish_iter:
156 | break
157 |
158 |
--------------------------------------------------------------------------------
/MoreNet/train_trigram.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 | batch_size = opts.batch_size
16 |
17 | loader = KKDataLoader(opts)
18 | vocabs = loader.get_vocab()
19 | vocab = ['#END#']
20 | for i in range(len(vocabs)):
21 | ids = str(i+1)
22 | vocab.append(vocabs[ids])
23 |
24 | if not os.path.exists('trigram_cider_model'):
25 | os.mkdir('trigram_cider_model')
26 |
27 | if opts.use_att:
28 | save_dir = 'trigram_cider_model/' + 'att_model'
29 | else:
30 | save_dir = 'trigram_cider_model/' + 'fc_model'
31 | if not os.path.exists(save_dir):
32 | os.mkdir(save_dir)
33 | print(save_dir + ' has been built')
34 |
35 |
36 | image_dim = 2048
37 | vocab_size = loader.vocab_size + 2
38 | cell_size = 512
39 | lr = 0.00005
40 | if opts.use_att:
41 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True)
42 | model.load('warm_model/att_warm/model.init')
43 | else:
44 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True)
45 | model.load('warm_model/fc_warm/model.init')
46 |
47 |
48 | # Initialize cider-scorer
49 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
50 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
51 |
52 | def cider_temp(res):
53 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
54 | score, _ = cider_scorer.compute_score()
55 | return score
56 |
57 | pool = Pool(processes=5)
58 |
59 | best_score = -1
60 | logger = Logger(save_dir)
61 | iter = 0
62 | finish_iter = 100000
63 | timer = Timer()
64 | timer.tic()
65 | while iter < finish_iter:
66 | iter += 1
67 | data = loader.get_batch('train')
68 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
69 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
70 | fc_feats, att_feats = tmp
71 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
72 | if opts.use_att:
73 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
74 | feature = att_feats
75 | else:
76 | feature = fc_feats
77 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
78 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
79 |
80 | all_caps, all_results, all_scores = [], [], []
81 | for _ in xrange(20):
82 | # Generate captions by sampling
83 | sample_caps, sample_results = model.trigram_inference(vocab, image_id, feature,
84 | manner='sample',
85 | max_length=16)
86 |
87 | # Compute cider scores for sampled captions
88 | sample_scores = np.array(pool.map(cider_temp, sample_results))
89 | all_caps.append(sample_caps)
90 | all_results.append(sample_results)
91 | all_scores.append(sample_scores)
92 |
93 | all_scores = np.array(all_scores)
94 | sample_caps, sample_results, sample_scores = [], [], []
95 | for n in xrange(opts.batch_size):
96 | best_i = all_scores[:, n].argmax()
97 | sample_caps.append(all_caps[best_i][n])
98 | sample_results.append(all_results[best_i][n])
99 | sample_scores.append(all_scores[best_i, n])
100 | sample_scores = np.array(sample_scores)
101 |
102 | max_length = max([cap.shape[0] for cap in sample_caps])
103 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
104 | for n in xrange(opts.batch_size):
105 | L = sample_caps[n].shape[0]
106 | caption[1:L + 1, n] = sample_caps[n]
107 | caption[L + 1:, n] = 0
108 | caption[0,:] = 9488
109 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
110 | for n in xrange(opts.batch_size):
111 | L = sample_caps[n].shape[0]
112 | mask[:L + 1, n] = 1
113 |
114 | reward = (sample_scores - greedy_scores).astype(np.float32)
115 | print image_id[0]
116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
117 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
118 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
119 | if iter % 300 == 0:
120 | results = []
121 | for nn in range(5000/opts.batch_size):
122 | data = loader.get_batch('val')
123 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
124 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
125 | fc_feats, att_feats = tmp
126 | if opts.use_att:
127 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
128 | att_feats.shape[3])
129 | feature_val = att_feats
130 | else:
131 | feature_val = fc_feats
132 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
133 |
134 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
135 | # Generate sentences for validation set
136 | results += greedy_res
137 | # Evaluate generated captions
138 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
139 | gt_file = osp.join('data/features', 'captions_val.json')
140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
141 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w'))
142 | # Update if finding new best model
143 | if score > best_score:
144 | best_score = score
145 | model.save(osp.join(save_dir, 'model.best'))
146 | model.save(osp.join(save_dir,'model.ckpt'))
147 | # Output training information
148 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
149 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
150 | # Reset loss and timer
151 | train_losses = []
152 | timer.tic()
153 |
154 | # If early-stop condition triggers
155 | if iter > finish_iter:
156 | break
157 |
158 |
--------------------------------------------------------------------------------
/MoreNet/train_warm.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | import os
4 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0"
5 | import ngram_opts
6 | from tools import *
7 | from dataloader import *
8 | opts = ngram_opts.parse_opt()
9 | if opts.caption_model == 'fc':
10 | opts.use_att = False
11 | else:
12 | opts.use_att = True
13 |
14 | batch_size = opts.batch_size
15 |
16 | loader = KKDataLoader(opts)
17 | vocabs = loader.get_vocab()
18 | vocab = ['#END#']
19 | for i in range(len(vocabs)):
20 | ids = str(i+1)
21 | vocab.append(vocabs[ids])
22 |
23 | if opts.use_att:
24 | save_dir = 'warm_model/' + 'att_warm'
25 | else:
26 | save_dir = 'warm_model/' + 'fc_warm'
27 | if not os.path.exists(save_dir):
28 | os.mkdir(save_dir)
29 | print(save_dir + ' has been built')
30 |
31 |
32 | image_dim = 2048
33 | vocab_size = loader.vocab_size + 2 # set start token to 9488
34 | cell_size = 512
35 | lr = 0.00005
36 | if opts.use_att:
37 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
38 | else:
39 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
40 |
41 |
42 | iters = 0
43 | best_score = -1
44 | train_losses = []
45 | timer = Timer()
46 | timer.tic()
47 | logger = Logger(save_dir)
48 | finish_iter = 1000000
49 | while iters < 1000000:
50 | iters += 1
51 | data = loader.get_batch('train')
52 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
53 |
54 | fc_feats, att_feats, labels, masks = tmp
55 | if opts.use_att:
56 | feature = att_feats.reshape(att_feats.shape[0],att_feats.shape[1]* att_feats.shape[2],att_feats.shape[3])
57 | else:
58 | feature = fc_feats
59 | Label = labels.transpose()
60 | Label[0,:] = 9488
61 | Mask = masks.transpose()[0:-1,:]
62 | reward = np.ones((opts.batch_size*5,))
63 |
64 | train_losses.append(model.train_on_batch(feature,Label, Mask, reward))
65 | # Validation
66 | if iters % 500 == 0 and iters >= 20000:
67 | results = []
68 | for nn in range(5000/opts.batch_size):
69 | datas = loader.get_batch('val')
70 | tmp = [datas['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
71 | datas['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
72 | fc_feats, att_feats = tmp
73 | if opts.use_att:
74 | feature = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
75 | att_feats.shape[3])
76 | else:
77 | feature = fc_feats
78 | image_id = [datas['infos'][i]['id'] for i in range(opts.batch_size)]
79 |
80 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature,manner='greedy',max_length=16)
81 |
82 | results += greedy_res
83 |
84 | json.dump(results, open(osp.join(save_dir, 'tmp_result.json'), 'w'))
85 | gt_file = osp.join('data/features', 'captions_val.json')
86 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'tmp_result.json'))[-1]
87 |
88 | if score > best_score:
89 | best_score = score
90 | model.save(osp.join(save_dir, 'model.init'))
91 |
92 | # Output training information
93 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
94 | .format(iters, np.mean(train_losses), score, best_score, finish_iter, timer.toc()))
95 | # Reset loss and timer
96 | train_losses = []
97 | timer.tic()
98 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Pytorch Implementation of [Improving Reinforcement Learning Based Image Captioning with Natural Language Prior](https://arxiv.org/abs/1809.06227)
2 |
3 | ## Requirements
4 | Python 2.7
5 |
6 | PyTorch 0.4 (along with torchvision)
7 |
8 | cider package (copy from [Here](https://drive.google.com/open?id=15jqeHYQD0LJjp_e86QvJipUL4_-MHH5p) and dump them to `cider/`)
9 |
10 | pycoco package (copy from [Here](https://drive.google.com/open?id=1B71eCxPj8h7cw5SGVyKOLPsjbbM6dFAF) and extract them to `pycoco/`)
11 |
12 | You need to download pretrained resnet model for both training and evaluation. The models can be downloaded from [here](https://drive.google.com/open?id=1YD7YjPPoK-WGZhmeTcV8LEp_3hYoBcpq), and should be placed in `data/imagenet_weights`.
13 |
14 | ## Train your own network on COCO
15 |
16 | ### Download COCO captions and preprocess them
17 |
18 | Download preprocessed coco captions from [link](https://drive.google.com/open?id=1RzIFR-12fxptp6wm8bqteosLhmRh0cyJ) following Karpathy's split. Copy `dataset_coco.json`,`captions_train.json`,`captions_val.json` and `captions_test.json` in to `data/features`.
19 |
20 | Then do:
21 |
22 | ```bash
23 | $ python scripts/prepro_labels.py --input_json data/dataset_coco.json --output_json data/cocotalk.json --output_h5 data/cocotalk
24 | ```
25 |
26 | `prepro_labels.py` will map all words that occur <= 5 times to a special `UNK` token, and create a vocabulary for all the remaining words. The image information and vocabulary are dumped into `data/cocotalk.json` and discretized caption data are dumped into `data/cocotalk_label.h5`.
27 |
28 | ### Download COCO dataset and pre-extract the image features
29 |
30 | Download the coco images from [link](http://mscoco.org/dataset/#download). We need 2014 training images and 2014 val. images. You should put the `train2014/` and `val2014/` in the same directory, denoted as `$IMAGE_ROOT`.
31 |
32 | Then:
33 |
34 | ```
35 | $ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_dir data/cocotalk --images_root $IMAGE_ROOT
36 | ```
37 |
38 |
39 | `prepro_feats.py` extract the resnet101 features (both fc feature and last conv feature) of each image. The features are saved in `data/cocotalk_fc` and `data/cocotalk_att`, and resulting files are about 200GB.
40 |
41 | (Check the prepro scripts for more options, like other resnet models or other attention sizes.)
42 |
43 | ### Warm Starm
44 |
45 | In order to help CIDEr based REINFORCE algorithm converge more stable and faster, We need to warm start the captioning model and run the script below
46 |
47 | ```bash
48 | $ python train_warm.py --caption_model fc
49 | ```
50 | if you want to use Attention, then run
51 | ```bash
52 | $ python train_warm.py --caption_model att
53 | ```
54 | Download our pretrained warm start model from this [link](https://drive.google.com/open?id=1ZmAqqknqPVnwmiPS2KF6wQCURVhTuZp2). And the best CIDEr score in validation set are 90.1 for FC and 94.2 for Attention.
55 |
56 | ### Train using Self-critical
57 | ```bash
58 | $ python train_sc_cider.py --caption_model att
59 | ```
60 | You will see a large boost of CIDEr score but with lots of bad endings.
61 | 
62 |
63 |
64 |
65 | ### Train using Ngram constraint
66 |
67 | First you should preprocess the dataset and get the ngram data:
68 | ```
69 | $ python get_ngram.py
70 | ```
71 | and will generate `fourgram.pkl` and `trigram.pkl` in `data/` .
72 |
73 | Then
74 | ```bash
75 | $ python train_fourgram.py --caption_model fc
76 | ```
77 | It will take almost 40,000 iterations to converge and the experiment details are written in `experiment.log` in `save_dir` like
78 | 
79 |
80 |
81 | ### Train using Neural Language model
82 |
83 | First you should train a neural language or you can download our pretrained LSTM language model from [link](https://drive.google.com/open?id=1ZmAqqknqPVnwmiPS2KF6wQCURVhTuZp2).
84 | ```
85 | $ python train_rnnlm.py
86 | ```
87 |
88 | Then train RL setting with Neural Language model constraint with the same warm start model.
89 | ```bash
90 | $ python train_rnnlm_cider.py --caption_model fc
91 | ```
92 | or
93 | ```bash
94 | $ python train_rnnlm_cider.py --caption_model att
95 | ```
96 | It will take almost 36,000 iterations to converge and the experiment details are written in `experiment.log` in `save_dir`.
97 |
98 | 
99 |
100 |
101 | ### Evaluating `CIDEr`,`METEOR`,`ROUGEL`,`BLEU`score with Bad Ending removal
102 | ```bash
103 | $ python Eval_model.py --caption_model fc --rl_type fourgram
104 | ```
105 |
106 | ### Try another network structure
107 | We also try another neural network structure and get the similar results. Please see the MoreNet.md for more details.
108 |
109 | ## Acknowledgements
110 | Thanks the original [self-critical](https://github.com/ruotianluo/self-critical.pytorch) performed by ruotianluo.
111 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | #'Prior Image Caption'
--------------------------------------------------------------------------------
/caption_model/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/caption_model/fc.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import numpy as np
4 | import torch.nn.functional as F
5 | from torch.autograd import Variable
6 | import pickle
7 |
8 | class FCModel(nn.Module):
9 | def __init__(self, batch_size, cell_size, image_dim,
10 | vocab_size, lr, ngram=0, on_gpu=False):
11 | super(FCModel, self).__init__()
12 | # Settings
13 | self.batch_size = batch_size
14 | self.cell_size = cell_size
15 | self.image_dim = image_dim
16 | self.vocab_size = vocab_size
17 | self.lr = lr
18 | self.on_gpu = on_gpu
19 |
20 | # Word embedding lookup table
21 | self.word_embedding = nn.Embedding(vocab_size, cell_size)
22 |
23 | # Image embedding mlp
24 | self.image_embedding = nn.Linear(image_dim, cell_size, bias=False)
25 |
26 | # Recurrent layera
27 | self.rnn = nn.LSTMCell(cell_size, cell_size)
28 |
29 | # Word predicting mlp
30 | self.predictor = nn.Linear(cell_size, vocab_size)
31 |
32 | # Onehot encoder
33 | self.onehot = torch.torch.eye(vocab_size)
34 | if self.on_gpu:
35 | self.onehot = self.onehot.cuda()
36 |
37 | # Optimizer
38 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
39 |
40 | if ngram == 3:
41 | trigram = pickle.load(open('data/features/trigram.pkl'))
42 | self.trigram_mask = {}
43 | for tri in trigram:
44 | temp = np.zeros((vocab_size,))
45 | for word in trigram[tri]:
46 | temp[word] = 1
47 | self.trigram_mask[tri] = temp
48 | elif ngram == 4:
49 | fourgram = pickle.load(open('data/features/fourgram.pkl'))
50 | self.fourgram_mask = {}
51 | for four in fourgram:
52 | temp = np.zeros((vocab_size,))
53 | for word in fourgram[four]:
54 | temp[word] = 1
55 | self.fourgram_mask[four] = temp
56 |
57 | # Move to gpu if necessary
58 | if self.on_gpu:
59 | self.cuda()
60 |
61 | def forward(self, word_emb, state):
62 | # Get states
63 | h_tm1, c_tm1 = state
64 |
65 | # RNN input
66 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1))
67 |
68 | # Next word's logtis
69 | logits = self.predictor(h_t)
70 |
71 | return logits, (h_t, c_t)
72 |
73 | def initial_state(self, image):
74 | image = Variable(torch.Tensor(image))
75 | if self.on_gpu:
76 | image = image.cuda()
77 |
78 | # Image embedding
79 | first_word = self.image_embedding(image)
80 | h = Variable(torch.zeros(self.batch_size, self.cell_size))
81 | c = Variable(torch.zeros(self.batch_size, self.cell_size))
82 | if self.on_gpu:
83 | h = h.cuda()
84 | c = c.cuda()
85 | zero_state = h, c
86 |
87 | return first_word, zero_state
88 |
89 | def train_on_batch(self, image, sentence, mask, reward):
90 | # Convert numpy to torch
91 | self.batch_size = image.shape[0]
92 | sentence = Variable(torch.LongTensor(sentence.tolist()))
93 | mask = torch.Tensor(mask)
94 | reward = torch.Tensor(reward)
95 | T = sentence.size()[0]
96 |
97 | # If using gpu
98 | if self.on_gpu:
99 | sentence = sentence.cuda()
100 | mask = mask.cuda()
101 | reward = reward.cuda()
102 |
103 | # Initial state of RNN
104 | first_word, state = self.initial_state(image)
105 |
106 | # Word embedding for input sequence
107 | inputs = self.word_embedding(sentence[:-1, :])
108 |
109 | # Recurrent computation
110 | logits = []
111 | for i in xrange(T):
112 | if i == 0:
113 | word = first_word
114 | else:
115 | word = inputs[i-1, :, :]
116 | logit, state = self.forward(word, state)
117 | logits.append(logit.unsqueeze(0))
118 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
119 | logits = logits.resize(T*self.batch_size, self.vocab_size)
120 |
121 | # Next word's distribution
122 | prob = nn.Softmax()(logits).data
123 |
124 | # Ground-truth
125 | targets = sentence.data[:, :].view(T*self.batch_size)
126 | gt_prob = self.onehot.index_select(0, targets)
127 |
128 | # Gradients
129 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size)
130 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad)
131 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad)
132 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad)
133 | logit_grad = logit_grad / self.batch_size
134 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size)
135 |
136 | # Gradient descent
137 | self.optimizer.zero_grad()
138 | logits.backward(gradient=logit_grad)
139 | self.optimizer.step()
140 | loss = -1
141 |
142 | return loss
143 |
144 | def single_step(self, time_step,state, words, manner='greedy'):
145 | if time_step != 0:
146 | words = Variable(torch.LongTensor(words), volatile=True)
147 | if self.on_gpu:
148 | words = words.cuda()
149 | # Word embedding
150 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0)
151 |
152 | # Take a rnn step
153 | logits, new_state = self.forward(words, state)
154 |
155 | # Next words
156 | if manner == 'greedy':
157 | new_words = logits.data.cpu().numpy().argmax(1)
158 | elif manner == 'sample':
159 | # Gumbel argmax trick
160 | if self.on_gpu:
161 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
162 | else:
163 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
164 | V = logits.data - torch.log(-U.log())
165 | new_words = V.cpu().numpy().argmax(1)
166 | else:
167 | raise ValueError('Unknown manner: [{}]'.format(manner))
168 |
169 | return new_state, new_words
170 |
171 | def inference(self, vocab, image_ids, image, manner='greedy',max_length=16):
172 | # Choose batch-size
173 | self.batch_size = image.shape[0]
174 |
175 | # Initiazization
176 | results = []
177 | captions = []
178 |
179 | # Iteratively generate words
180 | first_word, state = self.initial_state(image)
181 | sentences = []
182 | word = first_word
183 | for i in xrange(max_length):
184 | state, word = self.single_step(i,state, word, manner=manner)
185 | sentences.append(word)
186 | sentences = np.array(sentences).transpose()
187 |
188 | # Translate indexes to sentences
189 | for j in xrange(sentences.shape[0]):
190 | idxs = np.where(sentences[j, :] == 0)[0]
191 | end_index = idxs[0] if len(idxs) > 0 else max_length
192 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
193 | captions.append(sentences[j, :end_index])
194 | results.append({'image_id': image_ids[j], 'caption': cap})
195 | # Type: captions (np.array), results (natural language)
196 | return captions, results
197 |
198 | def ngram_single_step(self,time_step,state, words,temp_mask, manner='greedy'):
199 | if time_step != 0:
200 | words = Variable(torch.LongTensor(words), volatile=True)
201 | if self.on_gpu:
202 | words = words.cuda()
203 | # Word embedding
204 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0)
205 |
206 | Temp_mask = Variable(torch.Tensor(temp_mask))
207 | if self.on_gpu:
208 | Temp_mask = Temp_mask.cuda()
209 | Temp_mask = Temp_mask * 100000 - 100000
210 |
211 | # Take a rnn step
212 | logits, new_state = self.forward(words, state)
213 | logits = logits + Temp_mask
214 | # Next words
215 | if manner == 'greedy':
216 | new_words = logits.data.cpu().numpy().argmax(1)
217 | elif manner == 'sample':
218 | # Gumbel argmax trick
219 | if self.on_gpu:
220 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
221 | else:
222 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1)
223 | V = logits.data - torch.log(-U.log())
224 | new_words = V.cpu().numpy().argmax(1)
225 | else:
226 | raise ValueError('Unknown manner: [{}]'.format(manner))
227 |
228 | return new_state, new_words
229 |
230 | def fourgram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16):
231 | # Choose batch-size
232 | self.batch_size = image.shape[0]
233 |
234 | # Initiazization
235 | results = []
236 | captions = []
237 |
238 | # Iteratively generate words
239 | init_word, state = self.initial_state(image)
240 | sentences = []
241 | word = init_word
242 | sentencemask = np.zeros((max_length + 3, self.batch_size), dtype=np.int32)
243 | sentencemask[0:3,:] = 9488
244 | for jj in xrange(max_length):
245 | temp_mask = self.get_four_Mask(sentencemask,jj+3)
246 | state, word = self.ngram_single_step(jj,state, word,temp_mask,manner=manner)
247 | sentencemask[jj+3,:] = word
248 | sentences.append(word)
249 | sentences = np.array(sentences).transpose()
250 |
251 | # Translate indexes to sentences
252 | for j in xrange(sentences.shape[0]):
253 | idxs = np.where(sentences[j, :] == 0)[0]
254 | end_index = idxs[0] if len(idxs) > 0 else max_length
255 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
256 | captions.append(sentences[j, :end_index])
257 | results.append({'image_id': image_ids[j], 'caption': cap})
258 |
259 | # Type: captions (np.array), results (natural language)
260 | return captions, results
261 |
262 | def get_four_Mask(self,sentencemask,index):
263 | tempmask = np.zeros((self.batch_size,self.vocab_size))
264 | for hh in range(self.batch_size):
265 | temp = tuple(list(sentencemask[index-3:index,hh]))
266 | if temp in self.fourgram_mask:
267 | tempmask[hh] = self.fourgram_mask[temp]
268 | else:
269 | tempmask[hh][0] = 1 # END token
270 | return tempmask
271 |
272 |
273 | def trigram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16):
274 | # Choose batch-size
275 | self.batch_size = image.shape[0]
276 |
277 | # Initiazization
278 | results = []
279 | captions = []
280 |
281 | # Iteratively generate words
282 | init_word, state = self.initial_state(image)
283 | sentences = []
284 | word = init_word
285 | sentencemask = np.zeros((max_length + 2, self.batch_size), dtype=np.int32)
286 | sentencemask[0:2,:] = 9488
287 | for jj in xrange(max_length):
288 | temp_mask = self.get_tri_Mask(sentencemask,jj+2)
289 | state, word = self.ngram_single_step(jj,state, word,temp_mask,manner=manner)
290 | sentencemask[jj+2,:] = word
291 | sentences.append(word)
292 | sentences = np.array(sentences).transpose()
293 |
294 | # Translate indexes to sentences
295 | for j in xrange(sentences.shape[0]):
296 | idxs = np.where(sentences[j, :] == 0)[0]
297 | end_index = idxs[0] if len(idxs) > 0 else max_length
298 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]])
299 | captions.append(sentences[j, :end_index])
300 | results.append({'image_id': image_ids[j], 'caption': cap})
301 |
302 | # Type: captions (np.array), results (natural language)
303 | return captions, results
304 |
305 | def get_tri_Mask(self,sentencemask,index):
306 | tempmask = np.zeros((self.batch_size,self.vocab_size))
307 | for hh in range(self.batch_size):
308 | temp = tuple(list(sentencemask[index-2:index,hh]))
309 | if temp in self.trigram_mask:
310 | tempmask[hh] = self.trigram_mask[temp]
311 | else:
312 | tempmask[hh][0] = 1 # END token
313 | return tempmask
314 |
315 | def save(self, file_path):
316 | with open(file_path, 'wb') as f:
317 | torch.save(self.state_dict(), f)
318 |
319 | def load(self, file_path):
320 | with open(file_path, 'rb') as f:
321 | self.load_state_dict(torch.load(f))
322 |
323 |
--------------------------------------------------------------------------------
/caption_model/rnnlm.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | import numpy as np
6 |
7 | class LM(nn.Module):
8 | def __init__(self, batch_size, hidden_size,
9 | vocab_size, word_embed_size,
10 | lr, on_gpu=True):
11 | super(LM, self).__init__()
12 | self.lr = lr
13 | self.vocab_size = vocab_size
14 | self.word_embed_size = word_embed_size
15 | self.hidden_size = hidden_size
16 | self.batch_size = batch_size
17 | self.on_gpu = on_gpu
18 |
19 | # word embedding layer
20 | self.word_embedding_layer = nn.Embedding(vocab_size,word_embed_size)
21 |
22 | # language model LSTM
23 | self.rnn = nn.LSTMCell(word_embed_size,hidden_size)
24 |
25 | # predict layer
26 | self.predict_layer = nn.Linear(hidden_size,vocab_size)
27 |
28 | # Optimizer
29 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
30 |
31 | self.onehot = torch.torch.eye(vocab_size)
32 | if self.on_gpu:
33 | self.onehot = self.onehot.cuda()
34 | # Move to gpu if necessary
35 | if self.on_gpu:
36 | self.cuda()
37 | def init_state(self):
38 | h = Variable(torch.zeros(self.batch_size, self.hidden_size))
39 | c = Variable(torch.zeros(self.batch_size, self.hidden_size))
40 | if self.on_gpu:
41 | h = h.cuda()
42 | c = c.cuda()
43 | return h,c
44 |
45 | def forward(self, word_emb, state):
46 | # Get states
47 | h_tm1, c_tm1 = state
48 | # RNN input
49 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1))
50 | # Next word's logtis
51 | logits = self.predict_layer(h_t)
52 | return logits, (h_t, c_t)
53 |
54 | def train_on_batch(self,sentence, mask, reward):
55 | # Convert numpy to torch
56 | sentence = Variable(torch.LongTensor(sentence.tolist()))
57 | mask = torch.Tensor(mask)
58 | reward = torch.Tensor(reward)
59 | T = sentence.size()[0] - 1
60 | # If using gpu
61 |
62 | sentence = sentence.cuda()
63 | mask = mask.cuda()
64 | reward = reward.cuda()
65 |
66 | # Initial state of RNN
67 | state = self.init_state()
68 | # Word embedding for input sequence
69 | inputs = self.word_embedding_layer(sentence[:-1, :])
70 | # Recurrent computation
71 | logits = []
72 | for i in xrange(T):
73 | word = inputs[i, :, :]
74 | logit, state = self.forward(word, state)
75 | logits.append(logit.unsqueeze(0))
76 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
77 | logits = logits.resize(T*self.batch_size, self.vocab_size)
78 | # Next word's distribution
79 | prob = F.softmax(logits).data
80 | # Ground-truth
81 | targets = sentence.data[1:, :].view(T*self.batch_size)
82 | gt_prob = self.onehot.index_select(0, targets)
83 | # Gradients
84 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size)
85 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad)
86 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad)
87 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad)
88 | logit_grad = logit_grad / self.batch_size
89 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size)
90 |
91 | # Gradient descent
92 | self.optimizer.zero_grad()
93 | logits.backward(gradient=logit_grad)
94 | self.optimizer.step()
95 | targets = targets.cpu().numpy()
96 | loss = - np.log(prob[np.arange(T * self.batch_size), targets])
97 | return loss.mean().numpy()
98 |
99 | def test_on_batch(self,sentence, mask):
100 | sentence = Variable(torch.LongTensor(sentence.tolist()))
101 | mask = torch.Tensor(mask)
102 | T = sentence.size()[0] - 1
103 | # If using gpu
104 | if self.on_gpu:
105 | sentence = sentence.cuda()
106 | mask = mask.cuda()
107 | # Initial state of RNN
108 | state = self.init_state()
109 | # Word embedding for input sequence
110 | inputs = self.word_embedding_layer(sentence[:-1, :])
111 | # Recurrent computation
112 | logits = []
113 | for i in xrange(T):
114 | word = inputs[i, :, :]
115 | logit, state = self.forward(word, state)
116 | logits.append(logit.unsqueeze(0))
117 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size)
118 | logits = logits.resize(T * self.batch_size, self.vocab_size)
119 | # Next word's distribution
120 | prob = F.softmax(logits).data
121 | prob = prob.view(T,self.batch_size,self.vocab_size)
122 | # Ground-truth
123 | return prob
124 |
125 | def single_step_prob(self,word,state):
126 | word = Variable(torch.LongTensor(word.tolist()))
127 | if self.on_gpu:
128 | word = word.cuda()
129 | word_emb = self.word_embedding_layer(word)
130 | logit, state2 = self.forward(word_emb, state) # logit : (batch_size, vocab_size)
131 | prob = F.softmax(logit).data # (batch_size, vocab_size)
132 | return prob,state2
133 |
134 | def save(self, file_path):
135 | with open(file_path, 'wb') as f:
136 | torch.save(self.state_dict(), f)
137 |
138 | def load(self, file_path):
139 | with open(file_path, 'rb') as f:
140 | self.load_state_dict(torch.load(f))
141 |
--------------------------------------------------------------------------------
/cider/README.md:
--------------------------------------------------------------------------------
1 | Consensus-based Image Description Evaluation (CIDEr Code)
2 | ===================
3 |
4 | Evaluation code for CIDEr metric. Provides CIDEr as well as
5 | CIDEr-D (CIDEr Defended) which is more robust to gaming effects.
6 |
7 | ## Important Note ##
8 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus, CIDEr score for a reference dataset with only 1 image will be zero. When evaluating using one (or few) images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results.
9 |
10 | ## Requirements ##
11 | - java 1.8.0
12 | - python 2.7
13 |
14 | For running the ipython notebook file, update your Ipython to [Jupyter](https://jupyter.org/)
15 |
16 | ## Files ##
17 | ./
18 | - cidereval.py (demo script)
19 |
20 | ./PyDataFormat
21 | - loadData.py (load the json files for references and candidates)
22 |
23 | - {$result\_file}.json (file with the CIDEr and CIDEr-D scores)
24 |
25 | ./pycocoevalcap: The folder where all evaluation codes are stored.
26 | - evals.py: Performs tokenization and runs both the metrics
27 | - tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
28 | - cider: CIDEr evaluation codes
29 | - ciderD: CIDEr-D evaluation codes
30 |
31 | ## Instructions ##
32 | 1. Edit the params.json file to contain path to reference and candidate json files, and the result file where the scores are stored\*.
33 | 2. Set the "idf" value in params.json to "corpus" if not evaluating on a single image/instance. Set the "idf" value to "coco-val-df" if evaluating on a single image. In this case IDF values from the MSCOCO dataset are used. If using some other corpus, get the document frequencies into a similar format as "coco-val-df", and put them in the data/ folder as a pickle file. Then set mode to the name of the document frequency file (without the '.p' extension).
34 | 3. Sample json reference and candidate files are pascal50S.json and pascal_candsB.json
35 | 4. CIDEr scores are stored in "scores" variable: scores['CIDEr'] -> CIDEr scores, scores['CIDErD'] -> CIDEr-D scores
36 |
37 | *Even when evaluating with independent candidate/references (for eg. when using "coco-val-df"), put multiple candidate and reference entries into the same json files. This is much faster than having separate candidate and reference files and calling the evaluation code separately on each candidate/reference file.
38 | ## References ##
39 |
40 | - PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml).
41 | - CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf)
42 |
43 | ## Developers ##
44 | - Ramakrishna Vedantam (Virgina Tech)
45 |
46 | ## Acknowledgments ##
47 | - MS COCO Caption Evaluation Team
48 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | data
--------------------------------------------------------------------------------
/data/karpathy/__init__.py:
--------------------------------------------------------------------------------
1 | '''Image Caption with Prior'''
--------------------------------------------------------------------------------
/dataloader.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import json
6 | import h5py
7 | import os
8 | import numpy as np
9 | import random
10 |
11 | import torch
12 | import torch.utils.data as Data
13 |
14 | import multiprocessing
15 |
16 |
17 | def get_npy_data(ix, fc_file, att_file, use_att):
18 | if use_att == True:
19 | return (np.load(fc_file), np.load(att_file)['feat'], ix)
20 | else:
21 | return (np.load(fc_file), np.zeros((1, 1, 1)), ix)
22 |
23 |
24 | class KKDataLoader(Data.Dataset):
25 |
26 | def reset_iterator(self, split):
27 | del self._prefetch_process[split]
28 | self._prefetch_process[split] = BlobFetcher(split, self, split == 'train')
29 | self.iterators[split] = 0
30 |
31 | def get_vocab_size(self):
32 | return self.vocab_size
33 |
34 | def get_vocab(self):
35 | return self.ix_to_word
36 |
37 | def get_seq_length(self):
38 | return self.seq_length
39 |
40 | def __init__(self, opt):
41 | self.opt = opt
42 | self.batch_size = self.opt.batch_size
43 | self.seq_per_img = opt.seq_per_img
44 | self.use_att = getattr(opt, 'use_att', True)
45 |
46 | # load the json file which contains additional information about the dataset
47 | print('DataLoader loading json file: ', opt.input_json)
48 | self.info = json.load(open(self.opt.input_json))
49 | self.ix_to_word = self.info['ix_to_word']
50 | self.vocab_size = len(self.ix_to_word)
51 | print('vocab size is ', self.vocab_size)
52 |
53 | # open the hdf5 file
54 | print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5)
55 | self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core')
56 |
57 | self.input_fc_dir = self.opt.input_fc_dir
58 | self.input_att_dir = self.opt.input_att_dir
59 |
60 | # load in the sequence data
61 | seq_size = self.h5_label_file['labels'].shape
62 | self.seq_length = seq_size[1]
63 | print('max sequence length in data is', self.seq_length)
64 | # load the pointers in full to RAM (should be small enough)
65 | self.label_start_ix = self.h5_label_file['label_start_ix'][:]
66 | self.label_end_ix = self.h5_label_file['label_end_ix'][:]
67 |
68 | self.num_images = self.label_start_ix.shape[0]
69 | print('read %d image features' % (self.num_images))
70 |
71 | # separate out indexes for each of the provided splits
72 | self.split_ix = {'train': [], 'val': [], 'test': []}
73 | for ix in range(len(self.info['images'])):
74 | img = self.info['images'][ix]
75 | if img['split'] == 'train':
76 | self.split_ix['train'].append(ix)
77 | elif img['split'] == 'val':
78 | self.split_ix['val'].append(ix)
79 | elif img['split'] == 'test':
80 | self.split_ix['test'].append(ix)
81 | elif opt.train_only == 0:
82 | self.split_ix['train'].append(ix)
83 |
84 | print('assigned %d images to split train' % len(self.split_ix['train']))
85 | print('assigned %d images to split val' % len(self.split_ix['val']))
86 | print('assigned %d images to split test' % len(self.split_ix['test']))
87 |
88 | self.iterators = {'train': 0, 'val': 0, 'test': 0}
89 |
90 | self._prefetch_process = {} # The three prefetch process
91 | for split in self.iterators.keys():
92 | self._prefetch_process[split] = BlobFetcher(split, self, split == 'train')
93 | # Terminate the child process when the parent exists
94 |
95 | def cleanup():
96 | print('Terminating BlobFetcher')
97 | for split in self.iterators.keys():
98 | del self._prefetch_process[split]
99 |
100 | import atexit
101 | atexit.register(cleanup)
102 |
103 | def get_batch(self, split, batch_size=None, seq_per_img=None):
104 | batch_size = batch_size or self.batch_size
105 | seq_per_img = seq_per_img or self.seq_per_img
106 |
107 | fc_batch = [] # np.ndarray((batch_size * seq_per_img, self.opt.fc_feat_size), dtype = 'float32')
108 | att_batch = [] # np.ndarray((batch_size * seq_per_img, 14, 14, self.opt.att_feat_size), dtype = 'float32')
109 | label_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype='int')
110 | mask_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype='float32')
111 |
112 | wrapped = False
113 |
114 | infos = []
115 | gts = []
116 |
117 | for i in range(batch_size):
118 | import time
119 | t_start = time.time()
120 | # fetch image
121 | tmp_fc, tmp_att, \
122 | ix, tmp_wrapped = self._prefetch_process[split].get()
123 | fc_batch += [tmp_fc] * seq_per_img
124 | att_batch += [tmp_att] * seq_per_img
125 |
126 | # fetch the sequence labels
127 | ix1 = self.label_start_ix[ix] - 1 # label_start_ix starts from 1
128 | ix2 = self.label_end_ix[ix] - 1
129 | ncap = ix2 - ix1 + 1 # number of captions available for this image
130 | assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t'
131 |
132 | if ncap < seq_per_img:
133 | # we need to subsample (with replacement)
134 | seq = np.zeros([seq_per_img, self.seq_length], dtype='int')
135 | for q in range(seq_per_img):
136 | ixl = random.randint(ix1, ix2)
137 | seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length]
138 | else:
139 | ixl = random.randint(ix1, ix2 - seq_per_img + 1)
140 | seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length]
141 |
142 | label_batch[i * seq_per_img: (i + 1) * seq_per_img, 1: self.seq_length + 1] = seq
143 |
144 | if tmp_wrapped:
145 | wrapped = True
146 |
147 | # Used for reward evaluation
148 | gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix]])
149 |
150 | # record associated info as well
151 | info_dict = {}
152 | info_dict['ix'] = ix
153 | info_dict['id'] = self.info['images'][ix]['id']
154 | info_dict['file_path'] = self.info['images'][ix]['file_path']
155 | infos.append(info_dict)
156 | # print(i, time.time() - t_start)
157 |
158 | # generate mask
159 | t_start = time.time()
160 | nonzeros = np.array(list(map(lambda x: (x != 0).sum() + 2, label_batch)))
161 | for ix, row in enumerate(mask_batch):
162 | row[:nonzeros[ix]] = 1
163 | # print('mask', time.time() - t_start)
164 |
165 | datas = {}
166 | datas['fc_feats'] = np.stack(fc_batch)
167 | datas['att_feats'] = np.stack(att_batch)
168 | datas['labels'] = label_batch
169 | datas['gts'] = gts
170 | datas['masks'] = mask_batch
171 | datas['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': len(self.split_ix[split]), 'wrapped': wrapped}
172 | datas['infos'] = infos
173 |
174 | return datas
175 |
176 | # It's not coherent to make DataLoader a subclass of Dataset, but essentially, we only need to implement the following to functions,
177 | # so that the torch.utils.data.DataLoader can load the data according the index.
178 | # However, it's minimum change to switch to pytorch data loading.
179 | def __getitem__(self, index):
180 | """This function returns a tuple that is further passed to collate_fn
181 | """
182 | ix = index # self.split_ix[index]
183 | return get_npy_data(ix, \
184 | os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy'),
185 | os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'),
186 | self.use_att
187 | )
188 |
189 | def __len__(self):
190 | return len(self.info['images'])
191 | class SubsetSampler(torch.utils.data.sampler.Sampler):
192 | r"""Samples elements randomly from a given list of indices, without replacement.
193 | Arguments:
194 | indices (list): a list of indices
195 | """
196 |
197 | def __init__(self, indices):
198 | self.indices = indices
199 |
200 | def __iter__(self):
201 | return (self.indices[i] for i in range(len(self.indices)))
202 |
203 | def __len__(self):
204 | return len(self.indices)
205 |
206 |
207 | class BlobFetcher():
208 | """Experimental class for prefetching blobs in a separate process."""
209 |
210 | def __init__(self, split, dataloader, if_shuffle=False):
211 | """
212 | db is a list of tuples containing: imcrop_name, caption, bbox_feat of gt box, imname
213 | """
214 | self.split = split
215 | self.dataloader = dataloader
216 | self.if_shuffle = if_shuffle
217 |
218 | # Add more in the queue
219 | # def ReSet(self):
220 | # """
221 | # Two cases:
222 | # 1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator
223 | # 2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already.
224 | # """
225 | # # batch_size is 0, the merge is done in DataLoader class
226 | # self.split_loader = iter(Data.DataLoader(dataset=self.dataloader,
227 | # batch_size=1,
228 | # shuffle=False,
229 | # num_workers=4))
230 | def ReSet(self):
231 | """
232 | Two cases for this function to be triggered:
233 | 1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator
234 | 2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already.
235 | """
236 | # batch_size is 1, the merge is done in DataLoader class
237 | self.split_loader = iter(Data.DataLoader(dataset=self.dataloader,
238 | batch_size=1,
239 | sampler=SubsetSampler(self.dataloader.split_ix[self.split][
240 | self.dataloader.iterators[self.split]:]),
241 | shuffle=False,
242 | pin_memory=True,
243 | num_workers=4, # 4 is usually enough
244 | collate_fn=lambda x: x[0]))
245 |
246 | def _get_next_minibatch_inds(self):
247 | max_index = len(self.dataloader.split_ix[self.split])
248 | wrapped = False
249 |
250 | ri = self.dataloader.iterators[self.split]
251 | ix = self.dataloader.split_ix[self.split][ri]
252 |
253 | ri_next = ri + 1
254 | if ri_next >= max_index:
255 | ri_next = 0
256 | if self.if_shuffle:
257 | random.shuffle(self.dataloader.split_ix[self.split])
258 | wrapped = True
259 | self.dataloader.iterators[self.split] = ri_next
260 |
261 | return ix, wrapped
262 |
263 | def get(self):
264 | if not hasattr(self, 'split_loader'):
265 | self.ReSet()
266 |
267 | ix, wrapped = self._get_next_minibatch_inds()
268 | tmp = self.split_loader.next()
269 | if wrapped:
270 | self.ReSet()
271 |
272 | assert tmp[2] == ix, "ix not equal"
273 |
274 | return tmp + [wrapped]
275 |
--------------------------------------------------------------------------------
/get_ngram.py:
--------------------------------------------------------------------------------
1 | import json
2 | from dataloader import *
3 | from tools import remove_badEnding
4 | import pickle
5 | coco = json.load(open('data/cocotalk.json'))
6 | ix_to_word = coco['ix_to_word']
7 |
8 | vocab = {}
9 | for i in ix_to_word:
10 | vocab[ix_to_word[i]] = int(i)
11 |
12 |
13 | fourgram = {}
14 | trigram = {}
15 | vocab_size = 9488 # 0:'#END' 9487:'UNK' 9488:'#BEGIN'
16 |
17 | train_data = json.load(open('data/captions_train.json'))
18 | for all_cap in train_data['captions']:
19 | for cap in all_cap:
20 | tokens = cap.split(' ')
21 | L = len(tokens)
22 | index = []
23 | for i in range(len(tokens)):
24 | if tokens[i] in vocab:
25 | index.append(vocab[tokens[i]])
26 | else:
27 | index.append(vocab_size-1)
28 | index += [0] * (16-L)
29 | fourgram_seq = [vocab_size,vocab_size,vocab_size] + index
30 | trigram_seq = [vocab_size,vocab_size] + index
31 | for j in range(16):
32 | fourgram_tuple = tuple(fourgram_seq[j:j+3])
33 | trigram_tuple = tuple(trigram_seq[j:j+2])
34 | if fourgram_tuple not in fourgram:
35 | fourgram[fourgram_tuple] = {}
36 | fourgram[fourgram_tuple][index[j]] = 1
37 | else:
38 | if index[j] not in fourgram[fourgram_tuple]:
39 | fourgram[fourgram_tuple][index[j]] = 1
40 | else:
41 | fourgram[fourgram_tuple][index[j]] += 1
42 |
43 | if trigram_tuple not in trigram:
44 | trigram[trigram_tuple] = {}
45 | trigram[trigram_tuple][index[j]] = 1
46 | else:
47 | if index[j] not in trigram[trigram_tuple]:
48 | trigram[trigram_tuple][index[j]] = 1
49 | else:
50 | trigram[trigram_tuple][index[j]] += 1
51 |
52 | f = open('data/fourgram.pkl','w')
53 | pickle.dump(fourgram,f)
54 | f.close()
55 |
56 | f = open('data/trigram.pkl','w')
57 | pickle.dump(trigram,f)
58 | f.close()
59 |
--------------------------------------------------------------------------------
/images/badending.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/badending.png
--------------------------------------------------------------------------------
/images/fourgram_att.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/fourgram_att.png
--------------------------------------------------------------------------------
/images/rnn_att.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/rnn_att.png
--------------------------------------------------------------------------------
/images/rnn_fc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/rnn_fc.png
--------------------------------------------------------------------------------
/misc/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/__init__.pyc
--------------------------------------------------------------------------------
/misc/ngram_reward.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import time
7 | import misc.utils as utils
8 | from collections import OrderedDict
9 | import torch
10 | from torch.autograd import Variable
11 |
12 | import sys
13 |
14 | sys.path.append("cider")
15 | from pyciderevalcap.ciderD.ciderD import CiderD
16 |
17 | CiderD_scorer = None
18 |
19 |
20 | # CiderD_scorer = CiderD(df='corpus')
21 |
22 | def init_cider_scorer(cached_tokens):
23 | global CiderD_scorer
24 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
25 |
26 |
27 | def array_to_str(arr):
28 | out = ''
29 | for i in range(len(arr)):
30 | out += str(arr[i]) + ' '
31 | if arr[i] == 0:
32 | break
33 | return out.strip()
34 |
35 |
36 | def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result):
37 | batch_size = gen_result.size(0) # batch_size = sample_size * seq_per_img
38 | seq_per_img = batch_size // len(data['gts'])
39 |
40 | # get greedy decoding baseline
41 | greedy_res, _ = model.ngram_sample(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True))
42 |
43 | res = OrderedDict()
44 |
45 | gen_result = gen_result.cpu().numpy()
46 | greedy_res = greedy_res.cpu().numpy()
47 | for i in range(batch_size):
48 | res[i] = [array_to_str(gen_result[i])]
49 | for i in range(batch_size):
50 | res[batch_size + i] = [array_to_str(greedy_res[i])]
51 |
52 | gts = OrderedDict()
53 | for i in range(len(data['gts'])):
54 | gts[i] = [array_to_str(data['gts'][i][j]) for j in range(len(data['gts'][i]))]
55 |
56 | # _, scores = Bleu(4).compute_score(gts, res)
57 | # scores = np.array(scores[3])
58 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]
59 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)}
60 | _, scores = CiderD_scorer.compute_score(gts, res)
61 | print('Cider scores:', _)
62 |
63 | scores = scores[:batch_size] - scores[batch_size:]
64 |
65 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)
66 |
67 | return rewards
--------------------------------------------------------------------------------
/misc/ngram_reward.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/ngram_reward.pyc
--------------------------------------------------------------------------------
/misc/ngram_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 | bad_endding = ['with','in','on','of','a','at','to','for','an','this','his','her','that']
5 | import collections
6 | import torch
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 | import numpy as np
10 |
11 | def if_use_att(caption_model):
12 | # Decide if load attention feature according to caption model
13 | if caption_model in ['show_tell', 'all_img', 'fc']:
14 | return False
15 | return True
16 |
17 | def delete_badding(txt):
18 | if txt.endswith('with'):
19 | txt = txt[0:len(txt)-5]
20 |
21 |
22 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token.
23 | def decode_sequence(ix_to_word, seq):
24 | N, D = seq.size()
25 | out = []
26 | for i in range(N):
27 | txt = ''
28 | for j in range(D):
29 | ix = seq[i,j]
30 | if ix > 0 :
31 | if j >= 1:
32 | txt = txt + ' '
33 | txt = txt + ix_to_word[str(ix)]
34 | else:
35 | break
36 | out.append(txt)
37 | return out
38 | def delete_decode_sequence(ix_to_word, seq):
39 | N, D = seq.size()
40 | out = []
41 | for i in range(N):
42 | flag = 0
43 | for j in range(D):
44 | ix = seq[i,D-1-j]
45 | if ix > 0 and ix_to_word[str(ix)] not in bad_endding:
46 | flag = D-j
47 | break
48 | txt = ' '.join([ix_to_word[str(ix)] for ix in seq[i,0:flag]])
49 | out.append(txt)
50 | return out
51 | def to_contiguous(tensor):
52 | if tensor.is_contiguous():
53 | return tensor
54 | else:
55 | return tensor.contiguous()
56 |
57 | class RewardCriterion(nn.Module):
58 | def __init__(self):
59 | super(RewardCriterion, self).__init__()
60 |
61 | def forward(self, input, seq, reward):
62 | input = to_contiguous(input).view(-1)
63 | reward = to_contiguous(reward).view(-1)
64 | mask = (seq>0).float()
65 | mask = to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1)
66 | output = - input * reward * Variable(mask)
67 | output = torch.sum(output) / torch.sum(mask)
68 |
69 | return output
70 | class LanguageModelCriterion(nn.Module):
71 | def __init__(self):
72 | super(LanguageModelCriterion, self).__init__()
73 |
74 | def forward(self, input, target, mask):
75 | # truncate to the same size
76 | target = target[:, :input.size(1)]
77 | mask = mask[:, :input.size(1)]
78 | input = to_contiguous(input).view(-1, input.size(2))
79 | target = to_contiguous(target).view(-1, 1)
80 | mask = to_contiguous(mask).view(-1, 1)
81 | output = - input.gather(1, target) * mask
82 | output = torch.sum(output) / torch.sum(mask)
83 |
84 | return output
85 |
86 | def set_lr(optimizer, lr):
87 | for group in optimizer.param_groups:
88 | group['lr'] = lr
89 |
90 | def clip_gradient(optimizer, grad_clip):
91 | for group in optimizer.param_groups:
92 | for param in group['params']:
93 | param.grad.data.clamp_(-grad_clip, grad_clip)
--------------------------------------------------------------------------------
/misc/ngram_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/ngram_utils.pyc
--------------------------------------------------------------------------------
/misc/resnet.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import math
3 | import torch.utils.model_zoo as model_zoo
4 |
5 |
6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
7 | 'resnet152']
8 |
9 |
10 | model_urls = {
11 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
12 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
13 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
14 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
15 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
16 | }
17 |
18 |
19 | def conv3x3(in_planes, out_planes, stride=1):
20 | "3x3 convolution with padding"
21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
22 | padding=1, bias=False)
23 |
24 |
25 | class BasicBlock(nn.Module):
26 | expansion = 1
27 |
28 | def __init__(self, inplanes, planes, stride=1, downsample=None):
29 | super(BasicBlock, self).__init__()
30 | self.conv1 = conv3x3(inplanes, planes, stride)
31 | self.bn1 = nn.BatchNorm2d(planes)
32 | self.relu = nn.ReLU(inplace=True)
33 | self.conv2 = conv3x3(planes, planes)
34 | self.bn2 = nn.BatchNorm2d(planes)
35 | self.downsample = downsample
36 | self.stride = stride
37 |
38 | def forward(self, x):
39 | residual = x
40 |
41 | out = self.conv1(x)
42 | out = self.bn1(out)
43 | out = self.relu(out)
44 |
45 | out = self.conv2(out)
46 | out = self.bn2(out)
47 |
48 | if self.downsample is not None:
49 | residual = self.downsample(x)
50 |
51 | out += residual
52 | out = self.relu(out)
53 |
54 | return out
55 |
56 |
57 | class Bottleneck(nn.Module):
58 | expansion = 4
59 |
60 | def __init__(self, inplanes, planes, stride=1, downsample=None):
61 | super(Bottleneck, self).__init__()
62 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
63 | self.bn1 = nn.BatchNorm2d(planes)
64 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
65 | padding=1, bias=False)
66 | self.bn2 = nn.BatchNorm2d(planes)
67 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
68 | self.bn3 = nn.BatchNorm2d(planes * 4)
69 | self.relu = nn.ReLU(inplace=True)
70 | self.downsample = downsample
71 | self.stride = stride
72 |
73 | def forward(self, x):
74 | residual = x
75 |
76 | out = self.conv1(x)
77 | out = self.bn1(out)
78 | out = self.relu(out)
79 |
80 | out = self.conv2(out)
81 | out = self.bn2(out)
82 | out = self.relu(out)
83 |
84 | out = self.conv3(out)
85 | out = self.bn3(out)
86 |
87 | if self.downsample is not None:
88 | residual = self.downsample(x)
89 |
90 | out += residual
91 | out = self.relu(out)
92 |
93 | return out
94 |
95 |
96 | class ResNet(nn.Module):
97 | def __init__(self, block, layers, num_classes=1000):
98 | self.inplanes = 64
99 | super(ResNet, self).__init__()
100 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
101 | bias=False)
102 | self.bn1 = nn.BatchNorm2d(64)
103 | self.relu = nn.ReLU(inplace=True)
104 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
105 | self.layer1 = self._make_layer(block, 64, layers[0])
106 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
108 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
109 | self.avgpool = nn.AvgPool2d(7)
110 | self.fc = nn.Linear(512 * block.expansion, num_classes)
111 |
112 | for m in self.modules():
113 | if isinstance(m, nn.Conv2d):
114 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
115 | m.weight.data.normal_(0, math.sqrt(2. / n))
116 | elif isinstance(m, nn.BatchNorm2d):
117 | m.weight.data.fill_(1)
118 | m.bias.data.zero_()
119 |
120 | def _make_layer(self, block, planes, blocks, stride=1):
121 | downsample = None
122 | if stride != 1 or self.inplanes != planes * block.expansion:
123 | downsample = nn.Sequential(
124 | nn.Conv2d(self.inplanes, planes * block.expansion,
125 | kernel_size=1, stride=stride, bias=False),
126 | nn.BatchNorm2d(planes * block.expansion),
127 | )
128 |
129 | layers = []
130 | layers.append(block(self.inplanes, planes, stride, downsample))
131 | self.inplanes = planes * block.expansion
132 | for i in range(1, blocks):
133 | layers.append(block(self.inplanes, planes))
134 |
135 | return nn.Sequential(*layers)
136 |
137 | def forward(self, x):
138 | x = self.conv1(x)
139 | x = self.bn1(x)
140 | x = self.relu(x)
141 | x = self.maxpool(x)
142 |
143 | x = self.layer1(x)
144 | x = self.layer2(x)
145 | x = self.layer3(x)
146 | x = self.layer4(x)
147 |
148 | x = self.avgpool(x)
149 | x = x.view(x.size(0), -1)
150 | x = self.fc(x)
151 |
152 | return x
153 |
154 |
155 | def resnet18(pretrained=False):
156 | """Constructs a ResNet-18 model.
157 |
158 | Args:
159 | pretrained (bool): If True, returns a model pre-trained on ImageNet
160 | """
161 | model = ResNet(BasicBlock, [2, 2, 2, 2])
162 | if pretrained:
163 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
164 | return model
165 |
166 |
167 | def resnet34(pretrained=False):
168 | """Constructs a ResNet-34 model.
169 |
170 | Args:
171 | pretrained (bool): If True, returns a model pre-trained on ImageNet
172 | """
173 | model = ResNet(BasicBlock, [3, 4, 6, 3])
174 | if pretrained:
175 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
176 | return model
177 |
178 |
179 | def resnet50(pretrained=False):
180 | """Constructs a ResNet-50 model.
181 |
182 | Args:
183 | pretrained (bool): If True, returns a model pre-trained on ImageNet
184 | """
185 | model = ResNet(Bottleneck, [3, 4, 6, 3])
186 | if pretrained:
187 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
188 | return model
189 |
190 |
191 | def resnet101(pretrained=False):
192 | """Constructs a ResNet-101 model.
193 |
194 | Args:
195 | pretrained (bool): If True, returns a model pre-trained on ImageNet
196 | """
197 | model = ResNet(Bottleneck, [3, 4, 23, 3])
198 | if pretrained:
199 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
200 | return model
201 |
202 |
203 | def resnet152(pretrained=False):
204 | """Constructs a ResNet-152 model.
205 |
206 | Args:
207 | pretrained (bool): If True, returns a model pre-trained on ImageNet
208 | """
209 | model = ResNet(Bottleneck, [3, 8, 36, 3])
210 | if pretrained:
211 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
212 | return model
--------------------------------------------------------------------------------
/misc/resnet.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/resnet.pyc
--------------------------------------------------------------------------------
/misc/resnet_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 | import torch.nn.functional as F
5 |
6 | class myResnet(nn.Module):
7 | def __init__(self, resnet):
8 | super(myResnet, self).__init__()
9 | self.resnet = resnet
10 |
11 | def forward(self, img, att_size=14):
12 | x = img.unsqueeze(0)
13 |
14 | x = self.resnet.conv1(x)
15 | x = self.resnet.bn1(x)
16 | x = self.resnet.relu(x)
17 | x = self.resnet.maxpool(x)
18 |
19 | x = self.resnet.layer1(x)
20 | x = self.resnet.layer2(x)
21 | x = self.resnet.layer3(x)
22 | x = self.resnet.layer4(x)
23 |
24 | fc = x.mean(3).mean(2).squeeze()
25 | att = F.adaptive_avg_pool2d(x,[att_size,att_size]).squeeze().permute(1, 2, 0)
26 |
27 | return fc, att
28 |
29 |
--------------------------------------------------------------------------------
/misc/resnet_utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/resnet_utils.pyc
--------------------------------------------------------------------------------
/misc/rewards.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import time
7 | import misc.utils as utils
8 | from collections import OrderedDict
9 | import torch
10 | from torch.autograd import Variable
11 |
12 | import sys
13 |
14 | sys.path.append("cider")
15 | from pyciderevalcap.ciderD.ciderD import CiderD
16 |
17 | CiderD_scorer = None
18 |
19 |
20 | # CiderD_scorer = CiderD(df='corpus')
21 |
22 | def init_cider_scorer(cached_tokens):
23 | global CiderD_scorer
24 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens)
25 |
26 |
27 | def array_to_str(arr):
28 | out = ''
29 | for i in range(len(arr)):
30 | out += str(arr[i]) + ' '
31 | if arr[i] == 0:
32 | break
33 | return out.strip()
34 |
35 |
36 | def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result):
37 | batch_size = gen_result.size(0) # batch_size = sample_size * seq_per_img
38 | seq_per_img = batch_size // len(data['gts'])
39 |
40 | # get greedy decoding baseline
41 | greedy_res, _ = model.sample(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True))
42 |
43 | res = OrderedDict()
44 |
45 | gen_result = gen_result.cpu().numpy()
46 | greedy_res = greedy_res.cpu().numpy()
47 | for i in range(batch_size):
48 | res[i] = [array_to_str(gen_result[i])]
49 | for i in range(batch_size):
50 | res[batch_size + i] = [array_to_str(greedy_res[i])]
51 |
52 | gts = OrderedDict()
53 | for i in range(len(data['gts'])):
54 | gts[i] = [array_to_str(data['gts'][i][j]) for j in range(len(data['gts'][i]))]
55 |
56 | # _, scores = Bleu(4).compute_score(gts, res)
57 | # scores = np.array(scores[3])
58 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)]
59 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)}
60 | _, scores = CiderD_scorer.compute_score(gts, res)
61 | print('Cider scores:', _)
62 |
63 | scores = scores[:batch_size] - scores[batch_size:]
64 |
65 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)
66 |
67 | return rewards
--------------------------------------------------------------------------------
/misc/rewards.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/rewards.pyc
--------------------------------------------------------------------------------
/misc/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import collections
6 | import torch
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 | import numpy as np
10 |
11 | def if_use_att(caption_model):
12 | # Decide if load attention feature according to caption model
13 | if caption_model in ['show_tell', 'all_img', 'fc']:
14 | return False
15 | return True
16 |
17 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token.
18 | def decode_sequence(ix_to_word, seq):
19 | N, D = seq.size()
20 | out = []
21 | for i in range(N):
22 | txt = ''
23 | for j in range(D):
24 | ix = seq[i,j]
25 | if ix > 0 :
26 | if j >= 1:
27 | txt = txt + ' '
28 | txt = txt + ix_to_word[str(ix)]
29 | else:
30 | break
31 | out.append(txt)
32 | return out
33 |
34 | def to_contiguous(tensor):
35 | if tensor.is_contiguous():
36 | return tensor
37 | else:
38 | return tensor.contiguous()
39 |
40 | class RewardCriterion(nn.Module):
41 | def __init__(self):
42 | super(RewardCriterion, self).__init__()
43 |
44 | def forward(self, input, seq, reward):
45 | input = to_contiguous(input).view(-1)
46 | reward = to_contiguous(reward).view(-1)
47 | mask = (seq>0).float()
48 | mask = to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1)
49 | output = - input * reward * Variable(mask)
50 | output = torch.sum(output) / torch.sum(mask)
51 |
52 | return output
53 | class LanguageModelCriterion(nn.Module):
54 | def __init__(self):
55 | super(LanguageModelCriterion, self).__init__()
56 |
57 | def forward(self, input, target, mask):
58 | # truncate to the same size
59 | target = target[:, :input.size(1)]
60 | mask = mask[:, :input.size(1)]
61 | input = to_contiguous(input).view(-1, input.size(2))
62 | target = to_contiguous(target).view(-1, 1)
63 | mask = to_contiguous(mask).view(-1, 1)
64 | output = - input.gather(1, target) * mask
65 | output = torch.sum(output) / torch.sum(mask)
66 |
67 | return output
68 |
69 | def set_lr(optimizer, lr):
70 | for group in optimizer.param_groups:
71 | group['lr'] = lr
72 |
73 | def clip_gradient(optimizer, grad_clip):
74 | for group in optimizer.param_groups:
75 | for param in group['params']:
76 | param.grad.data.clamp_(-grad_clip, grad_clip)
77 |
--------------------------------------------------------------------------------
/misc/utils.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/utils.pyc
--------------------------------------------------------------------------------
/mycider.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | import math
4 | from collections import defaultdict
5 |
6 |
7 | def transfer_result_to_res(data):
8 | res = {}
9 | for i in range(len(data)):
10 | res[data[i]['image_id']] = [data[i]['caption']]
11 | return res
12 |
13 |
14 | def transfer_json_to_cider_gts(json_file):
15 | print '... changing standard format for cider calculation'
16 | with open(json_file) as f:
17 | data = json.load(f)
18 | image_index = data['image_ids']
19 | index_caption = data['captions']
20 | gts_caption = {}
21 | for i in range(len(image_index)):
22 | gts_caption[image_index[i]] = index_caption[i]
23 | print '... finishing changing standard format'
24 | return gts_caption
25 |
26 |
27 | def precook(s, n=4, out=False):
28 | """
29 | Takes a string as input and returns an object that can be given to
30 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
31 | can take string arguments as well.
32 | :param s: string : sentence to be converted into ngrams
33 | :param n: int : number of ngrams for which representation is calculated
34 | :return: term frequency vector for occuring ngrams
35 | """
36 | words = s.split()
37 | counts = defaultdict(int)
38 | for k in xrange(1,n+1):
39 | for i in xrange(len(words)-k+1):
40 | ngram = tuple(words[i:i+k])
41 | counts[ngram] += 1
42 | return counts
43 |
44 |
45 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
46 | '''Takes a list of reference sentences for a single segment
47 | and returns an object that encapsulates everything that BLEU
48 | needs to know about them.
49 | :param refs: list of string : reference sentences for some image
50 | :param n: int : number of ngrams for which (ngram) representation is calculated
51 | :return: result (list of dict)
52 | '''
53 | return [precook(ref, n) for ref in refs]
54 |
55 |
56 | def cook_test(test, n=4):
57 | '''Takes a test sentence and returns an object that
58 | encapsulates everything that BLEU needs to know about it.
59 | :param test: list of string : hypothesis sentence for some image
60 | :param n: int : number of ngrams for which (ngram) representation is calculated
61 | :return: result (dict)
62 | '''
63 | return precook(test, n, True)
64 |
65 |
66 | class CiderScorer(object):
67 | def __init__(self, refs=None, n=4, sigma=6.0):
68 | self.n = n
69 | self.sigma = sigma
70 | self.crefs = []
71 | self.ref_to_imageId = {}
72 | self.build_cook_refs(refs)
73 | self.document_frequency = defaultdict(float)
74 | self.compute_doc_freq()
75 |
76 | def compute_doc_freq(self):
77 | """
78 | Compute term frequency for reference data.
79 | This will be used to compute idf (inverse document frequency later)
80 | The term frequency is stored in the object
81 | :return: None
82 | """
83 | print 'done for stats'
84 | for refs in self.crefs:
85 | # refs, k ref captions of one image
86 | for ngram in set([ngram for ref in refs for (ngram, count) in ref.iteritems()]):
87 | self.document_frequency[ngram] += 1
88 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
89 |
90 | def Counts2vec(self,cnts):
91 | """
92 | Function maps counts of ngram to vector of tfidf weights.
93 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
94 | The n-th entry of array denotes length of n-grams.
95 | :param cnts:
96 | :return: vec (array of dict), norm (array of float), length (int)
97 | """
98 | vec = [defaultdict(float) for _ in range(self.n)]
99 | length = 0
100 | norm = [0.0 for _ in range(self.n)]
101 | for (ngram, term_freq) in cnts.iteritems():
102 | # give word count 1 if it doesn't appear in reference corpus
103 | df = np.log(max(1.0, self.document_frequency[ngram]))
104 | # ngram index
105 | n = len(ngram) - 1
106 | # tf (term_freq) * idf (precomputed idf) for n-grams
107 | vec[n][ngram] = float(term_freq) * (self.ref_len - df)
108 | # compute norm for the vector. the norm will be used for computing similarity
109 | norm[n] += pow(vec[n][ngram], 2)
110 |
111 | if n == 1:
112 | length += term_freq
113 | norm = [np.sqrt(n) for n in norm]
114 | return vec, norm, length
115 |
116 | def Sim(self, vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
117 | """
118 | Compute the cosine similarity of two vectors.
119 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis
120 | :param vec_ref: array of dictionary for vector corresponding to reference
121 | :param norm_hyp: array of float for vector corresponding to hypothesis
122 | :param norm_ref: array of float for vector corresponding to reference
123 | :param length_hyp: int containing length of hypothesis
124 | :param length_ref: int containing length of reference
125 | :return: array of score for each n-grams cosine similarity
126 | """
127 | delta = float(length_hyp - length_ref)
128 | # measure consine similarity
129 | val = np.array([0.0 for _ in range(self.n)])
130 | for n in range(self.n):
131 | # ngram
132 | for (ngram, count) in vec_hyp[n].iteritems():
133 | # vrama91 : added clipping
134 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
135 |
136 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
137 | val[n] /= (norm_hyp[n] * norm_ref[n])
138 |
139 | assert (not math.isnan(val[n]))
140 | # vrama91: added a length based gaussian penalty
141 | val[n] *= np.e ** (-(delta ** 2) / (2 * self.sigma ** 2))
142 | return val
143 |
144 | # compute log reference length
145 |
146 | def build_cook_refs(self, refs):
147 | count = 0
148 | if refs is not None:
149 | for item in refs:
150 | self.ref_to_imageId[item] = count
151 | self.crefs.append(cook_refs(refs[item], n= self.n))
152 | count = count + 1
153 |
154 | def cook_append_test(self, test=None):
155 | self.ctest = []
156 | self.test_to_imageId = {}
157 | Counttest = 0
158 | if test is not None:
159 | for item in test:
160 | self.test_to_imageId[Counttest] = item
161 | self.ctest.append(cook_test(test[item][0], n=self.n))
162 | Counttest = Counttest + 1
163 | else:
164 | self.ctest.append(None)
165 |
166 | def compute_cider(self):
167 | def counts2vec(cnts):
168 | """
169 | Function maps counts of ngram to vector of tfidf weights.
170 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
171 | The n-th entry of array denotes length of n-grams.
172 | :param cnts:
173 | :return: vec (array of dict), norm (array of float), length (int)
174 | """
175 | vec = [defaultdict(float) for _ in range(self.n)]
176 | length = 0
177 | norm = [0.0 for _ in range(self.n)]
178 | for (ngram, term_freq) in cnts.iteritems():
179 | # give word count 1 if it doesn't appear in reference corpus
180 | df = np.log(max(1.0, self.document_frequency[ngram]))
181 | # ngram index
182 | n = len(ngram) - 1
183 | # tf (term_freq) * idf (precomputed idf) for n-grams
184 | vec[n][ngram] = float(term_freq) * (self.ref_len - df)
185 | # compute norm for the vector. the norm will be used for computing similarity
186 | norm[n] += pow(vec[n][ngram], 2)
187 |
188 | if n == 1:
189 | length += term_freq
190 | norm = [np.sqrt(n) for n in norm]
191 | return vec, norm, length
192 |
193 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
194 | """
195 | Compute the cosine similarity of two vectors.
196 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis
197 | :param vec_ref: array of dictionary for vector corresponding to reference
198 | :param norm_hyp: array of float for vector corresponding to hypothesis
199 | :param norm_ref: array of float for vector corresponding to reference
200 | :param length_hyp: int containing length of hypothesis
201 | :param length_ref: int containing length of reference
202 | :return: array of score for each n-grams cosine similarity
203 | """
204 | delta = float(length_hyp - length_ref)
205 | # measure consine similarity
206 | val = np.array([0.0 for _ in range(self.n)])
207 | for n in range(self.n):
208 | # ngram
209 | for (ngram, count) in vec_hyp[n].iteritems():
210 | # vrama91 : added clipping
211 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
212 |
213 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
214 | val[n] /= (norm_hyp[n] * norm_ref[n])
215 |
216 | assert (not math.isnan(val[n]))
217 | # vrama91: added a length based gaussian penalty
218 | val[n] *= np.e ** (-(delta ** 2) / (2 * self.sigma ** 2))
219 | return val
220 |
221 | # compute log reference length
222 | self.ref_len = np.log(float(len(self.crefs)))
223 |
224 | scores = []
225 | # for test, refs in zip(self.ctest, self.crefs):
226 | for id in range(len(self.ctest)):
227 | test = self.ctest[id]
228 | refs = self.crefs[self.ref_to_imageId[self.test_to_imageId[id]]]
229 | # compute vector for test captions
230 | vec, norm, length = counts2vec(test)
231 | # compute vector for ref captions
232 | score = np.array([0.0 for _ in range(self.n)])
233 | for ref in refs:
234 | vec_ref, norm_ref, length_ref = counts2vec(ref)
235 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
236 | # change by vrama91 - mean of ngram scores, instead of sum
237 | score_avg = np.mean(score)
238 | # divide by number of references
239 | score_avg /= len(refs)
240 | # multiply score by 10
241 | score_avg *= 10.0
242 | # append score of an image to the score list
243 | scores.append(score_avg)
244 | return scores
245 |
246 | def compute_score(self, option=None, verbose=0):
247 | # compute idf
248 | #if first_time == 1:
249 |
250 | # assert to check document frequency
251 | #assert (len(self.ctest) >= max(self.document_frequency.values()))
252 | # compute cider score
253 | score = self.compute_cider()
254 | # debug
255 | # print score
256 | return np.mean(np.array(score)), np.array(score)
--------------------------------------------------------------------------------
/ngram_opts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | def parse_opt():
4 | parser = argparse.ArgumentParser()
5 | # Data input settings
6 | parser.add_argument('--input_json', type=str, default='data/features/cocotalk.json',
7 | help='path to the json file containing additional info and vocab')
8 | parser.add_argument('--input_fc_dir', type=str, default='data/features_fc',
9 | help='path to the directory containing the preprocessed fc feats')
10 | parser.add_argument('--input_att_dir', type=str, default='data/features_att',
11 | help='path to the directory containing the preprocessed att feats')
12 | parser.add_argument('--input_label_h5', type=str, default='data/features/cocotalk_label.h5',
13 | help='path to the h5file containing the preprocessed dataset')
14 | parser.add_argument('--cached_tokens', type=str, default='coco-train-idxs',
15 | help='Cached token file for calculating cider score during self critical training.')
16 |
17 | # Model settings
18 | parser.add_argument('--caption_model', type=str, default="fc",
19 | help='fc, att')
20 | parser.add_argument('--rnn_size', type=int, default=512,
21 | help='size of the rnn in number of hidden nodes in each layer')
22 | parser.add_argument('--fc_feat_size', type=int, default=2048,
23 | help='2048 for resnet, 4096 for vgg')
24 | parser.add_argument('--att_feat_size', type=int, default=2048,
25 | help='2048 for resnet, 512 for vgg')
26 |
27 | args = parser.parse_args()
28 | return args
29 |
--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | #'Prior Image Caption'
--------------------------------------------------------------------------------
/scripts/prepro_feats.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 | import torch
5 | torch.cuda.set_device(3)
6 | import os
7 | import json
8 | import argparse
9 | from random import shuffle, seed
10 | import string
11 | # non-standard dependencies:
12 | import h5py
13 | from six.moves import cPickle
14 | import numpy as np
15 | import torch
16 | import torchvision.models as models
17 | from torch.autograd import Variable
18 | import skimage.io
19 |
20 | from torchvision import transforms as trn
21 |
22 | preprocess = trn.Compose([
23 | #trn.ToTensor(),
24 | trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
25 | ])
26 |
27 | from misc.resnet_utils import myResnet
28 | import misc.resnet as resnet
29 |
30 | def main(params):
31 | net = getattr(resnet, params['model'])()
32 | net.load_state_dict(torch.load(os.path.join(params['model_root'],params['model']+'.pth')))
33 | my_resnet = myResnet(net)
34 | my_resnet.cuda()
35 | my_resnet.eval()
36 |
37 | imgs = json.load(open(params['input_json'], 'r'))
38 | imgs = imgs['images']
39 | N = len(imgs)
40 |
41 | seed(123) # make reproducible
42 |
43 | dir_fc = params['output_dir']+'_fc'
44 | dir_att = params['output_dir']+'_att'
45 | if not os.path.isdir(dir_fc):
46 | os.mkdir(dir_fc)
47 | if not os.path.isdir(dir_att):
48 | os.mkdir(dir_att)
49 |
50 | for i,img in enumerate(imgs):
51 | # load the image
52 | I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename']))
53 | # handle grayscale input images
54 | if len(I.shape) == 2:
55 | I = I[:,:,np.newaxis]
56 | I = np.concatenate((I,I,I), axis=2)
57 |
58 | I = I.astype('float32')/255.0
59 | I = torch.from_numpy(I.transpose([2,0,1])).cuda()
60 | I = Variable(preprocess(I), volatile=True)
61 | tmp_fc, tmp_att = my_resnet(I, params['att_size'])
62 | # write to pkl
63 | np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy())
64 | np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy())
65 |
66 | if i % 1000 == 0:
67 | print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N))
68 | print('wrote ', params['output_dir'])
69 |
70 | if __name__ == "__main__":
71 |
72 | parser = argparse.ArgumentParser()
73 |
74 | # input json
75 | parser.add_argument('--input_json', default='data/dataset_coco.json',required=True, help='input json file to process into hdf5')
76 | parser.add_argument('--output_dir', default='data/cocotalk', help='output h5 file')
77 |
78 | # options
79 | parser.add_argument('--images_root', default='/home/trunk/RTrunk1/kk/MSCOCO/2014/images.cocodataset.org/zips/coco', help='root location in which images are stored, to be prepended to file_path in input json')
80 | parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7')
81 | parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152')
82 | parser.add_argument('--model_root', default='./data/imagenet_weights', type=str, help='model root')
83 |
84 | args = parser.parse_args()
85 | params = vars(args) # convert to ordinary dict
86 | print('parsed input parameters:')
87 | print(json.dumps(params, indent = 2))
88 | main(params)
89 |
--------------------------------------------------------------------------------
/scripts/prepro_labels.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import os
6 | import json
7 | import argparse
8 | from random import shuffle, seed
9 | import string
10 | # non-standard dependencies:
11 | import h5py
12 | import numpy as np
13 | import torch
14 | import torchvision.models as models
15 | from torch.autograd import Variable
16 | import skimage.io
17 |
18 | def build_vocab(imgs, params):
19 | count_thr = params['word_count_threshold']
20 |
21 | # count up the number of words
22 | counts = {}
23 | for img in imgs:
24 | for sent in img['sentences']:
25 | for w in sent['tokens']:
26 | counts[w] = counts.get(w, 0) + 1
27 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
28 | print('top words and their counts:')
29 | print('\n'.join(map(str,cw[:20])))
30 |
31 | # print some stats
32 | total_words = sum(counts.values())
33 | print('total words:', total_words)
34 | bad_words = [w for w,n in counts.items() if n <= count_thr]
35 | vocab = [w for w,n in counts.items() if n > count_thr]
36 | bad_count = sum(counts[w] for w in bad_words)
37 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
38 | print('number of words in vocab would be %d' % (len(vocab), ))
39 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
40 |
41 | # lets look at the distribution of lengths as well
42 | sent_lengths = {}
43 | for img in imgs:
44 | for sent in img['sentences']:
45 | txt = sent['tokens']
46 | nw = len(txt)
47 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
48 | max_len = max(sent_lengths.keys())
49 | print('max length sentence in raw data: ', max_len)
50 | print('sentence length distribution (count, number of words):')
51 | sum_len = sum(sent_lengths.values())
52 | for i in range(max_len+1):
53 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
54 |
55 | # lets now produce the final annotations
56 | if bad_count > 0:
57 | # additional special UNK token we will use below to map infrequent words to
58 | print('inserting the special UNK token')
59 | vocab.append('UNK')
60 |
61 | for img in imgs:
62 | img['final_captions'] = []
63 | for sent in img['sentences']:
64 | txt = sent['tokens']
65 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
66 | img['final_captions'].append(caption)
67 |
68 | return vocab
69 |
70 | def encode_captions(imgs, params, wtoi):
71 | """
72 | encode all captions into one large array, which will be 1-indexed.
73 | also produces label_start_ix and label_end_ix which store 1-indexed
74 | and inclusive (Lua-style) pointers to the first and last caption for
75 | each image in the dataset.
76 | """
77 |
78 | max_length = params['max_length']
79 | N = len(imgs)
80 | M = sum(len(img['final_captions']) for img in imgs) # total number of captions
81 |
82 | label_arrays = []
83 | label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
84 | label_end_ix = np.zeros(N, dtype='uint32')
85 | label_length = np.zeros(M, dtype='uint32')
86 | caption_counter = 0
87 | counter = 1
88 | for i,img in enumerate(imgs):
89 | n = len(img['final_captions'])
90 | assert n > 0, 'error: some image has no captions'
91 |
92 | Li = np.zeros((n, max_length), dtype='uint32')
93 | for j,s in enumerate(img['final_captions']):
94 | label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence
95 | caption_counter += 1
96 | for k,w in enumerate(s):
97 | if k < max_length:
98 | Li[j,k] = wtoi[w]
99 |
100 | # note: word indices are 1-indexed, and captions are padded with zeros
101 | label_arrays.append(Li)
102 | label_start_ix[i] = counter
103 | label_end_ix[i] = counter + n - 1
104 |
105 | counter += n
106 |
107 | L = np.concatenate(label_arrays, axis=0) # put all the labels together
108 | assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
109 | assert np.all(label_length > 0), 'error: some caption had no words?'
110 |
111 | print('encoded captions to array of size ', L.shape)
112 | return L, label_start_ix, label_end_ix, label_length
113 |
114 | def main(params):
115 |
116 | imgs = json.load(open(params['input_json'], 'r'))
117 | imgs = imgs['images']
118 |
119 | seed(123) # make reproducible
120 |
121 | # create the vocab
122 | vocab = build_vocab(imgs, params)
123 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
124 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
125 |
126 | # encode captions in large arrays, ready to ship to hdf5 file
127 | L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi)
128 |
129 | # create output h5 file
130 | N = len(imgs)
131 | f_lb = h5py.File(params['output_h5']+'_label.h5', "w")
132 | f_lb.create_dataset("labels", dtype='uint32', data=L)
133 | f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
134 | f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
135 | f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
136 | f_lb.close()
137 |
138 | # create output json file
139 | out = {}
140 | out['ix_to_word'] = itow # encode the (1-indexed) vocab
141 | out['images'] = []
142 | for i,img in enumerate(imgs):
143 |
144 | jimg = {}
145 | jimg['split'] = img['split']
146 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
147 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
148 |
149 | out['images'].append(jimg)
150 |
151 | json.dump(out, open(params['output_json'], 'w'))
152 | print('wrote ', params['output_json'])
153 |
154 | if __name__ == "__main__":
155 |
156 | parser = argparse.ArgumentParser()
157 |
158 | # input json
159 | parser.add_argument('--input_json', required=True, help='input json file to process into hdf5')
160 | parser.add_argument('--output_json', default='data.json', help='output json file')
161 | parser.add_argument('--output_h5', default='data', help='output h5 file')
162 |
163 | # options
164 | parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
165 | parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
166 |
167 | args = parser.parse_args()
168 | params = vars(args) # convert to ordinary dict
169 | print('parsed input parameters:')
170 | print(json.dumps(params, indent = 2))
171 | main(params)
172 |
--------------------------------------------------------------------------------
/scripts/prepro_ngrams.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 | from six.moves import cPickle
5 | from collections import defaultdict
6 |
7 | def precook(s, n=4, out=False):
8 | """
9 | Takes a string as input and returns an object that can be given to
10 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
11 | can take string arguments as well.
12 | :param s: string : sentence to be converted into ngrams
13 | :param n: int : number of ngrams for which representation is calculated
14 | :return: term frequency vector for occuring ngrams
15 | """
16 | words = s.split()
17 | counts = defaultdict(int)
18 | for k in xrange(1,n+1):
19 | for i in xrange(len(words)-k+1):
20 | ngram = tuple(words[i:i+k])
21 | counts[ngram] += 1
22 | return counts
23 |
24 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
25 | '''Takes a list of reference sentences for a single segment
26 | and returns an object that encapsulates everything that BLEU
27 | needs to know about them.
28 | :param refs: list of string : reference sentences for some image
29 | :param n: int : number of ngrams for which (ngram) representation is calculated
30 | :return: result (list of dict)
31 | '''
32 | return [precook(ref, n) for ref in refs]
33 |
34 | def create_crefs(refs):
35 | crefs = []
36 | for ref in refs:
37 | # ref is a list of 5 captions
38 | crefs.append(cook_refs(ref))
39 | return crefs
40 |
41 | def compute_doc_freq(crefs):
42 | '''
43 | Compute term frequency for reference data.
44 | This will be used to compute idf (inverse document frequency later)
45 | The term frequency is stored in the object
46 | :return: None
47 | '''
48 | document_frequency = defaultdict(float)
49 | for refs in crefs:
50 | # refs, k ref captions of one image
51 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
52 | document_frequency[ngram] += 1
53 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
54 | return document_frequency
55 |
56 | def build_dict(imgs, wtoi, params):
57 | wtoi[''] = 0
58 |
59 | count_imgs = 0
60 |
61 | refs_words = []
62 | refs_idxs = []
63 | for img in imgs:
64 | if (params['split'] == img['split']) or \
65 | (params['split'] == 'train' and img['split'] == 'restval') or \
66 | (params['split'] == 'all'):
67 | #(params['split'] == 'val' and img['split'] == 'restval') or \
68 | ref_words = []
69 | ref_idxs = []
70 | for sent in img['sentences']:
71 | tmp_tokens = sent['tokens'] + ['']
72 | tmp_tokens = [_ if _ in wtoi else 'UNK' for _ in tmp_tokens]
73 | ref_words.append(' '.join(tmp_tokens))
74 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens]))
75 | refs_words.append(ref_words)
76 | refs_idxs.append(ref_idxs)
77 | count_imgs += 1
78 | print('total imgs:', count_imgs)
79 |
80 | ngram_words = compute_doc_freq(create_crefs(refs_words))
81 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
82 | return ngram_words, ngram_idxs, count_imgs
83 |
84 | def main(params):
85 |
86 | imgs = json.load(open(params['input_json'], 'r'))
87 | itow = json.load(open(params['dict_json'], 'r'))['ix_to_word']
88 | wtoi = {w:i for i,w in itow.items()}
89 |
90 | imgs = imgs['images']
91 |
92 | ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params)
93 |
94 | cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
95 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
96 |
97 | if __name__ == "__main__":
98 |
99 | parser = argparse.ArgumentParser()
100 |
101 | # input json
102 | parser.add_argument('--input_json', default='/home-nfs/rluo/rluo/nips/code/prepro/dataset_coco.json', help='input json file to process into hdf5')
103 | parser.add_argument('--dict_json', default='data/cocotalk.json', help='output json file')
104 | parser.add_argument('--output_pkl', default='data/coco-all', help='output pickle file')
105 | parser.add_argument('--split', default='all', help='test, val, train, all')
106 | args = parser.parse_args()
107 | params = vars(args) # convert to ordinary dict
108 |
109 | main(params)
110 |
--------------------------------------------------------------------------------
/scripts/resnet.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import math
3 | import torch.utils.model_zoo as model_zoo
4 |
5 |
6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
7 | 'resnet152']
8 |
9 |
10 | model_urls = {
11 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
12 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
13 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
14 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
15 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
16 | }
17 |
18 |
19 | def conv3x3(in_planes, out_planes, stride=1):
20 | "3x3 convolution with padding"
21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
22 | padding=1, bias=False)
23 |
24 |
25 | class BasicBlock(nn.Module):
26 | expansion = 1
27 |
28 | def __init__(self, inplanes, planes, stride=1, downsample=None):
29 | super(BasicBlock, self).__init__()
30 | self.conv1 = conv3x3(inplanes, planes, stride)
31 | self.bn1 = nn.BatchNorm2d(planes)
32 | self.relu = nn.ReLU(inplace=True)
33 | self.conv2 = conv3x3(planes, planes)
34 | self.bn2 = nn.BatchNorm2d(planes)
35 | self.downsample = downsample
36 | self.stride = stride
37 |
38 | def forward(self, x):
39 | residual = x
40 |
41 | out = self.conv1(x)
42 | out = self.bn1(out)
43 | out = self.relu(out)
44 |
45 | out = self.conv2(out)
46 | out = self.bn2(out)
47 |
48 | if self.downsample is not None:
49 | residual = self.downsample(x)
50 |
51 | out += residual
52 | out = self.relu(out)
53 |
54 | return out
55 |
56 |
57 | class Bottleneck(nn.Module):
58 | expansion = 4
59 |
60 | def __init__(self, inplanes, planes, stride=1, downsample=None):
61 | super(Bottleneck, self).__init__()
62 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
63 | self.bn1 = nn.BatchNorm2d(planes)
64 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
65 | padding=1, bias=False)
66 | self.bn2 = nn.BatchNorm2d(planes)
67 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
68 | self.bn3 = nn.BatchNorm2d(planes * 4)
69 | self.relu = nn.ReLU(inplace=True)
70 | self.downsample = downsample
71 | self.stride = stride
72 |
73 | def forward(self, x):
74 | residual = x
75 |
76 | out = self.conv1(x)
77 | out = self.bn1(out)
78 | out = self.relu(out)
79 |
80 | out = self.conv2(out)
81 | out = self.bn2(out)
82 | out = self.relu(out)
83 |
84 | out = self.conv3(out)
85 | out = self.bn3(out)
86 |
87 | if self.downsample is not None:
88 | residual = self.downsample(x)
89 |
90 | out += residual
91 | out = self.relu(out)
92 |
93 | return out
94 |
95 |
96 | class ResNet(nn.Module):
97 | def __init__(self, block, layers, num_classes=1000):
98 | self.inplanes = 64
99 | super(ResNet, self).__init__()
100 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
101 | bias=False)
102 | self.bn1 = nn.BatchNorm2d(64)
103 | self.relu = nn.ReLU(inplace=True)
104 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
105 | self.layer1 = self._make_layer(block, 64, layers[0])
106 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
108 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
109 | self.avgpool = nn.AvgPool2d(7)
110 | self.fc = nn.Linear(512 * block.expansion, num_classes)
111 |
112 | for m in self.modules():
113 | if isinstance(m, nn.Conv2d):
114 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
115 | m.weight.data.normal_(0, math.sqrt(2. / n))
116 | elif isinstance(m, nn.BatchNorm2d):
117 | m.weight.data.fill_(1)
118 | m.bias.data.zero_()
119 |
120 | def _make_layer(self, block, planes, blocks, stride=1):
121 | downsample = None
122 | if stride != 1 or self.inplanes != planes * block.expansion:
123 | downsample = nn.Sequential(
124 | nn.Conv2d(self.inplanes, planes * block.expansion,
125 | kernel_size=1, stride=stride, bias=False),
126 | nn.BatchNorm2d(planes * block.expansion),
127 | )
128 |
129 | layers = []
130 | layers.append(block(self.inplanes, planes, stride, downsample))
131 | self.inplanes = planes * block.expansion
132 | for i in range(1, blocks):
133 | layers.append(block(self.inplanes, planes))
134 |
135 | return nn.Sequential(*layers)
136 |
137 | def forward(self, x):
138 | x = self.conv1(x)
139 | x = self.bn1(x)
140 | x = self.relu(x)
141 | x = self.maxpool(x)
142 |
143 | x = self.layer1(x)
144 | x = self.layer2(x)
145 | x = self.layer3(x)
146 | x = self.layer4(x)
147 |
148 | x = self.avgpool(x)
149 | x = x.view(x.size(0), -1)
150 | x = self.fc(x)
151 |
152 | return x
153 |
154 |
155 | def resnet18(pretrained=False):
156 | """Constructs a ResNet-18 model.
157 |
158 | Args:
159 | pretrained (bool): If True, returns a model pre-trained on ImageNet
160 | """
161 | model = ResNet(BasicBlock, [2, 2, 2, 2])
162 | if pretrained:
163 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
164 | return model
165 |
166 |
167 | def resnet34(pretrained=False):
168 | """Constructs a ResNet-34 model.
169 |
170 | Args:
171 | pretrained (bool): If True, returns a model pre-trained on ImageNet
172 | """
173 | model = ResNet(BasicBlock, [3, 4, 6, 3])
174 | if pretrained:
175 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
176 | return model
177 |
178 |
179 | def resnet50(pretrained=False):
180 | """Constructs a ResNet-50 model.
181 |
182 | Args:
183 | pretrained (bool): If True, returns a model pre-trained on ImageNet
184 | """
185 | model = ResNet(Bottleneck, [3, 4, 6, 3])
186 | if pretrained:
187 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
188 | return model
189 |
190 |
191 | def resnet101(pretrained=False):
192 | """Constructs a ResNet-101 model.
193 |
194 | Args:
195 | pretrained (bool): If True, returns a model pre-trained on ImageNet
196 | """
197 | model = ResNet(Bottleneck, [3, 4, 23, 3])
198 | if pretrained:
199 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
200 | return model
201 |
202 |
203 | def resnet152(pretrained=False):
204 | """Constructs a ResNet-152 model.
205 |
206 | Args:
207 | pretrained (bool): If True, returns a model pre-trained on ImageNet
208 | """
209 | model = ResNet(Bottleneck, [3, 8, 36, 3])
210 | if pretrained:
211 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
212 | return model
--------------------------------------------------------------------------------
/scripts/resnet_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | from torch.autograd import Variable
4 | import torch.nn.functional as F
5 |
6 | class myResnet(nn.Module):
7 | def __init__(self, resnet):
8 | super(myResnet, self).__init__()
9 | self.resnet = resnet
10 |
11 | def forward(self, img, att_size=14):
12 | x = img.unsqueeze(0)
13 |
14 | x = self.resnet.conv1(x)
15 | x = self.resnet.bn1(x)
16 | x = self.resnet.relu(x)
17 | x = self.resnet.maxpool(x)
18 |
19 | x = self.resnet.layer1(x)
20 | x = self.resnet.layer2(x)
21 | x = self.resnet.layer3(x)
22 | x = self.resnet.layer4(x)
23 |
24 | fc = x.mean(3).mean(2).squeeze()
25 | att = F.adaptive_avg_pool2d(x,[att_size,att_size]).squeeze().permute(1, 2, 0)
26 |
27 | return fc, att
28 |
29 |
--------------------------------------------------------------------------------
/scripts/utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import collections
6 | import torch
7 | import torch.nn as nn
8 | from torch.autograd import Variable
9 |
10 | def if_use_att(caption_model):
11 | # Decide if load attention feature according to caption model
12 | if caption_model in ['show_tell', 'all_img', 'fc']:
13 | return False
14 | return True
15 |
16 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token.
17 | def decode_sequence(ix_to_word, seq):
18 | N, D = seq.size()
19 | out = []
20 | for i in range(N):
21 | txt = ''
22 | for j in range(D):
23 | ix = seq[i,j]
24 | if ix > 0 :
25 | if j >= 1:
26 | txt = txt + ' '
27 | txt = txt + ix_to_word[str(ix)]
28 | else:
29 | break
30 | out.append(txt)
31 | return out
32 |
33 | def to_contiguous(tensor):
34 | if tensor.is_contiguous():
35 | return tensor
36 | else:
37 | return tensor.contiguous()
38 |
39 | class LanguageModelCriterion(nn.Module):
40 | def __init__(self):
41 | super(LanguageModelCriterion, self).__init__()
42 |
43 | def forward(self, input, target, mask):
44 | # truncate to the same size
45 | target = target[:, :input.size(1)]
46 | mask = mask[:, :input.size(1)]
47 | input = to_contiguous(input).view(-1, input.size(2))
48 | target = to_contiguous(target).view(-1, 1)
49 | mask = to_contiguous(mask).view(-1, 1)
50 | output = - input.gather(1, target) * mask
51 | output = torch.sum(output) / torch.sum(mask)
52 |
53 | return output
54 |
55 | def set_lr(optimizer, lr):
56 | for group in optimizer.param_groups:
57 | group['lr'] = lr
58 |
59 | def clip_gradient(optimizer, grad_clip):
60 | for group in optimizer.param_groups:
61 | for param in group['params']:
62 | param.grad.data.clamp_(-grad_clip, grad_clip)
--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
1 | import time
2 | import json
3 | import logging
4 | import numpy as np
5 | import os.path as osp
6 | from pycoco.bleu.bleu import Bleu
7 | from pycoco.meteor.meteor import Meteor
8 | from pycoco.rouge.rouge import Rouge
9 | from pycoco.cider.cider import Cider
10 | import torch
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from torch.autograd import Variable
15 |
16 |
17 | class Timer:
18 | def __init__(self):
19 | self.start_time = 0
20 | self.end_time = 0
21 | self.total_time = 0
22 | self.avg_time = 0
23 | self.n_toc = 0
24 |
25 | def tic(self):
26 | self.n_toc = 0
27 | self.start_time = time.time()
28 |
29 | def toc(self):
30 | self.end_time = time.time()
31 | self.total_time = self.end_time - self.start_time
32 | self.n_toc += 1.
33 | self.avg_time = self.total_time / self.n_toc
34 | return self.total_time
35 |
36 |
37 | class Logger:
38 | """
39 | When receiving a message, first print it on screen, then write it into log file.
40 | If save_dir is None, it writes no log and only prints on screen.
41 | """
42 | def __init__(self, save_dir):
43 | if save_dir is not None:
44 | self.logger = logging.getLogger()
45 | logging.basicConfig(filename=osp.join(save_dir, 'experiment.log'), format='%(asctime)s | %(message)s')
46 | logging.root.setLevel(level=logging.INFO)
47 | else:
48 | self.logger = None
49 |
50 | def info(self, msg, to_file=True):
51 | print msg
52 | if self.logger is not None and to_file:
53 | self.logger.info(msg)
54 | def evaluate(gt_file, re_file, logger=None):
55 | """
56 | This function is reformed from MSCOCO evaluating code.
57 | The reference sentences are read from gt_file,
58 | the generated sentences to be evaluated are read from res_file
59 |
60 | """
61 | gts = json.load(open(gt_file, 'r'))
62 | scorers = [
63 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
64 | #(Meteor(), "METEOR"),
65 | # (Rouge(), "ROUGE_L"),
66 | (Cider(), "CIDEr")
67 | ]
68 | metrics = []
69 | res = json.load(open(re_file, 'r'))
70 | res = {c['image_id']: [c['caption']] for c in res}
71 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res}
72 | for scorer, method in scorers:
73 | if logger is not None:
74 | logger.info('computing %s score...' % (scorer.method()))
75 | score, scores = scorer.compute_score(gts, res)
76 | if type(method) == list:
77 | for sc, scs, m in zip(score, scores, method):
78 | if logger is not None:
79 | logger.info("%s: %0.3f" % (m, sc))
80 | metrics.extend(score)
81 | else:
82 | if logger is not None:
83 | logger.info("%s: %0.3f" % (method, score))
84 | metrics.append(score)
85 | return metrics
86 |
87 |
88 | def lm_caption_step(w_t,lm_state_t,caption_state_t,lm,caption_model,eta,manner):
89 | word = Variable(torch.LongTensor(w_t.tolist()))
90 | if lm.on_gpu:
91 | word = word.cuda()
92 | word_emb = lm.word_embedding_layer(word)
93 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size)
94 | prob = F.softmax(logit) # (batch_size, vocab_size)
95 | P = prob - eta
96 | P *= 10000000
97 | mask = F.sigmoid(P).data.cpu().numpy()
98 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(caption_state_t, w_t,mask,manner)
99 | return w_t_1,lm_state_t_1,caption_state_t_1
100 |
101 |
102 | def lm_caption(lm,model,image_ids,vocab,loader,feature,max_step,manner):
103 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model
104 | lm_state_0 = lm.init_state()
105 | cap_state_0 = model.initial_state(feature)
106 | eta_0 = 0.00005
107 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32)
108 | if manner == 'sample':
109 | res = []
110 | for step in range(max_step-1):
111 | w_1, lm_state_1, cap_state_1 = lm_caption_step(w_0, lm_state_0, cap_state_0, lm, model, eta_0*(2**step), manner)
112 | cap[step + 1, :] = w_1[:]
113 | w_0 = w_1
114 | lm_state_0 = lm_state_1
115 | cap_state_0 = cap_state_1
116 | for i in range(loader.batch_size):
117 | index = np.where(cap[1:,i] == 0)[0]
118 | if len(index) > 0:
119 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i])
120 | else:
121 | s = ' '.join(vocab[w] for w in cap[1:, i])
122 | res.append({'image_id': image_ids[i], 'caption': s})
123 | else:
124 | cap, res = model.inference(vocab, image_ids, feature, manner='greedy', max_length=max_step)
125 | return cap,res
126 |
127 | def att_lm_caption_step(w_t,lm_state_t,patches,caption_state_t,lm,caption_model,eta,manner):
128 | word = Variable(torch.LongTensor(w_t.tolist()))
129 | if lm.on_gpu:
130 | word = word.cuda()
131 | word_emb = lm.word_embedding_layer(word)
132 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size)
133 | prob = F.softmax(logit) # (batch_size, vocab_size)
134 | P = prob - eta
135 | P *= 10000000
136 | mask = F.sigmoid(P).data.cpu().numpy()
137 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(caption_state_t, w_t,patches,mask,manner)
138 | return w_t_1,lm_state_t_1,caption_state_t_1
139 |
140 | def att_lm_caption(lm,model,image_ids,vocab,loader,features,max_step,manner):
141 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model
142 | lm_state_0 = lm.init_state()
143 | eta_0 = 0.00005
144 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32)
145 | if manner == 'sample':
146 | pathes, cap_state_0 = model.initial_state(features)
147 | res = []
148 | for step in range(max_step-1):
149 | w_1, lm_state_1, cap_state_1 = att_lm_caption_step(w_0, lm_state_0,pathes, cap_state_0, lm, model, eta_0*(2**step), manner)
150 | cap[step + 1, :] = w_1[:]
151 | w_0 = w_1
152 | lm_state_0 = lm_state_1
153 | cap_state_0 = cap_state_1
154 | for i in range(loader.batch_size):
155 | index = np.where(cap[1:,i] == 0)[0]
156 | if len(index) > 0:
157 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i])
158 | else:
159 | s = ' '.join(vocab[w] for w in cap[1:, i])
160 | res.append({'image_id': image_ids[i], 'caption': s})
161 | else:
162 | cap, res = model.inference(vocab, image_ids, features, manner='greedy', max_length=max_step)
163 | return cap,res
164 |
165 |
166 | def lm2_caption_step(w_t,first_word,lm_state_t,caption_state_t,lm,caption_model,eta,manner,step):
167 | word = Variable(torch.LongTensor(w_t.tolist()))
168 | if lm.on_gpu:
169 | word = word.cuda()
170 | word_emb = lm.word_embedding_layer(word)
171 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size)
172 | prob = F.softmax(logit) # (batch_size, vocab_size)
173 | P = prob - eta
174 | P *= 10000000
175 | mask = F.sigmoid(P).data.cpu().numpy()[:,:-1] # drop the start token
176 | if step == 0:
177 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(0,caption_state_t, first_word,mask,manner)
178 | else:
179 | caption_state_t_1, w_t_1 = caption_model.ngram_single_step(step, caption_state_t, w_t, mask, manner)
180 | return w_t_1,lm_state_t_1,caption_state_t_1
181 |
182 |
183 | def lm2_caption(lm,model,image_ids,vocab,loader,feature,max_step,manner):
184 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model
185 | lm_state_0 = lm.init_state()
186 | first_word, cap_state_0 = model.initial_state(feature)
187 | eta_0 = 0.00005
188 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32)
189 | if manner == 'sample':
190 | res = []
191 | for step in range(max_step-1):
192 | w_1, lm_state_1, cap_state_1 = lm2_caption_step(w_0,first_word, lm_state_0, cap_state_0, lm, model, eta_0*(2**step), manner,step)
193 | cap[step + 1, :] = w_1[:]
194 | w_0 = w_1
195 | lm_state_0 = lm_state_1
196 | cap_state_0 = cap_state_1
197 | for i in range(loader.batch_size):
198 | index = np.where(cap[1:,i] == 0)[0]
199 | if len(index) > 0:
200 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i])
201 | else:
202 | s = ' '.join(vocab[w] for w in cap[1:, i])
203 | res.append({'image_id': image_ids[i], 'caption': s})
204 | else:
205 | cap, res = model.inference(vocab, image_ids, feature, manner='greedy', max_length=max_step)
206 | return cap[1:,:],res
207 |
208 | def att2_lm_caption_step(w_t,first_word,lm_state_t,patches,caption_state_t,lm,caption_model,eta,manner,step):
209 | word = Variable(torch.LongTensor(w_t.tolist()))
210 | if lm.on_gpu:
211 | word = word.cuda()
212 | word_emb = lm.word_embedding_layer(word)
213 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size)
214 | prob = F.softmax(logit) # (batch_size, vocab_size)
215 | P = prob - eta
216 | P *= 10000000
217 | mask = F.sigmoid(P).data.cpu().numpy()[:,:-1] # drop the start token
218 | if step == 0:
219 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(0,caption_state_t,first_word,patches,mask,manner)
220 | else:
221 | caption_state_t_1, w_t_1 = caption_model.ngram_single_step(step,caption_state_t, w_t, patches, mask, manner)
222 | return w_t_1,lm_state_t_1,caption_state_t_1
223 |
224 | def att2_lm_caption(lm,model,image_ids,vocab,loader,features,max_step,manner):
225 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model
226 | lm_state_0 = lm.init_state()
227 | eta_0 = 0.00005
228 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32)
229 | if manner == 'sample':
230 | patches,first_word, cap_state_0 = model.initial_state(features)
231 | res = []
232 | for step in range(max_step-1):
233 | w_1, lm_state_1, cap_state_1 = att2_lm_caption_step(w_0,first_word, lm_state_0,patches, cap_state_0, lm, model, eta_0*(2**step), manner,step)
234 | cap[step + 1, :] = w_1[:]
235 | w_0 = w_1
236 | lm_state_0 = lm_state_1
237 | cap_state_0 = cap_state_1
238 | for i in range(loader.batch_size):
239 | index = np.where(cap[1:,i] == 0)[0]
240 | if len(index) > 0:
241 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i])
242 | else:
243 | s = ' '.join(vocab[w] for w in cap[1:, i])
244 | res.append({'image_id': image_ids[i], 'caption': s})
245 | else:
246 | cap, res = model.inference(vocab, image_ids, features, manner='greedy', max_length=max_step)
247 | return cap[1:,:],res
248 |
249 |
--------------------------------------------------------------------------------
/train_fourgram.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 |
16 | batch_size = opts.batch_size
17 |
18 | loader = KKDataLoader(opts)
19 | vocabs = loader.get_vocab()
20 | vocab = ['#END#']
21 | for i in range(len(vocabs)):
22 | ids = str(i+1)
23 | vocab.append(vocabs[ids])
24 |
25 | if not os.path.exists('fourgram_cider_model'):
26 | os.mkdir('fourgram_cider_model')
27 |
28 | if opts.use_att:
29 | save_dir = 'fourgram_cider_model/' + 'att_model'
30 | else:
31 | save_dir = 'fourgram_cider_model/' + 'fc_model'
32 | if not os.path.exists(save_dir):
33 | os.mkdir(save_dir)
34 | print(save_dir + ' has been built')
35 |
36 |
37 | image_dim = 2048
38 | vocab_size = loader.vocab_size + 1
39 | cell_size = 512
40 | lr = 0.00005
41 | if opts.use_att:
42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True)
43 | model.load('warm_model/att_warm/model.init')
44 | else:
45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True)
46 | model.load('warm_model/fc_warm/model.init')
47 |
48 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
49 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
50 |
51 | def cider_temp(res):
52 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
53 | score, _ = cider_scorer.compute_score()
54 | return score
55 |
56 | pool = Pool(processes=5)
57 | best_score = -1
58 | logger = Logger(save_dir)
59 | iter = 0
60 | finish_iter = 100000
61 | timer = Timer()
62 | timer.tic()
63 | while iter < finish_iter:
64 | iter += 1
65 | data = loader.get_batch('train')
66 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
67 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
68 | fc_feats, att_feats = tmp
69 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
70 | if opts.use_att:
71 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
72 | feature = att_feats
73 | else:
74 | feature = fc_feats
75 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
76 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
77 |
78 | all_caps, all_results, all_scores = [], [], []
79 | for _ in xrange(20):
80 | # Generate captions by sampling
81 | sample_caps, sample_results = model.fourgram_inference(vocab, image_id, feature,
82 | manner='sample',
83 | max_length=16)
84 |
85 | # Compute cider scores for sampled captions
86 | sample_scores = np.array(pool.map(cider_temp, sample_results))
87 | all_caps.append(sample_caps)
88 | all_results.append(sample_results)
89 | all_scores.append(sample_scores)
90 |
91 | all_scores = np.array(all_scores)
92 | sample_caps, sample_results, sample_scores = [], [], []
93 | for n in xrange(opts.batch_size):
94 | best_i = all_scores[:, n].argmax()
95 | sample_caps.append(all_caps[best_i][n])
96 | sample_results.append(all_results[best_i][n])
97 | sample_scores.append(all_scores[best_i, n])
98 | sample_scores = np.array(sample_scores)
99 |
100 | max_length = max([cap.shape[0] for cap in sample_caps])
101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
102 | for n in xrange(opts.batch_size):
103 | L = sample_caps[n].shape[0]
104 | caption[1:L + 1, n] = sample_caps[n]
105 | caption[L + 1:, n] = 0
106 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
107 | for n in xrange(opts.batch_size):
108 | L = sample_caps[n].shape[0]
109 | mask[:L + 1, n] = 1
110 | reward = (sample_scores - greedy_scores).astype(np.float32)
111 | print image_id[0]
112 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
113 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
114 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
115 | if iter % 300 == 0:
116 | results = []
117 | for nn in range(5000/opts.batch_size):
118 | data = loader.get_batch('val')
119 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
120 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
121 | fc_feats, att_feats = tmp
122 | if opts.use_att:
123 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
124 | att_feats.shape[3])
125 | feature_val = att_feats
126 | else:
127 | feature_val = fc_feats
128 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
129 |
130 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
131 | # Generate sentences for validation set
132 | results += greedy_res
133 | # Evaluate generated captions
134 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
135 | gt_file = osp.join('data/features', 'captions_val.json')
136 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
137 |
138 | if score > best_score:
139 | best_score = score
140 | model.save(osp.join(save_dir, 'model.best'))
141 | model.save(osp.join(save_dir,'model.ckpt'))
142 | # Output training information
143 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
144 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
145 | # Reset loss and timer
146 | train_losses = []
147 | timer.tic()
148 |
149 | # If early-stop condition triggers
150 | if iter > finish_iter:
151 | break
152 |
153 |
--------------------------------------------------------------------------------
/train_rnnlm.py:
--------------------------------------------------------------------------------
1 | import ngram_opts
2 | from dataloader import *
3 | import os
4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
5 | from caption_model.rnnlm import *
6 |
7 | opts = ngram_opts.parse_opt()
8 | if opts.caption_model == 'fc':
9 | opts.use_att = False
10 | else:
11 | opts.use_att = True
12 | loader = KKDataLoader(opts)
13 |
14 |
15 | batch_size = 100
16 | hidden_size = 256
17 | word_embed_size = 256
18 | vocab_size = loader.vocab_size + 2 # set start token
19 | lr = 0.0001
20 | lm = LM(batch_size, hidden_size,vocab_size, word_embed_size,lr)
21 | Labels = loader.h5_label_file['labels']
22 | new_labels = np.zeros((Labels.shape[1]+1,Labels.shape[0]),dtype=Labels.dtype)
23 | new_labels[0,:] = 9488 # Set start token to 9488, the total vocab size is 9489
24 | for i in range(Labels.shape[0]):
25 | new_labels[1:,i] = Labels[i,:]
26 |
27 | Ind = range(len(Labels))
28 | mask = np.ones((16,100))
29 | reward = np.ones((100,))
30 | import random
31 | for i in range(1000):
32 | random.shuffle(Ind)
33 | Loss = []
34 | for j in range(100):
35 | index = Ind[j*batch_size:(j+1)*batch_size]
36 | batch_sen = new_labels[:,index]
37 | loss = lm.train_on_batch(batch_sen,mask,reward)
38 | Loss.append(loss)
39 | print i,np.mean(Loss)
40 | if i % 10 == 0:
41 | lm.save('warm_model/rnnlm/model.init')
42 |
43 |
--------------------------------------------------------------------------------
/train_rnnlm_cider.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | from caption_model.rnnlm import *
6 | import os
7 | os.environ["CUDA_VISIBLE_DEVICES"] = "2"
8 | import ngram_opts
9 | from tools import *
10 | from dataloader import *
11 | opts = ngram_opts.parse_opt()
12 | if opts.caption_model == 'fc':
13 | opts.use_att = False
14 | else:
15 | opts.use_att = True
16 |
17 | batch_size = opts.batch_size
18 |
19 | loader = KKDataLoader(opts)
20 | vocabs = loader.get_vocab()
21 | vocab = ['#END#']
22 | for i in range(len(vocabs)):
23 | ids = str(i+1)
24 | vocab.append(vocabs[ids])
25 |
26 | if opts.use_att:
27 | save_dir = 'rnnlm_cider_model/' + 'att_model'
28 | else:
29 | save_dir = 'rnnlm_cider_model/' + 'fc_model'
30 | if not os.path.exists(save_dir):
31 | os.mkdir(save_dir)
32 | print(save_dir + ' has been built')
33 |
34 |
35 | image_dim = opts.fc_feat_size
36 | vocab_size = loader.vocab_size + 1
37 | cell_size = opts.rnn_size
38 | lr = 0.00005
39 | if opts.use_att:
40 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
41 | model.load('warm_model/att_warm/model.init')
42 | else:
43 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
44 | model.load('warm_model/fc_warm/model.init')
45 |
46 | word_embed_size = 256
47 | hidden_size = 256
48 | manner = 'sample'
49 | lm = LM(batch_size, hidden_size,vocab_size+1, word_embed_size,lr)
50 | lm.load('warm_model/rnnlm/model.init')
51 |
52 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
53 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
54 | def cider_temp(res):
55 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
56 | score, _ = cider_scorer.compute_score()
57 | return score
58 |
59 | pool = Pool(processes=4)
60 | logger = Logger(save_dir)
61 | best_score = -1
62 | iters = 0
63 | finish_iter = 1000000
64 | timer = Timer()
65 | timer.tic()
66 | best_count = 0
67 | max_step = 14
68 | while iters < finish_iter:
69 | iters += 1
70 | data = loader.get_batch('train')
71 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
72 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
73 | fc_feats, att_feats = tmp
74 | if opts.use_att:
75 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
76 | feature = att_feats
77 | else:
78 | feature = fc_feats
79 |
80 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
81 |
82 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
83 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
84 |
85 | all_caps, all_results, all_scores = [], [], []
86 | for _ in xrange(20):
87 | if opts.use_att:
88 | sample_caps, sample_results = att2_lm_caption(lm,model,image_id,vocab,loader,feature,max_step,'sample')
89 | else:
90 | sample_caps, sample_results = lm2_caption(lm, model, image_id, vocab, loader, feature, max_step,'sample')
91 | # Compute cider scores for sampled captions
92 | sample_scores = np.array(pool.map(cider_temp, sample_results))
93 | all_caps.append(sample_caps)
94 | all_results.append(sample_results)
95 | all_scores.append(sample_scores)
96 |
97 | all_scores = np.array(all_scores)
98 | sample_caps, sample_results, sample_scores = [], [], []
99 | for n in xrange(opts.batch_size):
100 | best_i = all_scores[:, n].argmax()
101 | sample_caps.append(all_caps[best_i][:,n])
102 | sample_results.append(all_results[best_i][n])
103 | sample_scores.append(all_scores[best_i, n])
104 | sample_scores = np.array(sample_scores)
105 | sample_caps = np.array(sample_caps)
106 | sample_caps = sample_caps.transpose()
107 |
108 | mask = np.ones((sample_caps.shape[0],sample_caps.shape[1]))
109 | for n in range(opts.batch_size):
110 | index = np.where(sample_caps[:,n] == 0)[0]
111 | if len(index) > 1:
112 | mask[index[1]:,n] = 0
113 |
114 | reward = (sample_scores - greedy_scores).astype(np.float32)
115 | print iters, image_id[0]
116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
117 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
118 |
119 | loss_train = model.train_on_batch(feature,sample_caps, mask, reward)
120 | if iters % 300 == 0:
121 | results = []
122 | for kkk in range(5000/opts.batch_size):
123 | data = loader.get_batch('val')
124 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
125 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
126 | fc_feats, att_feats = tmp
127 | if opts.use_att:
128 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
129 | att_feats.shape[3])
130 | feature = att_feats
131 | else:
132 | feature = fc_feats
133 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
134 |
135 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
136 | results += greedy_res
137 | # Evaluate generated captions
138 | json.dump(results, open(osp.join(save_dir, 'rl_result.json'), 'w'))
139 | gt_file = osp.join('data/features', 'captions_val.json')
140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'rl_result.json'))[-1]
141 |
142 | if score > best_score:
143 | best_score = score
144 | model.save(osp.join(save_dir, 'model.best'))
145 | model.save(osp.join(save_dir,'model.ckpt'))
146 | # Output training information
147 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
148 | .format(iters, -1, score, best_score, finish_iter, timer.toc()))
149 |
150 | train_losses = []
151 | timer.tic()
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
--------------------------------------------------------------------------------
/train_sc_cider.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 |
16 | batch_size = opts.batch_size
17 |
18 | loader = KKDataLoader(opts)
19 | vocabs = loader.get_vocab()
20 | vocab = ['#END#']
21 | for i in range(len(vocabs)):
22 | ids = str(i+1)
23 | vocab.append(vocabs[ids])
24 |
25 | if not os.path.exists('sc_cider_model'):
26 | os.mkdir('sc_cider_model')
27 |
28 | if opts.use_att:
29 | save_dir = 'sc_cider_model/' + 'att_model'
30 | else:
31 | save_dir = 'sc_cider_model/' + 'fc_model'
32 | if not os.path.exists(save_dir):
33 | os.mkdir(save_dir)
34 | print(save_dir + ' has been built')
35 |
36 |
37 | image_dim = 2048
38 | vocab_size = loader.vocab_size + 1
39 | cell_size = 512
40 | lr = 0.00005
41 | if opts.use_att:
42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True)
43 | model.load('warm_model/att_warm/model.init')
44 | else:
45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True)
46 | model.load('warm_model/fc_warm/model.init')
47 |
48 |
49 | # Initialize cider-scorer
50 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
51 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
52 |
53 | def cider_temp(res):
54 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
55 | score, _ = cider_scorer.compute_score()
56 | return score
57 |
58 | pool = Pool(processes=5)
59 |
60 | best_score = -1
61 | logger = Logger(save_dir)
62 | iter = 0
63 | finish_iter = 1000000
64 | timer = Timer()
65 | timer.tic()
66 | while iter < finish_iter:
67 | iter += 1
68 | data = loader.get_batch('train')
69 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
70 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
71 | fc_feats, att_feats = tmp
72 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
73 | if opts.use_att:
74 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
75 | feature = att_feats
76 | else:
77 | feature = fc_feats
78 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
79 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
80 |
81 | all_caps, all_results, all_scores = [], [], []
82 | for _ in xrange(20):
83 | # Generate captions by sampling
84 | sample_caps, sample_results = model.inference(vocab, image_id, feature,
85 | manner='sample',
86 | max_length=16)
87 |
88 | # Compute cider scores for sampled captions
89 | sample_scores = np.array(pool.map(cider_temp, sample_results))
90 | all_caps.append(sample_caps)
91 | all_results.append(sample_results)
92 | all_scores.append(sample_scores)
93 |
94 | all_scores = np.array(all_scores)
95 | sample_caps, sample_results, sample_scores = [], [], []
96 | for n in xrange(opts.batch_size):
97 | best_i = all_scores[:, n].argmax()
98 | sample_caps.append(all_caps[best_i][n])
99 | sample_results.append(all_results[best_i][n])
100 | sample_scores.append(all_scores[best_i, n])
101 | sample_scores = np.array(sample_scores)
102 |
103 | max_length = max([cap.shape[0] for cap in sample_caps])
104 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
105 | for n in xrange(opts.batch_size):
106 | L = sample_caps[n].shape[0]
107 | caption[1:L + 1, n] = sample_caps[n]
108 | caption[L + 1:, n] = 0
109 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
110 | for n in xrange(opts.batch_size):
111 | L = sample_caps[n].shape[0]
112 | mask[:L + 1, n] = 1
113 | reward = (sample_scores - greedy_scores).astype(np.float32)
114 | print image_id[0]
115 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
116 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
117 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
118 | if iter % 300 == 0:
119 | results = []
120 | for nn in range(5000/opts.batch_size):
121 | data = loader.get_batch('val')
122 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
123 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
124 | fc_feats, att_feats = tmp
125 | if opts.use_att:
126 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
127 | att_feats.shape[3])
128 | feature_val = att_feats
129 | else:
130 | feature_val = fc_feats
131 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
132 |
133 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
134 | # Generate sentences for validation set
135 | results += greedy_res
136 | # Evaluate generated captions
137 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
138 | gt_file = osp.join('data/features', 'captions_val.json')
139 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
140 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w'))
141 | # Update if finding new best model
142 | if score > best_score:
143 | best_score = score
144 | model.save(osp.join(save_dir, 'model.best'))
145 | model.save(osp.join(save_dir,'model.ckpt'))
146 | # Output training information
147 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
148 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
149 | # Reset loss and timer
150 | train_losses = []
151 | timer.tic()
152 |
153 | # If early-stop condition triggers
154 | if iter > finish_iter:
155 | break
156 |
157 |
--------------------------------------------------------------------------------
/train_trigram.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | from mycider import *
4 | from multiprocessing import Pool
5 | import os
6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
7 | import ngram_opts
8 | from tools import *
9 | from dataloader import *
10 | opts = ngram_opts.parse_opt()
11 | if opts.caption_model == 'fc':
12 | opts.use_att = False
13 | else:
14 | opts.use_att = True
15 | batch_size = opts.batch_size
16 |
17 | loader = KKDataLoader(opts)
18 | vocabs = loader.get_vocab()
19 | vocab = ['#END#']
20 | for i in range(len(vocabs)):
21 | ids = str(i+1)
22 | vocab.append(vocabs[ids])
23 |
24 | if not os.path.exists('trigram_cider_model'):
25 | os.mkdir('trigram_cider_model')
26 |
27 | if opts.use_att:
28 | save_dir = 'trigram_cider_model/' + 'att_model'
29 | else:
30 | save_dir = 'trigram_cider_model/' + 'fc_model'
31 | if not os.path.exists(save_dir):
32 | os.mkdir(save_dir)
33 | print(save_dir + ' has been built')
34 |
35 |
36 | image_dim = 2048
37 | vocab_size = loader.vocab_size + 1
38 | cell_size = 512
39 | lr = 0.00005
40 | if opts.use_att:
41 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True)
42 | model.load('warm_model/att_warm/model.init')
43 | else:
44 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True)
45 | model.load('warm_model/fc_warm/model.init')
46 |
47 |
48 | # Initialize cider-scorer
49 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json'))
50 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0)
51 |
52 | def cider_temp(res):
53 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]})
54 | score, _ = cider_scorer.compute_score()
55 | return score
56 |
57 | pool = Pool(processes=5)
58 |
59 | best_score = -1
60 | logger = Logger(save_dir)
61 | iter = 0
62 | finish_iter = 100000
63 | timer = Timer()
64 | timer.tic()
65 | while iter < finish_iter:
66 | iter += 1
67 | data = loader.get_batch('train')
68 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
69 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
70 | fc_feats, att_feats = tmp
71 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
72 | if opts.use_att:
73 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3])
74 | feature = att_feats
75 | else:
76 | feature = fc_feats
77 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16)
78 | greedy_scores = np.array(pool.map(cider_temp, greedy_res))
79 |
80 | all_caps, all_results, all_scores = [], [], []
81 | for _ in xrange(20):
82 | # Generate captions by sampling
83 | sample_caps, sample_results = model.trigram_inference(vocab, image_id, feature,
84 | manner='sample',
85 | max_length=16)
86 | sample_scores = np.array(pool.map(cider_temp, sample_results))
87 | all_caps.append(sample_caps)
88 | all_results.append(sample_results)
89 | all_scores.append(sample_scores)
90 |
91 | all_scores = np.array(all_scores)
92 | sample_caps, sample_results, sample_scores = [], [], []
93 | for n in xrange(opts.batch_size):
94 | best_i = all_scores[:, n].argmax()
95 | sample_caps.append(all_caps[best_i][n])
96 | sample_results.append(all_results[best_i][n])
97 | sample_scores.append(all_scores[best_i, n])
98 | sample_scores = np.array(sample_scores)
99 |
100 | max_length = max([cap.shape[0] for cap in sample_caps])
101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32)
102 | for n in xrange(opts.batch_size):
103 | L = sample_caps[n].shape[0]
104 | caption[1:L + 1, n] = sample_caps[n]
105 | caption[L + 1:, n] = 0
106 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32)
107 | for n in xrange(opts.batch_size):
108 | L = sample_caps[n].shape[0]
109 | mask[:L + 1, n] = 1
110 | reward = (sample_scores - greedy_scores).astype(np.float32)
111 | print image_id[0]
112 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption']
113 | print 'sample: ', sample_scores[0], sample_results[0]['caption']
114 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward)
115 | if iter % 300 == 0:
116 | results = []
117 | for nn in range(5000/opts.batch_size):
118 | data = loader.get_batch('val')
119 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
120 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
121 | fc_feats, att_feats = tmp
122 | if opts.use_att:
123 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
124 | att_feats.shape[3])
125 | feature_val = att_feats
126 | else:
127 | feature_val = fc_feats
128 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)]
129 |
130 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16)
131 | # Generate sentences for validation set
132 | results += greedy_res
133 | # Evaluate generated captions
134 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w'))
135 | gt_file = osp.join('data/features', 'captions_val.json')
136 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1]
137 | if score > best_score:
138 | best_score = score
139 | model.save(osp.join(save_dir, 'model.best'))
140 | model.save(osp.join(save_dir,'model.ckpt'))
141 | # Output training information
142 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
143 | .format(iter, -1, score, best_score, finish_iter, timer.toc()))
144 | # Reset loss and timer
145 | train_losses = []
146 | timer.tic()
147 |
148 | # If early-stop condition triggers
149 | if iter > finish_iter:
150 | break
151 |
152 |
--------------------------------------------------------------------------------
/train_warm.py:
--------------------------------------------------------------------------------
1 | from caption_model.att import *
2 | from caption_model.fc import *
3 | import os
4 | import sys
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"
6 | import ngram_opts
7 | from tools import *
8 | from dataloader import *
9 | opts = ngram_opts.parse_opt()
10 | if opts.caption_model == 'fc':
11 | opts.use_att = False
12 | else:
13 | opts.use_att = True
14 | batch_size = opts.batch_size
15 |
16 | loader = KKDataLoader(opts)
17 | vocabs = loader.get_vocab()
18 | vocab = ['#END#']
19 | for i in range(len(vocabs)):
20 | ids = str(i+1)
21 | vocab.append(vocabs[ids])
22 | if opts.use_att:
23 | save_dir = 'warm_model/' + 'att_warm'
24 | else:
25 | save_dir = 'warm_model/' + 'fc_warm'
26 | if not os.path.exists(save_dir):
27 | os.mkdir(save_dir)
28 | print(save_dir + ' has been built')
29 |
30 | image_dim = opts.fc_feat_size
31 | vocab_size = loader.vocab_size+1 # no start token for ngram model warm start
32 | cell_size = opts.rnn_size
33 | lr = 0.00005
34 | if opts.use_att:
35 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
36 | else:
37 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True)
38 |
39 | logger = Logger(save_dir)
40 | iters = 0
41 | best_score = -1
42 | timer = Timer()
43 | timer.tic()
44 | finish_iter = 1000000
45 | while iters < finish_iter:
46 | iters += 1
47 | data = loader.get_batch('train')
48 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
49 | fc_feats, att_feats, labels, masks = tmp
50 | if opts.use_att:
51 | feature = att_feats.reshape(att_feats.shape[0],att_feats.shape[1]* att_feats.shape[2],att_feats.shape[3])
52 | else:
53 | feature = fc_feats
54 | Label = labels.transpose()[1:,:]
55 | Mask = masks.transpose()[0:-1,:]
56 | reward = np.ones((opts.batch_size*5,))
57 | loss = model.train_on_batch(feature,Label, Mask, reward)
58 | if iters % 500 == 0:
59 | results = []
60 | for n_batches in range(5000/opts.batch_size):
61 | datas = loader.get_batch('val')
62 | tmp = [datas['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
63 | datas['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
64 | fc_feats, att_feats = tmp
65 | if opts.use_att:
66 | feature = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],
67 | att_feats.shape[3])
68 | else:
69 | feature = fc_feats
70 | image_id = [datas['infos'][i]['id'] for i in range(opts.batch_size)]
71 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature,manner='greedy',max_length=16)
72 | results += greedy_res
73 | json.dump(results, open(osp.join(save_dir, 'tmp_result.json'), 'w'))
74 | gt_file = osp.join('data/features', 'captions_val.json')
75 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'tmp_result.json'))[-1]
76 |
77 | if score > best_score:
78 | best_score = score
79 | model.save(osp.join(save_dir, 'model.init'))
80 | logger.info('[{}],CIDEr score/CIDEr best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec'
81 | .format(iters, score, best_score, finish_iter, timer.toc()))
82 | timer.tic()
83 |
--------------------------------------------------------------------------------
/vis/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | neuraltalk2 results visualization
7 |
8 |
42 |
43 |
44 |
45 |
72 |
73 |
74 |
--------------------------------------------------------------------------------