├── Eval_model.py ├── LICENSE ├── MoreNet.md ├── MoreNet ├── Eval_model.py ├── caption_model │ ├── __init__.py │ ├── att.py │ ├── fc.py │ └── rnnlm.py ├── train_fourgram.py ├── train_rnnlm.py ├── train_rnnlm_cider.py ├── train_sc_cider.py ├── train_trigram.py └── train_warm.py ├── README.md ├── __init__.py ├── caption_model ├── __init__.py ├── att.py ├── fc.py └── rnnlm.py ├── cider └── README.md ├── data ├── README.md └── karpathy │ └── __init__.py ├── dataloader.py ├── get_ngram.py ├── images ├── badending.png ├── fourgram_att.png ├── rnn_att.png └── rnn_fc.png ├── misc ├── __init__.pyc ├── ngram_reward.py ├── ngram_reward.pyc ├── ngram_utils.py ├── ngram_utils.pyc ├── resnet.py ├── resnet.pyc ├── resnet_utils.py ├── resnet_utils.pyc ├── rewards.py ├── rewards.pyc ├── utils.py └── utils.pyc ├── mycider.py ├── ngram_opts.py ├── scripts ├── __init__.py ├── prepro_feats.py ├── prepro_labels.py ├── prepro_ngrams.py ├── resnet.py ├── resnet_utils.py └── utils.py ├── tools.py ├── train_fourgram.py ├── train_rnnlm.py ├── train_rnnlm_cider.py ├── train_sc_cider.py ├── train_trigram.py ├── train_warm.py └── vis ├── index.html └── jquery-1.8.3.min.js /Eval_model.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import logging 4 | import numpy as np 5 | import os.path as osp 6 | from pycoco.bleu.bleu import Bleu 7 | from pycoco.meteor.meteor import Meteor 8 | from pycoco.rouge.rouge import Rouge 9 | from pycoco.cider.cider import Cider 10 | bad_endings = ['a','an','the','in','for','at','of','with','before','after','on','upon','near','to','is','are','am'] 11 | 12 | def count_bad(sen,max_step): 13 | sen = sen.split(' ') 14 | if len(sen) < max_step and sen[-1] in bad_endings: 15 | return 1 16 | else: 17 | return 0 18 | 19 | 20 | def evaluate(gt_file, re_file, logger=None): 21 | """ 22 | This function is reformed from MSCOCO evaluating code. 23 | The reference sentences are read from gt_file, 24 | the generated sentences to be evaluated are read from res_file 25 | 26 | """ 27 | gts = json.load(open(gt_file, 'r')) 28 | scorers = [ 29 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 30 | (Meteor(), "METEOR"), 31 | (Rouge(), "ROUGE_L"), 32 | (Cider(), "CIDEr") 33 | ] 34 | metrics = [] 35 | res = json.load(open(re_file, 'r')) 36 | res = {c['image_id']: [c['caption']] for c in res} 37 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res} 38 | for scorer, method in scorers: 39 | if logger is not None: 40 | logger.info('computing %s score...' % (scorer.method())) 41 | score, scores = scorer.compute_score(gts, res) 42 | if type(method) == list: 43 | for sc, scs, m in zip(score, scores, method): 44 | if logger is not None: 45 | logger.info("%s: %0.3f" % (m, sc)) 46 | metrics.extend(score) 47 | else: 48 | if logger is not None: 49 | logger.info("%s: %0.3f" % (method, score)) 50 | metrics.append(score) 51 | return metrics 52 | 53 | 54 | 55 | import sys 56 | import ngram_opts 57 | from dataloader import * 58 | opts = ngram_opts.parse_opt() 59 | lr = 0.0005 60 | opts.batch_size = 50 61 | loader = KKDataLoader(opts) 62 | vocabs = loader.get_vocab() 63 | vocab = ['#END#'] 64 | for i in range(len(vocabs)): 65 | ids = str(i+1) 66 | vocab.append(vocabs[ids]) 67 | save_dir = 'eval' 68 | model_type = opts.caption_model # fc or attention 69 | rl_type = opts.rl_type # 'fourgram', 'trigram', 'rnnlm' 70 | batch_size = opts.batch_size 71 | image_dim = 2048 72 | cell_size = 512 73 | if rl_type == 'fourgram': 74 | if model_type == 'att': 75 | from caption_model.att import * 76 | vocab_size = 9489 77 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 78 | ngram=4, on_gpu=True) 79 | model.load('fourgram_cider_model/att_model/model.best') 80 | results = [] 81 | for kkk in range(5000 / opts.batch_size): 82 | data = loader.get_batch('test') 83 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 84 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 85 | fc_feats, att_feats = tmp 86 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 87 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 88 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 89 | results += greedy_res 90 | else: 91 | from caption_model.fc import * 92 | vocab_size = 9489 93 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 94 | ngram=4, on_gpu=True) 95 | model.load('fourgram_cider_model/fc_model/model.best') 96 | results = [] 97 | for kkk in range(5000 / opts.batch_size): 98 | data = loader.get_batch('test') 99 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 100 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 101 | fc_feats, att_feats = tmp 102 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 103 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 104 | results += greedy_res 105 | elif rl_type =='trigram': 106 | if model_type == 'att': 107 | from caption_model.att import * 108 | vocab_size = 9489 109 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 110 | ngram=3, on_gpu=True) 111 | model.load('trigram_cider_model/att_model/model.best') 112 | results = [] 113 | for kkk in range(5000 / opts.batch_size): 114 | data = loader.get_batch('test') 115 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 116 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 117 | fc_feats, att_feats = tmp 118 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 119 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 120 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 121 | results += greedy_res 122 | else: 123 | from caption_model.fc import * 124 | vocab_size = 9489 125 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 126 | ngram=3, on_gpu=True) 127 | model.load('trigram_cider_model/fc_model/model.best') 128 | results = [] 129 | for kkk in range(5000 / opts.batch_size): 130 | data = loader.get_batch('test') 131 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 132 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 133 | fc_feats, att_feats = tmp 134 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 135 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 136 | results += greedy_res 137 | elif rl_type =='rnnlm': 138 | if model_type == 'att': 139 | from caption_model.att import * 140 | vocab_size = 9489 141 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 142 | model.load('rnnlm_cider_model/att_model/model.best') 143 | results = [] 144 | for kkk in range(5000 / opts.batch_size): 145 | data = loader.get_batch('test') 146 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 147 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 148 | fc_feats, att_feats = tmp 149 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 150 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 151 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 152 | results += greedy_res 153 | else: 154 | from caption_model.fc import * 155 | vocab_size = 9489 156 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True) 157 | model.load('rnnlm_cider_model/fc_model/model.best') 158 | results = [] 159 | for kkk in range(5000 / opts.batch_size): 160 | data = loader.get_batch('test') 161 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 162 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 163 | fc_feats, att_feats = tmp 164 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 165 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 166 | results += greedy_res 167 | elif rl_type =='sc': 168 | if model_type == 'att': 169 | from caption_model.att import * 170 | vocab_size = 9489 171 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 172 | model.load('sc_cider_model/att_model/model.best') 173 | results = [] 174 | for kkk in range(5000 / opts.batch_size): 175 | data = loader.get_batch('test') 176 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 177 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 178 | fc_feats, att_feats = tmp 179 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 180 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 181 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 182 | results += greedy_res 183 | else: 184 | from caption_model.fc import * 185 | vocab_size = 9489 186 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True) 187 | model.load('sc_cider_model/fc_model/model.best') 188 | results = [] 189 | for kkk in range(5000 / opts.batch_size): 190 | data = loader.get_batch('test') 191 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 192 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 193 | fc_feats, att_feats = tmp 194 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 195 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 196 | results += greedy_res 197 | 198 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 199 | gt_file = osp.join('data/features', 'captions_test.json') 200 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 201 | bad_count = [count_bad(results[i]['caption']) for i in range(5000)] 202 | total_bad_count = sum(bad_count) 203 | print score , total_bad_count 204 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Tszhang Guo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MoreNet.md: -------------------------------------------------------------------------------- 1 | We use the same data preprocess as in README.md and copy all files in `MoreNet/` to replace in the main directory. 2 | ### Warm Starm 3 | In order to help CIDEr based REINFORCE algorithm converge more stable and faster, We need to warm start the captioning model and run the script below 4 | 5 | ```bash 6 | $ python train_warm.py --caption_model fc 7 | ``` 8 | if you want to use Attention, then run 9 | ```bash 10 | $ python train_warm.py --caption_model att 11 | ``` 12 | Download our pretrained warm start model from this [link](https://drive.google.com/open?id=1fj_Dgy9Gmxc9t6phzWKaH6DUZZXB-a6T). 13 | 14 | ### Train using Self-critical 15 | ```bash 16 | $ python train_sc_cider.py --caption_model att 17 | ``` 18 | You will also see a large boost of CIDEr score but with lots of bad endings. 19 | 20 | ### Train using Ngram constraint 21 | ```bash 22 | $ python train_fourgram.py --caption_model fc 23 | ``` 24 | 25 | ### Train using Neural Language model 26 | 27 | First you should train a neural language or you can download our pretrained LSTM language model from [link](https://drive.google.com/open?id=1fj_Dgy9Gmxc9t6phzWKaH6DUZZXB-a6T). 28 | ``` 29 | $ python train_rnnlm.py 30 | ``` 31 | 32 | Then train RL setting with Neural Language model constraint with the same warm start model. 33 | ```bash 34 | $ python train_rnnlm_cider.py --caption_model fc 35 | ``` 36 | or 37 | ```bash 38 | $ python train_rnnlm_cider.py --caption_model att 39 | ``` 40 | 41 | -------------------------------------------------------------------------------- /MoreNet/Eval_model.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import logging 4 | import numpy as np 5 | import os.path as osp 6 | from pycoco.bleu.bleu import Bleu 7 | from pycoco.meteor.meteor import Meteor 8 | from pycoco.rouge.rouge import Rouge 9 | from pycoco.cider.cider import Cider 10 | bad_endings = ['a','an','the','in','for','at','of','with','before','after','on','upon','near','to','is','are','am'] 11 | 12 | def count_bad(sen,max_step): 13 | sen = sen.split(' ') 14 | if len(sen) < max_step and sen[-1] in bad_endings: 15 | return 1 16 | else: 17 | return 0 18 | 19 | 20 | def evaluate(gt_file, re_file, logger=None): 21 | """ 22 | This function is reformed from MSCOCO evaluating code. 23 | The reference sentences are read from gt_file, 24 | the generated sentences to be evaluated are read from res_file 25 | 26 | """ 27 | gts = json.load(open(gt_file, 'r')) 28 | scorers = [ 29 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 30 | (Meteor(), "METEOR"), 31 | (Rouge(), "ROUGE_L"), 32 | (Cider(), "CIDEr") 33 | ] 34 | metrics = [] 35 | res = json.load(open(re_file, 'r')) 36 | res = {c['image_id']: [c['caption']] for c in res} 37 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res} 38 | for scorer, method in scorers: 39 | if logger is not None: 40 | logger.info('computing %s score...' % (scorer.method())) 41 | score, scores = scorer.compute_score(gts, res) 42 | if type(method) == list: 43 | for sc, scs, m in zip(score, scores, method): 44 | if logger is not None: 45 | logger.info("%s: %0.3f" % (m, sc)) 46 | metrics.extend(score) 47 | else: 48 | if logger is not None: 49 | logger.info("%s: %0.3f" % (method, score)) 50 | metrics.append(score) 51 | return metrics 52 | 53 | 54 | 55 | import sys 56 | import ngram_opts 57 | from dataloader import * 58 | opts = ngram_opts.parse_opt() 59 | lr = 0.0005 60 | opts.batch_size = 50 61 | loader = KKDataLoader(opts) 62 | vocabs = loader.get_vocab() 63 | vocab = ['#END#'] 64 | for i in range(len(vocabs)): 65 | ids = str(i+1) 66 | vocab.append(vocabs[ids]) 67 | save_dir = 'eval' 68 | model_type = opts.caption_model # fc or attention 69 | rl_type = opts.rl_type # 'fourgram', 'trigram', 'rnnlm' 70 | batch_size = opts.batch_size 71 | image_dim = 2048 72 | cell_size = 512 73 | if rl_type == 'fourgram': 74 | if model_type == 'att': 75 | from caption_model.att import * 76 | vocab_size = 9489 77 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 78 | ngram=4, on_gpu=True) 79 | model.load('fourgram_cider_model/att_model/model.best') 80 | results = [] 81 | for kkk in range(5000 / opts.batch_size): 82 | data = loader.get_batch('test') 83 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 84 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 85 | fc_feats, att_feats = tmp 86 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 87 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 88 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 89 | results += greedy_res 90 | else: 91 | from caption_model.fc import * 92 | vocab_size = 9489 93 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 94 | ngram=4, on_gpu=True) 95 | model.load('fourgram_cider_model/fc_model/model.best') 96 | results = [] 97 | for kkk in range(5000 / opts.batch_size): 98 | data = loader.get_batch('test') 99 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 100 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 101 | fc_feats, att_feats = tmp 102 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 103 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 104 | results += greedy_res 105 | elif rl_type =='trigram': 106 | if model_type == 'att': 107 | from caption_model.att import * 108 | vocab_size = 9489 109 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 110 | ngram=3, on_gpu=True) 111 | model.load('trigram_cider_model/att_model/model.best') 112 | results = [] 113 | for kkk in range(5000 / opts.batch_size): 114 | data = loader.get_batch('test') 115 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 116 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 117 | fc_feats, att_feats = tmp 118 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 119 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 120 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 121 | results += greedy_res 122 | else: 123 | from caption_model.fc import * 124 | vocab_size = 9489 125 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, 126 | ngram=3, on_gpu=True) 127 | model.load('trigram_cider_model/fc_model/model.best') 128 | results = [] 129 | for kkk in range(5000 / opts.batch_size): 130 | data = loader.get_batch('test') 131 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 132 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 133 | fc_feats, att_feats = tmp 134 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 135 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 136 | results += greedy_res 137 | elif rl_type =='rnnlm': 138 | if model_type == 'att': 139 | from caption_model.att import * 140 | vocab_size = 9489 141 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 142 | model.load('rnnlm_cider_model/att_model/model.best') 143 | results = [] 144 | for kkk in range(5000 / opts.batch_size): 145 | data = loader.get_batch('test') 146 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 147 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 148 | fc_feats, att_feats = tmp 149 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 150 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 151 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 152 | results += greedy_res 153 | else: 154 | from caption_model.fc import * 155 | vocab_size = 9489 156 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True) 157 | model.load('rnnlm_cider_model/fc_model/model.best') 158 | results = [] 159 | for kkk in range(5000 / opts.batch_size): 160 | data = loader.get_batch('test') 161 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 162 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 163 | fc_feats, att_feats = tmp 164 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 165 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 166 | results += greedy_res 167 | elif rl_type =='sc': 168 | if model_type == 'att': 169 | from caption_model.att import * 170 | vocab_size = 9489 171 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 172 | model.load('sc_cider_model/att_model/model.best') 173 | results = [] 174 | for kkk in range(5000 / opts.batch_size): 175 | data = loader.get_batch('test') 176 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 177 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 178 | fc_feats, att_feats = tmp 179 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 180 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2],att_feats.shape[3]) 181 | greedy_cap, greedy_res = model.inference(vocab, image_id, att_feats, manner='greedy', max_length=20) 182 | results += greedy_res 183 | else: 184 | from caption_model.fc import * 185 | vocab_size = 9489 186 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr,on_gpu=True) 187 | model.load('sc_cider_model/fc_model/model.best') 188 | results = [] 189 | for kkk in range(5000 / opts.batch_size): 190 | data = loader.get_batch('test') 191 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 192 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 193 | fc_feats, att_feats = tmp 194 | image_id = [data['infos'][i]['id'] for i in range(batch_size)] 195 | greedy_cap, greedy_res = model.inference(vocab, image_id, fc_feats, manner='greedy', max_length=20) 196 | results += greedy_res 197 | 198 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 199 | gt_file = osp.join('data/features', 'captions_test.json') 200 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 201 | bad_count = [count_bad(results[i]['caption']) for i in range(5000)] 202 | total_bad_count = sum(bad_count) 203 | print score , total_bad_count 204 | -------------------------------------------------------------------------------- /MoreNet/caption_model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /MoreNet/caption_model/fc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import pickle 7 | class FCModel(nn.Module): 8 | def __init__(self, batch_size, cell_size, image_dim, 9 | vocab_size, lr,ngram=0, on_gpu=False): 10 | super(FCModel, self).__init__() 11 | # Settings 12 | self.batch_size = batch_size 13 | self.cell_size = cell_size 14 | self.image_dim = image_dim 15 | self.vocab_size = vocab_size 16 | self.lr = lr 17 | self.on_gpu = on_gpu 18 | 19 | # Word embedding lookup table 20 | self.word_embedding = nn.Embedding(vocab_size, cell_size) 21 | 22 | # Image embedding mlp 23 | self.image_embedding = nn.Linear(image_dim, cell_size, bias=False) 24 | 25 | # State initializer 26 | self.c_initializer = nn.Linear(cell_size, cell_size, bias=False) 27 | self.h_initializer = nn.Linear(cell_size, cell_size, bias=False) 28 | 29 | # Recurrent layer 30 | self.rnn = nn.LSTMCell(cell_size, cell_size) 31 | 32 | # Word predicting mlp 33 | self.predictor = nn.Linear(cell_size, vocab_size) 34 | if ngram == 3: 35 | trigram = pickle.load(open('data/trigram.pkl')) 36 | self.trigram_mask = {} 37 | for tri in trigram: 38 | temp = np.zeros((vocab_size,)) 39 | for word in trigram[tri]: 40 | temp[word] = 1 41 | self.trigram_mask[tri] = temp 42 | elif ngram == 4: 43 | fourgram = pickle.load(open('data/fourgram.pkl')) 44 | self.fourgram_mask = {} 45 | for four in fourgram: 46 | temp = np.zeros((vocab_size,)) 47 | for word in fourgram[four]: 48 | temp[word] = 1 49 | self.fourgram_mask[four] = temp 50 | 51 | # Onehot encoder 52 | self.onehot = torch.torch.eye(vocab_size) 53 | if self.on_gpu: 54 | self.onehot = self.onehot.cuda() 55 | 56 | # Optimizer 57 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) 58 | 59 | # Move to gpu if necessary 60 | if self.on_gpu: 61 | self.cuda() 62 | 63 | def forward(self, word_emb, state): 64 | # Get states 65 | h_tm1, c_tm1 = state 66 | 67 | # RNN input 68 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1)) 69 | 70 | # Next word's logtis 71 | logits = self.predictor(h_t) 72 | 73 | return logits, (h_t, c_t) 74 | 75 | def initial_state(self, image): 76 | image = Variable(torch.Tensor(image)) 77 | if self.on_gpu: 78 | image = image.cuda() 79 | 80 | # Image embedding 81 | feat = self.image_embedding(image) 82 | 83 | # Initial state (batch_size, rnn_size) 84 | h0 = nn.Tanh()(self.h_initializer(feat)) 85 | c0 = nn.Tanh()(self.c_initializer(feat)) 86 | 87 | return h0, c0 88 | 89 | def train_on_batch(self, image, sentence, mask, reward): 90 | # Convert numpy to torch 91 | self.batch_size = image.shape[0] 92 | sentence = Variable(torch.LongTensor(sentence.tolist())) 93 | mask = torch.Tensor(mask) 94 | reward = torch.Tensor(reward) 95 | T = sentence.size()[0] - 1 96 | 97 | # If using gpu 98 | if self.on_gpu: 99 | sentence = sentence.cuda() 100 | mask = mask.cuda() 101 | reward = reward.cuda() 102 | 103 | # Initial state of RNN 104 | state = self.initial_state(image) 105 | 106 | # Word embedding for input sequence 107 | inputs = self.word_embedding(sentence[:-1, :]) 108 | 109 | # Recurrent computation 110 | logits = [] 111 | for i in xrange(T): 112 | word = inputs[i, :, :] 113 | logit, state = self.forward(word, state) 114 | logits.append(logit.unsqueeze(0)) 115 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 116 | logits = logits.resize(T*self.batch_size, self.vocab_size) 117 | 118 | # Next word's distribution 119 | prob = nn.Softmax()(logits).data 120 | 121 | # Ground-truth 122 | targets = sentence.data[1:, :].view(T*self.batch_size) 123 | gt_prob = self.onehot.index_select(0, targets) 124 | 125 | # Gradients 126 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size) 127 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad) 128 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad) 129 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad) 130 | logit_grad = logit_grad / self.batch_size 131 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size) 132 | 133 | # Gradient descent 134 | self.optimizer.zero_grad() 135 | logits.backward(gradient=logit_grad) 136 | self.optimizer.step() 137 | loss = -1 138 | 139 | return loss 140 | 141 | def single_step(self, state, words, manner='greedy'): 142 | words = Variable(torch.LongTensor(words), volatile=True) 143 | if self.on_gpu: 144 | words = words.cuda() 145 | 146 | # Word embedding 147 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0) 148 | 149 | # Take a rnn step 150 | logits, new_state = self.forward(words, state) 151 | 152 | # Next words 153 | if manner == 'greedy': 154 | new_words = logits.data.cpu().numpy().argmax(1) 155 | elif manner == 'sample': 156 | # Gumbel argmax trick 157 | if self.on_gpu: 158 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 159 | else: 160 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 161 | V = logits.data - torch.log(-U.log()) 162 | new_words = V.cpu().numpy().argmax(1) 163 | else: 164 | raise ValueError('Unknown manner: [{}]'.format(manner)) 165 | 166 | return new_state, new_words 167 | 168 | def inference(self, vocab, image_ids, image, manner='greedy', 169 | max_length=16, verbose=0, batch_size=None): 170 | # Choose batch-size 171 | self.batch_size = image.shape[0] 172 | 173 | # Beginning tokens 174 | init_word = torch.LongTensor([0] * self.batch_size) 175 | 176 | # Initiazization 177 | results = [] 178 | captions = [] 179 | 180 | # Iteratively generate words 181 | state = self.initial_state(image) 182 | sentences = [] 183 | word = init_word 184 | for _ in xrange(max_length): 185 | state, word = self.single_step(state, word, manner=manner) 186 | sentences.append(word) 187 | sentences = np.array(sentences).transpose() 188 | 189 | # Translate indexes to sentences 190 | for j in xrange(sentences.shape[0]): 191 | idxs = np.where(sentences[j, :] == 0)[0] 192 | end_index = idxs[0] if len(idxs) > 0 else max_length 193 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 194 | if verbose > 0: 195 | print 'id={}, {}'.format(image_ids[j], cap) 196 | captions.append(sentences[j, :end_index]) 197 | results.append({'image_id': image_ids[j], 'caption': cap}) 198 | # Type: captions (np.array), results (natural language) 199 | return captions, results 200 | 201 | def ngram_single_step(self, state, words,temp_mask, manner='greedy'): 202 | words = Variable(torch.LongTensor(words), volatile=True) 203 | Temp_mask = Variable(torch.Tensor(temp_mask)) 204 | if self.on_gpu: 205 | words = words.cuda() 206 | Temp_mask = Temp_mask.cuda() 207 | Temp_mask = Temp_mask * 100000 - 100000 208 | # Word embedding 209 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0) 210 | 211 | # Take a rnn step 212 | logits, new_state = self.forward(words, state) 213 | logits = logits + Temp_mask 214 | # Next words 215 | if manner == 'greedy': 216 | new_words = logits.data.cpu().numpy().argmax(1) 217 | elif manner == 'sample': 218 | # Gumbel argmax trick 219 | if self.on_gpu: 220 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 221 | else: 222 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 223 | V = logits.data - torch.log(-U.log()) 224 | new_words = V.cpu().numpy().argmax(1) 225 | else: 226 | raise ValueError('Unknown manner: [{}]'.format(manner)) 227 | 228 | return new_state, new_words 229 | 230 | def fourgram_inference(self, vocab, image_ids, image, manner='greedy', 231 | max_length=16): 232 | # Choose batch-size 233 | self.batch_size = image.shape[0] 234 | 235 | # Beginning tokens 236 | init_word = torch.LongTensor([9488] * self.batch_size) 237 | 238 | # Initiazization 239 | results = [] 240 | captions = [] 241 | 242 | # Iteratively generate words 243 | state = self.initial_state(image) 244 | sentences = [] 245 | word = init_word 246 | sentencemask = np.zeros((max_length + 3, self.batch_size), dtype=np.int32) 247 | sentencemask[0:3,:] = 9488 248 | for jj in xrange(max_length): 249 | temp_mask = self.get_four_Mask(sentencemask,jj+3) 250 | state, word = self.ngram_single_step(state, word,temp_mask,manner=manner) 251 | sentencemask[jj+3,:] = word 252 | sentences.append(word) 253 | sentences = np.array(sentences).transpose() 254 | 255 | # Translate indexes to sentences 256 | for j in xrange(sentences.shape[0]): 257 | idxs = np.where(sentences[j, :] == 0)[0] 258 | end_index = idxs[0] if len(idxs) > 0 else max_length 259 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 260 | captions.append(sentences[j, :end_index]) 261 | results.append({'image_id': image_ids[j], 'caption': cap}) 262 | 263 | # Type: captions (np.array), results (natural language) 264 | return captions, results 265 | 266 | def get_four_Mask(self,sentencemask,index): 267 | tempmask = np.zeros((self.batch_size,self.vocab_size)) 268 | for hh in range(self.batch_size): 269 | temp = tuple(list(sentencemask[index-3:index,hh])) 270 | if temp in self.fourgram_mask: 271 | tempmask[hh] = self.fourgram_mask[temp] 272 | else: 273 | tempmask[hh][0] = 1 # END token 274 | return tempmask 275 | 276 | def trigram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16): 277 | # Choose batch-size 278 | self.batch_size = image.shape[0] 279 | results = [] 280 | captions = [] 281 | 282 | # Iteratively generate words 283 | state = self.initial_state(image) 284 | sentences = [] 285 | init_word = torch.LongTensor([9488] * self.batch_size) 286 | word = init_word 287 | sentencemask = np.zeros((max_length + 2, self.batch_size), dtype=np.int32) 288 | sentencemask[0:2,:] = 9488 289 | for jj in xrange(max_length): 290 | temp_mask = self.get_tri_Mask(sentencemask,jj+2) 291 | state, word = self.ngram_single_step(state, word,temp_mask,manner=manner) 292 | sentencemask[jj+2,:] = word 293 | sentences.append(word) 294 | sentences = np.array(sentences).transpose() 295 | 296 | # Translate indexes to sentences 297 | for j in xrange(sentences.shape[0]): 298 | idxs = np.where(sentences[j, :] == 0)[0] 299 | end_index = idxs[0] if len(idxs) > 0 else max_length 300 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 301 | captions.append(sentences[j, :end_index]) 302 | results.append({'image_id': image_ids[j], 'caption': cap}) 303 | 304 | # Type: captions (np.array), results (natural language) 305 | return captions, results 306 | 307 | def get_tri_Mask(self,sentencemask,index): 308 | tempmask = np.zeros((self.batch_size,self.vocab_size)) 309 | for hh in range(self.batch_size): 310 | temp = tuple(list(sentencemask[index-2:index,hh])) 311 | if temp in self.trigram_mask: 312 | tempmask[hh] = self.trigram_mask[temp] 313 | else: 314 | tempmask[hh][0] = 1 # END token 315 | return tempmask 316 | 317 | def save(self, file_path): 318 | with open(file_path, 'wb') as f: 319 | torch.save(self.state_dict(), f) 320 | 321 | def load(self, file_path): 322 | with open(file_path, 'rb') as f: 323 | self.load_state_dict(torch.load(f)) 324 | 325 | -------------------------------------------------------------------------------- /MoreNet/caption_model/rnnlm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | class LM(nn.Module): 8 | def __init__(self, batch_size, hidden_size, 9 | vocab_size, word_embed_size, 10 | lr, on_gpu=True): 11 | super(LM, self).__init__() 12 | self.lr = lr 13 | self.vocab_size = vocab_size 14 | self.word_embed_size = word_embed_size 15 | self.hidden_size = hidden_size 16 | self.batch_size = batch_size 17 | self.on_gpu = on_gpu 18 | 19 | # word embedding layer 20 | self.word_embedding_layer = nn.Embedding(vocab_size,word_embed_size) 21 | 22 | # language model LSTM 23 | self.rnn = nn.LSTMCell(word_embed_size,hidden_size) 24 | 25 | # predict layer 26 | self.predict_layer = nn.Linear(hidden_size,vocab_size) 27 | 28 | # Optimizer 29 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) 30 | 31 | self.onehot = torch.torch.eye(vocab_size) 32 | if self.on_gpu: 33 | self.onehot = self.onehot.cuda() 34 | # Move to gpu if necessary 35 | if self.on_gpu: 36 | self.cuda() 37 | def init_state(self): 38 | h = Variable(torch.zeros(self.batch_size, self.hidden_size)) 39 | c = Variable(torch.zeros(self.batch_size, self.hidden_size)) 40 | if self.on_gpu: 41 | h = h.cuda() 42 | c = c.cuda() 43 | return h,c 44 | 45 | def forward(self, word_emb, state): 46 | # Get states 47 | h_tm1, c_tm1 = state 48 | # RNN input 49 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1)) 50 | # Next word's logtis 51 | logits = self.predict_layer(h_t) 52 | return logits, (h_t, c_t) 53 | 54 | def train_on_batch(self,sentence, mask, reward): 55 | # Convert numpy to torch 56 | sentence = Variable(torch.LongTensor(sentence.tolist())) 57 | mask = torch.Tensor(mask) 58 | reward = torch.Tensor(reward) 59 | T = sentence.size()[0] - 1 60 | # If using gpu 61 | 62 | sentence = sentence.cuda() 63 | mask = mask.cuda() 64 | reward = reward.cuda() 65 | 66 | # Initial state of RNN 67 | state = self.init_state() 68 | # Word embedding for input sequence 69 | inputs = self.word_embedding_layer(sentence[:-1, :]) 70 | # Recurrent computation 71 | logits = [] 72 | for i in xrange(T): 73 | word = inputs[i, :, :] 74 | logit, state = self.forward(word, state) 75 | logits.append(logit.unsqueeze(0)) 76 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 77 | logits = logits.resize(T*self.batch_size, self.vocab_size) 78 | # Next word's distribution 79 | prob = F.softmax(logits).data 80 | # Ground-truth 81 | targets = sentence.data[1:, :].view(T*self.batch_size) 82 | gt_prob = self.onehot.index_select(0, targets) 83 | # Gradients 84 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size) 85 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad) 86 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad) 87 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad) 88 | logit_grad = logit_grad / self.batch_size 89 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size) 90 | 91 | # Gradient descent 92 | self.optimizer.zero_grad() 93 | logits.backward(gradient=logit_grad) 94 | self.optimizer.step() 95 | targets = targets.cpu().numpy() 96 | loss = - np.log(prob[np.arange(T * self.batch_size), targets]) 97 | return loss.mean().numpy() 98 | 99 | def test_on_batch(self,sentence, mask): 100 | sentence = Variable(torch.LongTensor(sentence.tolist())) 101 | mask = torch.Tensor(mask) 102 | T = sentence.size()[0] - 1 103 | # If using gpu 104 | if self.on_gpu: 105 | sentence = sentence.cuda() 106 | mask = mask.cuda() 107 | # Initial state of RNN 108 | state = self.init_state() 109 | # Word embedding for input sequence 110 | inputs = self.word_embedding_layer(sentence[:-1, :]) 111 | # Recurrent computation 112 | logits = [] 113 | for i in xrange(T): 114 | word = inputs[i, :, :] 115 | logit, state = self.forward(word, state) 116 | logits.append(logit.unsqueeze(0)) 117 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 118 | logits = logits.resize(T * self.batch_size, self.vocab_size) 119 | # Next word's distribution 120 | prob = F.softmax(logits).data 121 | prob = prob.view(T,self.batch_size,self.vocab_size) 122 | # Ground-truth 123 | return prob 124 | 125 | def single_step_prob(self,word,state): 126 | word = Variable(torch.LongTensor(word.tolist())) 127 | if self.on_gpu: 128 | word = word.cuda() 129 | word_emb = self.word_embedding_layer(word) 130 | logit, state2 = self.forward(word_emb, state) # logit : (batch_size, vocab_size) 131 | prob = F.softmax(logit).data # (batch_size, vocab_size) 132 | return prob,state2 133 | 134 | def save(self, file_path): 135 | with open(file_path, 'wb') as f: 136 | torch.save(self.state_dict(), f) 137 | 138 | def load(self, file_path): 139 | with open(file_path, 'rb') as f: 140 | self.load_state_dict(torch.load(f)) 141 | -------------------------------------------------------------------------------- /MoreNet/train_fourgram.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | 16 | batch_size = opts.batch_size 17 | 18 | loader = KKDataLoader(opts) 19 | vocabs = loader.get_vocab() 20 | vocab = ['#END#'] 21 | for i in range(len(vocabs)): 22 | ids = str(i+1) 23 | vocab.append(vocabs[ids]) 24 | 25 | if not os.path.exists('fourgram_cider_model'): 26 | os.mkdir('fourgram_cider_model') 27 | 28 | if opts.use_att: 29 | save_dir = 'fourgram_cider_model/' + 'att_model' 30 | else: 31 | save_dir = 'fourgram_cider_model/' + 'fc_model' 32 | if not os.path.exists(save_dir): 33 | os.mkdir(save_dir) 34 | print(save_dir + ' has been built') 35 | 36 | 37 | image_dim = 2048 38 | vocab_size = loader.vocab_size + 2 39 | cell_size = 512 40 | lr = 0.00005 41 | if opts.use_att: 42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True) 43 | model.load('warm_model/att_warm/model.init') 44 | else: 45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True) 46 | model.load('warm_model/fc_warm/model.init') 47 | 48 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 49 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 50 | 51 | def cider_temp(res): 52 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 53 | score, _ = cider_scorer.compute_score() 54 | return score 55 | 56 | pool = Pool(processes=5) 57 | best_score = -1 58 | logger = Logger(save_dir) 59 | iter = 0 60 | finish_iter = 100000 61 | timer = Timer() 62 | timer.tic() 63 | while iter < finish_iter: 64 | iter += 1 65 | data = loader.get_batch('train') 66 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 67 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 68 | fc_feats, att_feats = tmp 69 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 70 | if opts.use_att: 71 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 72 | feature = att_feats 73 | else: 74 | feature = fc_feats 75 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 76 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 77 | 78 | all_caps, all_results, all_scores = [], [], [] 79 | for _ in xrange(20): 80 | # Generate captions by sampling 81 | sample_caps, sample_results = model.fourgram_inference(vocab, image_id, feature, 82 | manner='sample', 83 | max_length=16) 84 | 85 | # Compute cider scores for sampled captions 86 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 87 | all_caps.append(sample_caps) 88 | all_results.append(sample_results) 89 | all_scores.append(sample_scores) 90 | 91 | all_scores = np.array(all_scores) 92 | sample_caps, sample_results, sample_scores = [], [], [] 93 | for n in xrange(opts.batch_size): 94 | best_i = all_scores[:, n].argmax() 95 | sample_caps.append(all_caps[best_i][n]) 96 | sample_results.append(all_results[best_i][n]) 97 | sample_scores.append(all_scores[best_i, n]) 98 | sample_scores = np.array(sample_scores) 99 | 100 | max_length = max([cap.shape[0] for cap in sample_caps]) 101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 102 | for n in xrange(opts.batch_size): 103 | L = sample_caps[n].shape[0] 104 | caption[1:L + 1, n] = sample_caps[n] 105 | caption[L + 1:, n] = 0 106 | caption[0,:] = 9488 107 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 108 | for n in xrange(opts.batch_size): 109 | L = sample_caps[n].shape[0] 110 | mask[:L + 1, n] = 1 111 | reward = (sample_scores - greedy_scores).astype(np.float32) 112 | print image_id[0] 113 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 114 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 115 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 116 | if iter % 300 == 0: 117 | results = [] 118 | for nn in range(5000/opts.batch_size): 119 | data = loader.get_batch('val') 120 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 121 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 122 | fc_feats, att_feats = tmp 123 | if opts.use_att: 124 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 125 | att_feats.shape[3]) 126 | feature_val = att_feats 127 | else: 128 | feature_val = fc_feats 129 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 130 | 131 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 132 | # Generate sentences for validation set 133 | results += greedy_res 134 | # Evaluate generated captions 135 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 136 | gt_file = osp.join('data/features', 'captions_val.json') 137 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 138 | 139 | if score > best_score: 140 | best_score = score 141 | model.save(osp.join(save_dir, 'model.best')) 142 | model.save(osp.join(save_dir,'model.ckpt')) 143 | # Output training information 144 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 145 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 146 | # Reset loss and timer 147 | train_losses = [] 148 | timer.tic() 149 | 150 | # If early-stop condition triggers 151 | if iter > finish_iter: 152 | break 153 | 154 | -------------------------------------------------------------------------------- /MoreNet/train_rnnlm.py: -------------------------------------------------------------------------------- 1 | import ngram_opts 2 | from dataloader import * 3 | import os 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | from caption_model.rnnlm import * 6 | 7 | opts = ngram_opts.parse_opt() 8 | if opts.caption_model == 'fc': 9 | opts.use_att = False 10 | else: 11 | opts.use_att = True 12 | loader = KKDataLoader(opts) 13 | 14 | 15 | batch_size = 100 16 | hidden_size = 256 17 | word_embed_size = 256 18 | vocab_size = loader.vocab_size + 2 # set start token 19 | lr = 0.0001 20 | lm = LM(batch_size, hidden_size,vocab_size, word_embed_size,lr) 21 | lm.load('warm_model/rnnlm/model.init') 22 | Labels = loader.h5_label_file['labels'] 23 | new_labels = np.zeros((Labels.shape[1]+1,Labels.shape[0]),dtype=Labels.dtype) 24 | new_labels[0,:] = 9488 # Set start token to 9488, the total vocab size is 9489 25 | for i in range(Labels.shape[0]): 26 | new_labels[1:,i] = Labels[i,:] 27 | 28 | Ind = range(len(Labels)) 29 | mask = np.ones((16,100)) 30 | reward = np.ones((100,)) 31 | import random 32 | for i in range(1000): 33 | random.shuffle(Ind) 34 | Loss = [] 35 | for j in range(100): 36 | index = Ind[j*batch_size:(j+1)*batch_size] 37 | batch_sen = new_labels[:,index] 38 | loss = lm.train_on_batch(batch_sen,mask,reward) 39 | Loss.append(loss) 40 | print i,np.mean(Loss) 41 | if i % 10 == 0: 42 | lm.save('warm_model2/rnnlm/model.init') 43 | 44 | -------------------------------------------------------------------------------- /MoreNet/train_rnnlm_cider.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | from caption_model.rnnlm import * 6 | import os 7 | #os.environ["CUDA_VISIBLE_DEVICES"] = "2" 8 | import ngram_opts 9 | from tools import * 10 | from dataloader import * 11 | opts = ngram_opts.parse_opt() 12 | if opts.caption_model == 'fc': 13 | opts.use_att = False 14 | else: 15 | opts.use_att = True 16 | 17 | batch_size = opts.batch_size 18 | 19 | loader = KKDataLoader(opts) 20 | vocabs = loader.get_vocab() 21 | vocab = ['#END#'] 22 | for i in range(len(vocabs)): 23 | ids = str(i+1) 24 | vocab.append(vocabs[ids]) 25 | 26 | if opts.use_att: 27 | save_dir = 'rnnlm_cider_model/' + 'att_model' 28 | else: 29 | save_dir = 'rnnlm_cider_model/' + 'fc_model' 30 | if not os.path.exists(save_dir): 31 | os.mkdir(save_dir) 32 | print(save_dir + ' has been built') 33 | 34 | 35 | image_dim = opts.fc_feat_size 36 | vocab_size = loader.vocab_size + 2 37 | cell_size = opts.rnn_size 38 | lr = 0.00005 39 | if opts.use_att: 40 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 41 | model.load('warm_model/att_warm/model.init') 42 | else: 43 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 44 | model.load('warm_model/fc_warm/model.init') 45 | 46 | word_embed_size = 256 47 | hidden_size = 256 48 | manner = 'sample' 49 | lm = LM(batch_size, hidden_size,vocab_size+1, word_embed_size,lr) 50 | lm.load('warm_model/rnnlm/model.init') 51 | 52 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 53 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 54 | def cider_temp(res): 55 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 56 | score, _ = cider_scorer.compute_score() 57 | return score 58 | 59 | pool = Pool(processes=4) 60 | logger = Logger(save_dir) 61 | best_score = -1 62 | iters = 0 63 | finish_iter = 100 64 | timer = Timer() 65 | timer.tic() 66 | best_count = 0 67 | max_step = 14 68 | while iters < finish_iter: 69 | iters += 1 70 | data = loader.get_batch('train') 71 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 72 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 73 | fc_feats, att_feats = tmp 74 | if opts.use_att: 75 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 76 | feature = att_feats 77 | else: 78 | feature = fc_feats 79 | 80 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 81 | 82 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 83 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 84 | 85 | all_caps, all_results, all_scores = [], [], [] 86 | for _ in xrange(30): 87 | # Generate captions by sampling 88 | if opts.use_att: 89 | sample_caps, sample_results = att_lm_caption(lm,model,image_id,vocab,loader,feature,max_step,'sample') 90 | else: 91 | sample_caps, sample_results = lm_caption(lm, model, image_id, vocab, loader, feature, max_step,'sample') 92 | # Compute cider scores for sampled captions 93 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 94 | all_caps.append(sample_caps) 95 | all_results.append(sample_results) 96 | all_scores.append(sample_scores) 97 | 98 | all_scores = np.array(all_scores) 99 | sample_caps, sample_results, sample_scores = [], [], [] 100 | for n in xrange(opts.batch_size): 101 | best_i = all_scores[:, n].argmax() 102 | sample_caps.append(all_caps[best_i][:,n]) 103 | sample_results.append(all_results[best_i][n]) 104 | sample_scores.append(all_scores[best_i, n]) 105 | sample_scores = np.array(sample_scores) 106 | sample_caps = np.array(sample_caps) 107 | sample_caps = sample_caps.transpose() 108 | 109 | mask = np.ones((sample_caps.shape[0]-1,sample_caps.shape[1])) 110 | for n in range(opts.batch_size): 111 | index = np.where(sample_caps[:,n] == 0)[0] 112 | if len(index) > 1: 113 | mask[index[1]-1:,n] = 0 114 | 115 | reward = (sample_scores - greedy_scores).astype(np.float32) 116 | sample_caps[0,:] = 9488 117 | print iters, image_id[0] 118 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 119 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 120 | 121 | loss_train = model.train_on_batch(feature,sample_caps, mask, reward) 122 | if iters % 300 == 0: 123 | results = [] 124 | for kkk in range(5000/opts.batch_size): 125 | data = loader.get_batch('val') 126 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 127 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 128 | fc_feats, att_feats = tmp 129 | if opts.use_att: 130 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 131 | att_feats.shape[3]) 132 | feature = att_feats 133 | else: 134 | feature = fc_feats 135 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 136 | 137 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 138 | results += greedy_res 139 | # Evaluate generated captions 140 | json.dump(results, open(osp.join(save_dir, 'rl_result.json'), 'w')) 141 | gt_file = osp.join('data/features', 'captions_val.json') 142 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'rl_result.json'))[-1] 143 | 144 | if score > best_score: 145 | best_score = score 146 | model.save(osp.join(save_dir, 'model.best')) 147 | model.save(osp.join(save_dir,'model.ckpt')) 148 | # Output training information 149 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 150 | .format(iters, -1, score, best_score, finish_iter, timer.toc())) 151 | 152 | train_losses = [] 153 | timer.tic() 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /MoreNet/train_sc_cider.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | 16 | batch_size = opts.batch_size 17 | 18 | loader = KKDataLoader(opts) 19 | vocabs = loader.get_vocab() 20 | vocab = ['#END#'] 21 | for i in range(len(vocabs)): 22 | ids = str(i+1) 23 | vocab.append(vocabs[ids]) 24 | 25 | if not os.path.exists('sc_cider_model'): 26 | os.mkdir('sc_cider_model') 27 | 28 | if opts.use_att: 29 | save_dir = 'sc_cider_model/' + 'att_model' 30 | else: 31 | save_dir = 'sc_cider_model/' + 'fc_model' 32 | if not os.path.exists(save_dir): 33 | os.mkdir(save_dir) 34 | print(save_dir + ' has been built') 35 | 36 | 37 | image_dim = 2048 38 | vocab_size = loader.vocab_size + 1 39 | cell_size = 512 40 | lr = 0.00005 41 | if opts.use_att: 42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True) 43 | model.load('warm_model/att_warm/model.init') 44 | else: 45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True) 46 | model.load('warm_model/fc_warm/model.init') 47 | 48 | 49 | # Initialize cider-scorer 50 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 51 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 52 | 53 | def cider_temp(res): 54 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 55 | score, _ = cider_scorer.compute_score() 56 | return score 57 | 58 | pool = Pool(processes=5) 59 | 60 | best_score = -1 61 | logger = Logger(save_dir) 62 | iter = 0 63 | finish_iter = 1000000 64 | timer = Timer() 65 | timer.tic() 66 | while iter < finish_iter: 67 | iter += 1 68 | data = loader.get_batch('train') 69 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 70 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 71 | fc_feats, att_feats = tmp 72 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 73 | if opts.use_att: 74 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 75 | feature = att_feats 76 | else: 77 | feature = fc_feats 78 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 79 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 80 | 81 | all_caps, all_results, all_scores = [], [], [] 82 | for _ in xrange(20): 83 | # Generate captions by sampling 84 | sample_caps, sample_results = model.inference(vocab, image_id, feature, 85 | manner='sample', 86 | max_length=16) 87 | 88 | # Compute cider scores for sampled captions 89 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 90 | all_caps.append(sample_caps) 91 | all_results.append(sample_results) 92 | all_scores.append(sample_scores) 93 | 94 | all_scores = np.array(all_scores) 95 | sample_caps, sample_results, sample_scores = [], [], [] 96 | for n in xrange(opts.batch_size): 97 | best_i = all_scores[:, n].argmax() 98 | sample_caps.append(all_caps[best_i][n]) 99 | sample_results.append(all_results[best_i][n]) 100 | sample_scores.append(all_scores[best_i, n]) 101 | sample_scores = np.array(sample_scores) 102 | 103 | max_length = max([cap.shape[0] for cap in sample_caps]) 104 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 105 | for n in xrange(opts.batch_size): 106 | L = sample_caps[n].shape[0] 107 | caption[1:L + 1, n] = sample_caps[n] 108 | caption[L + 1:, n] = 0 109 | caption[0,:] = 9488 110 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 111 | for n in xrange(opts.batch_size): 112 | L = sample_caps[n].shape[0] 113 | mask[:L + 1, n] = 1 114 | reward = (sample_scores - greedy_scores).astype(np.float32) 115 | print image_id[0] 116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 117 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 118 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 119 | if iter % 300 == 0: 120 | results = [] 121 | for nn in range(5000/opts.batch_size): 122 | data = loader.get_batch('val') 123 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 124 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 125 | fc_feats, att_feats = tmp 126 | if opts.use_att: 127 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 128 | att_feats.shape[3]) 129 | feature_val = att_feats 130 | else: 131 | feature_val = fc_feats 132 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 133 | 134 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 135 | # Generate sentences for validation set 136 | results += greedy_res 137 | # Evaluate generated captions 138 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 139 | gt_file = osp.join('data/features', 'captions_val.json') 140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 141 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w')) 142 | # Update if finding new best model 143 | if score > best_score: 144 | best_score = score 145 | model.save(osp.join(save_dir, 'model.best')) 146 | model.save(osp.join(save_dir,'model.ckpt')) 147 | # Output training information 148 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 149 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 150 | # Reset loss and timer 151 | train_losses = [] 152 | timer.tic() 153 | 154 | # If early-stop condition triggers 155 | if iter > finish_iter: 156 | break 157 | 158 | -------------------------------------------------------------------------------- /MoreNet/train_trigram.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | batch_size = opts.batch_size 16 | 17 | loader = KKDataLoader(opts) 18 | vocabs = loader.get_vocab() 19 | vocab = ['#END#'] 20 | for i in range(len(vocabs)): 21 | ids = str(i+1) 22 | vocab.append(vocabs[ids]) 23 | 24 | if not os.path.exists('trigram_cider_model'): 25 | os.mkdir('trigram_cider_model') 26 | 27 | if opts.use_att: 28 | save_dir = 'trigram_cider_model/' + 'att_model' 29 | else: 30 | save_dir = 'trigram_cider_model/' + 'fc_model' 31 | if not os.path.exists(save_dir): 32 | os.mkdir(save_dir) 33 | print(save_dir + ' has been built') 34 | 35 | 36 | image_dim = 2048 37 | vocab_size = loader.vocab_size + 2 38 | cell_size = 512 39 | lr = 0.00005 40 | if opts.use_att: 41 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True) 42 | model.load('warm_model/att_warm/model.init') 43 | else: 44 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True) 45 | model.load('warm_model/fc_warm/model.init') 46 | 47 | 48 | # Initialize cider-scorer 49 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 50 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 51 | 52 | def cider_temp(res): 53 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 54 | score, _ = cider_scorer.compute_score() 55 | return score 56 | 57 | pool = Pool(processes=5) 58 | 59 | best_score = -1 60 | logger = Logger(save_dir) 61 | iter = 0 62 | finish_iter = 100000 63 | timer = Timer() 64 | timer.tic() 65 | while iter < finish_iter: 66 | iter += 1 67 | data = loader.get_batch('train') 68 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 69 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 70 | fc_feats, att_feats = tmp 71 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 72 | if opts.use_att: 73 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 74 | feature = att_feats 75 | else: 76 | feature = fc_feats 77 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 78 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 79 | 80 | all_caps, all_results, all_scores = [], [], [] 81 | for _ in xrange(20): 82 | # Generate captions by sampling 83 | sample_caps, sample_results = model.trigram_inference(vocab, image_id, feature, 84 | manner='sample', 85 | max_length=16) 86 | 87 | # Compute cider scores for sampled captions 88 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 89 | all_caps.append(sample_caps) 90 | all_results.append(sample_results) 91 | all_scores.append(sample_scores) 92 | 93 | all_scores = np.array(all_scores) 94 | sample_caps, sample_results, sample_scores = [], [], [] 95 | for n in xrange(opts.batch_size): 96 | best_i = all_scores[:, n].argmax() 97 | sample_caps.append(all_caps[best_i][n]) 98 | sample_results.append(all_results[best_i][n]) 99 | sample_scores.append(all_scores[best_i, n]) 100 | sample_scores = np.array(sample_scores) 101 | 102 | max_length = max([cap.shape[0] for cap in sample_caps]) 103 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 104 | for n in xrange(opts.batch_size): 105 | L = sample_caps[n].shape[0] 106 | caption[1:L + 1, n] = sample_caps[n] 107 | caption[L + 1:, n] = 0 108 | caption[0,:] = 9488 109 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 110 | for n in xrange(opts.batch_size): 111 | L = sample_caps[n].shape[0] 112 | mask[:L + 1, n] = 1 113 | 114 | reward = (sample_scores - greedy_scores).astype(np.float32) 115 | print image_id[0] 116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 117 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 118 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 119 | if iter % 300 == 0: 120 | results = [] 121 | for nn in range(5000/opts.batch_size): 122 | data = loader.get_batch('val') 123 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 124 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 125 | fc_feats, att_feats = tmp 126 | if opts.use_att: 127 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 128 | att_feats.shape[3]) 129 | feature_val = att_feats 130 | else: 131 | feature_val = fc_feats 132 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 133 | 134 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 135 | # Generate sentences for validation set 136 | results += greedy_res 137 | # Evaluate generated captions 138 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 139 | gt_file = osp.join('data/features', 'captions_val.json') 140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 141 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w')) 142 | # Update if finding new best model 143 | if score > best_score: 144 | best_score = score 145 | model.save(osp.join(save_dir, 'model.best')) 146 | model.save(osp.join(save_dir,'model.ckpt')) 147 | # Output training information 148 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 149 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 150 | # Reset loss and timer 151 | train_losses = [] 152 | timer.tic() 153 | 154 | # If early-stop condition triggers 155 | if iter > finish_iter: 156 | break 157 | 158 | -------------------------------------------------------------------------------- /MoreNet/train_warm.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | import os 4 | #os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | import ngram_opts 6 | from tools import * 7 | from dataloader import * 8 | opts = ngram_opts.parse_opt() 9 | if opts.caption_model == 'fc': 10 | opts.use_att = False 11 | else: 12 | opts.use_att = True 13 | 14 | batch_size = opts.batch_size 15 | 16 | loader = KKDataLoader(opts) 17 | vocabs = loader.get_vocab() 18 | vocab = ['#END#'] 19 | for i in range(len(vocabs)): 20 | ids = str(i+1) 21 | vocab.append(vocabs[ids]) 22 | 23 | if opts.use_att: 24 | save_dir = 'warm_model/' + 'att_warm' 25 | else: 26 | save_dir = 'warm_model/' + 'fc_warm' 27 | if not os.path.exists(save_dir): 28 | os.mkdir(save_dir) 29 | print(save_dir + ' has been built') 30 | 31 | 32 | image_dim = 2048 33 | vocab_size = loader.vocab_size + 2 # set start token to 9488 34 | cell_size = 512 35 | lr = 0.00005 36 | if opts.use_att: 37 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 38 | else: 39 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 40 | 41 | 42 | iters = 0 43 | best_score = -1 44 | train_losses = [] 45 | timer = Timer() 46 | timer.tic() 47 | logger = Logger(save_dir) 48 | finish_iter = 1000000 49 | while iters < 1000000: 50 | iters += 1 51 | data = loader.get_batch('train') 52 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']] 53 | 54 | fc_feats, att_feats, labels, masks = tmp 55 | if opts.use_att: 56 | feature = att_feats.reshape(att_feats.shape[0],att_feats.shape[1]* att_feats.shape[2],att_feats.shape[3]) 57 | else: 58 | feature = fc_feats 59 | Label = labels.transpose() 60 | Label[0,:] = 9488 61 | Mask = masks.transpose()[0:-1,:] 62 | reward = np.ones((opts.batch_size*5,)) 63 | 64 | train_losses.append(model.train_on_batch(feature,Label, Mask, reward)) 65 | # Validation 66 | if iters % 500 == 0 and iters >= 20000: 67 | results = [] 68 | for nn in range(5000/opts.batch_size): 69 | datas = loader.get_batch('val') 70 | tmp = [datas['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 71 | datas['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 72 | fc_feats, att_feats = tmp 73 | if opts.use_att: 74 | feature = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 75 | att_feats.shape[3]) 76 | else: 77 | feature = fc_feats 78 | image_id = [datas['infos'][i]['id'] for i in range(opts.batch_size)] 79 | 80 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature,manner='greedy',max_length=16) 81 | 82 | results += greedy_res 83 | 84 | json.dump(results, open(osp.join(save_dir, 'tmp_result.json'), 'w')) 85 | gt_file = osp.join('data/features', 'captions_val.json') 86 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'tmp_result.json'))[-1] 87 | 88 | if score > best_score: 89 | best_score = score 90 | model.save(osp.join(save_dir, 'model.init')) 91 | 92 | # Output training information 93 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 94 | .format(iters, np.mean(train_losses), score, best_score, finish_iter, timer.toc())) 95 | # Reset loss and timer 96 | train_losses = [] 97 | timer.tic() 98 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pytorch Implementation of [Improving Reinforcement Learning Based Image Captioning with Natural Language Prior](https://arxiv.org/abs/1809.06227) 2 | 3 | ## Requirements 4 | Python 2.7 5 | 6 | PyTorch 0.4 (along with torchvision) 7 | 8 | cider package (copy from [Here](https://drive.google.com/open?id=15jqeHYQD0LJjp_e86QvJipUL4_-MHH5p) and dump them to `cider/`) 9 | 10 | pycoco package (copy from [Here](https://drive.google.com/open?id=1B71eCxPj8h7cw5SGVyKOLPsjbbM6dFAF) and extract them to `pycoco/`) 11 | 12 | You need to download pretrained resnet model for both training and evaluation. The models can be downloaded from [here](https://drive.google.com/open?id=1YD7YjPPoK-WGZhmeTcV8LEp_3hYoBcpq), and should be placed in `data/imagenet_weights`. 13 | 14 | ## Train your own network on COCO 15 | 16 | ### Download COCO captions and preprocess them 17 | 18 | Download preprocessed coco captions from [link](https://drive.google.com/open?id=1RzIFR-12fxptp6wm8bqteosLhmRh0cyJ) following Karpathy's split. Copy `dataset_coco.json`,`captions_train.json`,`captions_val.json` and `captions_test.json` in to `data/features`. 19 | 20 | Then do: 21 | 22 | ```bash 23 | $ python scripts/prepro_labels.py --input_json data/dataset_coco.json --output_json data/cocotalk.json --output_h5 data/cocotalk 24 | ``` 25 | 26 | `prepro_labels.py` will map all words that occur <= 5 times to a special `UNK` token, and create a vocabulary for all the remaining words. The image information and vocabulary are dumped into `data/cocotalk.json` and discretized caption data are dumped into `data/cocotalk_label.h5`. 27 | 28 | ### Download COCO dataset and pre-extract the image features 29 | 30 | Download the coco images from [link](http://mscoco.org/dataset/#download). We need 2014 training images and 2014 val. images. You should put the `train2014/` and `val2014/` in the same directory, denoted as `$IMAGE_ROOT`. 31 | 32 | Then: 33 | 34 | ``` 35 | $ python scripts/prepro_feats.py --input_json data/dataset_coco.json --output_dir data/cocotalk --images_root $IMAGE_ROOT 36 | ``` 37 | 38 | 39 | `prepro_feats.py` extract the resnet101 features (both fc feature and last conv feature) of each image. The features are saved in `data/cocotalk_fc` and `data/cocotalk_att`, and resulting files are about 200GB. 40 | 41 | (Check the prepro scripts for more options, like other resnet models or other attention sizes.) 42 | 43 | ### Warm Starm 44 | 45 | In order to help CIDEr based REINFORCE algorithm converge more stable and faster, We need to warm start the captioning model and run the script below 46 | 47 | ```bash 48 | $ python train_warm.py --caption_model fc 49 | ``` 50 | if you want to use Attention, then run 51 | ```bash 52 | $ python train_warm.py --caption_model att 53 | ``` 54 | Download our pretrained warm start model from this [link](https://drive.google.com/open?id=1ZmAqqknqPVnwmiPS2KF6wQCURVhTuZp2). And the best CIDEr score in validation set are 90.1 for FC and 94.2 for Attention. 55 | 56 | ### Train using Self-critical 57 | ```bash 58 | $ python train_sc_cider.py --caption_model att 59 | ``` 60 | You will see a large boost of CIDEr score but with lots of bad endings. 61 | ![Image text](https://github.com/tgGuo15/PriorImageCaption/blob/master/images/badending.png) 62 | 63 | 64 | 65 | ### Train using Ngram constraint 66 | 67 | First you should preprocess the dataset and get the ngram data: 68 | ``` 69 | $ python get_ngram.py 70 | ``` 71 | and will generate `fourgram.pkl` and `trigram.pkl` in `data/` . 72 | 73 | Then 74 | ```bash 75 | $ python train_fourgram.py --caption_model fc 76 | ``` 77 | It will take almost 40,000 iterations to converge and the experiment details are written in `experiment.log` in `save_dir` like 78 | ![Image text](https://github.com/tgGuo15/PriorImageCaption/blob/master/images/fourgram_att.png) 79 | 80 | 81 | ### Train using Neural Language model 82 | 83 | First you should train a neural language or you can download our pretrained LSTM language model from [link](https://drive.google.com/open?id=1ZmAqqknqPVnwmiPS2KF6wQCURVhTuZp2). 84 | ``` 85 | $ python train_rnnlm.py 86 | ``` 87 | 88 | Then train RL setting with Neural Language model constraint with the same warm start model. 89 | ```bash 90 | $ python train_rnnlm_cider.py --caption_model fc 91 | ``` 92 | or 93 | ```bash 94 | $ python train_rnnlm_cider.py --caption_model att 95 | ``` 96 | It will take almost 36,000 iterations to converge and the experiment details are written in `experiment.log` in `save_dir`. 97 | 98 | ![Image text](https://github.com/tgGuo15/PriorImageCaption/blob/master/images/rnn_att.png) 99 | 100 | 101 | ### Evaluating `CIDEr`,`METEOR`,`ROUGEL`,`BLEU`score with Bad Ending removal 102 | ```bash 103 | $ python Eval_model.py --caption_model fc --rl_type fourgram 104 | ``` 105 | 106 | ### Try another network structure 107 | We also try another neural network structure and get the similar results. Please see the MoreNet.md for more details. 108 | 109 | ## Acknowledgements 110 | Thanks the original [self-critical](https://github.com/ruotianluo/self-critical.pytorch) performed by ruotianluo. 111 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | #'Prior Image Caption' -------------------------------------------------------------------------------- /caption_model/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /caption_model/fc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import pickle 7 | 8 | class FCModel(nn.Module): 9 | def __init__(self, batch_size, cell_size, image_dim, 10 | vocab_size, lr, ngram=0, on_gpu=False): 11 | super(FCModel, self).__init__() 12 | # Settings 13 | self.batch_size = batch_size 14 | self.cell_size = cell_size 15 | self.image_dim = image_dim 16 | self.vocab_size = vocab_size 17 | self.lr = lr 18 | self.on_gpu = on_gpu 19 | 20 | # Word embedding lookup table 21 | self.word_embedding = nn.Embedding(vocab_size, cell_size) 22 | 23 | # Image embedding mlp 24 | self.image_embedding = nn.Linear(image_dim, cell_size, bias=False) 25 | 26 | # Recurrent layera 27 | self.rnn = nn.LSTMCell(cell_size, cell_size) 28 | 29 | # Word predicting mlp 30 | self.predictor = nn.Linear(cell_size, vocab_size) 31 | 32 | # Onehot encoder 33 | self.onehot = torch.torch.eye(vocab_size) 34 | if self.on_gpu: 35 | self.onehot = self.onehot.cuda() 36 | 37 | # Optimizer 38 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) 39 | 40 | if ngram == 3: 41 | trigram = pickle.load(open('data/features/trigram.pkl')) 42 | self.trigram_mask = {} 43 | for tri in trigram: 44 | temp = np.zeros((vocab_size,)) 45 | for word in trigram[tri]: 46 | temp[word] = 1 47 | self.trigram_mask[tri] = temp 48 | elif ngram == 4: 49 | fourgram = pickle.load(open('data/features/fourgram.pkl')) 50 | self.fourgram_mask = {} 51 | for four in fourgram: 52 | temp = np.zeros((vocab_size,)) 53 | for word in fourgram[four]: 54 | temp[word] = 1 55 | self.fourgram_mask[four] = temp 56 | 57 | # Move to gpu if necessary 58 | if self.on_gpu: 59 | self.cuda() 60 | 61 | def forward(self, word_emb, state): 62 | # Get states 63 | h_tm1, c_tm1 = state 64 | 65 | # RNN input 66 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1)) 67 | 68 | # Next word's logtis 69 | logits = self.predictor(h_t) 70 | 71 | return logits, (h_t, c_t) 72 | 73 | def initial_state(self, image): 74 | image = Variable(torch.Tensor(image)) 75 | if self.on_gpu: 76 | image = image.cuda() 77 | 78 | # Image embedding 79 | first_word = self.image_embedding(image) 80 | h = Variable(torch.zeros(self.batch_size, self.cell_size)) 81 | c = Variable(torch.zeros(self.batch_size, self.cell_size)) 82 | if self.on_gpu: 83 | h = h.cuda() 84 | c = c.cuda() 85 | zero_state = h, c 86 | 87 | return first_word, zero_state 88 | 89 | def train_on_batch(self, image, sentence, mask, reward): 90 | # Convert numpy to torch 91 | self.batch_size = image.shape[0] 92 | sentence = Variable(torch.LongTensor(sentence.tolist())) 93 | mask = torch.Tensor(mask) 94 | reward = torch.Tensor(reward) 95 | T = sentence.size()[0] 96 | 97 | # If using gpu 98 | if self.on_gpu: 99 | sentence = sentence.cuda() 100 | mask = mask.cuda() 101 | reward = reward.cuda() 102 | 103 | # Initial state of RNN 104 | first_word, state = self.initial_state(image) 105 | 106 | # Word embedding for input sequence 107 | inputs = self.word_embedding(sentence[:-1, :]) 108 | 109 | # Recurrent computation 110 | logits = [] 111 | for i in xrange(T): 112 | if i == 0: 113 | word = first_word 114 | else: 115 | word = inputs[i-1, :, :] 116 | logit, state = self.forward(word, state) 117 | logits.append(logit.unsqueeze(0)) 118 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 119 | logits = logits.resize(T*self.batch_size, self.vocab_size) 120 | 121 | # Next word's distribution 122 | prob = nn.Softmax()(logits).data 123 | 124 | # Ground-truth 125 | targets = sentence.data[:, :].view(T*self.batch_size) 126 | gt_prob = self.onehot.index_select(0, targets) 127 | 128 | # Gradients 129 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size) 130 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad) 131 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad) 132 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad) 133 | logit_grad = logit_grad / self.batch_size 134 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size) 135 | 136 | # Gradient descent 137 | self.optimizer.zero_grad() 138 | logits.backward(gradient=logit_grad) 139 | self.optimizer.step() 140 | loss = -1 141 | 142 | return loss 143 | 144 | def single_step(self, time_step,state, words, manner='greedy'): 145 | if time_step != 0: 146 | words = Variable(torch.LongTensor(words), volatile=True) 147 | if self.on_gpu: 148 | words = words.cuda() 149 | # Word embedding 150 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0) 151 | 152 | # Take a rnn step 153 | logits, new_state = self.forward(words, state) 154 | 155 | # Next words 156 | if manner == 'greedy': 157 | new_words = logits.data.cpu().numpy().argmax(1) 158 | elif manner == 'sample': 159 | # Gumbel argmax trick 160 | if self.on_gpu: 161 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 162 | else: 163 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 164 | V = logits.data - torch.log(-U.log()) 165 | new_words = V.cpu().numpy().argmax(1) 166 | else: 167 | raise ValueError('Unknown manner: [{}]'.format(manner)) 168 | 169 | return new_state, new_words 170 | 171 | def inference(self, vocab, image_ids, image, manner='greedy',max_length=16): 172 | # Choose batch-size 173 | self.batch_size = image.shape[0] 174 | 175 | # Initiazization 176 | results = [] 177 | captions = [] 178 | 179 | # Iteratively generate words 180 | first_word, state = self.initial_state(image) 181 | sentences = [] 182 | word = first_word 183 | for i in xrange(max_length): 184 | state, word = self.single_step(i,state, word, manner=manner) 185 | sentences.append(word) 186 | sentences = np.array(sentences).transpose() 187 | 188 | # Translate indexes to sentences 189 | for j in xrange(sentences.shape[0]): 190 | idxs = np.where(sentences[j, :] == 0)[0] 191 | end_index = idxs[0] if len(idxs) > 0 else max_length 192 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 193 | captions.append(sentences[j, :end_index]) 194 | results.append({'image_id': image_ids[j], 'caption': cap}) 195 | # Type: captions (np.array), results (natural language) 196 | return captions, results 197 | 198 | def ngram_single_step(self,time_step,state, words,temp_mask, manner='greedy'): 199 | if time_step != 0: 200 | words = Variable(torch.LongTensor(words), volatile=True) 201 | if self.on_gpu: 202 | words = words.cuda() 203 | # Word embedding 204 | words = self.word_embedding(words.unsqueeze(0)).squeeze(0) 205 | 206 | Temp_mask = Variable(torch.Tensor(temp_mask)) 207 | if self.on_gpu: 208 | Temp_mask = Temp_mask.cuda() 209 | Temp_mask = Temp_mask * 100000 - 100000 210 | 211 | # Take a rnn step 212 | logits, new_state = self.forward(words, state) 213 | logits = logits + Temp_mask 214 | # Next words 215 | if manner == 'greedy': 216 | new_words = logits.data.cpu().numpy().argmax(1) 217 | elif manner == 'sample': 218 | # Gumbel argmax trick 219 | if self.on_gpu: 220 | U = torch.cuda.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 221 | else: 222 | U = torch.FloatTensor(self.batch_size, self.vocab_size).uniform_(0, 1) 223 | V = logits.data - torch.log(-U.log()) 224 | new_words = V.cpu().numpy().argmax(1) 225 | else: 226 | raise ValueError('Unknown manner: [{}]'.format(manner)) 227 | 228 | return new_state, new_words 229 | 230 | def fourgram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16): 231 | # Choose batch-size 232 | self.batch_size = image.shape[0] 233 | 234 | # Initiazization 235 | results = [] 236 | captions = [] 237 | 238 | # Iteratively generate words 239 | init_word, state = self.initial_state(image) 240 | sentences = [] 241 | word = init_word 242 | sentencemask = np.zeros((max_length + 3, self.batch_size), dtype=np.int32) 243 | sentencemask[0:3,:] = 9488 244 | for jj in xrange(max_length): 245 | temp_mask = self.get_four_Mask(sentencemask,jj+3) 246 | state, word = self.ngram_single_step(jj,state, word,temp_mask,manner=manner) 247 | sentencemask[jj+3,:] = word 248 | sentences.append(word) 249 | sentences = np.array(sentences).transpose() 250 | 251 | # Translate indexes to sentences 252 | for j in xrange(sentences.shape[0]): 253 | idxs = np.where(sentences[j, :] == 0)[0] 254 | end_index = idxs[0] if len(idxs) > 0 else max_length 255 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 256 | captions.append(sentences[j, :end_index]) 257 | results.append({'image_id': image_ids[j], 'caption': cap}) 258 | 259 | # Type: captions (np.array), results (natural language) 260 | return captions, results 261 | 262 | def get_four_Mask(self,sentencemask,index): 263 | tempmask = np.zeros((self.batch_size,self.vocab_size)) 264 | for hh in range(self.batch_size): 265 | temp = tuple(list(sentencemask[index-3:index,hh])) 266 | if temp in self.fourgram_mask: 267 | tempmask[hh] = self.fourgram_mask[temp] 268 | else: 269 | tempmask[hh][0] = 1 # END token 270 | return tempmask 271 | 272 | 273 | def trigram_inference(self, vocab, image_ids, image, manner='greedy',max_length=16): 274 | # Choose batch-size 275 | self.batch_size = image.shape[0] 276 | 277 | # Initiazization 278 | results = [] 279 | captions = [] 280 | 281 | # Iteratively generate words 282 | init_word, state = self.initial_state(image) 283 | sentences = [] 284 | word = init_word 285 | sentencemask = np.zeros((max_length + 2, self.batch_size), dtype=np.int32) 286 | sentencemask[0:2,:] = 9488 287 | for jj in xrange(max_length): 288 | temp_mask = self.get_tri_Mask(sentencemask,jj+2) 289 | state, word = self.ngram_single_step(jj,state, word,temp_mask,manner=manner) 290 | sentencemask[jj+2,:] = word 291 | sentences.append(word) 292 | sentences = np.array(sentences).transpose() 293 | 294 | # Translate indexes to sentences 295 | for j in xrange(sentences.shape[0]): 296 | idxs = np.where(sentences[j, :] == 0)[0] 297 | end_index = idxs[0] if len(idxs) > 0 else max_length 298 | cap = ' '.join([vocab[w] for w in sentences[j, :end_index]]) 299 | captions.append(sentences[j, :end_index]) 300 | results.append({'image_id': image_ids[j], 'caption': cap}) 301 | 302 | # Type: captions (np.array), results (natural language) 303 | return captions, results 304 | 305 | def get_tri_Mask(self,sentencemask,index): 306 | tempmask = np.zeros((self.batch_size,self.vocab_size)) 307 | for hh in range(self.batch_size): 308 | temp = tuple(list(sentencemask[index-2:index,hh])) 309 | if temp in self.trigram_mask: 310 | tempmask[hh] = self.trigram_mask[temp] 311 | else: 312 | tempmask[hh][0] = 1 # END token 313 | return tempmask 314 | 315 | def save(self, file_path): 316 | with open(file_path, 'wb') as f: 317 | torch.save(self.state_dict(), f) 318 | 319 | def load(self, file_path): 320 | with open(file_path, 'rb') as f: 321 | self.load_state_dict(torch.load(f)) 322 | 323 | -------------------------------------------------------------------------------- /caption_model/rnnlm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | import numpy as np 6 | 7 | class LM(nn.Module): 8 | def __init__(self, batch_size, hidden_size, 9 | vocab_size, word_embed_size, 10 | lr, on_gpu=True): 11 | super(LM, self).__init__() 12 | self.lr = lr 13 | self.vocab_size = vocab_size 14 | self.word_embed_size = word_embed_size 15 | self.hidden_size = hidden_size 16 | self.batch_size = batch_size 17 | self.on_gpu = on_gpu 18 | 19 | # word embedding layer 20 | self.word_embedding_layer = nn.Embedding(vocab_size,word_embed_size) 21 | 22 | # language model LSTM 23 | self.rnn = nn.LSTMCell(word_embed_size,hidden_size) 24 | 25 | # predict layer 26 | self.predict_layer = nn.Linear(hidden_size,vocab_size) 27 | 28 | # Optimizer 29 | self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) 30 | 31 | self.onehot = torch.torch.eye(vocab_size) 32 | if self.on_gpu: 33 | self.onehot = self.onehot.cuda() 34 | # Move to gpu if necessary 35 | if self.on_gpu: 36 | self.cuda() 37 | def init_state(self): 38 | h = Variable(torch.zeros(self.batch_size, self.hidden_size)) 39 | c = Variable(torch.zeros(self.batch_size, self.hidden_size)) 40 | if self.on_gpu: 41 | h = h.cuda() 42 | c = c.cuda() 43 | return h,c 44 | 45 | def forward(self, word_emb, state): 46 | # Get states 47 | h_tm1, c_tm1 = state 48 | # RNN input 49 | h_t, c_t = self.rnn(word_emb, (h_tm1, c_tm1)) 50 | # Next word's logtis 51 | logits = self.predict_layer(h_t) 52 | return logits, (h_t, c_t) 53 | 54 | def train_on_batch(self,sentence, mask, reward): 55 | # Convert numpy to torch 56 | sentence = Variable(torch.LongTensor(sentence.tolist())) 57 | mask = torch.Tensor(mask) 58 | reward = torch.Tensor(reward) 59 | T = sentence.size()[0] - 1 60 | # If using gpu 61 | 62 | sentence = sentence.cuda() 63 | mask = mask.cuda() 64 | reward = reward.cuda() 65 | 66 | # Initial state of RNN 67 | state = self.init_state() 68 | # Word embedding for input sequence 69 | inputs = self.word_embedding_layer(sentence[:-1, :]) 70 | # Recurrent computation 71 | logits = [] 72 | for i in xrange(T): 73 | word = inputs[i, :, :] 74 | logit, state = self.forward(word, state) 75 | logits.append(logit.unsqueeze(0)) 76 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 77 | logits = logits.resize(T*self.batch_size, self.vocab_size) 78 | # Next word's distribution 79 | prob = F.softmax(logits).data 80 | # Ground-truth 81 | targets = sentence.data[1:, :].view(T*self.batch_size) 82 | gt_prob = self.onehot.index_select(0, targets) 83 | # Gradients 84 | logit_grad = (prob - gt_prob).view(T, self.batch_size, self.vocab_size) 85 | logit_grad = logit_grad * mask.view(T, self.batch_size, 1).expand_as(logit_grad) 86 | logit_grad = logit_grad * reward.view(1, self.batch_size, 1).expand_as(logit_grad) 87 | logit_grad = logit_grad / mask.sum(0).view(1, self.batch_size, 1).expand_as(logit_grad) 88 | logit_grad = logit_grad / self.batch_size 89 | logit_grad = logit_grad.view(T*self.batch_size, self.vocab_size) 90 | 91 | # Gradient descent 92 | self.optimizer.zero_grad() 93 | logits.backward(gradient=logit_grad) 94 | self.optimizer.step() 95 | targets = targets.cpu().numpy() 96 | loss = - np.log(prob[np.arange(T * self.batch_size), targets]) 97 | return loss.mean().numpy() 98 | 99 | def test_on_batch(self,sentence, mask): 100 | sentence = Variable(torch.LongTensor(sentence.tolist())) 101 | mask = torch.Tensor(mask) 102 | T = sentence.size()[0] - 1 103 | # If using gpu 104 | if self.on_gpu: 105 | sentence = sentence.cuda() 106 | mask = mask.cuda() 107 | # Initial state of RNN 108 | state = self.init_state() 109 | # Word embedding for input sequence 110 | inputs = self.word_embedding_layer(sentence[:-1, :]) 111 | # Recurrent computation 112 | logits = [] 113 | for i in xrange(T): 114 | word = inputs[i, :, :] 115 | logit, state = self.forward(word, state) 116 | logits.append(logit.unsqueeze(0)) 117 | logits = torch.cat(logits, 0) # (T, batch_size, vocab_size) 118 | logits = logits.resize(T * self.batch_size, self.vocab_size) 119 | # Next word's distribution 120 | prob = F.softmax(logits).data 121 | prob = prob.view(T,self.batch_size,self.vocab_size) 122 | # Ground-truth 123 | return prob 124 | 125 | def single_step_prob(self,word,state): 126 | word = Variable(torch.LongTensor(word.tolist())) 127 | if self.on_gpu: 128 | word = word.cuda() 129 | word_emb = self.word_embedding_layer(word) 130 | logit, state2 = self.forward(word_emb, state) # logit : (batch_size, vocab_size) 131 | prob = F.softmax(logit).data # (batch_size, vocab_size) 132 | return prob,state2 133 | 134 | def save(self, file_path): 135 | with open(file_path, 'wb') as f: 136 | torch.save(self.state_dict(), f) 137 | 138 | def load(self, file_path): 139 | with open(file_path, 'rb') as f: 140 | self.load_state_dict(torch.load(f)) 141 | -------------------------------------------------------------------------------- /cider/README.md: -------------------------------------------------------------------------------- 1 | Consensus-based Image Description Evaluation (CIDEr Code) 2 | =================== 3 | 4 | Evaluation code for CIDEr metric. Provides CIDEr as well as 5 | CIDEr-D (CIDEr Defended) which is more robust to gaming effects. 6 | 7 | ## Important Note ## 8 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus, CIDEr score for a reference dataset with only 1 image will be zero. When evaluating using one (or few) images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results. 9 | 10 | ## Requirements ## 11 | - java 1.8.0 12 | - python 2.7 13 | 14 | For running the ipython notebook file, update your Ipython to [Jupyter](https://jupyter.org/) 15 | 16 | ## Files ## 17 | ./ 18 | - cidereval.py (demo script) 19 | 20 | ./PyDataFormat 21 | - loadData.py (load the json files for references and candidates) 22 | 23 | - {$result\_file}.json (file with the CIDEr and CIDEr-D scores) 24 | 25 | ./pycocoevalcap: The folder where all evaluation codes are stored. 26 | - evals.py: Performs tokenization and runs both the metrics 27 | - tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer 28 | - cider: CIDEr evaluation codes 29 | - ciderD: CIDEr-D evaluation codes 30 | 31 | ## Instructions ## 32 | 1. Edit the params.json file to contain path to reference and candidate json files, and the result file where the scores are stored\*. 33 | 2. Set the "idf" value in params.json to "corpus" if not evaluating on a single image/instance. Set the "idf" value to "coco-val-df" if evaluating on a single image. In this case IDF values from the MSCOCO dataset are used. If using some other corpus, get the document frequencies into a similar format as "coco-val-df", and put them in the data/ folder as a pickle file. Then set mode to the name of the document frequency file (without the '.p' extension). 34 | 3. Sample json reference and candidate files are pascal50S.json and pascal_candsB.json 35 | 4. CIDEr scores are stored in "scores" variable: scores['CIDEr'] -> CIDEr scores, scores['CIDErD'] -> CIDEr-D scores 36 | 37 | *Even when evaluating with independent candidate/references (for eg. when using "coco-val-df"), put multiple candidate and reference entries into the same json files. This is much faster than having separate candidate and reference files and calling the evaluation code separately on each candidate/reference file. 38 | ## References ## 39 | 40 | - PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml). 41 | - CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf) 42 | 43 | ## Developers ## 44 | - Ramakrishna Vedantam (Virgina Tech) 45 | 46 | ## Acknowledgments ## 47 | - MS COCO Caption Evaluation Team 48 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | data -------------------------------------------------------------------------------- /data/karpathy/__init__.py: -------------------------------------------------------------------------------- 1 | '''Image Caption with Prior''' -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import json 6 | import h5py 7 | import os 8 | import numpy as np 9 | import random 10 | 11 | import torch 12 | import torch.utils.data as Data 13 | 14 | import multiprocessing 15 | 16 | 17 | def get_npy_data(ix, fc_file, att_file, use_att): 18 | if use_att == True: 19 | return (np.load(fc_file), np.load(att_file)['feat'], ix) 20 | else: 21 | return (np.load(fc_file), np.zeros((1, 1, 1)), ix) 22 | 23 | 24 | class KKDataLoader(Data.Dataset): 25 | 26 | def reset_iterator(self, split): 27 | del self._prefetch_process[split] 28 | self._prefetch_process[split] = BlobFetcher(split, self, split == 'train') 29 | self.iterators[split] = 0 30 | 31 | def get_vocab_size(self): 32 | return self.vocab_size 33 | 34 | def get_vocab(self): 35 | return self.ix_to_word 36 | 37 | def get_seq_length(self): 38 | return self.seq_length 39 | 40 | def __init__(self, opt): 41 | self.opt = opt 42 | self.batch_size = self.opt.batch_size 43 | self.seq_per_img = opt.seq_per_img 44 | self.use_att = getattr(opt, 'use_att', True) 45 | 46 | # load the json file which contains additional information about the dataset 47 | print('DataLoader loading json file: ', opt.input_json) 48 | self.info = json.load(open(self.opt.input_json)) 49 | self.ix_to_word = self.info['ix_to_word'] 50 | self.vocab_size = len(self.ix_to_word) 51 | print('vocab size is ', self.vocab_size) 52 | 53 | # open the hdf5 file 54 | print('DataLoader loading h5 file: ', opt.input_fc_dir, opt.input_att_dir, opt.input_label_h5) 55 | self.h5_label_file = h5py.File(self.opt.input_label_h5, 'r', driver='core') 56 | 57 | self.input_fc_dir = self.opt.input_fc_dir 58 | self.input_att_dir = self.opt.input_att_dir 59 | 60 | # load in the sequence data 61 | seq_size = self.h5_label_file['labels'].shape 62 | self.seq_length = seq_size[1] 63 | print('max sequence length in data is', self.seq_length) 64 | # load the pointers in full to RAM (should be small enough) 65 | self.label_start_ix = self.h5_label_file['label_start_ix'][:] 66 | self.label_end_ix = self.h5_label_file['label_end_ix'][:] 67 | 68 | self.num_images = self.label_start_ix.shape[0] 69 | print('read %d image features' % (self.num_images)) 70 | 71 | # separate out indexes for each of the provided splits 72 | self.split_ix = {'train': [], 'val': [], 'test': []} 73 | for ix in range(len(self.info['images'])): 74 | img = self.info['images'][ix] 75 | if img['split'] == 'train': 76 | self.split_ix['train'].append(ix) 77 | elif img['split'] == 'val': 78 | self.split_ix['val'].append(ix) 79 | elif img['split'] == 'test': 80 | self.split_ix['test'].append(ix) 81 | elif opt.train_only == 0: 82 | self.split_ix['train'].append(ix) 83 | 84 | print('assigned %d images to split train' % len(self.split_ix['train'])) 85 | print('assigned %d images to split val' % len(self.split_ix['val'])) 86 | print('assigned %d images to split test' % len(self.split_ix['test'])) 87 | 88 | self.iterators = {'train': 0, 'val': 0, 'test': 0} 89 | 90 | self._prefetch_process = {} # The three prefetch process 91 | for split in self.iterators.keys(): 92 | self._prefetch_process[split] = BlobFetcher(split, self, split == 'train') 93 | # Terminate the child process when the parent exists 94 | 95 | def cleanup(): 96 | print('Terminating BlobFetcher') 97 | for split in self.iterators.keys(): 98 | del self._prefetch_process[split] 99 | 100 | import atexit 101 | atexit.register(cleanup) 102 | 103 | def get_batch(self, split, batch_size=None, seq_per_img=None): 104 | batch_size = batch_size or self.batch_size 105 | seq_per_img = seq_per_img or self.seq_per_img 106 | 107 | fc_batch = [] # np.ndarray((batch_size * seq_per_img, self.opt.fc_feat_size), dtype = 'float32') 108 | att_batch = [] # np.ndarray((batch_size * seq_per_img, 14, 14, self.opt.att_feat_size), dtype = 'float32') 109 | label_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype='int') 110 | mask_batch = np.zeros([batch_size * seq_per_img, self.seq_length + 2], dtype='float32') 111 | 112 | wrapped = False 113 | 114 | infos = [] 115 | gts = [] 116 | 117 | for i in range(batch_size): 118 | import time 119 | t_start = time.time() 120 | # fetch image 121 | tmp_fc, tmp_att, \ 122 | ix, tmp_wrapped = self._prefetch_process[split].get() 123 | fc_batch += [tmp_fc] * seq_per_img 124 | att_batch += [tmp_att] * seq_per_img 125 | 126 | # fetch the sequence labels 127 | ix1 = self.label_start_ix[ix] - 1 # label_start_ix starts from 1 128 | ix2 = self.label_end_ix[ix] - 1 129 | ncap = ix2 - ix1 + 1 # number of captions available for this image 130 | assert ncap > 0, 'an image does not have any label. this can be handled but right now isn\'t' 131 | 132 | if ncap < seq_per_img: 133 | # we need to subsample (with replacement) 134 | seq = np.zeros([seq_per_img, self.seq_length], dtype='int') 135 | for q in range(seq_per_img): 136 | ixl = random.randint(ix1, ix2) 137 | seq[q, :] = self.h5_label_file['labels'][ixl, :self.seq_length] 138 | else: 139 | ixl = random.randint(ix1, ix2 - seq_per_img + 1) 140 | seq = self.h5_label_file['labels'][ixl: ixl + seq_per_img, :self.seq_length] 141 | 142 | label_batch[i * seq_per_img: (i + 1) * seq_per_img, 1: self.seq_length + 1] = seq 143 | 144 | if tmp_wrapped: 145 | wrapped = True 146 | 147 | # Used for reward evaluation 148 | gts.append(self.h5_label_file['labels'][self.label_start_ix[ix] - 1: self.label_end_ix[ix]]) 149 | 150 | # record associated info as well 151 | info_dict = {} 152 | info_dict['ix'] = ix 153 | info_dict['id'] = self.info['images'][ix]['id'] 154 | info_dict['file_path'] = self.info['images'][ix]['file_path'] 155 | infos.append(info_dict) 156 | # print(i, time.time() - t_start) 157 | 158 | # generate mask 159 | t_start = time.time() 160 | nonzeros = np.array(list(map(lambda x: (x != 0).sum() + 2, label_batch))) 161 | for ix, row in enumerate(mask_batch): 162 | row[:nonzeros[ix]] = 1 163 | # print('mask', time.time() - t_start) 164 | 165 | datas = {} 166 | datas['fc_feats'] = np.stack(fc_batch) 167 | datas['att_feats'] = np.stack(att_batch) 168 | datas['labels'] = label_batch 169 | datas['gts'] = gts 170 | datas['masks'] = mask_batch 171 | datas['bounds'] = {'it_pos_now': self.iterators[split], 'it_max': len(self.split_ix[split]), 'wrapped': wrapped} 172 | datas['infos'] = infos 173 | 174 | return datas 175 | 176 | # It's not coherent to make DataLoader a subclass of Dataset, but essentially, we only need to implement the following to functions, 177 | # so that the torch.utils.data.DataLoader can load the data according the index. 178 | # However, it's minimum change to switch to pytorch data loading. 179 | def __getitem__(self, index): 180 | """This function returns a tuple that is further passed to collate_fn 181 | """ 182 | ix = index # self.split_ix[index] 183 | return get_npy_data(ix, \ 184 | os.path.join(self.input_fc_dir, str(self.info['images'][ix]['id']) + '.npy'), 185 | os.path.join(self.input_att_dir, str(self.info['images'][ix]['id']) + '.npz'), 186 | self.use_att 187 | ) 188 | 189 | def __len__(self): 190 | return len(self.info['images']) 191 | class SubsetSampler(torch.utils.data.sampler.Sampler): 192 | r"""Samples elements randomly from a given list of indices, without replacement. 193 | Arguments: 194 | indices (list): a list of indices 195 | """ 196 | 197 | def __init__(self, indices): 198 | self.indices = indices 199 | 200 | def __iter__(self): 201 | return (self.indices[i] for i in range(len(self.indices))) 202 | 203 | def __len__(self): 204 | return len(self.indices) 205 | 206 | 207 | class BlobFetcher(): 208 | """Experimental class for prefetching blobs in a separate process.""" 209 | 210 | def __init__(self, split, dataloader, if_shuffle=False): 211 | """ 212 | db is a list of tuples containing: imcrop_name, caption, bbox_feat of gt box, imname 213 | """ 214 | self.split = split 215 | self.dataloader = dataloader 216 | self.if_shuffle = if_shuffle 217 | 218 | # Add more in the queue 219 | # def ReSet(self): 220 | # """ 221 | # Two cases: 222 | # 1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator 223 | # 2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already. 224 | # """ 225 | # # batch_size is 0, the merge is done in DataLoader class 226 | # self.split_loader = iter(Data.DataLoader(dataset=self.dataloader, 227 | # batch_size=1, 228 | # shuffle=False, 229 | # num_workers=4)) 230 | def ReSet(self): 231 | """ 232 | Two cases for this function to be triggered: 233 | 1. not hasattr(self, 'split_loader'): Resume from previous training. Create the dataset given the saved split_ix and iterator 234 | 2. wrapped: a new epoch, the split_ix and iterator have been updated in the get_minibatch_inds already. 235 | """ 236 | # batch_size is 1, the merge is done in DataLoader class 237 | self.split_loader = iter(Data.DataLoader(dataset=self.dataloader, 238 | batch_size=1, 239 | sampler=SubsetSampler(self.dataloader.split_ix[self.split][ 240 | self.dataloader.iterators[self.split]:]), 241 | shuffle=False, 242 | pin_memory=True, 243 | num_workers=4, # 4 is usually enough 244 | collate_fn=lambda x: x[0])) 245 | 246 | def _get_next_minibatch_inds(self): 247 | max_index = len(self.dataloader.split_ix[self.split]) 248 | wrapped = False 249 | 250 | ri = self.dataloader.iterators[self.split] 251 | ix = self.dataloader.split_ix[self.split][ri] 252 | 253 | ri_next = ri + 1 254 | if ri_next >= max_index: 255 | ri_next = 0 256 | if self.if_shuffle: 257 | random.shuffle(self.dataloader.split_ix[self.split]) 258 | wrapped = True 259 | self.dataloader.iterators[self.split] = ri_next 260 | 261 | return ix, wrapped 262 | 263 | def get(self): 264 | if not hasattr(self, 'split_loader'): 265 | self.ReSet() 266 | 267 | ix, wrapped = self._get_next_minibatch_inds() 268 | tmp = self.split_loader.next() 269 | if wrapped: 270 | self.ReSet() 271 | 272 | assert tmp[2] == ix, "ix not equal" 273 | 274 | return tmp + [wrapped] 275 | -------------------------------------------------------------------------------- /get_ngram.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dataloader import * 3 | from tools import remove_badEnding 4 | import pickle 5 | coco = json.load(open('data/cocotalk.json')) 6 | ix_to_word = coco['ix_to_word'] 7 | 8 | vocab = {} 9 | for i in ix_to_word: 10 | vocab[ix_to_word[i]] = int(i) 11 | 12 | 13 | fourgram = {} 14 | trigram = {} 15 | vocab_size = 9488 # 0:'#END' 9487:'UNK' 9488:'#BEGIN' 16 | 17 | train_data = json.load(open('data/captions_train.json')) 18 | for all_cap in train_data['captions']: 19 | for cap in all_cap: 20 | tokens = cap.split(' ') 21 | L = len(tokens) 22 | index = [] 23 | for i in range(len(tokens)): 24 | if tokens[i] in vocab: 25 | index.append(vocab[tokens[i]]) 26 | else: 27 | index.append(vocab_size-1) 28 | index += [0] * (16-L) 29 | fourgram_seq = [vocab_size,vocab_size,vocab_size] + index 30 | trigram_seq = [vocab_size,vocab_size] + index 31 | for j in range(16): 32 | fourgram_tuple = tuple(fourgram_seq[j:j+3]) 33 | trigram_tuple = tuple(trigram_seq[j:j+2]) 34 | if fourgram_tuple not in fourgram: 35 | fourgram[fourgram_tuple] = {} 36 | fourgram[fourgram_tuple][index[j]] = 1 37 | else: 38 | if index[j] not in fourgram[fourgram_tuple]: 39 | fourgram[fourgram_tuple][index[j]] = 1 40 | else: 41 | fourgram[fourgram_tuple][index[j]] += 1 42 | 43 | if trigram_tuple not in trigram: 44 | trigram[trigram_tuple] = {} 45 | trigram[trigram_tuple][index[j]] = 1 46 | else: 47 | if index[j] not in trigram[trigram_tuple]: 48 | trigram[trigram_tuple][index[j]] = 1 49 | else: 50 | trigram[trigram_tuple][index[j]] += 1 51 | 52 | f = open('data/fourgram.pkl','w') 53 | pickle.dump(fourgram,f) 54 | f.close() 55 | 56 | f = open('data/trigram.pkl','w') 57 | pickle.dump(trigram,f) 58 | f.close() 59 | -------------------------------------------------------------------------------- /images/badending.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/badending.png -------------------------------------------------------------------------------- /images/fourgram_att.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/fourgram_att.png -------------------------------------------------------------------------------- /images/rnn_att.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/rnn_att.png -------------------------------------------------------------------------------- /images/rnn_fc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/images/rnn_fc.png -------------------------------------------------------------------------------- /misc/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/__init__.pyc -------------------------------------------------------------------------------- /misc/ngram_reward.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import time 7 | import misc.utils as utils 8 | from collections import OrderedDict 9 | import torch 10 | from torch.autograd import Variable 11 | 12 | import sys 13 | 14 | sys.path.append("cider") 15 | from pyciderevalcap.ciderD.ciderD import CiderD 16 | 17 | CiderD_scorer = None 18 | 19 | 20 | # CiderD_scorer = CiderD(df='corpus') 21 | 22 | def init_cider_scorer(cached_tokens): 23 | global CiderD_scorer 24 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) 25 | 26 | 27 | def array_to_str(arr): 28 | out = '' 29 | for i in range(len(arr)): 30 | out += str(arr[i]) + ' ' 31 | if arr[i] == 0: 32 | break 33 | return out.strip() 34 | 35 | 36 | def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result): 37 | batch_size = gen_result.size(0) # batch_size = sample_size * seq_per_img 38 | seq_per_img = batch_size // len(data['gts']) 39 | 40 | # get greedy decoding baseline 41 | greedy_res, _ = model.ngram_sample(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True)) 42 | 43 | res = OrderedDict() 44 | 45 | gen_result = gen_result.cpu().numpy() 46 | greedy_res = greedy_res.cpu().numpy() 47 | for i in range(batch_size): 48 | res[i] = [array_to_str(gen_result[i])] 49 | for i in range(batch_size): 50 | res[batch_size + i] = [array_to_str(greedy_res[i])] 51 | 52 | gts = OrderedDict() 53 | for i in range(len(data['gts'])): 54 | gts[i] = [array_to_str(data['gts'][i][j]) for j in range(len(data['gts'][i]))] 55 | 56 | # _, scores = Bleu(4).compute_score(gts, res) 57 | # scores = np.array(scores[3]) 58 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] 59 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)} 60 | _, scores = CiderD_scorer.compute_score(gts, res) 61 | print('Cider scores:', _) 62 | 63 | scores = scores[:batch_size] - scores[batch_size:] 64 | 65 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1) 66 | 67 | return rewards -------------------------------------------------------------------------------- /misc/ngram_reward.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/ngram_reward.pyc -------------------------------------------------------------------------------- /misc/ngram_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | bad_endding = ['with','in','on','of','a','at','to','for','an','this','his','her','that'] 5 | import collections 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | import numpy as np 10 | 11 | def if_use_att(caption_model): 12 | # Decide if load attention feature according to caption model 13 | if caption_model in ['show_tell', 'all_img', 'fc']: 14 | return False 15 | return True 16 | 17 | def delete_badding(txt): 18 | if txt.endswith('with'): 19 | txt = txt[0:len(txt)-5] 20 | 21 | 22 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token. 23 | def decode_sequence(ix_to_word, seq): 24 | N, D = seq.size() 25 | out = [] 26 | for i in range(N): 27 | txt = '' 28 | for j in range(D): 29 | ix = seq[i,j] 30 | if ix > 0 : 31 | if j >= 1: 32 | txt = txt + ' ' 33 | txt = txt + ix_to_word[str(ix)] 34 | else: 35 | break 36 | out.append(txt) 37 | return out 38 | def delete_decode_sequence(ix_to_word, seq): 39 | N, D = seq.size() 40 | out = [] 41 | for i in range(N): 42 | flag = 0 43 | for j in range(D): 44 | ix = seq[i,D-1-j] 45 | if ix > 0 and ix_to_word[str(ix)] not in bad_endding: 46 | flag = D-j 47 | break 48 | txt = ' '.join([ix_to_word[str(ix)] for ix in seq[i,0:flag]]) 49 | out.append(txt) 50 | return out 51 | def to_contiguous(tensor): 52 | if tensor.is_contiguous(): 53 | return tensor 54 | else: 55 | return tensor.contiguous() 56 | 57 | class RewardCriterion(nn.Module): 58 | def __init__(self): 59 | super(RewardCriterion, self).__init__() 60 | 61 | def forward(self, input, seq, reward): 62 | input = to_contiguous(input).view(-1) 63 | reward = to_contiguous(reward).view(-1) 64 | mask = (seq>0).float() 65 | mask = to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1) 66 | output = - input * reward * Variable(mask) 67 | output = torch.sum(output) / torch.sum(mask) 68 | 69 | return output 70 | class LanguageModelCriterion(nn.Module): 71 | def __init__(self): 72 | super(LanguageModelCriterion, self).__init__() 73 | 74 | def forward(self, input, target, mask): 75 | # truncate to the same size 76 | target = target[:, :input.size(1)] 77 | mask = mask[:, :input.size(1)] 78 | input = to_contiguous(input).view(-1, input.size(2)) 79 | target = to_contiguous(target).view(-1, 1) 80 | mask = to_contiguous(mask).view(-1, 1) 81 | output = - input.gather(1, target) * mask 82 | output = torch.sum(output) / torch.sum(mask) 83 | 84 | return output 85 | 86 | def set_lr(optimizer, lr): 87 | for group in optimizer.param_groups: 88 | group['lr'] = lr 89 | 90 | def clip_gradient(optimizer, grad_clip): 91 | for group in optimizer.param_groups: 92 | for param in group['params']: 93 | param.grad.data.clamp_(-grad_clip, grad_clip) -------------------------------------------------------------------------------- /misc/ngram_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/ngram_utils.pyc -------------------------------------------------------------------------------- /misc/resnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | import torch.utils.model_zoo as model_zoo 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152'] 8 | 9 | 10 | model_urls = { 11 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', 12 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth', 13 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth', 14 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth', 15 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth', 16 | } 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | "3x3 convolution with padding" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=1, bias=False) 23 | 24 | 25 | class BasicBlock(nn.Module): 26 | expansion = 1 27 | 28 | def __init__(self, inplanes, planes, stride=1, downsample=None): 29 | super(BasicBlock, self).__init__() 30 | self.conv1 = conv3x3(inplanes, planes, stride) 31 | self.bn1 = nn.BatchNorm2d(planes) 32 | self.relu = nn.ReLU(inplace=True) 33 | self.conv2 = conv3x3(planes, planes) 34 | self.bn2 = nn.BatchNorm2d(planes) 35 | self.downsample = downsample 36 | self.stride = stride 37 | 38 | def forward(self, x): 39 | residual = x 40 | 41 | out = self.conv1(x) 42 | out = self.bn1(out) 43 | out = self.relu(out) 44 | 45 | out = self.conv2(out) 46 | out = self.bn2(out) 47 | 48 | if self.downsample is not None: 49 | residual = self.downsample(x) 50 | 51 | out += residual 52 | out = self.relu(out) 53 | 54 | return out 55 | 56 | 57 | class Bottleneck(nn.Module): 58 | expansion = 4 59 | 60 | def __init__(self, inplanes, planes, stride=1, downsample=None): 61 | super(Bottleneck, self).__init__() 62 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change 63 | self.bn1 = nn.BatchNorm2d(planes) 64 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change 65 | padding=1, bias=False) 66 | self.bn2 = nn.BatchNorm2d(planes) 67 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 68 | self.bn3 = nn.BatchNorm2d(planes * 4) 69 | self.relu = nn.ReLU(inplace=True) 70 | self.downsample = downsample 71 | self.stride = stride 72 | 73 | def forward(self, x): 74 | residual = x 75 | 76 | out = self.conv1(x) 77 | out = self.bn1(out) 78 | out = self.relu(out) 79 | 80 | out = self.conv2(out) 81 | out = self.bn2(out) 82 | out = self.relu(out) 83 | 84 | out = self.conv3(out) 85 | out = self.bn3(out) 86 | 87 | if self.downsample is not None: 88 | residual = self.downsample(x) 89 | 90 | out += residual 91 | out = self.relu(out) 92 | 93 | return out 94 | 95 | 96 | class ResNet(nn.Module): 97 | def __init__(self, block, layers, num_classes=1000): 98 | self.inplanes = 64 99 | super(ResNet, self).__init__() 100 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 101 | bias=False) 102 | self.bn1 = nn.BatchNorm2d(64) 103 | self.relu = nn.ReLU(inplace=True) 104 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change 105 | self.layer1 = self._make_layer(block, 64, layers[0]) 106 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 108 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 109 | self.avgpool = nn.AvgPool2d(7) 110 | self.fc = nn.Linear(512 * block.expansion, num_classes) 111 | 112 | for m in self.modules(): 113 | if isinstance(m, nn.Conv2d): 114 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 115 | m.weight.data.normal_(0, math.sqrt(2. / n)) 116 | elif isinstance(m, nn.BatchNorm2d): 117 | m.weight.data.fill_(1) 118 | m.bias.data.zero_() 119 | 120 | def _make_layer(self, block, planes, blocks, stride=1): 121 | downsample = None 122 | if stride != 1 or self.inplanes != planes * block.expansion: 123 | downsample = nn.Sequential( 124 | nn.Conv2d(self.inplanes, planes * block.expansion, 125 | kernel_size=1, stride=stride, bias=False), 126 | nn.BatchNorm2d(planes * block.expansion), 127 | ) 128 | 129 | layers = [] 130 | layers.append(block(self.inplanes, planes, stride, downsample)) 131 | self.inplanes = planes * block.expansion 132 | for i in range(1, blocks): 133 | layers.append(block(self.inplanes, planes)) 134 | 135 | return nn.Sequential(*layers) 136 | 137 | def forward(self, x): 138 | x = self.conv1(x) 139 | x = self.bn1(x) 140 | x = self.relu(x) 141 | x = self.maxpool(x) 142 | 143 | x = self.layer1(x) 144 | x = self.layer2(x) 145 | x = self.layer3(x) 146 | x = self.layer4(x) 147 | 148 | x = self.avgpool(x) 149 | x = x.view(x.size(0), -1) 150 | x = self.fc(x) 151 | 152 | return x 153 | 154 | 155 | def resnet18(pretrained=False): 156 | """Constructs a ResNet-18 model. 157 | 158 | Args: 159 | pretrained (bool): If True, returns a model pre-trained on ImageNet 160 | """ 161 | model = ResNet(BasicBlock, [2, 2, 2, 2]) 162 | if pretrained: 163 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) 164 | return model 165 | 166 | 167 | def resnet34(pretrained=False): 168 | """Constructs a ResNet-34 model. 169 | 170 | Args: 171 | pretrained (bool): If True, returns a model pre-trained on ImageNet 172 | """ 173 | model = ResNet(BasicBlock, [3, 4, 6, 3]) 174 | if pretrained: 175 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) 176 | return model 177 | 178 | 179 | def resnet50(pretrained=False): 180 | """Constructs a ResNet-50 model. 181 | 182 | Args: 183 | pretrained (bool): If True, returns a model pre-trained on ImageNet 184 | """ 185 | model = ResNet(Bottleneck, [3, 4, 6, 3]) 186 | if pretrained: 187 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) 188 | return model 189 | 190 | 191 | def resnet101(pretrained=False): 192 | """Constructs a ResNet-101 model. 193 | 194 | Args: 195 | pretrained (bool): If True, returns a model pre-trained on ImageNet 196 | """ 197 | model = ResNet(Bottleneck, [3, 4, 23, 3]) 198 | if pretrained: 199 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) 200 | return model 201 | 202 | 203 | def resnet152(pretrained=False): 204 | """Constructs a ResNet-152 model. 205 | 206 | Args: 207 | pretrained (bool): If True, returns a model pre-trained on ImageNet 208 | """ 209 | model = ResNet(Bottleneck, [3, 8, 36, 3]) 210 | if pretrained: 211 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 212 | return model -------------------------------------------------------------------------------- /misc/resnet.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/resnet.pyc -------------------------------------------------------------------------------- /misc/resnet_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | 6 | class myResnet(nn.Module): 7 | def __init__(self, resnet): 8 | super(myResnet, self).__init__() 9 | self.resnet = resnet 10 | 11 | def forward(self, img, att_size=14): 12 | x = img.unsqueeze(0) 13 | 14 | x = self.resnet.conv1(x) 15 | x = self.resnet.bn1(x) 16 | x = self.resnet.relu(x) 17 | x = self.resnet.maxpool(x) 18 | 19 | x = self.resnet.layer1(x) 20 | x = self.resnet.layer2(x) 21 | x = self.resnet.layer3(x) 22 | x = self.resnet.layer4(x) 23 | 24 | fc = x.mean(3).mean(2).squeeze() 25 | att = F.adaptive_avg_pool2d(x,[att_size,att_size]).squeeze().permute(1, 2, 0) 26 | 27 | return fc, att 28 | 29 | -------------------------------------------------------------------------------- /misc/resnet_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/resnet_utils.pyc -------------------------------------------------------------------------------- /misc/rewards.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import time 7 | import misc.utils as utils 8 | from collections import OrderedDict 9 | import torch 10 | from torch.autograd import Variable 11 | 12 | import sys 13 | 14 | sys.path.append("cider") 15 | from pyciderevalcap.ciderD.ciderD import CiderD 16 | 17 | CiderD_scorer = None 18 | 19 | 20 | # CiderD_scorer = CiderD(df='corpus') 21 | 22 | def init_cider_scorer(cached_tokens): 23 | global CiderD_scorer 24 | CiderD_scorer = CiderD_scorer or CiderD(df=cached_tokens) 25 | 26 | 27 | def array_to_str(arr): 28 | out = '' 29 | for i in range(len(arr)): 30 | out += str(arr[i]) + ' ' 31 | if arr[i] == 0: 32 | break 33 | return out.strip() 34 | 35 | 36 | def get_self_critical_reward(model, fc_feats, att_feats, data, gen_result): 37 | batch_size = gen_result.size(0) # batch_size = sample_size * seq_per_img 38 | seq_per_img = batch_size // len(data['gts']) 39 | 40 | # get greedy decoding baseline 41 | greedy_res, _ = model.sample(Variable(fc_feats.data, volatile=True), Variable(att_feats.data, volatile=True)) 42 | 43 | res = OrderedDict() 44 | 45 | gen_result = gen_result.cpu().numpy() 46 | greedy_res = greedy_res.cpu().numpy() 47 | for i in range(batch_size): 48 | res[i] = [array_to_str(gen_result[i])] 49 | for i in range(batch_size): 50 | res[batch_size + i] = [array_to_str(greedy_res[i])] 51 | 52 | gts = OrderedDict() 53 | for i in range(len(data['gts'])): 54 | gts[i] = [array_to_str(data['gts'][i][j]) for j in range(len(data['gts'][i]))] 55 | 56 | # _, scores = Bleu(4).compute_score(gts, res) 57 | # scores = np.array(scores[3]) 58 | res = [{'image_id': i, 'caption': res[i]} for i in range(2 * batch_size)] 59 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)} 60 | _, scores = CiderD_scorer.compute_score(gts, res) 61 | print('Cider scores:', _) 62 | 63 | scores = scores[:batch_size] - scores[batch_size:] 64 | 65 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1) 66 | 67 | return rewards -------------------------------------------------------------------------------- /misc/rewards.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/rewards.pyc -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | import numpy as np 10 | 11 | def if_use_att(caption_model): 12 | # Decide if load attention feature according to caption model 13 | if caption_model in ['show_tell', 'all_img', 'fc']: 14 | return False 15 | return True 16 | 17 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token. 18 | def decode_sequence(ix_to_word, seq): 19 | N, D = seq.size() 20 | out = [] 21 | for i in range(N): 22 | txt = '' 23 | for j in range(D): 24 | ix = seq[i,j] 25 | if ix > 0 : 26 | if j >= 1: 27 | txt = txt + ' ' 28 | txt = txt + ix_to_word[str(ix)] 29 | else: 30 | break 31 | out.append(txt) 32 | return out 33 | 34 | def to_contiguous(tensor): 35 | if tensor.is_contiguous(): 36 | return tensor 37 | else: 38 | return tensor.contiguous() 39 | 40 | class RewardCriterion(nn.Module): 41 | def __init__(self): 42 | super(RewardCriterion, self).__init__() 43 | 44 | def forward(self, input, seq, reward): 45 | input = to_contiguous(input).view(-1) 46 | reward = to_contiguous(reward).view(-1) 47 | mask = (seq>0).float() 48 | mask = to_contiguous(torch.cat([mask.new(mask.size(0), 1).fill_(1), mask[:, :-1]], 1)).view(-1) 49 | output = - input * reward * Variable(mask) 50 | output = torch.sum(output) / torch.sum(mask) 51 | 52 | return output 53 | class LanguageModelCriterion(nn.Module): 54 | def __init__(self): 55 | super(LanguageModelCriterion, self).__init__() 56 | 57 | def forward(self, input, target, mask): 58 | # truncate to the same size 59 | target = target[:, :input.size(1)] 60 | mask = mask[:, :input.size(1)] 61 | input = to_contiguous(input).view(-1, input.size(2)) 62 | target = to_contiguous(target).view(-1, 1) 63 | mask = to_contiguous(mask).view(-1, 1) 64 | output = - input.gather(1, target) * mask 65 | output = torch.sum(output) / torch.sum(mask) 66 | 67 | return output 68 | 69 | def set_lr(optimizer, lr): 70 | for group in optimizer.param_groups: 71 | group['lr'] = lr 72 | 73 | def clip_gradient(optimizer, grad_clip): 74 | for group in optimizer.param_groups: 75 | for param in group['params']: 76 | param.grad.data.clamp_(-grad_clip, grad_clip) 77 | -------------------------------------------------------------------------------- /misc/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tgGuo15/PriorImageCaption/4ee6017d642116145cc74c6f752685bd2d19b1cc/misc/utils.pyc -------------------------------------------------------------------------------- /mycider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import math 4 | from collections import defaultdict 5 | 6 | 7 | def transfer_result_to_res(data): 8 | res = {} 9 | for i in range(len(data)): 10 | res[data[i]['image_id']] = [data[i]['caption']] 11 | return res 12 | 13 | 14 | def transfer_json_to_cider_gts(json_file): 15 | print '... changing standard format for cider calculation' 16 | with open(json_file) as f: 17 | data = json.load(f) 18 | image_index = data['image_ids'] 19 | index_caption = data['captions'] 20 | gts_caption = {} 21 | for i in range(len(image_index)): 22 | gts_caption[image_index[i]] = index_caption[i] 23 | print '... finishing changing standard format' 24 | return gts_caption 25 | 26 | 27 | def precook(s, n=4, out=False): 28 | """ 29 | Takes a string as input and returns an object that can be given to 30 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 31 | can take string arguments as well. 32 | :param s: string : sentence to be converted into ngrams 33 | :param n: int : number of ngrams for which representation is calculated 34 | :return: term frequency vector for occuring ngrams 35 | """ 36 | words = s.split() 37 | counts = defaultdict(int) 38 | for k in xrange(1,n+1): 39 | for i in xrange(len(words)-k+1): 40 | ngram = tuple(words[i:i+k]) 41 | counts[ngram] += 1 42 | return counts 43 | 44 | 45 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 46 | '''Takes a list of reference sentences for a single segment 47 | and returns an object that encapsulates everything that BLEU 48 | needs to know about them. 49 | :param refs: list of string : reference sentences for some image 50 | :param n: int : number of ngrams for which (ngram) representation is calculated 51 | :return: result (list of dict) 52 | ''' 53 | return [precook(ref, n) for ref in refs] 54 | 55 | 56 | def cook_test(test, n=4): 57 | '''Takes a test sentence and returns an object that 58 | encapsulates everything that BLEU needs to know about it. 59 | :param test: list of string : hypothesis sentence for some image 60 | :param n: int : number of ngrams for which (ngram) representation is calculated 61 | :return: result (dict) 62 | ''' 63 | return precook(test, n, True) 64 | 65 | 66 | class CiderScorer(object): 67 | def __init__(self, refs=None, n=4, sigma=6.0): 68 | self.n = n 69 | self.sigma = sigma 70 | self.crefs = [] 71 | self.ref_to_imageId = {} 72 | self.build_cook_refs(refs) 73 | self.document_frequency = defaultdict(float) 74 | self.compute_doc_freq() 75 | 76 | def compute_doc_freq(self): 77 | """ 78 | Compute term frequency for reference data. 79 | This will be used to compute idf (inverse document frequency later) 80 | The term frequency is stored in the object 81 | :return: None 82 | """ 83 | print 'done for stats' 84 | for refs in self.crefs: 85 | # refs, k ref captions of one image 86 | for ngram in set([ngram for ref in refs for (ngram, count) in ref.iteritems()]): 87 | self.document_frequency[ngram] += 1 88 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 89 | 90 | def Counts2vec(self,cnts): 91 | """ 92 | Function maps counts of ngram to vector of tfidf weights. 93 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 94 | The n-th entry of array denotes length of n-grams. 95 | :param cnts: 96 | :return: vec (array of dict), norm (array of float), length (int) 97 | """ 98 | vec = [defaultdict(float) for _ in range(self.n)] 99 | length = 0 100 | norm = [0.0 for _ in range(self.n)] 101 | for (ngram, term_freq) in cnts.iteritems(): 102 | # give word count 1 if it doesn't appear in reference corpus 103 | df = np.log(max(1.0, self.document_frequency[ngram])) 104 | # ngram index 105 | n = len(ngram) - 1 106 | # tf (term_freq) * idf (precomputed idf) for n-grams 107 | vec[n][ngram] = float(term_freq) * (self.ref_len - df) 108 | # compute norm for the vector. the norm will be used for computing similarity 109 | norm[n] += pow(vec[n][ngram], 2) 110 | 111 | if n == 1: 112 | length += term_freq 113 | norm = [np.sqrt(n) for n in norm] 114 | return vec, norm, length 115 | 116 | def Sim(self, vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 117 | """ 118 | Compute the cosine similarity of two vectors. 119 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 120 | :param vec_ref: array of dictionary for vector corresponding to reference 121 | :param norm_hyp: array of float for vector corresponding to hypothesis 122 | :param norm_ref: array of float for vector corresponding to reference 123 | :param length_hyp: int containing length of hypothesis 124 | :param length_ref: int containing length of reference 125 | :return: array of score for each n-grams cosine similarity 126 | """ 127 | delta = float(length_hyp - length_ref) 128 | # measure consine similarity 129 | val = np.array([0.0 for _ in range(self.n)]) 130 | for n in range(self.n): 131 | # ngram 132 | for (ngram, count) in vec_hyp[n].iteritems(): 133 | # vrama91 : added clipping 134 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 135 | 136 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 137 | val[n] /= (norm_hyp[n] * norm_ref[n]) 138 | 139 | assert (not math.isnan(val[n])) 140 | # vrama91: added a length based gaussian penalty 141 | val[n] *= np.e ** (-(delta ** 2) / (2 * self.sigma ** 2)) 142 | return val 143 | 144 | # compute log reference length 145 | 146 | def build_cook_refs(self, refs): 147 | count = 0 148 | if refs is not None: 149 | for item in refs: 150 | self.ref_to_imageId[item] = count 151 | self.crefs.append(cook_refs(refs[item], n= self.n)) 152 | count = count + 1 153 | 154 | def cook_append_test(self, test=None): 155 | self.ctest = [] 156 | self.test_to_imageId = {} 157 | Counttest = 0 158 | if test is not None: 159 | for item in test: 160 | self.test_to_imageId[Counttest] = item 161 | self.ctest.append(cook_test(test[item][0], n=self.n)) 162 | Counttest = Counttest + 1 163 | else: 164 | self.ctest.append(None) 165 | 166 | def compute_cider(self): 167 | def counts2vec(cnts): 168 | """ 169 | Function maps counts of ngram to vector of tfidf weights. 170 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 171 | The n-th entry of array denotes length of n-grams. 172 | :param cnts: 173 | :return: vec (array of dict), norm (array of float), length (int) 174 | """ 175 | vec = [defaultdict(float) for _ in range(self.n)] 176 | length = 0 177 | norm = [0.0 for _ in range(self.n)] 178 | for (ngram, term_freq) in cnts.iteritems(): 179 | # give word count 1 if it doesn't appear in reference corpus 180 | df = np.log(max(1.0, self.document_frequency[ngram])) 181 | # ngram index 182 | n = len(ngram) - 1 183 | # tf (term_freq) * idf (precomputed idf) for n-grams 184 | vec[n][ngram] = float(term_freq) * (self.ref_len - df) 185 | # compute norm for the vector. the norm will be used for computing similarity 186 | norm[n] += pow(vec[n][ngram], 2) 187 | 188 | if n == 1: 189 | length += term_freq 190 | norm = [np.sqrt(n) for n in norm] 191 | return vec, norm, length 192 | 193 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 194 | """ 195 | Compute the cosine similarity of two vectors. 196 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 197 | :param vec_ref: array of dictionary for vector corresponding to reference 198 | :param norm_hyp: array of float for vector corresponding to hypothesis 199 | :param norm_ref: array of float for vector corresponding to reference 200 | :param length_hyp: int containing length of hypothesis 201 | :param length_ref: int containing length of reference 202 | :return: array of score for each n-grams cosine similarity 203 | """ 204 | delta = float(length_hyp - length_ref) 205 | # measure consine similarity 206 | val = np.array([0.0 for _ in range(self.n)]) 207 | for n in range(self.n): 208 | # ngram 209 | for (ngram, count) in vec_hyp[n].iteritems(): 210 | # vrama91 : added clipping 211 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 212 | 213 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 214 | val[n] /= (norm_hyp[n] * norm_ref[n]) 215 | 216 | assert (not math.isnan(val[n])) 217 | # vrama91: added a length based gaussian penalty 218 | val[n] *= np.e ** (-(delta ** 2) / (2 * self.sigma ** 2)) 219 | return val 220 | 221 | # compute log reference length 222 | self.ref_len = np.log(float(len(self.crefs))) 223 | 224 | scores = [] 225 | # for test, refs in zip(self.ctest, self.crefs): 226 | for id in range(len(self.ctest)): 227 | test = self.ctest[id] 228 | refs = self.crefs[self.ref_to_imageId[self.test_to_imageId[id]]] 229 | # compute vector for test captions 230 | vec, norm, length = counts2vec(test) 231 | # compute vector for ref captions 232 | score = np.array([0.0 for _ in range(self.n)]) 233 | for ref in refs: 234 | vec_ref, norm_ref, length_ref = counts2vec(ref) 235 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 236 | # change by vrama91 - mean of ngram scores, instead of sum 237 | score_avg = np.mean(score) 238 | # divide by number of references 239 | score_avg /= len(refs) 240 | # multiply score by 10 241 | score_avg *= 10.0 242 | # append score of an image to the score list 243 | scores.append(score_avg) 244 | return scores 245 | 246 | def compute_score(self, option=None, verbose=0): 247 | # compute idf 248 | #if first_time == 1: 249 | 250 | # assert to check document frequency 251 | #assert (len(self.ctest) >= max(self.document_frequency.values())) 252 | # compute cider score 253 | score = self.compute_cider() 254 | # debug 255 | # print score 256 | return np.mean(np.array(score)), np.array(score) -------------------------------------------------------------------------------- /ngram_opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_opt(): 4 | parser = argparse.ArgumentParser() 5 | # Data input settings 6 | parser.add_argument('--input_json', type=str, default='data/features/cocotalk.json', 7 | help='path to the json file containing additional info and vocab') 8 | parser.add_argument('--input_fc_dir', type=str, default='data/features_fc', 9 | help='path to the directory containing the preprocessed fc feats') 10 | parser.add_argument('--input_att_dir', type=str, default='data/features_att', 11 | help='path to the directory containing the preprocessed att feats') 12 | parser.add_argument('--input_label_h5', type=str, default='data/features/cocotalk_label.h5', 13 | help='path to the h5file containing the preprocessed dataset') 14 | parser.add_argument('--cached_tokens', type=str, default='coco-train-idxs', 15 | help='Cached token file for calculating cider score during self critical training.') 16 | 17 | # Model settings 18 | parser.add_argument('--caption_model', type=str, default="fc", 19 | help='fc, att') 20 | parser.add_argument('--rnn_size', type=int, default=512, 21 | help='size of the rnn in number of hidden nodes in each layer') 22 | parser.add_argument('--fc_feat_size', type=int, default=2048, 23 | help='2048 for resnet, 4096 for vgg') 24 | parser.add_argument('--att_feat_size', type=int, default=2048, 25 | help='2048 for resnet, 512 for vgg') 26 | 27 | args = parser.parse_args() 28 | return args 29 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | #'Prior Image Caption' -------------------------------------------------------------------------------- /scripts/prepro_feats.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | import torch 5 | torch.cuda.set_device(3) 6 | import os 7 | import json 8 | import argparse 9 | from random import shuffle, seed 10 | import string 11 | # non-standard dependencies: 12 | import h5py 13 | from six.moves import cPickle 14 | import numpy as np 15 | import torch 16 | import torchvision.models as models 17 | from torch.autograd import Variable 18 | import skimage.io 19 | 20 | from torchvision import transforms as trn 21 | 22 | preprocess = trn.Compose([ 23 | #trn.ToTensor(), 24 | trn.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 25 | ]) 26 | 27 | from misc.resnet_utils import myResnet 28 | import misc.resnet as resnet 29 | 30 | def main(params): 31 | net = getattr(resnet, params['model'])() 32 | net.load_state_dict(torch.load(os.path.join(params['model_root'],params['model']+'.pth'))) 33 | my_resnet = myResnet(net) 34 | my_resnet.cuda() 35 | my_resnet.eval() 36 | 37 | imgs = json.load(open(params['input_json'], 'r')) 38 | imgs = imgs['images'] 39 | N = len(imgs) 40 | 41 | seed(123) # make reproducible 42 | 43 | dir_fc = params['output_dir']+'_fc' 44 | dir_att = params['output_dir']+'_att' 45 | if not os.path.isdir(dir_fc): 46 | os.mkdir(dir_fc) 47 | if not os.path.isdir(dir_att): 48 | os.mkdir(dir_att) 49 | 50 | for i,img in enumerate(imgs): 51 | # load the image 52 | I = skimage.io.imread(os.path.join(params['images_root'], img['filepath'], img['filename'])) 53 | # handle grayscale input images 54 | if len(I.shape) == 2: 55 | I = I[:,:,np.newaxis] 56 | I = np.concatenate((I,I,I), axis=2) 57 | 58 | I = I.astype('float32')/255.0 59 | I = torch.from_numpy(I.transpose([2,0,1])).cuda() 60 | I = Variable(preprocess(I), volatile=True) 61 | tmp_fc, tmp_att = my_resnet(I, params['att_size']) 62 | # write to pkl 63 | np.save(os.path.join(dir_fc, str(img['cocoid'])), tmp_fc.data.cpu().float().numpy()) 64 | np.savez_compressed(os.path.join(dir_att, str(img['cocoid'])), feat=tmp_att.data.cpu().float().numpy()) 65 | 66 | if i % 1000 == 0: 67 | print('processing %d/%d (%.2f%% done)' % (i, N, i*100.0/N)) 68 | print('wrote ', params['output_dir']) 69 | 70 | if __name__ == "__main__": 71 | 72 | parser = argparse.ArgumentParser() 73 | 74 | # input json 75 | parser.add_argument('--input_json', default='data/dataset_coco.json',required=True, help='input json file to process into hdf5') 76 | parser.add_argument('--output_dir', default='data/cocotalk', help='output h5 file') 77 | 78 | # options 79 | parser.add_argument('--images_root', default='/home/trunk/RTrunk1/kk/MSCOCO/2014/images.cocodataset.org/zips/coco', help='root location in which images are stored, to be prepended to file_path in input json') 80 | parser.add_argument('--att_size', default=14, type=int, help='14x14 or 7x7') 81 | parser.add_argument('--model', default='resnet101', type=str, help='resnet101, resnet152') 82 | parser.add_argument('--model_root', default='./data/imagenet_weights', type=str, help='model root') 83 | 84 | args = parser.parse_args() 85 | params = vars(args) # convert to ordinary dict 86 | print('parsed input parameters:') 87 | print(json.dumps(params, indent = 2)) 88 | main(params) 89 | -------------------------------------------------------------------------------- /scripts/prepro_labels.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import os 6 | import json 7 | import argparse 8 | from random import shuffle, seed 9 | import string 10 | # non-standard dependencies: 11 | import h5py 12 | import numpy as np 13 | import torch 14 | import torchvision.models as models 15 | from torch.autograd import Variable 16 | import skimage.io 17 | 18 | def build_vocab(imgs, params): 19 | count_thr = params['word_count_threshold'] 20 | 21 | # count up the number of words 22 | counts = {} 23 | for img in imgs: 24 | for sent in img['sentences']: 25 | for w in sent['tokens']: 26 | counts[w] = counts.get(w, 0) + 1 27 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True) 28 | print('top words and their counts:') 29 | print('\n'.join(map(str,cw[:20]))) 30 | 31 | # print some stats 32 | total_words = sum(counts.values()) 33 | print('total words:', total_words) 34 | bad_words = [w for w,n in counts.items() if n <= count_thr] 35 | vocab = [w for w,n in counts.items() if n > count_thr] 36 | bad_count = sum(counts[w] for w in bad_words) 37 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts))) 38 | print('number of words in vocab would be %d' % (len(vocab), )) 39 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words)) 40 | 41 | # lets look at the distribution of lengths as well 42 | sent_lengths = {} 43 | for img in imgs: 44 | for sent in img['sentences']: 45 | txt = sent['tokens'] 46 | nw = len(txt) 47 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1 48 | max_len = max(sent_lengths.keys()) 49 | print('max length sentence in raw data: ', max_len) 50 | print('sentence length distribution (count, number of words):') 51 | sum_len = sum(sent_lengths.values()) 52 | for i in range(max_len+1): 53 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len)) 54 | 55 | # lets now produce the final annotations 56 | if bad_count > 0: 57 | # additional special UNK token we will use below to map infrequent words to 58 | print('inserting the special UNK token') 59 | vocab.append('UNK') 60 | 61 | for img in imgs: 62 | img['final_captions'] = [] 63 | for sent in img['sentences']: 64 | txt = sent['tokens'] 65 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt] 66 | img['final_captions'].append(caption) 67 | 68 | return vocab 69 | 70 | def encode_captions(imgs, params, wtoi): 71 | """ 72 | encode all captions into one large array, which will be 1-indexed. 73 | also produces label_start_ix and label_end_ix which store 1-indexed 74 | and inclusive (Lua-style) pointers to the first and last caption for 75 | each image in the dataset. 76 | """ 77 | 78 | max_length = params['max_length'] 79 | N = len(imgs) 80 | M = sum(len(img['final_captions']) for img in imgs) # total number of captions 81 | 82 | label_arrays = [] 83 | label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed 84 | label_end_ix = np.zeros(N, dtype='uint32') 85 | label_length = np.zeros(M, dtype='uint32') 86 | caption_counter = 0 87 | counter = 1 88 | for i,img in enumerate(imgs): 89 | n = len(img['final_captions']) 90 | assert n > 0, 'error: some image has no captions' 91 | 92 | Li = np.zeros((n, max_length), dtype='uint32') 93 | for j,s in enumerate(img['final_captions']): 94 | label_length[caption_counter] = min(max_length, len(s)) # record the length of this sequence 95 | caption_counter += 1 96 | for k,w in enumerate(s): 97 | if k < max_length: 98 | Li[j,k] = wtoi[w] 99 | 100 | # note: word indices are 1-indexed, and captions are padded with zeros 101 | label_arrays.append(Li) 102 | label_start_ix[i] = counter 103 | label_end_ix[i] = counter + n - 1 104 | 105 | counter += n 106 | 107 | L = np.concatenate(label_arrays, axis=0) # put all the labels together 108 | assert L.shape[0] == M, 'lengths don\'t match? that\'s weird' 109 | assert np.all(label_length > 0), 'error: some caption had no words?' 110 | 111 | print('encoded captions to array of size ', L.shape) 112 | return L, label_start_ix, label_end_ix, label_length 113 | 114 | def main(params): 115 | 116 | imgs = json.load(open(params['input_json'], 'r')) 117 | imgs = imgs['images'] 118 | 119 | seed(123) # make reproducible 120 | 121 | # create the vocab 122 | vocab = build_vocab(imgs, params) 123 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table 124 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table 125 | 126 | # encode captions in large arrays, ready to ship to hdf5 file 127 | L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, params, wtoi) 128 | 129 | # create output h5 file 130 | N = len(imgs) 131 | f_lb = h5py.File(params['output_h5']+'_label.h5', "w") 132 | f_lb.create_dataset("labels", dtype='uint32', data=L) 133 | f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix) 134 | f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix) 135 | f_lb.create_dataset("label_length", dtype='uint32', data=label_length) 136 | f_lb.close() 137 | 138 | # create output json file 139 | out = {} 140 | out['ix_to_word'] = itow # encode the (1-indexed) vocab 141 | out['images'] = [] 142 | for i,img in enumerate(imgs): 143 | 144 | jimg = {} 145 | jimg['split'] = img['split'] 146 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need 147 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful) 148 | 149 | out['images'].append(jimg) 150 | 151 | json.dump(out, open(params['output_json'], 'w')) 152 | print('wrote ', params['output_json']) 153 | 154 | if __name__ == "__main__": 155 | 156 | parser = argparse.ArgumentParser() 157 | 158 | # input json 159 | parser.add_argument('--input_json', required=True, help='input json file to process into hdf5') 160 | parser.add_argument('--output_json', default='data.json', help='output json file') 161 | parser.add_argument('--output_h5', default='data', help='output h5 file') 162 | 163 | # options 164 | parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.') 165 | parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab') 166 | 167 | args = parser.parse_args() 168 | params = vars(args) # convert to ordinary dict 169 | print('parsed input parameters:') 170 | print(json.dumps(params, indent = 2)) 171 | main(params) 172 | -------------------------------------------------------------------------------- /scripts/prepro_ngrams.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | from six.moves import cPickle 5 | from collections import defaultdict 6 | 7 | def precook(s, n=4, out=False): 8 | """ 9 | Takes a string as input and returns an object that can be given to 10 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 11 | can take string arguments as well. 12 | :param s: string : sentence to be converted into ngrams 13 | :param n: int : number of ngrams for which representation is calculated 14 | :return: term frequency vector for occuring ngrams 15 | """ 16 | words = s.split() 17 | counts = defaultdict(int) 18 | for k in xrange(1,n+1): 19 | for i in xrange(len(words)-k+1): 20 | ngram = tuple(words[i:i+k]) 21 | counts[ngram] += 1 22 | return counts 23 | 24 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 25 | '''Takes a list of reference sentences for a single segment 26 | and returns an object that encapsulates everything that BLEU 27 | needs to know about them. 28 | :param refs: list of string : reference sentences for some image 29 | :param n: int : number of ngrams for which (ngram) representation is calculated 30 | :return: result (list of dict) 31 | ''' 32 | return [precook(ref, n) for ref in refs] 33 | 34 | def create_crefs(refs): 35 | crefs = [] 36 | for ref in refs: 37 | # ref is a list of 5 captions 38 | crefs.append(cook_refs(ref)) 39 | return crefs 40 | 41 | def compute_doc_freq(crefs): 42 | ''' 43 | Compute term frequency for reference data. 44 | This will be used to compute idf (inverse document frequency later) 45 | The term frequency is stored in the object 46 | :return: None 47 | ''' 48 | document_frequency = defaultdict(float) 49 | for refs in crefs: 50 | # refs, k ref captions of one image 51 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 52 | document_frequency[ngram] += 1 53 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 54 | return document_frequency 55 | 56 | def build_dict(imgs, wtoi, params): 57 | wtoi[''] = 0 58 | 59 | count_imgs = 0 60 | 61 | refs_words = [] 62 | refs_idxs = [] 63 | for img in imgs: 64 | if (params['split'] == img['split']) or \ 65 | (params['split'] == 'train' and img['split'] == 'restval') or \ 66 | (params['split'] == 'all'): 67 | #(params['split'] == 'val' and img['split'] == 'restval') or \ 68 | ref_words = [] 69 | ref_idxs = [] 70 | for sent in img['sentences']: 71 | tmp_tokens = sent['tokens'] + [''] 72 | tmp_tokens = [_ if _ in wtoi else 'UNK' for _ in tmp_tokens] 73 | ref_words.append(' '.join(tmp_tokens)) 74 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens])) 75 | refs_words.append(ref_words) 76 | refs_idxs.append(ref_idxs) 77 | count_imgs += 1 78 | print('total imgs:', count_imgs) 79 | 80 | ngram_words = compute_doc_freq(create_crefs(refs_words)) 81 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 82 | return ngram_words, ngram_idxs, count_imgs 83 | 84 | def main(params): 85 | 86 | imgs = json.load(open(params['input_json'], 'r')) 87 | itow = json.load(open(params['dict_json'], 'r'))['ix_to_word'] 88 | wtoi = {w:i for i,w in itow.items()} 89 | 90 | imgs = imgs['images'] 91 | 92 | ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params) 93 | 94 | cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 95 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 96 | 97 | if __name__ == "__main__": 98 | 99 | parser = argparse.ArgumentParser() 100 | 101 | # input json 102 | parser.add_argument('--input_json', default='/home-nfs/rluo/rluo/nips/code/prepro/dataset_coco.json', help='input json file to process into hdf5') 103 | parser.add_argument('--dict_json', default='data/cocotalk.json', help='output json file') 104 | parser.add_argument('--output_pkl', default='data/coco-all', help='output pickle file') 105 | parser.add_argument('--split', default='all', help='test, val, train, all') 106 | args = parser.parse_args() 107 | params = vars(args) # convert to ordinary dict 108 | 109 | main(params) 110 | -------------------------------------------------------------------------------- /scripts/resnet.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import math 3 | import torch.utils.model_zoo as model_zoo 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152'] 8 | 9 | 10 | model_urls = { 11 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', 12 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth', 13 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth', 14 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth', 15 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth', 16 | } 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | "3x3 convolution with padding" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=1, bias=False) 23 | 24 | 25 | class BasicBlock(nn.Module): 26 | expansion = 1 27 | 28 | def __init__(self, inplanes, planes, stride=1, downsample=None): 29 | super(BasicBlock, self).__init__() 30 | self.conv1 = conv3x3(inplanes, planes, stride) 31 | self.bn1 = nn.BatchNorm2d(planes) 32 | self.relu = nn.ReLU(inplace=True) 33 | self.conv2 = conv3x3(planes, planes) 34 | self.bn2 = nn.BatchNorm2d(planes) 35 | self.downsample = downsample 36 | self.stride = stride 37 | 38 | def forward(self, x): 39 | residual = x 40 | 41 | out = self.conv1(x) 42 | out = self.bn1(out) 43 | out = self.relu(out) 44 | 45 | out = self.conv2(out) 46 | out = self.bn2(out) 47 | 48 | if self.downsample is not None: 49 | residual = self.downsample(x) 50 | 51 | out += residual 52 | out = self.relu(out) 53 | 54 | return out 55 | 56 | 57 | class Bottleneck(nn.Module): 58 | expansion = 4 59 | 60 | def __init__(self, inplanes, planes, stride=1, downsample=None): 61 | super(Bottleneck, self).__init__() 62 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change 63 | self.bn1 = nn.BatchNorm2d(planes) 64 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change 65 | padding=1, bias=False) 66 | self.bn2 = nn.BatchNorm2d(planes) 67 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 68 | self.bn3 = nn.BatchNorm2d(planes * 4) 69 | self.relu = nn.ReLU(inplace=True) 70 | self.downsample = downsample 71 | self.stride = stride 72 | 73 | def forward(self, x): 74 | residual = x 75 | 76 | out = self.conv1(x) 77 | out = self.bn1(out) 78 | out = self.relu(out) 79 | 80 | out = self.conv2(out) 81 | out = self.bn2(out) 82 | out = self.relu(out) 83 | 84 | out = self.conv3(out) 85 | out = self.bn3(out) 86 | 87 | if self.downsample is not None: 88 | residual = self.downsample(x) 89 | 90 | out += residual 91 | out = self.relu(out) 92 | 93 | return out 94 | 95 | 96 | class ResNet(nn.Module): 97 | def __init__(self, block, layers, num_classes=1000): 98 | self.inplanes = 64 99 | super(ResNet, self).__init__() 100 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 101 | bias=False) 102 | self.bn1 = nn.BatchNorm2d(64) 103 | self.relu = nn.ReLU(inplace=True) 104 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change 105 | self.layer1 = self._make_layer(block, 64, layers[0]) 106 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 107 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 108 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 109 | self.avgpool = nn.AvgPool2d(7) 110 | self.fc = nn.Linear(512 * block.expansion, num_classes) 111 | 112 | for m in self.modules(): 113 | if isinstance(m, nn.Conv2d): 114 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 115 | m.weight.data.normal_(0, math.sqrt(2. / n)) 116 | elif isinstance(m, nn.BatchNorm2d): 117 | m.weight.data.fill_(1) 118 | m.bias.data.zero_() 119 | 120 | def _make_layer(self, block, planes, blocks, stride=1): 121 | downsample = None 122 | if stride != 1 or self.inplanes != planes * block.expansion: 123 | downsample = nn.Sequential( 124 | nn.Conv2d(self.inplanes, planes * block.expansion, 125 | kernel_size=1, stride=stride, bias=False), 126 | nn.BatchNorm2d(planes * block.expansion), 127 | ) 128 | 129 | layers = [] 130 | layers.append(block(self.inplanes, planes, stride, downsample)) 131 | self.inplanes = planes * block.expansion 132 | for i in range(1, blocks): 133 | layers.append(block(self.inplanes, planes)) 134 | 135 | return nn.Sequential(*layers) 136 | 137 | def forward(self, x): 138 | x = self.conv1(x) 139 | x = self.bn1(x) 140 | x = self.relu(x) 141 | x = self.maxpool(x) 142 | 143 | x = self.layer1(x) 144 | x = self.layer2(x) 145 | x = self.layer3(x) 146 | x = self.layer4(x) 147 | 148 | x = self.avgpool(x) 149 | x = x.view(x.size(0), -1) 150 | x = self.fc(x) 151 | 152 | return x 153 | 154 | 155 | def resnet18(pretrained=False): 156 | """Constructs a ResNet-18 model. 157 | 158 | Args: 159 | pretrained (bool): If True, returns a model pre-trained on ImageNet 160 | """ 161 | model = ResNet(BasicBlock, [2, 2, 2, 2]) 162 | if pretrained: 163 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) 164 | return model 165 | 166 | 167 | def resnet34(pretrained=False): 168 | """Constructs a ResNet-34 model. 169 | 170 | Args: 171 | pretrained (bool): If True, returns a model pre-trained on ImageNet 172 | """ 173 | model = ResNet(BasicBlock, [3, 4, 6, 3]) 174 | if pretrained: 175 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) 176 | return model 177 | 178 | 179 | def resnet50(pretrained=False): 180 | """Constructs a ResNet-50 model. 181 | 182 | Args: 183 | pretrained (bool): If True, returns a model pre-trained on ImageNet 184 | """ 185 | model = ResNet(Bottleneck, [3, 4, 6, 3]) 186 | if pretrained: 187 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) 188 | return model 189 | 190 | 191 | def resnet101(pretrained=False): 192 | """Constructs a ResNet-101 model. 193 | 194 | Args: 195 | pretrained (bool): If True, returns a model pre-trained on ImageNet 196 | """ 197 | model = ResNet(Bottleneck, [3, 4, 23, 3]) 198 | if pretrained: 199 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) 200 | return model 201 | 202 | 203 | def resnet152(pretrained=False): 204 | """Constructs a ResNet-152 model. 205 | 206 | Args: 207 | pretrained (bool): If True, returns a model pre-trained on ImageNet 208 | """ 209 | model = ResNet(Bottleneck, [3, 8, 36, 3]) 210 | if pretrained: 211 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 212 | return model -------------------------------------------------------------------------------- /scripts/resnet_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | 6 | class myResnet(nn.Module): 7 | def __init__(self, resnet): 8 | super(myResnet, self).__init__() 9 | self.resnet = resnet 10 | 11 | def forward(self, img, att_size=14): 12 | x = img.unsqueeze(0) 13 | 14 | x = self.resnet.conv1(x) 15 | x = self.resnet.bn1(x) 16 | x = self.resnet.relu(x) 17 | x = self.resnet.maxpool(x) 18 | 19 | x = self.resnet.layer1(x) 20 | x = self.resnet.layer2(x) 21 | x = self.resnet.layer3(x) 22 | x = self.resnet.layer4(x) 23 | 24 | fc = x.mean(3).mean(2).squeeze() 25 | att = F.adaptive_avg_pool2d(x,[att_size,att_size]).squeeze().permute(1, 2, 0) 26 | 27 | return fc, att 28 | 29 | -------------------------------------------------------------------------------- /scripts/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import collections 6 | import torch 7 | import torch.nn as nn 8 | from torch.autograd import Variable 9 | 10 | def if_use_att(caption_model): 11 | # Decide if load attention feature according to caption model 12 | if caption_model in ['show_tell', 'all_img', 'fc']: 13 | return False 14 | return True 15 | 16 | # Input: seq, N*D numpy array, with element 0 .. vocab_size. 0 is END token. 17 | def decode_sequence(ix_to_word, seq): 18 | N, D = seq.size() 19 | out = [] 20 | for i in range(N): 21 | txt = '' 22 | for j in range(D): 23 | ix = seq[i,j] 24 | if ix > 0 : 25 | if j >= 1: 26 | txt = txt + ' ' 27 | txt = txt + ix_to_word[str(ix)] 28 | else: 29 | break 30 | out.append(txt) 31 | return out 32 | 33 | def to_contiguous(tensor): 34 | if tensor.is_contiguous(): 35 | return tensor 36 | else: 37 | return tensor.contiguous() 38 | 39 | class LanguageModelCriterion(nn.Module): 40 | def __init__(self): 41 | super(LanguageModelCriterion, self).__init__() 42 | 43 | def forward(self, input, target, mask): 44 | # truncate to the same size 45 | target = target[:, :input.size(1)] 46 | mask = mask[:, :input.size(1)] 47 | input = to_contiguous(input).view(-1, input.size(2)) 48 | target = to_contiguous(target).view(-1, 1) 49 | mask = to_contiguous(mask).view(-1, 1) 50 | output = - input.gather(1, target) * mask 51 | output = torch.sum(output) / torch.sum(mask) 52 | 53 | return output 54 | 55 | def set_lr(optimizer, lr): 56 | for group in optimizer.param_groups: 57 | group['lr'] = lr 58 | 59 | def clip_gradient(optimizer, grad_clip): 60 | for group in optimizer.param_groups: 61 | for param in group['params']: 62 | param.grad.data.clamp_(-grad_clip, grad_clip) -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import time 2 | import json 3 | import logging 4 | import numpy as np 5 | import os.path as osp 6 | from pycoco.bleu.bleu import Bleu 7 | from pycoco.meteor.meteor import Meteor 8 | from pycoco.rouge.rouge import Rouge 9 | from pycoco.cider.cider import Cider 10 | import torch 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch.autograd import Variable 15 | 16 | 17 | class Timer: 18 | def __init__(self): 19 | self.start_time = 0 20 | self.end_time = 0 21 | self.total_time = 0 22 | self.avg_time = 0 23 | self.n_toc = 0 24 | 25 | def tic(self): 26 | self.n_toc = 0 27 | self.start_time = time.time() 28 | 29 | def toc(self): 30 | self.end_time = time.time() 31 | self.total_time = self.end_time - self.start_time 32 | self.n_toc += 1. 33 | self.avg_time = self.total_time / self.n_toc 34 | return self.total_time 35 | 36 | 37 | class Logger: 38 | """ 39 | When receiving a message, first print it on screen, then write it into log file. 40 | If save_dir is None, it writes no log and only prints on screen. 41 | """ 42 | def __init__(self, save_dir): 43 | if save_dir is not None: 44 | self.logger = logging.getLogger() 45 | logging.basicConfig(filename=osp.join(save_dir, 'experiment.log'), format='%(asctime)s | %(message)s') 46 | logging.root.setLevel(level=logging.INFO) 47 | else: 48 | self.logger = None 49 | 50 | def info(self, msg, to_file=True): 51 | print msg 52 | if self.logger is not None and to_file: 53 | self.logger.info(msg) 54 | def evaluate(gt_file, re_file, logger=None): 55 | """ 56 | This function is reformed from MSCOCO evaluating code. 57 | The reference sentences are read from gt_file, 58 | the generated sentences to be evaluated are read from res_file 59 | 60 | """ 61 | gts = json.load(open(gt_file, 'r')) 62 | scorers = [ 63 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 64 | #(Meteor(), "METEOR"), 65 | # (Rouge(), "ROUGE_L"), 66 | (Cider(), "CIDEr") 67 | ] 68 | metrics = [] 69 | res = json.load(open(re_file, 'r')) 70 | res = {c['image_id']: [c['caption']] for c in res} 71 | gts = {k: v for k, v in zip(gts['image_ids'], gts['captions']) if k in res} 72 | for scorer, method in scorers: 73 | if logger is not None: 74 | logger.info('computing %s score...' % (scorer.method())) 75 | score, scores = scorer.compute_score(gts, res) 76 | if type(method) == list: 77 | for sc, scs, m in zip(score, scores, method): 78 | if logger is not None: 79 | logger.info("%s: %0.3f" % (m, sc)) 80 | metrics.extend(score) 81 | else: 82 | if logger is not None: 83 | logger.info("%s: %0.3f" % (method, score)) 84 | metrics.append(score) 85 | return metrics 86 | 87 | 88 | def lm_caption_step(w_t,lm_state_t,caption_state_t,lm,caption_model,eta,manner): 89 | word = Variable(torch.LongTensor(w_t.tolist())) 90 | if lm.on_gpu: 91 | word = word.cuda() 92 | word_emb = lm.word_embedding_layer(word) 93 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size) 94 | prob = F.softmax(logit) # (batch_size, vocab_size) 95 | P = prob - eta 96 | P *= 10000000 97 | mask = F.sigmoid(P).data.cpu().numpy() 98 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(caption_state_t, w_t,mask,manner) 99 | return w_t_1,lm_state_t_1,caption_state_t_1 100 | 101 | 102 | def lm_caption(lm,model,image_ids,vocab,loader,feature,max_step,manner): 103 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model 104 | lm_state_0 = lm.init_state() 105 | cap_state_0 = model.initial_state(feature) 106 | eta_0 = 0.00005 107 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32) 108 | if manner == 'sample': 109 | res = [] 110 | for step in range(max_step-1): 111 | w_1, lm_state_1, cap_state_1 = lm_caption_step(w_0, lm_state_0, cap_state_0, lm, model, eta_0*(2**step), manner) 112 | cap[step + 1, :] = w_1[:] 113 | w_0 = w_1 114 | lm_state_0 = lm_state_1 115 | cap_state_0 = cap_state_1 116 | for i in range(loader.batch_size): 117 | index = np.where(cap[1:,i] == 0)[0] 118 | if len(index) > 0: 119 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i]) 120 | else: 121 | s = ' '.join(vocab[w] for w in cap[1:, i]) 122 | res.append({'image_id': image_ids[i], 'caption': s}) 123 | else: 124 | cap, res = model.inference(vocab, image_ids, feature, manner='greedy', max_length=max_step) 125 | return cap,res 126 | 127 | def att_lm_caption_step(w_t,lm_state_t,patches,caption_state_t,lm,caption_model,eta,manner): 128 | word = Variable(torch.LongTensor(w_t.tolist())) 129 | if lm.on_gpu: 130 | word = word.cuda() 131 | word_emb = lm.word_embedding_layer(word) 132 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size) 133 | prob = F.softmax(logit) # (batch_size, vocab_size) 134 | P = prob - eta 135 | P *= 10000000 136 | mask = F.sigmoid(P).data.cpu().numpy() 137 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(caption_state_t, w_t,patches,mask,manner) 138 | return w_t_1,lm_state_t_1,caption_state_t_1 139 | 140 | def att_lm_caption(lm,model,image_ids,vocab,loader,features,max_step,manner): 141 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model 142 | lm_state_0 = lm.init_state() 143 | eta_0 = 0.00005 144 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32) 145 | if manner == 'sample': 146 | pathes, cap_state_0 = model.initial_state(features) 147 | res = [] 148 | for step in range(max_step-1): 149 | w_1, lm_state_1, cap_state_1 = att_lm_caption_step(w_0, lm_state_0,pathes, cap_state_0, lm, model, eta_0*(2**step), manner) 150 | cap[step + 1, :] = w_1[:] 151 | w_0 = w_1 152 | lm_state_0 = lm_state_1 153 | cap_state_0 = cap_state_1 154 | for i in range(loader.batch_size): 155 | index = np.where(cap[1:,i] == 0)[0] 156 | if len(index) > 0: 157 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i]) 158 | else: 159 | s = ' '.join(vocab[w] for w in cap[1:, i]) 160 | res.append({'image_id': image_ids[i], 'caption': s}) 161 | else: 162 | cap, res = model.inference(vocab, image_ids, features, manner='greedy', max_length=max_step) 163 | return cap,res 164 | 165 | 166 | def lm2_caption_step(w_t,first_word,lm_state_t,caption_state_t,lm,caption_model,eta,manner,step): 167 | word = Variable(torch.LongTensor(w_t.tolist())) 168 | if lm.on_gpu: 169 | word = word.cuda() 170 | word_emb = lm.word_embedding_layer(word) 171 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size) 172 | prob = F.softmax(logit) # (batch_size, vocab_size) 173 | P = prob - eta 174 | P *= 10000000 175 | mask = F.sigmoid(P).data.cpu().numpy()[:,:-1] # drop the start token 176 | if step == 0: 177 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(0,caption_state_t, first_word,mask,manner) 178 | else: 179 | caption_state_t_1, w_t_1 = caption_model.ngram_single_step(step, caption_state_t, w_t, mask, manner) 180 | return w_t_1,lm_state_t_1,caption_state_t_1 181 | 182 | 183 | def lm2_caption(lm,model,image_ids,vocab,loader,feature,max_step,manner): 184 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model 185 | lm_state_0 = lm.init_state() 186 | first_word, cap_state_0 = model.initial_state(feature) 187 | eta_0 = 0.00005 188 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32) 189 | if manner == 'sample': 190 | res = [] 191 | for step in range(max_step-1): 192 | w_1, lm_state_1, cap_state_1 = lm2_caption_step(w_0,first_word, lm_state_0, cap_state_0, lm, model, eta_0*(2**step), manner,step) 193 | cap[step + 1, :] = w_1[:] 194 | w_0 = w_1 195 | lm_state_0 = lm_state_1 196 | cap_state_0 = cap_state_1 197 | for i in range(loader.batch_size): 198 | index = np.where(cap[1:,i] == 0)[0] 199 | if len(index) > 0: 200 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i]) 201 | else: 202 | s = ' '.join(vocab[w] for w in cap[1:, i]) 203 | res.append({'image_id': image_ids[i], 'caption': s}) 204 | else: 205 | cap, res = model.inference(vocab, image_ids, feature, manner='greedy', max_length=max_step) 206 | return cap[1:,:],res 207 | 208 | def att2_lm_caption_step(w_t,first_word,lm_state_t,patches,caption_state_t,lm,caption_model,eta,manner,step): 209 | word = Variable(torch.LongTensor(w_t.tolist())) 210 | if lm.on_gpu: 211 | word = word.cuda() 212 | word_emb = lm.word_embedding_layer(word) 213 | logit, lm_state_t_1 = lm.forward(word_emb, lm_state_t) # logit : (batch_size, vocab_size) 214 | prob = F.softmax(logit) # (batch_size, vocab_size) 215 | P = prob - eta 216 | P *= 10000000 217 | mask = F.sigmoid(P).data.cpu().numpy()[:,:-1] # drop the start token 218 | if step == 0: 219 | caption_state_t_1,w_t_1 = caption_model.ngram_single_step(0,caption_state_t,first_word,patches,mask,manner) 220 | else: 221 | caption_state_t_1, w_t_1 = caption_model.ngram_single_step(step,caption_state_t, w_t, patches, mask, manner) 222 | return w_t_1,lm_state_t_1,caption_state_t_1 223 | 224 | def att2_lm_caption(lm,model,image_ids,vocab,loader,features,max_step,manner): 225 | w_0 = np.ones((len(image_ids),), dtype=np.int32) * 9488 # set start token for rnn language model 226 | lm_state_0 = lm.init_state() 227 | eta_0 = 0.00005 228 | cap = np.zeros((max_step, len(image_ids)), dtype=np.int32) 229 | if manner == 'sample': 230 | patches,first_word, cap_state_0 = model.initial_state(features) 231 | res = [] 232 | for step in range(max_step-1): 233 | w_1, lm_state_1, cap_state_1 = att2_lm_caption_step(w_0,first_word, lm_state_0,patches, cap_state_0, lm, model, eta_0*(2**step), manner,step) 234 | cap[step + 1, :] = w_1[:] 235 | w_0 = w_1 236 | lm_state_0 = lm_state_1 237 | cap_state_0 = cap_state_1 238 | for i in range(loader.batch_size): 239 | index = np.where(cap[1:,i] == 0)[0] 240 | if len(index) > 0: 241 | s = ' '.join(vocab[w] for w in cap[1:index[0]+1, i]) 242 | else: 243 | s = ' '.join(vocab[w] for w in cap[1:, i]) 244 | res.append({'image_id': image_ids[i], 'caption': s}) 245 | else: 246 | cap, res = model.inference(vocab, image_ids, features, manner='greedy', max_length=max_step) 247 | return cap[1:,:],res 248 | 249 | -------------------------------------------------------------------------------- /train_fourgram.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | 16 | batch_size = opts.batch_size 17 | 18 | loader = KKDataLoader(opts) 19 | vocabs = loader.get_vocab() 20 | vocab = ['#END#'] 21 | for i in range(len(vocabs)): 22 | ids = str(i+1) 23 | vocab.append(vocabs[ids]) 24 | 25 | if not os.path.exists('fourgram_cider_model'): 26 | os.mkdir('fourgram_cider_model') 27 | 28 | if opts.use_att: 29 | save_dir = 'fourgram_cider_model/' + 'att_model' 30 | else: 31 | save_dir = 'fourgram_cider_model/' + 'fc_model' 32 | if not os.path.exists(save_dir): 33 | os.mkdir(save_dir) 34 | print(save_dir + ' has been built') 35 | 36 | 37 | image_dim = 2048 38 | vocab_size = loader.vocab_size + 1 39 | cell_size = 512 40 | lr = 0.00005 41 | if opts.use_att: 42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True) 43 | model.load('warm_model/att_warm/model.init') 44 | else: 45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=4,on_gpu=True) 46 | model.load('warm_model/fc_warm/model.init') 47 | 48 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 49 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 50 | 51 | def cider_temp(res): 52 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 53 | score, _ = cider_scorer.compute_score() 54 | return score 55 | 56 | pool = Pool(processes=5) 57 | best_score = -1 58 | logger = Logger(save_dir) 59 | iter = 0 60 | finish_iter = 100000 61 | timer = Timer() 62 | timer.tic() 63 | while iter < finish_iter: 64 | iter += 1 65 | data = loader.get_batch('train') 66 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 67 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 68 | fc_feats, att_feats = tmp 69 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 70 | if opts.use_att: 71 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 72 | feature = att_feats 73 | else: 74 | feature = fc_feats 75 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 76 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 77 | 78 | all_caps, all_results, all_scores = [], [], [] 79 | for _ in xrange(20): 80 | # Generate captions by sampling 81 | sample_caps, sample_results = model.fourgram_inference(vocab, image_id, feature, 82 | manner='sample', 83 | max_length=16) 84 | 85 | # Compute cider scores for sampled captions 86 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 87 | all_caps.append(sample_caps) 88 | all_results.append(sample_results) 89 | all_scores.append(sample_scores) 90 | 91 | all_scores = np.array(all_scores) 92 | sample_caps, sample_results, sample_scores = [], [], [] 93 | for n in xrange(opts.batch_size): 94 | best_i = all_scores[:, n].argmax() 95 | sample_caps.append(all_caps[best_i][n]) 96 | sample_results.append(all_results[best_i][n]) 97 | sample_scores.append(all_scores[best_i, n]) 98 | sample_scores = np.array(sample_scores) 99 | 100 | max_length = max([cap.shape[0] for cap in sample_caps]) 101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 102 | for n in xrange(opts.batch_size): 103 | L = sample_caps[n].shape[0] 104 | caption[1:L + 1, n] = sample_caps[n] 105 | caption[L + 1:, n] = 0 106 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 107 | for n in xrange(opts.batch_size): 108 | L = sample_caps[n].shape[0] 109 | mask[:L + 1, n] = 1 110 | reward = (sample_scores - greedy_scores).astype(np.float32) 111 | print image_id[0] 112 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 113 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 114 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 115 | if iter % 300 == 0: 116 | results = [] 117 | for nn in range(5000/opts.batch_size): 118 | data = loader.get_batch('val') 119 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 120 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 121 | fc_feats, att_feats = tmp 122 | if opts.use_att: 123 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 124 | att_feats.shape[3]) 125 | feature_val = att_feats 126 | else: 127 | feature_val = fc_feats 128 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 129 | 130 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 131 | # Generate sentences for validation set 132 | results += greedy_res 133 | # Evaluate generated captions 134 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 135 | gt_file = osp.join('data/features', 'captions_val.json') 136 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 137 | 138 | if score > best_score: 139 | best_score = score 140 | model.save(osp.join(save_dir, 'model.best')) 141 | model.save(osp.join(save_dir,'model.ckpt')) 142 | # Output training information 143 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 144 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 145 | # Reset loss and timer 146 | train_losses = [] 147 | timer.tic() 148 | 149 | # If early-stop condition triggers 150 | if iter > finish_iter: 151 | break 152 | 153 | -------------------------------------------------------------------------------- /train_rnnlm.py: -------------------------------------------------------------------------------- 1 | import ngram_opts 2 | from dataloader import * 3 | import os 4 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 5 | from caption_model.rnnlm import * 6 | 7 | opts = ngram_opts.parse_opt() 8 | if opts.caption_model == 'fc': 9 | opts.use_att = False 10 | else: 11 | opts.use_att = True 12 | loader = KKDataLoader(opts) 13 | 14 | 15 | batch_size = 100 16 | hidden_size = 256 17 | word_embed_size = 256 18 | vocab_size = loader.vocab_size + 2 # set start token 19 | lr = 0.0001 20 | lm = LM(batch_size, hidden_size,vocab_size, word_embed_size,lr) 21 | Labels = loader.h5_label_file['labels'] 22 | new_labels = np.zeros((Labels.shape[1]+1,Labels.shape[0]),dtype=Labels.dtype) 23 | new_labels[0,:] = 9488 # Set start token to 9488, the total vocab size is 9489 24 | for i in range(Labels.shape[0]): 25 | new_labels[1:,i] = Labels[i,:] 26 | 27 | Ind = range(len(Labels)) 28 | mask = np.ones((16,100)) 29 | reward = np.ones((100,)) 30 | import random 31 | for i in range(1000): 32 | random.shuffle(Ind) 33 | Loss = [] 34 | for j in range(100): 35 | index = Ind[j*batch_size:(j+1)*batch_size] 36 | batch_sen = new_labels[:,index] 37 | loss = lm.train_on_batch(batch_sen,mask,reward) 38 | Loss.append(loss) 39 | print i,np.mean(Loss) 40 | if i % 10 == 0: 41 | lm.save('warm_model/rnnlm/model.init') 42 | 43 | -------------------------------------------------------------------------------- /train_rnnlm_cider.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | from caption_model.rnnlm import * 6 | import os 7 | os.environ["CUDA_VISIBLE_DEVICES"] = "2" 8 | import ngram_opts 9 | from tools import * 10 | from dataloader import * 11 | opts = ngram_opts.parse_opt() 12 | if opts.caption_model == 'fc': 13 | opts.use_att = False 14 | else: 15 | opts.use_att = True 16 | 17 | batch_size = opts.batch_size 18 | 19 | loader = KKDataLoader(opts) 20 | vocabs = loader.get_vocab() 21 | vocab = ['#END#'] 22 | for i in range(len(vocabs)): 23 | ids = str(i+1) 24 | vocab.append(vocabs[ids]) 25 | 26 | if opts.use_att: 27 | save_dir = 'rnnlm_cider_model/' + 'att_model' 28 | else: 29 | save_dir = 'rnnlm_cider_model/' + 'fc_model' 30 | if not os.path.exists(save_dir): 31 | os.mkdir(save_dir) 32 | print(save_dir + ' has been built') 33 | 34 | 35 | image_dim = opts.fc_feat_size 36 | vocab_size = loader.vocab_size + 1 37 | cell_size = opts.rnn_size 38 | lr = 0.00005 39 | if opts.use_att: 40 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 41 | model.load('warm_model/att_warm/model.init') 42 | else: 43 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 44 | model.load('warm_model/fc_warm/model.init') 45 | 46 | word_embed_size = 256 47 | hidden_size = 256 48 | manner = 'sample' 49 | lm = LM(batch_size, hidden_size,vocab_size+1, word_embed_size,lr) 50 | lm.load('warm_model/rnnlm/model.init') 51 | 52 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 53 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 54 | def cider_temp(res): 55 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 56 | score, _ = cider_scorer.compute_score() 57 | return score 58 | 59 | pool = Pool(processes=4) 60 | logger = Logger(save_dir) 61 | best_score = -1 62 | iters = 0 63 | finish_iter = 1000000 64 | timer = Timer() 65 | timer.tic() 66 | best_count = 0 67 | max_step = 14 68 | while iters < finish_iter: 69 | iters += 1 70 | data = loader.get_batch('train') 71 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 72 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 73 | fc_feats, att_feats = tmp 74 | if opts.use_att: 75 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 76 | feature = att_feats 77 | else: 78 | feature = fc_feats 79 | 80 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 81 | 82 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 83 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 84 | 85 | all_caps, all_results, all_scores = [], [], [] 86 | for _ in xrange(20): 87 | if opts.use_att: 88 | sample_caps, sample_results = att2_lm_caption(lm,model,image_id,vocab,loader,feature,max_step,'sample') 89 | else: 90 | sample_caps, sample_results = lm2_caption(lm, model, image_id, vocab, loader, feature, max_step,'sample') 91 | # Compute cider scores for sampled captions 92 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 93 | all_caps.append(sample_caps) 94 | all_results.append(sample_results) 95 | all_scores.append(sample_scores) 96 | 97 | all_scores = np.array(all_scores) 98 | sample_caps, sample_results, sample_scores = [], [], [] 99 | for n in xrange(opts.batch_size): 100 | best_i = all_scores[:, n].argmax() 101 | sample_caps.append(all_caps[best_i][:,n]) 102 | sample_results.append(all_results[best_i][n]) 103 | sample_scores.append(all_scores[best_i, n]) 104 | sample_scores = np.array(sample_scores) 105 | sample_caps = np.array(sample_caps) 106 | sample_caps = sample_caps.transpose() 107 | 108 | mask = np.ones((sample_caps.shape[0],sample_caps.shape[1])) 109 | for n in range(opts.batch_size): 110 | index = np.where(sample_caps[:,n] == 0)[0] 111 | if len(index) > 1: 112 | mask[index[1]:,n] = 0 113 | 114 | reward = (sample_scores - greedy_scores).astype(np.float32) 115 | print iters, image_id[0] 116 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 117 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 118 | 119 | loss_train = model.train_on_batch(feature,sample_caps, mask, reward) 120 | if iters % 300 == 0: 121 | results = [] 122 | for kkk in range(5000/opts.batch_size): 123 | data = loader.get_batch('val') 124 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 125 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 126 | fc_feats, att_feats = tmp 127 | if opts.use_att: 128 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 129 | att_feats.shape[3]) 130 | feature = att_feats 131 | else: 132 | feature = fc_feats 133 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 134 | 135 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 136 | results += greedy_res 137 | # Evaluate generated captions 138 | json.dump(results, open(osp.join(save_dir, 'rl_result.json'), 'w')) 139 | gt_file = osp.join('data/features', 'captions_val.json') 140 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'rl_result.json'))[-1] 141 | 142 | if score > best_score: 143 | best_score = score 144 | model.save(osp.join(save_dir, 'model.best')) 145 | model.save(osp.join(save_dir,'model.ckpt')) 146 | # Output training information 147 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 148 | .format(iters, -1, score, best_score, finish_iter, timer.toc())) 149 | 150 | train_losses = [] 151 | timer.tic() 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /train_sc_cider.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | 16 | batch_size = opts.batch_size 17 | 18 | loader = KKDataLoader(opts) 19 | vocabs = loader.get_vocab() 20 | vocab = ['#END#'] 21 | for i in range(len(vocabs)): 22 | ids = str(i+1) 23 | vocab.append(vocabs[ids]) 24 | 25 | if not os.path.exists('sc_cider_model'): 26 | os.mkdir('sc_cider_model') 27 | 28 | if opts.use_att: 29 | save_dir = 'sc_cider_model/' + 'att_model' 30 | else: 31 | save_dir = 'sc_cider_model/' + 'fc_model' 32 | if not os.path.exists(save_dir): 33 | os.mkdir(save_dir) 34 | print(save_dir + ' has been built') 35 | 36 | 37 | image_dim = 2048 38 | vocab_size = loader.vocab_size + 1 39 | cell_size = 512 40 | lr = 0.00005 41 | if opts.use_att: 42 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True) 43 | model.load('warm_model/att_warm/model.init') 44 | else: 45 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=0,on_gpu=True) 46 | model.load('warm_model/fc_warm/model.init') 47 | 48 | 49 | # Initialize cider-scorer 50 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 51 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 52 | 53 | def cider_temp(res): 54 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 55 | score, _ = cider_scorer.compute_score() 56 | return score 57 | 58 | pool = Pool(processes=5) 59 | 60 | best_score = -1 61 | logger = Logger(save_dir) 62 | iter = 0 63 | finish_iter = 1000000 64 | timer = Timer() 65 | timer.tic() 66 | while iter < finish_iter: 67 | iter += 1 68 | data = loader.get_batch('train') 69 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 70 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 71 | fc_feats, att_feats = tmp 72 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 73 | if opts.use_att: 74 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 75 | feature = att_feats 76 | else: 77 | feature = fc_feats 78 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 79 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 80 | 81 | all_caps, all_results, all_scores = [], [], [] 82 | for _ in xrange(20): 83 | # Generate captions by sampling 84 | sample_caps, sample_results = model.inference(vocab, image_id, feature, 85 | manner='sample', 86 | max_length=16) 87 | 88 | # Compute cider scores for sampled captions 89 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 90 | all_caps.append(sample_caps) 91 | all_results.append(sample_results) 92 | all_scores.append(sample_scores) 93 | 94 | all_scores = np.array(all_scores) 95 | sample_caps, sample_results, sample_scores = [], [], [] 96 | for n in xrange(opts.batch_size): 97 | best_i = all_scores[:, n].argmax() 98 | sample_caps.append(all_caps[best_i][n]) 99 | sample_results.append(all_results[best_i][n]) 100 | sample_scores.append(all_scores[best_i, n]) 101 | sample_scores = np.array(sample_scores) 102 | 103 | max_length = max([cap.shape[0] for cap in sample_caps]) 104 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 105 | for n in xrange(opts.batch_size): 106 | L = sample_caps[n].shape[0] 107 | caption[1:L + 1, n] = sample_caps[n] 108 | caption[L + 1:, n] = 0 109 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 110 | for n in xrange(opts.batch_size): 111 | L = sample_caps[n].shape[0] 112 | mask[:L + 1, n] = 1 113 | reward = (sample_scores - greedy_scores).astype(np.float32) 114 | print image_id[0] 115 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 116 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 117 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 118 | if iter % 300 == 0: 119 | results = [] 120 | for nn in range(5000/opts.batch_size): 121 | data = loader.get_batch('val') 122 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 123 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 124 | fc_feats, att_feats = tmp 125 | if opts.use_att: 126 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 127 | att_feats.shape[3]) 128 | feature_val = att_feats 129 | else: 130 | feature_val = fc_feats 131 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 132 | 133 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 134 | # Generate sentences for validation set 135 | results += greedy_res 136 | # Evaluate generated captions 137 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 138 | gt_file = osp.join('data/features', 'captions_val.json') 139 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 140 | # json.dump(results, open(osp.join(save_dir, 'kk_rl_result_'+ str(iter) + '.json'), 'w')) 141 | # Update if finding new best model 142 | if score > best_score: 143 | best_score = score 144 | model.save(osp.join(save_dir, 'model.best')) 145 | model.save(osp.join(save_dir,'model.ckpt')) 146 | # Output training information 147 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 148 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 149 | # Reset loss and timer 150 | train_losses = [] 151 | timer.tic() 152 | 153 | # If early-stop condition triggers 154 | if iter > finish_iter: 155 | break 156 | 157 | -------------------------------------------------------------------------------- /train_trigram.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | from mycider import * 4 | from multiprocessing import Pool 5 | import os 6 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 7 | import ngram_opts 8 | from tools import * 9 | from dataloader import * 10 | opts = ngram_opts.parse_opt() 11 | if opts.caption_model == 'fc': 12 | opts.use_att = False 13 | else: 14 | opts.use_att = True 15 | batch_size = opts.batch_size 16 | 17 | loader = KKDataLoader(opts) 18 | vocabs = loader.get_vocab() 19 | vocab = ['#END#'] 20 | for i in range(len(vocabs)): 21 | ids = str(i+1) 22 | vocab.append(vocabs[ids]) 23 | 24 | if not os.path.exists('trigram_cider_model'): 25 | os.mkdir('trigram_cider_model') 26 | 27 | if opts.use_att: 28 | save_dir = 'trigram_cider_model/' + 'att_model' 29 | else: 30 | save_dir = 'trigram_cider_model/' + 'fc_model' 31 | if not os.path.exists(save_dir): 32 | os.mkdir(save_dir) 33 | print(save_dir + ' has been built') 34 | 35 | 36 | image_dim = 2048 37 | vocab_size = loader.vocab_size + 1 38 | cell_size = 512 39 | lr = 0.00005 40 | if opts.use_att: 41 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True) 42 | model.load('warm_model/att_warm/model.init') 43 | else: 44 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, ngram=3,on_gpu=True) 45 | model.load('warm_model/fc_warm/model.init') 46 | 47 | 48 | # Initialize cider-scorer 49 | gts = transfer_json_to_cider_gts(osp.join('data/features', 'captions_train.json')) 50 | cider_scorer = CiderScorer(refs=gts, n=4, sigma=6.0) 51 | 52 | def cider_temp(res): 53 | cider_scorer.cook_append_test(test={res['image_id']: [res['caption']]}) 54 | score, _ = cider_scorer.compute_score() 55 | return score 56 | 57 | pool = Pool(processes=5) 58 | 59 | best_score = -1 60 | logger = Logger(save_dir) 61 | iter = 0 62 | finish_iter = 100000 63 | timer = Timer() 64 | timer.tic() 65 | while iter < finish_iter: 66 | iter += 1 67 | data = loader.get_batch('train') 68 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 69 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 70 | fc_feats, att_feats = tmp 71 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 72 | if opts.use_att: 73 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], att_feats.shape[3]) 74 | feature = att_feats 75 | else: 76 | feature = fc_feats 77 | greedy_cap, greedy_res = model.inference(vocab, image_id, feature, manner='greedy', max_length=16) 78 | greedy_scores = np.array(pool.map(cider_temp, greedy_res)) 79 | 80 | all_caps, all_results, all_scores = [], [], [] 81 | for _ in xrange(20): 82 | # Generate captions by sampling 83 | sample_caps, sample_results = model.trigram_inference(vocab, image_id, feature, 84 | manner='sample', 85 | max_length=16) 86 | sample_scores = np.array(pool.map(cider_temp, sample_results)) 87 | all_caps.append(sample_caps) 88 | all_results.append(sample_results) 89 | all_scores.append(sample_scores) 90 | 91 | all_scores = np.array(all_scores) 92 | sample_caps, sample_results, sample_scores = [], [], [] 93 | for n in xrange(opts.batch_size): 94 | best_i = all_scores[:, n].argmax() 95 | sample_caps.append(all_caps[best_i][n]) 96 | sample_results.append(all_results[best_i][n]) 97 | sample_scores.append(all_scores[best_i, n]) 98 | sample_scores = np.array(sample_scores) 99 | 100 | max_length = max([cap.shape[0] for cap in sample_caps]) 101 | caption = np.zeros([max_length + 2, opts.batch_size], dtype=np.int32) 102 | for n in xrange(opts.batch_size): 103 | L = sample_caps[n].shape[0] 104 | caption[1:L + 1, n] = sample_caps[n] 105 | caption[L + 1:, n] = 0 106 | mask = np.zeros([max_length + 1, opts.batch_size], dtype=np.float32) 107 | for n in xrange(opts.batch_size): 108 | L = sample_caps[n].shape[0] 109 | mask[:L + 1, n] = 1 110 | reward = (sample_scores - greedy_scores).astype(np.float32) 111 | print image_id[0] 112 | print 'greedy: ', greedy_scores[0], greedy_res[0]['caption'] 113 | print 'sample: ', sample_scores[0], sample_results[0]['caption'] 114 | loss_train = model.train_on_batch(feature, caption[1:,:], mask, reward) 115 | if iter % 300 == 0: 116 | results = [] 117 | for nn in range(5000/opts.batch_size): 118 | data = loader.get_batch('val') 119 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 120 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 121 | fc_feats, att_feats = tmp 122 | if opts.use_att: 123 | att_feats = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 124 | att_feats.shape[3]) 125 | feature_val = att_feats 126 | else: 127 | feature_val = fc_feats 128 | image_id = [data['infos'][i]['id'] for i in range(opts.batch_size)] 129 | 130 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature_val,manner='greedy',max_length=16) 131 | # Generate sentences for validation set 132 | results += greedy_res 133 | # Evaluate generated captions 134 | json.dump(results, open(osp.join(save_dir, 'result.json'), 'w')) 135 | gt_file = osp.join('data/features', 'captions_val.json') 136 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'result.json'))[-1] 137 | if score > best_score: 138 | best_score = score 139 | model.save(osp.join(save_dir, 'model.best')) 140 | model.save(osp.join(save_dir,'model.ckpt')) 141 | # Output training information 142 | logger.info('[{}], tr_loss={:.5f}, score/best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 143 | .format(iter, -1, score, best_score, finish_iter, timer.toc())) 144 | # Reset loss and timer 145 | train_losses = [] 146 | timer.tic() 147 | 148 | # If early-stop condition triggers 149 | if iter > finish_iter: 150 | break 151 | 152 | -------------------------------------------------------------------------------- /train_warm.py: -------------------------------------------------------------------------------- 1 | from caption_model.att import * 2 | from caption_model.fc import * 3 | import os 4 | import sys 5 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 6 | import ngram_opts 7 | from tools import * 8 | from dataloader import * 9 | opts = ngram_opts.parse_opt() 10 | if opts.caption_model == 'fc': 11 | opts.use_att = False 12 | else: 13 | opts.use_att = True 14 | batch_size = opts.batch_size 15 | 16 | loader = KKDataLoader(opts) 17 | vocabs = loader.get_vocab() 18 | vocab = ['#END#'] 19 | for i in range(len(vocabs)): 20 | ids = str(i+1) 21 | vocab.append(vocabs[ids]) 22 | if opts.use_att: 23 | save_dir = 'warm_model/' + 'att_warm' 24 | else: 25 | save_dir = 'warm_model/' + 'fc_warm' 26 | if not os.path.exists(save_dir): 27 | os.mkdir(save_dir) 28 | print(save_dir + ' has been built') 29 | 30 | image_dim = opts.fc_feat_size 31 | vocab_size = loader.vocab_size+1 # no start token for ngram model warm start 32 | cell_size = opts.rnn_size 33 | lr = 0.00005 34 | if opts.use_att: 35 | model = AttModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 36 | else: 37 | model = FCModel(batch_size=batch_size, image_dim=image_dim, vocab_size=vocab_size, cell_size=cell_size, lr=lr, on_gpu=True) 38 | 39 | logger = Logger(save_dir) 40 | iters = 0 41 | best_score = -1 42 | timer = Timer() 43 | timer.tic() 44 | finish_iter = 1000000 45 | while iters < finish_iter: 46 | iters += 1 47 | data = loader.get_batch('train') 48 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']] 49 | fc_feats, att_feats, labels, masks = tmp 50 | if opts.use_att: 51 | feature = att_feats.reshape(att_feats.shape[0],att_feats.shape[1]* att_feats.shape[2],att_feats.shape[3]) 52 | else: 53 | feature = fc_feats 54 | Label = labels.transpose()[1:,:] 55 | Mask = masks.transpose()[0:-1,:] 56 | reward = np.ones((opts.batch_size*5,)) 57 | loss = model.train_on_batch(feature,Label, Mask, reward) 58 | if iters % 500 == 0: 59 | results = [] 60 | for n_batches in range(5000/opts.batch_size): 61 | datas = loader.get_batch('val') 62 | tmp = [datas['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 63 | datas['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 64 | fc_feats, att_feats = tmp 65 | if opts.use_att: 66 | feature = att_feats.reshape(att_feats.shape[0], att_feats.shape[1] * att_feats.shape[2], 67 | att_feats.shape[3]) 68 | else: 69 | feature = fc_feats 70 | image_id = [datas['infos'][i]['id'] for i in range(opts.batch_size)] 71 | greedy_cap, greedy_res = model.inference(vocab,image_id,feature,manner='greedy',max_length=16) 72 | results += greedy_res 73 | json.dump(results, open(osp.join(save_dir, 'tmp_result.json'), 'w')) 74 | gt_file = osp.join('data/features', 'captions_val.json') 75 | score = evaluate(gt_file=gt_file, re_file=osp.join(save_dir, 'tmp_result.json'))[-1] 76 | 77 | if score > best_score: 78 | best_score = score 79 | model.save(osp.join(save_dir, 'model.init')) 80 | logger.info('[{}],CIDEr score/CIDEr best={:.3f}/{:.3f}, finish->{}, time={:.1f}sec' 81 | .format(iters, score, best_score, finish_iter, timer.toc())) 82 | timer.tic() 83 | -------------------------------------------------------------------------------- /vis/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | neuraltalk2 results visualization 7 | 8 | 42 | 43 | 44 |
45 | 72 | 73 | 74 | --------------------------------------------------------------------------------