├── .gitignore ├── .gitmodules ├── README.md ├── captioning.lua ├── classification.lua ├── images ├── bathroom.jpg ├── cat_dog.jpg ├── firehydrants.jpg └── tabbycat_dog.jpg ├── misc ├── DeconvReLU.lua ├── GuidedBackpropReLU.lua ├── prepro_ques.py └── utils.lua ├── models └── download_models.sh └── visual_question_answering.lua /.gitignore: -------------------------------------------------------------------------------- 1 | models/ 2 | output/ 3 | !models/download_models.sh 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "VQA_LSTM_CNN"] 2 | path = VQA_LSTM_CNN 3 | url = git@github.com:VT-vision-lab/VQA_LSTM_CNN.git 4 | [submodule "neuraltalk2"] 5 | path = neuraltalk2 6 | url = git@github.com:karpathy/neuraltalk2.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ## Grad-CAM: Gradient-weighted Class Activation Mapping 3 | 4 | Code for the paper 5 | 6 | **[Grad-CAM: Why did you say that? Visual Explanations from Deep Networks via Gradient-based Localization][7]** 7 | Ramprasaath R. Selvaraju, Abhishek Das, Ramakrishna Vedantam, Michael Cogswell, Devi Parikh, Dhruv Batra 8 | [https://arxiv.org/abs/1610.02391][7] 9 | 10 | 11 | Demo: [gradcam.cloudcv.org][8] 12 | 13 | ![Overview](http://i.imgur.com/JaGbdZ5.png) 14 | 15 | ### Usage 16 | 17 | Download Caffe model(s) and prototxt for VGG-16/VGG-19/AlexNet using `sh models/download_models.sh`. 18 | 19 | #### Classification 20 | 21 | ``` 22 | th classification.lua -input_image_path images/cat_dog.jpg -label 243 -gpuid 0 23 | th classification.lua -input_image_path images/cat_dog.jpg -label 283 -gpuid 0 24 | ``` 25 | 26 | ##### Options 27 | 28 | - `proto_file`: Path to the `deploy.prototxt` file for the CNN Caffe model. Default is `models/VGG_ILSVRC_16_layers_deploy.prototxt` 29 | - `model_file`: Path to the `.caffemodel` file for the CNN Caffe model. Default is `models/VGG_ILSVRC_16_layers.caffemodel` 30 | - `input_image_path`: Path to the input image. Default is `images/cat_dog.jpg` 31 | - `input_sz`: Input image size. Default is 224 (Change to 227 if using AlexNet) 32 | - `layer_name`: Layer to use for Grad-CAM. Default is `relu5_3` (use `relu5_4` for VGG-19 and `relu5` for AlexNet) 33 | - `label`: Class label to generate grad-CAM for (-1 = use predicted class, 283 = Tiger cat, 243 = Boxer). Default is -1. These correspond to ILSVRC synset IDs 34 | - `out_path`: Path to save images in. Default is `output/` 35 | - `gpuid`: 0-indexed id of GPU to use. Default is -1 = CPU 36 | - `backend`: Backend to use with [loadcaffe][3]. Default is `nn` 37 | - `save_as_heatmap`: Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM. Default is 1 38 | 39 | ##### Examples 40 | 41 | 'border collie' (233) 42 | 43 | ![](http://i.imgur.com/nTaVH57.png) 44 | ![](http://i.imgur.com/fjZ8E3Z.png) 45 | ![](http://i.imgur.com/RzPhOYo.png) 46 | 47 | 'tabby cat' (282) 48 | 49 | ![](http://i.imgur.com/nTaVH57.png) 50 | ![](http://i.imgur.com/94ZMSNI.png) 51 | ![](http://i.imgur.com/wmtUTgj.png) 52 | 53 | 'boxer' (243) 54 | 55 | ![](http://i.imgur.com/OAoSQYT.png) 56 | ![](http://i.imgur.com/iZuijZy.png) 57 | ![](http://i.imgur.com/o7RStQm.png) 58 | 59 | 'tiger cat' (283) 60 | 61 | ![](http://i.imgur.com/OAoSQYT.png) 62 | ![](http://i.imgur.com/NzXRy5E.png) 63 | ![](http://i.imgur.com/fP0Dd87.png) 64 | 65 | #### Visual Question Answering 66 | 67 | Clone the [VQA][5] ([http://arxiv.org/abs/1505.00468][4]) sub-repository (`git submodule init && git submodule update`), and download and unzip the provided extracted features and pretrained model. 68 | 69 | ``` 70 | th visual_question_answering.lua -input_image_path images/cat_dog.jpg -question 'What animal?' -answer 'dog' -gpuid 0 71 | th visual_question_answering.lua -input_image_path images/cat_dog.jpg -question 'What animal?' -answer 'cat' -gpuid 0 72 | 73 | ``` 74 | 75 | ##### Options 76 | 77 | - `proto_file`: Path to the `deploy.prototxt` file for the CNN Caffe model. Default is `models/VGG_ILSVRC_19_layers_deploy.prototxt` 78 | - `model_file`: Path to the `.caffemodel` file for the CNN Caffe model. Default is `models/VGG_ILSVRC_19_layers.caffemodel` 79 | - `input_image_path`: Path to the input image. Default is `images/cat_dog.jpg` 80 | - `input_sz`: Input image size. Default is 224 (Change to 227 if using AlexNet) 81 | - `layer_name`: Layer to use for Grad-CAM. Default is `relu5_4` (use `relu5_3` for VGG-16 and `relu5` for AlexNet) 82 | - `question`: Input question. Default is `What animal?` 83 | - `answer`: Optional answer (For eg. "cat") to generate Grad-CAM for ('' = use predicted answer). Default is '' 84 | - `out_path`: Path to save images in. Default is `output/` 85 | - `model_path`: Path to VQA model checkpoint. Default is `VQA_LSTM_CNN/lstm.t7` 86 | - `gpuid`: 0-indexed id of GPU to use. Default is -1 = CPU 87 | - `backend`: Backend to use with [loadcaffe][3]. Default is `cudnn` 88 | - `save_as_heatmap`: Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM. Default is 1 89 | 90 | ##### Examples 91 | 92 | What animal? Dog 93 | 94 | ![](http://i.imgur.com/OAoSQYT.png) 95 | ![](http://i.imgur.com/QBTstax.png) 96 | ![](http://i.imgur.com/NRyhfdL.png) 97 | 98 | What animal? Cat 99 | 100 | ![](http://i.imgur.com/OAoSQYT.png) 101 | ![](http://i.imgur.com/hqBWRAm.png) 102 | ![](http://i.imgur.com/lwj5oAX.png) 103 | 104 | What color is the fire hydrant? Green 105 | 106 | ![](http://i.imgur.com/Zak2NZW.png) 107 | ![](http://i.imgur.com/GbhRhkg.png) 108 | ![](http://i.imgur.com/lrAgGj0.png) 109 | 110 | What color is the fire hydrant? Yellow 111 | 112 | ![](http://i.imgur.com/Zak2NZW.png) 113 | ![](http://i.imgur.com/cHzOo7k.png) 114 | ![](http://i.imgur.com/CJ6QiGD.png) 115 | 116 | What color is the fire hydrant? Green and Yellow 117 | 118 | ![](http://i.imgur.com/Zak2NZW.png) 119 | ![](http://i.imgur.com/i7AwHXx.png) 120 | ![](http://i.imgur.com/7N6BVgq.png) 121 | 122 | What color is the fire hydrant? Red and Yellow 123 | 124 | ![](http://i.imgur.com/Zak2NZW.png) 125 | ![](http://i.imgur.com/uISYeOR.png) 126 | ![](http://i.imgur.com/ebZVlTI.png) 127 | 128 | #### Image Captioning 129 | 130 | Clone the [neuraltalk2][6] sub-repository. Running `sh models/download_models.sh` will download the pretrained model and place it in the neuraltalk2 folder. 131 | 132 | Change lines 2-4 of `neuraltalk2/misc/LanguageModel.lua` to the following: 133 | 134 | ``` 135 | local utils = require 'neuraltalk2.misc.utils' 136 | local net_utils = require 'neuraltalk2.misc.net_utils' 137 | local LSTM = require 'neuraltalk2.misc.LSTM' 138 | ``` 139 | 140 | 141 | ``` 142 | th captioning.lua -input_image_path images/cat_dog.jpg -caption 'a dog and cat posing for a picture' -gpuid 0 143 | th captioning.lua -input_image_path images/cat_dog.jpg -caption '' -gpuid 0 144 | 145 | ``` 146 | ##### Options 147 | 148 | - `input_image_path`: Path to the input image. Default is `images/cat_dog.jpg` 149 | - `input_sz`: Input image size. Default is 224 (Change to 227 if using AlexNet) 150 | - `layer`: Layer to use for Grad-CAM. Default is 30 (relu5_3 for vgg16) 151 | - `caption`: Optional input caption. No input will use the generated caption as default 152 | - `out_path`: Path to save images in. Default is `output/` 153 | - `model_path`: Path to captioning model checkpoint. Default is `neuraltalk2/model_id1-501-1448236541.t7` 154 | - `gpuid`: 0-indexed id of GPU to use. Default is -1 = CPU 155 | - `backend`: Backend to use with [loadcaffe][3]. Default is `cudnn` 156 | - `save_as_heatmap`: Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM. Default is 1 157 | 158 | ##### Examples 159 | 160 | a dog and cat posing for a picture 161 | 162 | ![](http://i.imgur.com/OAoSQYT.png) 163 | ![](http://i.imgur.com/TiKdMMw.png) 164 | ![](http://i.imgur.com/GSQeR2M.png) 165 | 166 | a bathroom with a toilet and a sink 167 | 168 | ![](http://i.imgur.com/gE6VXql.png) 169 | ![](http://i.imgur.com/K3E9TWS.png) 170 | ![](http://i.imgur.com/em2oHRy.png) 171 | 172 | ### License 173 | 174 | BSD 175 | 176 | #### 3rd-party 177 | 178 | - [VQA_LSTM_CNN][5], BSD 179 | - [neuraltalk2][6], BSD 180 | 181 | [3]: https://github.com/szagoruyko/loadcaffe 182 | [4]: http://arxiv.org/abs/1505.00468 183 | [5]: https://github.com/VT-vision-lab/VQA_LSTM_CNN 184 | [6]: https://github.com/karpathy/neuraltalk2 185 | [7]: https://arxiv.org/abs/1610.02391 186 | [8]: http://gradcam.cloudcv.org/ 187 | [9]: https://ramprs.github.io/ 188 | [10]: http://abhishekdas.com/ 189 | [11]: http://vrama91.github.io/ 190 | [12]: http://mcogswell.io/ 191 | [13]: https://computing.ece.vt.edu/~parikh/ 192 | [14]: https://computing.ece.vt.edu/~dbatra/ 193 | -------------------------------------------------------------------------------- /captioning.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'nn' 3 | require 'lfs' 4 | require 'image' 5 | utils = require 'misc.utils' 6 | 7 | cmd = torch.CmdLine() 8 | cmd:text('Options') 9 | 10 | -- Model parameters 11 | cmd:option('-input_sz', 224, 'Input image dimensions (use 224 for VGG Net)') 12 | cmd:option('-backend', 'cudnn') 13 | 14 | -- Grad-CAM parameters 15 | cmd:option('-layer', 30, 'Layer to use for Grad-CAM (use 30 for relu5_3 for VGG-16 )') 16 | cmd:option('-input_image_path', 'images/cat_dog.jpg', 'Input image path') 17 | cmd:option('-caption', 'a dog and a cat posing for a picture', 'Optional input caption. No input will use the generated caption as default') 18 | cmd:option('-save_as_heatmap', 1, 'Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM.') 19 | 20 | -- Captioning model parameters 21 | cmd:option('-model_path', 'neuraltalk2/model_id1-501-1448236541.t7', 'Path to captioning model checkpoint') 22 | 23 | -- Miscellaneous 24 | cmd:option('-seed', 123, 'Torch manual random number generator seed') 25 | cmd:option('-gpuid', -1, '0-indexed id of GPU to use. -1 = CPU') 26 | cmd:option('-out_path', 'output/', 'Output path') 27 | 28 | -- Parse command-line parameters 29 | opt = cmd:parse(arg or {}) 30 | print(opt) 31 | 32 | torch.manualSeed(opt.seed) 33 | torch.setdefaulttensortype('torch.FloatTensor') 34 | lfs.mkdir(opt.out_path) 35 | 36 | if opt.gpuid >= 0 then 37 | require 'cunn' 38 | require 'cudnn' 39 | require 'cutorch' 40 | cutorch.setDevice(opt.gpuid + 1) 41 | cutorch.manualSeed(opt.seed) 42 | end 43 | 44 | -- neuraltalk2-specific dependencies 45 | -- https://github.com/karpathy/neuraltalk2 46 | local lm_misc_utils = require 'neuraltalk2.misc.utils' 47 | require 'neuraltalk2.misc.LanguageModel' 48 | local net_utils = require 'neuraltalk2.misc.net_utils' 49 | 50 | -- Load the models 51 | local cnn_lm_model = torch.load(opt.model_path) 52 | local cnn = cnn_lm_model.protos.cnn 53 | local lm = cnn_lm_model.protos.lm 54 | local vocab = cnn_lm_model.vocab 55 | 56 | net_utils.unsanitize_gradients(cnn) 57 | local lm_modules = lm:getModulesList() 58 | for k,v in pairs(lm_modules) do 59 | net_utils.unsanitize_gradients(v) 60 | end 61 | 62 | -- Set to evaluate mode 63 | lm:evaluate() 64 | cnn:evaluate() 65 | 66 | local img = utils.preprocess(opt.input_image_path, opt.input_sz, opt.input_sz):float() 67 | 68 | -- Clone & replace ReLUs for Guided Backprop 69 | local cnn_gb = cnn:clone() 70 | cnn_gb:replace(utils.guidedbackprop) 71 | 72 | -- Ship model to GPU 73 | if opt.gpuid >= 0 then 74 | cnn:cuda() 75 | cnn_gb:cuda() 76 | img = img:cuda() 77 | lm:cuda() 78 | end 79 | 80 | -- Forward pass 81 | im_feats = cnn:forward(img) 82 | im_feat = im_feats:view(1, -1) 83 | im_feat_gb = cnn_gb:forward(img) 84 | 85 | -- get the prediction from model 86 | local seq, seqlogps = lm:sample(im_feat, sample_opts) 87 | seq[{{}, 1}] = seq 88 | 89 | local caption = net_utils.decode_sequence(vocab, seq) 90 | 91 | if opt.caption == '' then 92 | print("No caption provided, using generated caption for Grad-CAM.") 93 | opt.caption = caption[1] 94 | end 95 | 96 | print("Generated caption: ", caption[1]) 97 | print("Grad-CAM caption: ", opt.caption) 98 | 99 | local seq_length = opt.seq_length or 16 100 | 101 | local labels = utils.sent_to_label(vocab, opt.caption, seq_length) 102 | if opt.gpuid >=0 then labels = labels:cuda() end 103 | 104 | local logprobs = lm:forward({im_feat, labels}) 105 | 106 | local doutput = utils.create_grad_input_lm(logprobs, labels) 107 | if opt.gpuid >=0 then doutput = doutput:cuda() end 108 | 109 | -- lm backward 110 | local dlm, ddummy = unpack(lm:backward({im_feat, labels}, doutput)) 111 | local dcnn = dlm[1] 112 | 113 | -- Grad-CAM 114 | local gcam = utils.grad_cam(cnn, opt.layer, dcnn) 115 | gcam = image.scale(gcam:float(), opt.input_sz, opt.input_sz) 116 | local hm = utils.to_heatmap(gcam) 117 | if opt.save_as_heatmap == 1 then 118 | image.save(opt.out_path .. 'caption_gcam_hm_' .. opt.caption .. '.png', image.toDisplayTensor(hm)) 119 | else 120 | image.save(opt.out_path .. 'caption_gcam_' .. opt.caption .. '.png', image.toDisplayTensor(gcam)) 121 | end 122 | 123 | -- Guided Backprop 124 | local gb_viz = cnn_gb:backward(img, dcnn) 125 | -- BGR to RGB 126 | gb_viz = gb_viz:index(1, torch.LongTensor{3, 2, 1}) 127 | image.save(opt.out_path .. 'caption_gb_' .. opt.caption .. '.png', image.toDisplayTensor(gb_viz)) 128 | 129 | -- Guided Grad-CAM 130 | local gb_gcam = gb_viz:float():cmul(gcam:expandAs(gb_viz)) 131 | image.save(opt.out_path .. 'caption_gb_gcam_' .. opt.caption .. '.png', image.toDisplayTensor(gb_gcam)) 132 | -------------------------------------------------------------------------------- /classification.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'nn' 3 | require 'lfs' 4 | require 'image' 5 | require 'loadcaffe' 6 | utils = require 'misc.utils' 7 | 8 | cmd = torch.CmdLine() 9 | cmd:text('Options') 10 | 11 | -- Model parameters 12 | cmd:option('-proto_file', 'models/VGG_ILSVRC_16_layers_deploy.prototxt') 13 | cmd:option('-model_file', 'models/VGG_ILSVRC_16_layers.caffemodel') 14 | cmd:option('-input_sz', 224, 'Input image dimensions (use 227 for AlexNet)') 15 | cmd:option('-backend', 'nn') 16 | 17 | -- Grad-CAM parameters 18 | cmd:option('-layer_name', 'relu5_3', 'Layer to use for Grad-CAM (use relu5_4 for VGG-19 and relu5 for AlexNet)') 19 | cmd:option('-input_image_path', 'images/cat_dog.jpg', 'Input image path') 20 | cmd:option('-output_image_name', '', 'Output image name') 21 | cmd:option('-label',-1, 'Class label to generate grad-CAM for (-1 = use predicted class, 283 = Tiger cat, 243 = Boxer)') 22 | cmd:option('-save_as_heatmap', 1, 'Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM.') 23 | 24 | -- Miscellaneous 25 | cmd:option('-seed', 123, 'Torch manual random number generator seed') 26 | cmd:option('-gpuid', -1, '0-indexed id of GPU to use. -1 = CPU') 27 | cmd:option('-out_path', 'output/', 'Output path') 28 | 29 | -- Parse command-line parameters 30 | opt = cmd:parse(arg or {}) 31 | print(opt) 32 | 33 | torch.manualSeed(opt.seed) 34 | torch.setdefaulttensortype('torch.FloatTensor') 35 | lfs.mkdir(opt.out_path) 36 | 37 | if opt.gpuid >= 0 then 38 | require 'cunn' 39 | require 'cutorch' 40 | cutorch.setDevice(opt.gpuid + 1) 41 | cutorch.manualSeed(opt.seed) 42 | end 43 | 44 | -- Load CNN 45 | local cnn = loadcaffe.load(opt.proto_file, opt.model_file, opt.backend) 46 | 47 | -- Set to evaluate and remove softmax layer 48 | cnn:evaluate() 49 | cnn:remove() 50 | 51 | -- Clone & replace ReLUs for Guided Backprop 52 | local cnn_gb = cnn:clone() 53 | cnn_gb:replace(utils.guidedbackprop) 54 | 55 | -- Load image 56 | local img = utils.preprocess(opt.input_image_path, opt.input_sz, opt.input_sz) 57 | 58 | -- Transfer to GPU 59 | if opt.gpuid >= 0 then 60 | cnn:cuda() 61 | cnn_gb:cuda() 62 | img = img:cuda() 63 | else 64 | img = img:float() 65 | end 66 | 67 | -- Forward pass 68 | local output = cnn:forward(img) 69 | local output_gb = cnn_gb:forward(img) 70 | 71 | -- Take argmax 72 | local score, pred_label = torch.max(output,1) 73 | 74 | if opt.label == -1 then 75 | print("No label provided, using predicted label ", pred_label) 76 | opt.label = pred_label[1] 77 | end 78 | 79 | -- Set gradInput 80 | local doutput = utils.create_grad_input(cnn.modules[#cnn.modules], opt.label) 81 | 82 | -- Grad-CAM 83 | local gcam = utils.grad_cam(cnn, opt.layer_name, doutput) 84 | gcam = image.scale(gcam:float(), opt.input_sz, opt.input_sz) 85 | local hm = utils.to_heatmap(gcam) 86 | 87 | if opt.output_image_name == "" then 88 | opt.output_image_name = opt.label 89 | end 90 | 91 | if opt.save_as_heatmap == 1 then 92 | image.save(opt.out_path .. 'classify_gcam_hm_' .. opt.output_image_name .. '.png', image.toDisplayTensor(hm)) 93 | else 94 | image.save(opt.out_path .. 'classify_gcam_' .. opt.output_image_name .. '.png', image.toDisplayTensor(gcam)) 95 | end 96 | 97 | -- Guided Backprop 98 | local gb_viz = cnn_gb:backward(img, doutput) 99 | -- BGR to RGB 100 | gb_viz = gb_viz:index(1, torch.LongTensor{3, 2, 1}) 101 | image.save(opt.out_path .. 'classify_gb_' .. opt.output_image_name .. '.png', image.toDisplayTensor(gb_viz)) 102 | 103 | -- Guided Grad-CAM 104 | local gb_gcam = gb_viz:float():cmul(gcam:expandAs(gb_viz)) 105 | image.save(opt.out_path .. 'classify_gb_gcam_' .. opt.output_image_name .. '.png', image.toDisplayTensor(gb_gcam)) 106 | -------------------------------------------------------------------------------- /images/bathroom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ramprs/grad-cam/ea86dc1c57c83bb183d574965f4c6e5dde56bcca/images/bathroom.jpg -------------------------------------------------------------------------------- /images/cat_dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ramprs/grad-cam/ea86dc1c57c83bb183d574965f4c6e5dde56bcca/images/cat_dog.jpg -------------------------------------------------------------------------------- /images/firehydrants.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ramprs/grad-cam/ea86dc1c57c83bb183d574965f4c6e5dde56bcca/images/firehydrants.jpg -------------------------------------------------------------------------------- /images/tabbycat_dog.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ramprs/grad-cam/ea86dc1c57c83bb183d574965f4c6e5dde56bcca/images/tabbycat_dog.jpg -------------------------------------------------------------------------------- /misc/DeconvReLU.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | 3 | Implementation of DeconvReLU that backpropagates positive gradients 4 | irrespective of activations; From the paper: 5 | 6 | Visualizing and Understanding Convolutional Networks 7 | Matthew D. Zeiler, Rob Fergus 8 | https://arxiv.org/abs/1311.2901 9 | 10 | ]]-- 11 | 12 | local DeconvReLU = torch.class('nn.DeconvReLU', 'nn.Module') 13 | 14 | function DeconvReLU:updateOutput(input) 15 | self.output:resizeAs(input):copy(input) 16 | return self.output:cmul(torch.gt(input,0):typeAs(input)) 17 | end 18 | 19 | function DeconvReLU:updateGradInput(input, gradOutput) 20 | self.gradInput:resizeAs(gradOutput):copy(gradOutput) 21 | return self.gradInput:cmul(torch.gt(gradOutput,0):typeAs(input)) 22 | end 23 | -------------------------------------------------------------------------------- /misc/GuidedBackpropReLU.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | 3 | Implementation of GuidedBackpropReLU that backpropagates positive gradients 4 | to input elements with positive activations; From the paper: 5 | 6 | Striving for Simplicity: The All Convolutional Net 7 | Jost Tobias Springenberg, Alexey Dosovitskiy, Thomas Brox, Martin Riedmiller 8 | http://arxiv.org/abs/1412.6806 9 | 10 | ]]-- 11 | 12 | local GuidedBackpropReLU = torch.class('nn.GuidedBackpropReLU', 'nn.Module') 13 | 14 | function GuidedBackpropReLU:updateOutput(input) 15 | self.output:resizeAs(input):copy(input) 16 | return self.output:cmul(torch.gt(input,0):typeAs(input)) 17 | end 18 | 19 | function GuidedBackpropReLU:updateGradInput(input, gradOutput) 20 | self.gradInput:resizeAs(gradOutput):copy(gradOutput) 21 | return self.gradInput:cmul(torch.gt(input,0):typeAs(input)):cmul(torch.gt(gradOutput,0):typeAs(input)) 22 | end 23 | -------------------------------------------------------------------------------- /misc/prepro_ques.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocesses a question 3 | Adapted from https://github.com/VT-vision-lab/VQA_LSTM_CNN/blob/master/prepro.py 4 | """ 5 | import copy 6 | from random import shuffle, seed 7 | import sys 8 | import os.path 9 | import argparse 10 | import glob 11 | import numpy as np 12 | from scipy.misc import imread, imresize 13 | import scipy.io 14 | import pdb 15 | import string 16 | import h5py 17 | from nltk.tokenize import word_tokenize 18 | import json 19 | 20 | import re 21 | 22 | 23 | def tokenize(sentence): 24 | return [i for i in re.split(r"([-.\"',:? !\$#@~()*&\^%;\[\]/\\\+<>\n=])", sentence) if i!='' and i!=' ' and i!='\n']; 25 | 26 | def prepro_question(s, method='nltk'): 27 | if method == 'nltk': 28 | txt = word_tokenize(str(s).lower()) 29 | else: 30 | txt = tokenize(s) 31 | return txt 32 | 33 | def apply_vocab_question(tokens, wtoi): 34 | # apply the vocab on test. 35 | question = [w if wtoi.get(w,len(wtoi)+1) != (len(wtoi)+1) else 'UNK' for w in tokens] 36 | return question 37 | 38 | def encode_question(ques, wtoi): 39 | max_length = 26 40 | 41 | label_arrays = np.zeros((max_length), dtype='uint32') 42 | label_length = min(max_length, len(ques)) # record the length of this sequence 43 | for k, w in enumerate(ques): 44 | if k < max_length : 45 | print(w) 46 | label_arrays[k] = wtoi[w] 47 | 48 | return label_arrays, label_length 49 | 50 | def feat_ques(question): 51 | # tokenization and preprocessing training question 52 | tokens = prepro_question(question) 53 | 54 | # create the vocab for question 55 | # Load Vocabulary File 56 | with open('VQA_LSTM_CNN/data_prepro.json', 'r') as f: 57 | itow = json.load(f)['ix_to_word'] 58 | wtoi = {w:i for i,w in itow.items()} # inverse table 59 | 60 | fin_ques = apply_vocab_question(tokens, wtoi) 61 | ques, ques_length = encode_question(fin_ques, wtoi) 62 | q = {} 63 | q['ques'] = ques.tolist() 64 | q['ques_length'] = ques_length 65 | with open('ques_feat.json','w') as q_file: 66 | json.dump(q,q_file) 67 | print ques.tolist() 68 | 69 | return ques.tolist(), ques_length 70 | 71 | def main(params): 72 | question = params['question'] 73 | ques, ques_length = feat_ques(question) 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser() 77 | 78 | parser.add_argument('-d', '--question', dest='question', default='what is the man doing', help='question string') 79 | args = parser.parse_args() 80 | params = vars(args) 81 | main(params) 82 | -------------------------------------------------------------------------------- /misc/utils.lua: -------------------------------------------------------------------------------- 1 | local utils = {} 2 | 3 | -- Preprocess the image before passing it to a Caffe model. 4 | function utils.preprocess(path, width, height) 5 | local width = width or 224 6 | local height = height or 224 7 | 8 | -- load image 9 | local orig_image = image.load(path) 10 | 11 | -- handle greyscale and rgba images 12 | if orig_image:size(1) == 1 then 13 | orig_image = orig_image:repeatTensor(3, 1, 1) 14 | elseif orig_image:size(1) == 4 then 15 | orig_image = orig_image[{{1,3},{},{}}] 16 | end 17 | 18 | -- get the dimensions of the original image 19 | local im_height = orig_image:size(2) 20 | local im_width = orig_image:size(3) 21 | 22 | -- scale and subtract mean 23 | local img = image.scale(orig_image, width, height):double() 24 | local mean_pixel = torch.DoubleTensor({103.939, 116.779, 123.68}) 25 | img = img:index(1, torch.LongTensor{3, 2, 1}):mul(255.0) 26 | mean_pixel = mean_pixel:view(3, 1, 1):expandAs(img) 27 | img:add(-1, mean_pixel) 28 | return img, im_height, im_width 29 | end 30 | 31 | -- Replace ReLUs with DeconvReLUs 32 | function utils.deconv(m) 33 | require 'misc.DeconvReLU' 34 | local name = torch.typename(m) 35 | if name == 'nn.ReLU' or name == 'cudnn.ReLU' then 36 | return nn.DeconvReLU() 37 | else 38 | return m 39 | end 40 | end 41 | 42 | -- Replace ReLUs with GuidedBackpropReLUs 43 | function utils.guidedbackprop(m) 44 | require 'misc.GuidedBackpropReLU' 45 | local name = torch.typename(m) 46 | if name == 'nn.ReLU' or name == 'cudnn.ReLU' then 47 | return nn.GuidedBackpropReLU() 48 | else 49 | return m 50 | end 51 | end 52 | 53 | -- Get layer id from name 54 | function utils.cnn_layer_id(cnn, layer_name) 55 | for i = 1, #cnn.modules do 56 | local layer = cnn:get(i) 57 | local name = layer.name 58 | if name == layer_name then 59 | return i 60 | end 61 | end 62 | return -1 63 | end 64 | 65 | -- Synthesize gradInput tensor 66 | function utils.create_grad_input(module, label) 67 | local doutput = module.output:clone():view(-1) 68 | doutput:fill(0) 69 | doutput[label] = 1 70 | return doutput 71 | end 72 | 73 | -- Creates gradInput for neuraltalk2 Language Model 74 | function utils.create_grad_input_lm(input, labels) 75 | local output = torch.zeros(input:size()):fill(0) 76 | for t =1,labels:size(1) do 77 | if labels[t][1]~=0 then 78 | output[t+1][1][labels[t][1]] = 1 79 | end 80 | end 81 | return output 82 | end 83 | 84 | -- Generate Grad-CAM 85 | function utils.grad_cam(cnn, layer_name, doutput) 86 | -- Split model into two 87 | local model1, model2 = nn.Sequential(), nn.Sequential() 88 | if tonumber(layer_name) == nil then 89 | 90 | for i = 1, #cnn.modules do 91 | model1:add(cnn:get(i)) 92 | layer_id = i 93 | if cnn:get(i).name == layer_name then 94 | break 95 | end 96 | end 97 | else 98 | 99 | layer_id = tonumber(layer_name) 100 | for i = 1, #cnn.modules do 101 | model1:add(cnn:get(i)) 102 | if i == layer_id then 103 | break 104 | end 105 | end 106 | end 107 | 108 | for i = layer_id+1, #cnn.modules do 109 | model2:add(cnn:get(i)) 110 | end 111 | 112 | -- Get activations and gradients 113 | model2:zeroGradParameters() 114 | model2:backward(model1.output, doutput) 115 | 116 | -- Get the activations from model1 and and gradients from model2 117 | local activations = model1.output:squeeze() 118 | local gradients = model2.gradInput:squeeze() 119 | 120 | -- Global average pool gradients 121 | local weights = torch.sum(gradients:view(activations:size(1), -1), 2) 122 | 123 | -- Summing and rectifying weighted activations across depth 124 | local map = torch.sum(torch.cmul(activations, weights:view(activations:size(1), 1, 1):expandAs(activations)), 1) 125 | map = map:cmul(torch.gt(map,0):typeAs(map)) 126 | 127 | return map 128 | end 129 | 130 | function utils.table_invert(t) 131 | local s = {} 132 | for k,v in pairs(t) do 133 | s[v] = k 134 | end 135 | return s 136 | end 137 | 138 | function utils.sent_to_label(vocab, sent, seq_length) 139 | local inv_vocab = utils.table_invert(vocab) 140 | local labels = torch.zeros(seq_length,1) 141 | local i = 0 142 | for word in sent:gmatch'%w+' do 143 | -- we replace out of vocabulary words with UNK 144 | if inv_vocab[word] == nil then 145 | word = 'UNK' 146 | end 147 | local ix_word = inv_vocab[word] 148 | i = i+1 149 | labels[{{i},{1}}] = ix_word 150 | end 151 | return labels 152 | end 153 | 154 | function utils.to_heatmap(map) 155 | map = image.toDisplayTensor(map) 156 | local cmap = torch.Tensor(3, map:size(2), map:size(3)):fill(1) 157 | for i = 1, map:size(2) do 158 | for j = 1, map:size(3) do 159 | local value = map[1][i][j] 160 | if value <= 0.25 then 161 | cmap[1][i][j] = 0 162 | cmap[2][i][j] = 4*value 163 | elseif value <= 0.5 then 164 | cmap[1][i][j] = 0 165 | cmap[3][i][j] = 2 - 4*value 166 | elseif value <= 0.75 then 167 | cmap[1][i][j] = 4*value - 2 168 | cmap[3][i][j] = 0 169 | else 170 | cmap[2][i][j] = 4 - 4*value 171 | cmap[3][i][j] = 0 172 | end 173 | end 174 | end 175 | return cmap 176 | end 177 | 178 | return utils 179 | -------------------------------------------------------------------------------- /models/download_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | cd models 4 | 5 | # AlexNet 6 | # wget -c https://raw.githubusercontent.com/BVLC/caffe/master/models/bvlc_alexnet/deploy.prototxt -O bvlc_alexnet_deploy.prototxt 7 | # wget -c http://dl.caffe.berkeleyvision.org/bvlc_alexnet.caffemodel 8 | 9 | # VGG-16 10 | wget -c https://gist.githubusercontent.com/ksimonyan/211839e770f7b538e2d8/raw/c3ba00e272d9f48594acef1f67e5fd12aff7a806/VGG_ILSVRC_16_layers_deploy.prototxt 11 | wget -c http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_16_layers.caffemodel 12 | 13 | # VGG-19 14 | wget -c https://gist.githubusercontent.com/ksimonyan/3785162f95cd2d5fee77/raw/bb2b4fe0a9bb0669211cf3d0bc949dfdda173e9e/VGG_ILSVRC_19_layers_deploy.prototxt 15 | wget -c http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel 16 | 17 | cd .. 18 | 19 | # VQA 20 | cd VQA_LSTM_CNN 21 | wget -c https://filebox.ece.vt.edu/~jiasenlu/codeRelease/vqaRelease/train_only/data_train_val.zip 22 | wget -c https://filebox.ece.vt.edu/~jiasenlu/codeRelease/vqaRelease/train_only/pretrained_lstm_train_val.t7.zip 23 | unzip data_train_val.zip 24 | unzip pretrained_lstm_train_val.t7.zip 25 | cd .. 26 | 27 | # neuraltalk2 28 | cd neuraltalk2 29 | wget http://cs.stanford.edu/people/karpathy/neuraltalk2/checkpoint_v1.zip 30 | unzip checkpoint_v1.zip 31 | cd .. 32 | -------------------------------------------------------------------------------- /visual_question_answering.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | require 'nn' 3 | require 'lfs' 4 | require 'image' 5 | require 'loadcaffe' 6 | utils = require 'misc.utils' 7 | 8 | cmd = torch.CmdLine() 9 | cmd:text('Options') 10 | 11 | -- Model parameters 12 | cmd:option('-proto_file', 'models/VGG_ILSVRC_19_layers_deploy.prototxt') 13 | cmd:option('-model_file', 'models/VGG_ILSVRC_19_layers.caffemodel') 14 | cmd:option('-input_sz', 224, 'Input image dimensions (use 227 for AlexNet)') 15 | cmd:option('-backend', 'nn') 16 | 17 | -- Grad-CAM parameters 18 | cmd:option('-layer_name', 'relu5_4', 'Layer to use for Grad-CAM (use relu5_3 for VGG-16 and relu5 for AlexNet)') 19 | cmd:option('-input_image_path', 'images/cat_dog.jpg', 'Input image path') 20 | cmd:option('-question', 'What animal?', 'Input question') 21 | cmd:option('-answer', '', 'Optional answer (For eg. "cat") to generate Grad-CAM for ("" = use predicted answer).') 22 | cmd:option('-save_as_heatmap', 1, 'Whether to save heatmap or raw Grad-CAM. 1 = save heatmap, 0 = save raw Grad-CAM.') 23 | 24 | -- VQA model parameters 25 | cmd:option('-model_path', 'VQA_LSTM_CNN/lstm.t7', 'Path to VQA model checkpoint') 26 | cmd:option('-input_encoding_size', 200, 'Encoding size of each token in the vocabulary') 27 | cmd:option('-rnn_size', 512, 'Size of the LSTM hidden state') 28 | cmd:option('-rnn_layers', 2, 'Number of the LSTM layers') 29 | cmd:option('-common_embedding_size', 1024, 'Size of the common embedding vector') 30 | cmd:option('-num_output', 1000, 'Number of output answers') 31 | 32 | -- Miscellaneous 33 | cmd:option('-seed', 123, 'Torch manual random number generator seed') 34 | cmd:option('-gpuid', -1, '0-indexed id of GPU to use. -1 = CPU') 35 | cmd:option('-out_path', 'output/', 'Output path') 36 | 37 | -- Parse command-line parameters 38 | opt = cmd:parse(arg or {}) 39 | print(opt) 40 | 41 | torch.manualSeed(opt.seed) 42 | torch.setdefaulttensortype('torch.DoubleTensor') 43 | lfs.mkdir(opt.out_path) 44 | 45 | if opt.gpuid >= 0 then 46 | require 'cunn' 47 | require 'cutorch' 48 | cutorch.setDevice(opt.gpuid + 1) 49 | cutorch.manualSeed(opt.seed) 50 | end 51 | 52 | -- Load CNN 53 | local cnn = loadcaffe.load(opt.proto_file, opt.model_file, opt.backend) 54 | 55 | -- Set to evaluate and remove linear+softmax layer 56 | cnn:evaluate() 57 | cnn:remove() 58 | cnn:remove() 59 | cnn:add(nn.Normalize(2)) 60 | 61 | -- Clone & replace ReLUs for Guided Backprop 62 | local cnn_gb = cnn:clone() 63 | cnn_gb:replace(utils.guidedbackprop) 64 | 65 | -- VQA-specific dependencies 66 | -- https://github.com/VT-vision-lab/VQA_LSTM_CNN/blob/master/eval.lua 67 | require 'VQA_LSTM_CNN/misc.netdef' 68 | require 'VQA_LSTM_CNN/misc.RNNUtils' 69 | LSTM = require 'VQA_LSTM_CNN/misc.LSTM' 70 | cjson = require 'cjson' 71 | 72 | -- Load vocabulary 73 | local file = io.open('VQA_LSTM_CNN/data_prepro.json','r') 74 | local text = file:read() 75 | file:close() 76 | local json_file = cjson.decode(text) 77 | local vocabulary_size_q = 0 78 | for i, w in pairs(json_file['ix_to_word']) do vocabulary_size_q = vocabulary_size_q + 1 end 79 | 80 | -- VQA model definition 81 | local embedding_net_q = nn.Sequential() 82 | :add(nn.Linear(vocabulary_size_q, opt.input_encoding_size)) 83 | :add(nn.Dropout(0.5)) 84 | :add(nn.Tanh()) 85 | 86 | local encoder_net_q = LSTM.lstm_conventional(opt.input_encoding_size, opt.rnn_size, 1, opt.rnn_layers, 0.5) 87 | 88 | local multimodal_net = nn.Sequential() 89 | :add(netdef.AxB(2 * opt.rnn_size * opt.rnn_layers, 4096, opt.common_embedding_size, 0.5)) 90 | :add(nn.Dropout(0.5)) 91 | :add(nn.Linear(opt.common_embedding_size, opt.num_output)) 92 | 93 | local dummy_state_q = torch.Tensor(opt.rnn_size * opt.rnn_layers * 2):fill(0) 94 | local dummy_output_q = torch.Tensor(1):fill(0) 95 | 96 | -- Ship model to GPU 97 | if opt.gpuid >= 0 then 98 | embedding_net_q:cuda() 99 | encoder_net_q:cuda() 100 | multimodal_net:cuda() 101 | dummy_state_q = dummy_state_q:cuda() 102 | dummy_output_q = dummy_output_q:cuda() 103 | end 104 | 105 | -- Set to evaluate 106 | embedding_net_q:evaluate() 107 | encoder_net_q:evaluate() 108 | multimodal_net:evaluate() 109 | 110 | -- Zero gradients 111 | embedding_net_q:zeroGradParameters() 112 | encoder_net_q:zeroGradParameters() 113 | multimodal_net:zeroGradParameters() 114 | 115 | -- Load pretrained VQA model 116 | embedding_w_q, embedding_dw_q = embedding_net_q:getParameters() 117 | encoder_w_q, encoder_dw_q = encoder_net_q:getParameters() 118 | multimodal_w, multimodal_dw = multimodal_net:getParameters() 119 | 120 | model_param = torch.load(opt.model_path) 121 | embedding_w_q:copy(model_param['embedding_w_q']) 122 | encoder_w_q:copy(model_param['encoder_w_q']) 123 | multimodal_w:copy(model_param['multimodal_w']) 124 | 125 | local encoder_net_buffer_q = dupe_rnn(encoder_net_q, 26) 126 | 127 | -- Load image 128 | local img = utils.preprocess(opt.input_image_path, opt.input_sz, opt.input_sz) 129 | 130 | -- Ship CNNs and image to GPU 131 | if opt.gpuid >= 0 then 132 | cnn:cuda() 133 | cnn_gb:cuda() 134 | img = img:cuda() 135 | end 136 | 137 | -- Forward pass 138 | fv_im = cnn:forward(img) 139 | fv_im_gb = cnn_gb:forward(img) 140 | 141 | -- Tokenize question 142 | local cmd = 'python misc/prepro_ques.py --question "'.. opt.question..'"' 143 | os.execute(cmd) 144 | file = io.open('ques_feat.json') 145 | text = file:read() 146 | file:close() 147 | q_feats = cjson.decode(text) 148 | 149 | question = right_align(torch.LongTensor{q_feats.ques}, torch.LongTensor{q_feats.ques_length}) 150 | fv_sorted_q = sort_encoding_onehot_right_align(question, torch.LongTensor{q_feats.ques_length}, vocabulary_size_q) 151 | 152 | -- Ship question features to GPU 153 | if opt.gpuid >= 0 then 154 | fv_sorted_q[1] = fv_sorted_q[1]:cuda() 155 | fv_sorted_q[3] = fv_sorted_q[3]:cuda() 156 | fv_sorted_q[4] = fv_sorted_q[4]:cuda() 157 | else 158 | fv_sorted_q[1] = fv_sorted_q[1]:double() 159 | end 160 | 161 | local question_max_length = fv_sorted_q[2]:size(1) 162 | 163 | -- Embedding forward 164 | local word_embedding_q = split_vector(embedding_net_q:forward(fv_sorted_q[1]), fv_sorted_q[2]) 165 | 166 | -- Encoder forward 167 | local states_q, _ = rnn_forward(encoder_net_buffer_q, torch.repeatTensor(dummy_state_q:fill(0), 1, 1), word_embedding_q, fv_sorted_q[2]) 168 | 169 | -- Multimodal forward 170 | local tv_q = states_q[question_max_length + 1]:index(1, fv_sorted_q[4]) 171 | local scores = multimodal_net:forward({tv_q, fv_im}) 172 | 173 | -- Get predictions 174 | _, pred = torch.max(scores:double(), 2) 175 | answer = json_file['ix_to_ans'][tostring(pred[{1, 1}])] 176 | 177 | local inv_vocab = utils.table_invert(json_file['ix_to_ans']) 178 | -- Replace out of vocabulary answers with predicted answer 179 | if opt.answer ~= '' and inv_vocab[opt.answer] ~= nil then answer_idx = inv_vocab[opt.answer] else opt.answer = answer answer_idx = inv_vocab[answer] end 180 | 181 | print("Question: ", opt.question) 182 | print("Predicted answer: ", answer) 183 | print("Grad-CAM answer: ", opt.answer) 184 | 185 | -- Set gradInput 186 | local doutput = utils.create_grad_input(multimodal_net.modules[#multimodal_net.modules], answer_idx) 187 | 188 | -- Multimodal backward 189 | local tmp = multimodal_net:backward({tv_q, fv_im}, doutput:view(1,-1)) 190 | local dcnn = tmp[2] 191 | 192 | -- Grad-CAM 193 | local gcam = utils.grad_cam(cnn, opt.layer_name, dcnn) 194 | gcam = image.scale(gcam:float(), opt.input_sz, opt.input_sz) 195 | local hm = utils.to_heatmap(gcam) 196 | if opt.save_as_heatmap == 1 then 197 | image.save(opt.out_path .. 'vqa_gcam_hm_' .. opt.answer .. '.png', image.toDisplayTensor(hm)) 198 | else 199 | image.save(opt.out_path .. 'vqa_gcam_' .. opt.answer .. '.png', image.toDisplayTensor(gcam)) 200 | end 201 | 202 | -- Guided Backprop 203 | local gb_viz = cnn_gb:backward(img, dcnn) 204 | -- BGR to RGB 205 | gb_viz = gb_viz:index(1, torch.LongTensor{3, 2, 1}) 206 | image.save(opt.out_path .. 'vqa_gb_' .. opt.answer .. '.png', image.toDisplayTensor(gb_viz)) 207 | 208 | -- Guided Grad-CAM 209 | local gb_gcam = gb_viz:float():cmul(gcam:expandAs(gb_viz)) 210 | image.save(opt.out_path .. 'vqa_gb_gcam_' .. opt.answer .. '.png', image.toDisplayTensor(gb_gcam)) 211 | --------------------------------------------------------------------------------