├── .gitignore ├── ILSVRC_evaluate_bbox.m ├── ILSVRC_generate_heatmap.m ├── README.md ├── bboxgenerator ├── Makefile ├── cut ├── dt.c ├── dt.h ├── dt_box ├── dt_box.cpp ├── gc.cpp ├── heatmap_6.jpg ├── heatmap_6.txt └── sample_6.jpg ├── categories1000.mat ├── data_img1.mat ├── data_img2.mat ├── data_net.mat ├── demo.m ├── generate_bbox.m ├── ilsvrc_2012_mean.mat ├── img1.jpg ├── img2.jpg ├── map2jpg.m ├── mergeTenCrop.m ├── models ├── categoriesImageNet.mat ├── categories_places205.mat ├── deploy_alexnetplusCAM_imagenet.prototxt ├── deploy_alexnetplusCAM_places205.prototxt ├── deploy_googlenetCAM.prototxt ├── deploy_googlenetCAM_places205.prototxt ├── deploy_vgg16CAM.prototxt └── download.sh ├── prepare_image.m ├── py_demo.py ├── py_generate_bbox.py ├── py_map2jpg.py ├── py_returnCAMmap.py └── returnCAMmap.m /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | models/*.caffemodel 3 | -------------------------------------------------------------------------------- /ILSVRC_evaluate_bbox.m: -------------------------------------------------------------------------------- 1 | 2 | 3 | datasetName = 'ILSVRCvalSet'; 4 | load('imagenet_toolkit/ILSVRC2014_devkit/evaluation/cache_groundtruth.mat'); 5 | load('imagenet_toolkit/ILSVRC2014_devkit/data/meta_clsloc.mat'); 6 | datasetPath = 'dataset/ILSVRC2012'; 7 | load([datasetPath '/imageListVal.mat']); 8 | load('sizeImg_ILSVRC2014.mat'); 9 | 10 | % datasetName = 'ILSVRCtestSet'; 11 | % datasetPath = '/data/vision/torralba/deeplearning/imagenet_toolkit'; 12 | % load([datasetPath '/imageListTest.mat']); 13 | 14 | 15 | nImgs = size(imageList,1); 16 | 17 | ground_truth_file='imagenet_toolkit/ILSVRC2014_devkit/data/ILSVRC2014_clsloc_validation_ground_truth.txt'; 18 | gt_labels = dlmread(ground_truth_file); 19 | 20 | categories_gt = []; 21 | categoryIDMap = containers.Map(); 22 | for i=1:numel(synsets) 23 | categories_gt{synsets(i).ILSVRC2014_ID,1} = synsets(i).words; 24 | categories_gt{synsets(i).ILSVRC2014_ID,2} = synsets(i).WNID; 25 | categoryIDMap(synsets(i).WNID) = i; 26 | end 27 | 28 | 29 | 30 | %% network to evaluate 31 | % backpropa-heatmap 32 | %netName = 'caffeNet_imagenet'; 33 | %netName = 'googlenetBVLC_imagenet'; 34 | %netName = 'VGG16_imagenet'; 35 | 36 | % CAM-based network 37 | %netName = 'NIN'; 38 | %netName = 'CAM_imagenetCNNaveSumDeep'; 39 | %netName = 'CAM_googlenetBVLC_imagenet';% the direct output 40 | netName = 'CAM_googlenetBVLCshrink_imagenet'; 41 | %netName = 'CAM_googlenetBVLCshrink_imagenet_maxpool'; 42 | %netName = 'CAM_VGG16_imagenet'; 43 | %netName = 'CAM_alexnet'; 44 | 45 | load('categoriesImageNet.mat'); 46 | 47 | visualizationPointer = 0; 48 | 49 | topCategoryNum = 5; 50 | predictionResult_bbox1 = zeros(nImgs, topCategoryNum*5); 51 | predictionResult_bbox2 = zeros(nImgs, topCategoryNum*5); 52 | predictionResult_bboxCombine = zeros(nImgs, topCategoryNum*5); 53 | 54 | if matlabpool('size')==0 55 | try 56 | matlabpool 57 | catch e 58 | end 59 | end 60 | 61 | heatMapFolder = ['heatMap-' datasetName '-' netName]; 62 | bbox_threshold = [20, 100, 110]; 63 | curParaThreshold = [num2str(bbox_threshold(1)) ' ' num2str(bbox_threshold(2)) ' ' num2str(bbox_threshold(3))]; 64 | parfor i=1:size(imageList,1) 65 | curImgIDX = i; 66 | 67 | height_original = sizeFull_imageList(curImgIDX,1);%tmp.Height; 68 | weight_original = sizeFull_imageList(curImgIDX,2);%tmp.Width; 69 | 70 | [a b c] = fileparts(imageList{curImgIDX,1}); 71 | curPath_fullSizeImg = ['/data/vision/torralba/deeplearning/imagenet_toolkit/ILSVRC2012_img_val/' b c]; 72 | curMatFile = [heatMapFolder '/' b '.mat']; 73 | [heatMapSet, value_category, IDX_category] = loadHeatMap( curMatFile); 74 | 75 | curResult_bbox1 = []; 76 | curResult_bbox2 = []; 77 | curResult_bboxCombine = []; 78 | for j=1:5 79 | curHeatMapFile = [heatMapFolder '/top' num2str(j) '/' b '.jpg']; 80 | 81 | curBBoxFile = [heatMapFolder '/top' num2str(j) '/' b '_default.txt']; 82 | %curBBoxFileGraphcut = [heatMapFolder '/top' num2str(j) '/' b '_graphcut.txt']; 83 | curCategory = categories{IDX_category(j),1}; 84 | %imwrite(curHeatMap, ['result_bbox/heatmap_tmp' b randString '.jpg']); 85 | if ~exist(curBBoxFile) 86 | %system(['/data/vision/torralba/deeplearning/package/bbox_hui/final ' curHeatMapFile ' ' curBBoxFile]); 87 | 88 | system(['/data/vision/torralba/deeplearning/package/bbox_hui_new/./dt_box ' curHeatMapFile ' ' curParaThreshold ' ' curBBoxFile]); 89 | end 90 | curPredictCategory = categories{IDX_category(j),1}; 91 | curPredictCategoryID = categories{IDX_category(j),1}(1:9); 92 | curPredictCategoryGTID = categoryIDMap(curPredictCategoryID); 93 | 94 | 95 | boxData = dlmread(curBBoxFile); 96 | boxData_formulate = [boxData(1:4:end)' boxData(2:4:end)' boxData(1:4:end)'+boxData(3:4:end)' boxData(2:4:end)'+boxData(4:4:end)']; 97 | boxData_formulate = [min(boxData_formulate(:,1),boxData_formulate(:,3)),min(boxData_formulate(:,2),boxData_formulate(:,4)),max(boxData_formulate(:,1),boxData_formulate(:,3)),max(boxData_formulate(:,2),boxData_formulate(:,4))]; 98 | 99 | % try 100 | % boxDataGraphcut = dlmread(curBBoxFileGraphcut); 101 | % boxData_formulateGraphcut = [boxDataGraphcut(1:4:end)' boxDataGraphcut(2:4:end)' boxDataGraphcut(1:4:end)'+boxDataGraphcut(3:4:end)' boxDataGraphcut(2:4:end)'+boxDataGraphcut(4:4:end)']; 102 | % catch exception 103 | % boxDataGraphcut = dlmread(curBBoxFile); 104 | % boxData_formulateGraphcut = [boxDataGraphcut(1:4:end)' boxDataGraphcut(2:4:end)' boxDataGraphcut(1:4:end)'+boxDataGraphcut(3:4:end)' boxDataGraphcut(2:4:end)'+boxDataGraphcut(4:4:end)']; 105 | % boxData_formulateGraphcut = boxData_formulateGraphcut(1,:); 106 | % end 107 | 108 | bbox = boxData_formulate(1,:); 109 | curPredictTuple = [curPredictCategoryGTID bbox(1) bbox(2) bbox(3) bbox(4)]; 110 | curResult_bbox1 = [curResult_bbox1 curPredictTuple]; 111 | curResult_bboxCombine = [curResult_bboxCombine curPredictTuple]; 112 | 113 | bbox = boxData_formulate(2,:); 114 | %bbox = boxData_formulateGraphcut(1,:); 115 | curPredictTuple = [curPredictCategoryGTID bbox(1) bbox(2) bbox(3) bbox(4)]; 116 | curResult_bbox2 = [curResult_bbox2 curPredictTuple]; 117 | 118 | curResult_bboxCombine = [curResult_bboxCombine curPredictTuple]; 119 | if visualizationPointer == 1 120 | 121 | curHeatMap = imread(curHeatMapFile); 122 | curHeatMap = imresize(curHeatMap,[height_original weight_original]); 123 | 124 | subplot(1,2,1),hold off, imshow(curPath_fullSizeImg); 125 | hold on 126 | curBox = boxData_formulate(1,:); 127 | rectangle('Position',[curBox(1) curBox(2) curBox(3)-curBox(1) curBox(4)-curBox(2)],'EdgeColor',[1 0 0]); 128 | subplot(1,2,2),imagesc(curHeatMap); 129 | title(curCategory); 130 | waitforbuttonpress 131 | end 132 | end 133 | 134 | predictionResult_bbox1(i, :) = curResult_bbox1; 135 | predictionResult_bbox2(i, :) = curResult_bbox2; 136 | predictionResult_bboxCombine(i,:) = curResult_bboxCombine(1:topCategoryNum*5); 137 | disp([netName ' processing ' b]) 138 | end 139 | 140 | 141 | addpath('imagenet_toolkit/ILSVRC2014_devkit/evaluation'); 142 | disp([netName '--------bbox1' ]); 143 | [cls_error, clsloc_error] = simpleEvaluation(predictionResult_bbox1); 144 | disp([(1:5)',clsloc_error,cls_error]); 145 | 146 | disp([netName '--------bbox2' ]); 147 | [cls_error, clsloc_error] = simpleEvaluation(predictionResult_bbox2); 148 | disp([(1:5)',clsloc_error,cls_error]); 149 | 150 | disp([netName '--------bboxCombine' ]); 151 | [cls_error, clsloc_error] = simpleEvaluation(predictionResult_bboxCombine); 152 | disp([(1:5)',clsloc_error,cls_error]); 153 | -------------------------------------------------------------------------------- /ILSVRC_generate_heatmap.m: -------------------------------------------------------------------------------- 1 | % raw script used to generate heatmaps for ILSVRC localization experiment 2 | % please load the necessary packages like matcaffe and ILSVRC toolbox correctly, some functions in matcaffe might be already deprecated. 3 | % you could take it as an example to see how to reproduce the ILSVRC localization experiment. 4 | % 5 | % Bolei Zhou. 6 | 7 | addpath('caffeCPU2/matlab/caffe'); 8 | 9 | modelSetFolder = 'CAMnet'; 10 | 11 | %% CAMnet 12 | 13 | 14 | % netName = 'CAM_googlenetBVLC_imagenet'; 15 | % model_file = [modelSetFolder '/googlenet_imagenet/bvlc_googlenet.caffemodel']; 16 | % model_def_file = [modelSetFolder '/googlenet_imagenet/deploy.protxt']; 17 | 18 | % netName = 'CAM_alexnet'; 19 | % model_file = [modelSetFolder '/alexnet/CAMmodels/caffeNetCAM_imagenet_train_iter_100000.caffemodel']; 20 | % model_def_file = [modelSetFolder '/alexnet/deploy_caffeNetCAM.prototxt']; 21 | 22 | netName = 'CAM_googlenetBVLCshrink_imagenet'; 23 | model_file = [modelSetFolder '/googlenet_imagenet/CAMmodels/imagenet_googleletCAM_train_iter_80000.caffemodel']; 24 | model_def_file = [modelSetFolder '/googlenet_imagenet/deploy_googlenetCAM.prototxt']; 25 | 26 | 27 | % netName = 'CAM_VGG16_imagenet'; 28 | % model_file = [modelSetFolder '/VGGnet/models/vgg16CAM_train_iter_50000.caffemodel']; 29 | % model_def_file = [modelSetFolder '/VGGnet/deploy_vgg16CAM.prototxt']; 30 | 31 | 32 | %% loading the network 33 | caffe('init', model_def_file, model_file,'test'); 34 | caffe('set_mode_gpu'); 35 | caffe('set_device',0); 36 | 37 | %% testing to predict some image 38 | 39 | weights = caffe('get_weights'); 40 | weights_LR = squeeze(weights(end,1).weights{1,1}); 41 | bias_LR = weights(end,1).weights{2,1}; 42 | layernames = caffe('get_names'); 43 | response = caffe('get_all_layers'); 44 | netInfo = cell(size(layernames,1),3); 45 | for i=1:size(layernames,1) 46 | netInfo{i,1} = layernames{i}; 47 | netInfo{i,2} = i; 48 | netInfo{i,3} = size(response{i,1}); 49 | end 50 | 51 | load('categoriesImageNet.mat'); 52 | d = load('/data/vision/torralba/small-projects/bolei_deep/caffe/ilsvrc_2012_mean.mat'); 53 | IMAGE_MEAN = d.image_mean; 54 | IMAGE_DIM = 256; 55 | CROPPED_DIM = netInfo{1,3}(1); 56 | 57 | weightInfo = cell(size(weights,1),1); 58 | for i=1:size(weights,1) 59 | weightInfo{i,1} = weights(i,1).layer_names; 60 | weightInfo{i,2} = weights(i,1).weights{1,1}; 61 | weightInfo{i,3} = size(weights(i,1).weights{1,1}); 62 | end 63 | 64 | %% testing to predict some image 65 | 66 | datasetName = 'ILSVRCvalSet'; 67 | datasetPath = '/data/vision/torralba/gigaSUN/deeplearning/dataset/ILSVRC2012'; 68 | load([datasetPath '/imageListVal.mat']); 69 | load('sizeImg_ILSVRC2014.mat'); 70 | % datasetName = 'ILSVRCtestSet'; 71 | % datasetPath = '/data/vision/torralba/deeplearning/imagenet_toolkit'; 72 | % load([datasetPath '/imageListTest.mat']); 73 | 74 | 75 | 76 | saveFolder = ['heatMap-' datasetName '-' netName]; 77 | if ~exist(saveFolder) 78 | mkdir(saveFolder); 79 | end 80 | for i=1:5 81 | if ~exist([saveFolder '/top' num2str(i)]) 82 | mkdir([saveFolder '/top' num2str(i)]); 83 | end 84 | end 85 | 86 | for i = 1:size(imageList,1) 87 | curImgIDX = i; 88 | [a b c] = fileparts(imageList{curImgIDX,1}); 89 | saveMatFile = [saveFolder '/' b '.mat']; 90 | if ~exist(saveMatFile) 91 | height_original = sizeFull_imageList(curImgIDX,1);%tmp.Height; 92 | weight_original = sizeFull_imageList(curImgIDX,2);%tmp.Width; 93 | 94 | 95 | curImg = imread(imageList{curImgIDX,1}); 96 | 97 | if size(curImg,3)==1 98 | curImg = repmat(curImg,[1 1 3]); 99 | end 100 | 101 | 102 | scores = caffe('forward', {prepare_img(curImg, IMAGE_MEAN, CROPPED_DIM)}); 103 | response = caffe('get_all_layers'); 104 | scoresMean = mean(squeeze(scores{1}),2); 105 | [value_category, IDX_category] = sort(scoresMean,'descend'); 106 | 107 | 108 | featureObjectSwitchSpatial = squeeze(response{end-3,1}); 109 | [curColumnMap] = returnColumnMap(featureObjectSwitchSpatial, weights_LR(:,IDX_category(1:5))); 110 | 111 | 112 | 113 | for j=1:5 114 | curFeatureMap = squeeze(curColumnMap(:,:,j,:)); 115 | curFeatureMap_crop = imresize(curFeatureMap,[netInfo{1,3}(1) netInfo{1,3}(2)]); 116 | gradients = zeros([netInfo{1,3}(1) netInfo{1,3}(2) 3 10]); 117 | gradients(:,:,1,:) = curFeatureMap_crop; 118 | gradients(:,:,2,:) = curFeatureMap_crop; 119 | gradients(:,:,3,:) = curFeatureMap_crop; 120 | 121 | [alignImgMean alignImgSet] = crop2img(gradients); 122 | alignImgMean = single(alignImgMean); 123 | alignImgMean = imresize(alignImgMean, [height_original weight_original]); 124 | alignImgMean = alignImgMean./max(alignImgMean(:)); 125 | 126 | 127 | imwrite(alignImgMean, [saveFolder '/top' num2str(j) '/' b '.jpg']); 128 | 129 | end 130 | value_category = single(value_category); 131 | IDX_category = single(IDX_category); 132 | save(saveMatFile,'value_category','IDX_category'); 133 | disp([netName ' processing ' b]); 134 | end 135 | end 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Class Activation Mapping for Python 2 | I have written the files demo.m and generate_bbox.m in Python in order to be able to use the script without Matlab. In order to run it in Python one just need to run 3 | ``` 4 | python py_demo.py 5 | ``` 6 | and 7 | ``` 8 | python py_generate_bbox.py 9 | ``` 10 | # Sample code for the Class Activation Mapping 11 | We propose a simple technique to expose the implicit attention of Convolutional Neural Networks on the image. It highlights the most informative image regions relevant to the predicted class. You could get attention-based model instantly by tweaking your own CNN a little bit more. The paper is published at [CVPR'16](http://arxiv.org/pdf/1512.04150.pdf). 12 | 13 | The framework of the Class Activation Mapping is as below: 14 | ![Framework](http://cnnlocalization.csail.mit.edu/framework.jpg) 15 | 16 | Some predicted class activation maps are: 17 | ![Results](http://cnnlocalization.csail.mit.edu/example.jpg) 18 | 19 | ### Pre-trained models: 20 | * GoogLeNet-CAM model on ImageNet: ```models/deploy_googlenetCAM.prototxt``` weights:[http://cnnlocalization.csail.mit.edu/demoCAM/models/imagenet_googleletCAM_train_iter_120000.caffemodel] 21 | * VGG16-CAM model on ImageNet: ```models/deploy_vgg16CAM.prototxt``` weights:[http://cnnlocalization.csail.mit.edu/demoCAM/models/vgg16CAM_train_iter_90000.caffemodel] 22 | * GoogLeNet-CAM model on Places205: ```models/deploy_googlenetCAM_places205.prototxt``` weights:[http://cnnlocalization.csail.mit.edu/demoCAM/models/places_googleletCAM_train_iter_120000.caffemodel] 23 | * AlexNet+-CAM on ImageNet:```models/deploy_alexnetplusCAM_imagenet.prototxt``` weights:[http://cnnlocalization.csail.mit.edu/demoCAM/models/alexnetplusCAM_imagenet.caffemodel] 24 | * AlexNet+-CAM on Places205 (used in the [online demo](http://places.csail.mit.edu/demo.html)):```models/deploy_alexnetplusCAM_places205.prototxt``` weights:[http://cnnlocalization.csail.mit.edu/demoCAM/models/alexnetplusCAM_places205.caffemodel] 25 | 26 | ### Usage Instructions: 27 | * Install [caffe](https://github.com/BVLC/caffe), compile the matcaffe (matlab wrapper for caffe), and make sure you could run the prediction example code classification.m. 28 | * Clone the code from Github: 29 | ``` 30 | git clone https://github.com/metalbubble/CAM.git 31 | cd CAM 32 | ``` 33 | * Download the pretrained network 34 | ``` 35 | sh models/download.sh 36 | ``` 37 | * Run the demo code to generate the heatmap: in matlab terminal, 38 | ``` 39 | demo 40 | ``` 41 | * Run the demo code to generate bounding boxes from the heatmap: in matlab terminal, 42 | ``` 43 | generate_bbox 44 | ``` 45 | 46 | The demo video of what the CNN is looking is [here](https://www.youtube.com/watch?v=fZvOy0VXWAI). The reimplementation in tensorflow is [here](https://github.com/jazzsaxmafia/Weakly_detector). 47 | 48 | ### Reference: 49 | ``` 50 | @inproceedings{zhou2016cvpr, 51 | author = {Zhou, Bolei and Khosla, Aditya and Lapedriza, Agata and Oliva, Aude and Torralba, Antonio}, 52 | title = {Learning Deep Features for Discriminative Localization}, 53 | booktitle = {Computer Vision and Pattern Recognition}, 54 | year = {2016} 55 | } 56 | ``` 57 | ### License: 58 | The pre-trained models and the CAM technique are released for unrestricted use. 59 | 60 | Contact [Bolei Zhou](http://people.csail.mit.edu/bzhou/) if you have questions. 61 | 62 | -------------------------------------------------------------------------------- /bboxgenerator/Makefile: -------------------------------------------------------------------------------- 1 | all: cut dt_box 2 | 3 | 4 | cut: 5 | g++ -g -O3 gc.cpp -o cut `pkg-config --libs opencv` -lm 6 | 7 | dt_box: 8 | g++ -g -O3 dt.c dt_box.cpp -o dt_box `pkg-config --libs opencv` -lm 9 | 10 | .PHONY: clean 11 | clean: 12 | rm cut dt_box 13 | -------------------------------------------------------------------------------- /bboxgenerator/cut: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/bboxgenerator/cut -------------------------------------------------------------------------------- /bboxgenerator/dt.c: -------------------------------------------------------------------------------- 1 | /** 2 | * Distance transform for binary image or gray-scale image. 3 | * @param 4 | * @return 5 | */ 6 | 7 | #include 8 | #include 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | #define INF 1E20 15 | 16 | #define SQUARE(q) ((q)*(q)) 17 | #define MAX(a,b) ((a) > (b) ? (a) : (b)) 18 | #define ROUND(t) ((int)((t) + 0.5)) 19 | #define BOUND_8U(t) ((t) < 0 ? 0 : (t) > 255 ? 255 : (t)) 20 | 21 | static void dt_row(const double *f, int n, double *d, double *z, int *v) { 22 | int k, q; 23 | 24 | v[0] = 0; 25 | z[0] = -INF; 26 | z[1] = +INF; 27 | 28 | k = 0; 29 | for (q = 1; q < n; ++q) { 30 | double s = ((f[q]+SQUARE(q))-(f[v[k]]+SQUARE(v[k])))/(double)(2*q-2*v[k]); 31 | while (s <= z[k]) { 32 | k--; 33 | s = ((f[q]+SQUARE(q))-(f[v[k]]+SQUARE(v[k])))/(double)(2*q-2*v[k]); 34 | } 35 | k++; 36 | v[k] = q; 37 | z[k] = s; 38 | z[k+1] = +INF; 39 | } 40 | 41 | k = 0; 42 | for (q = 0; q < n; ++q) { 43 | while (z[k+1] < q) 44 | k++; 45 | d[q] = SQUARE(q-v[k]) + f[v[k]]; 46 | } 47 | } 48 | 49 | void 50 | dt(double *m, int rows, int cols) { 51 | const int n = MAX(rows, cols); 52 | double *f = (double *)malloc(sizeof(f[0]) * n); 53 | double *d = (double *)malloc(sizeof(d[0]) * n); 54 | double *z = (double *)malloc(sizeof(z[0]) * (n+ 1)); 55 | int *v = (int *)malloc(sizeof(v[0]) * n); 56 | int x, y; 57 | 58 | for (x = 0; x < cols; ++x) { 59 | for (y = 0; y < rows; ++y) { 60 | f[y] = m[y*cols + x]; 61 | } 62 | dt_row(f, rows, d, z, v); 63 | for (y = 0; y < rows; ++y) { 64 | m[y*cols + x] = d[y]; 65 | } 66 | } 67 | 68 | for (y = 0; y < rows; ++y) { 69 | for (x = 0; x < cols; ++x) { 70 | f[x] = m[y*cols + x]; 71 | } 72 | dt_row(f, cols, d, z, v); 73 | for (x = 0; x < cols; ++x) { 74 | m[y*cols + x] = d[x]; 75 | } 76 | } 77 | 78 | free(f); 79 | free(d); 80 | free(z); 81 | free(v); 82 | } 83 | 84 | static void 85 | min_max(const double *m, int sz, double *min, double *max) { 86 | double mi = m[0], ma = m[0]; 87 | int i = 1; 88 | for (; i < sz; ++i) { 89 | if (m[i] > ma) { 90 | ma = m[i]; 91 | } 92 | else if (m[i] < mi) { 93 | mi = m[i]; 94 | } 95 | } 96 | *min = mi; 97 | *max = ma; 98 | } 99 | 100 | static void 101 | double_to_image(const double *m, int rows, int cols, unsigned char *data, int step) { 102 | int i, j; 103 | double mi, ma, scale; 104 | min_max(m, rows * cols, &mi, &ma); 105 | 106 | if (mi == ma) { 107 | return ; 108 | } 109 | 110 | scale = 255.0 / (ma - mi); 111 | 112 | for (i = 0; i < rows; ++i) { 113 | for (j = 0; j < cols; ++j) { 114 | const double s = m[i*cols + j] * scale; 115 | const int t = ROUND(s); 116 | data[i*step + j] = BOUND_8U(t); 117 | } 118 | } 119 | } 120 | 121 | static void 122 | sqrt_m(double *m, int sz) { 123 | int i = 0; 124 | for (; i < sz; ++i) { 125 | m[i] = sqrt(m[i]); 126 | } 127 | } 128 | 129 | void 130 | dt_gray(unsigned char *gray, int rows, int cols, int step) { 131 | double *m = (double *)malloc(sizeof(m[0]) * rows * cols); 132 | int i, j; 133 | const double vstep = 100.0; /* big enough to transform the distance... */ 134 | for (i = 0; i < rows; ++i) { 135 | for (j = 0; j < cols; ++j) { 136 | m[i*cols + j] = vstep * (double)gray[i*step + j]; 137 | } 138 | } 139 | 140 | dt(m, rows, cols); 141 | sqrt_m(m, rows * cols); 142 | double_to_image(m, rows, cols, gray, step); 143 | free(m); 144 | } 145 | 146 | void 147 | dt_binary(unsigned char *bimg, int rows, int cols, int step) { 148 | double *m = (double *)malloc(sizeof(m[0]) * rows * cols); 149 | int i, j; 150 | for (i = 0; i < rows; ++i) { 151 | for (j = 0; j < cols; ++j) { 152 | m[i*cols + j] = bimg[i*step + j] > 0 ? +INF : 0.0; 153 | } 154 | } 155 | 156 | dt(m, rows, cols); 157 | sqrt_m(m, rows*cols); 158 | double_to_image(m, rows, cols, bimg, step); 159 | free(m); 160 | } 161 | 162 | 163 | #ifdef __cplusplus 164 | } 165 | #endif 166 | -------------------------------------------------------------------------------- /bboxgenerator/dt.h: -------------------------------------------------------------------------------- 1 | #ifndef DT_H 2 | #define DT_H 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | void dt(double *m, int rows, int cols); 9 | void dt_binary(unsigned char *bimg, int rows, int cols, int step); 10 | void dt_gray(unsigned char *gray, int rows, int cols, int step); 11 | 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /bboxgenerator/dt_box: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/bboxgenerator/dt_box -------------------------------------------------------------------------------- /bboxgenerator/dt_box.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | ---------------------------------------- 3 | Given an heatmap, given out the bbox. 4 | 5 | 0. Get the DT-ed images 6 | 1. detect all contour in the bboxes 7 | 2. merge based on some rules. 8 | 3. output the bbox 9 | ---------------------------------------- 10 | */ 11 | #include 12 | #include "dt.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | using namespace cv; 19 | using std::vector; 20 | 21 | #define SCALE_NUM 3 22 | 23 | struct Data 24 | { 25 | Data() : size(SCALE_NUM) 26 | { 27 | for (int i = 0; i < SCALE_NUM; ++i) 28 | { 29 | images[i] = NULL; 30 | } 31 | } 32 | 33 | ~Data() 34 | { 35 | for (int i = 0; i < SCALE_NUM; ++i) 36 | { 37 | if (images[i]) 38 | { 39 | cvReleaseImage(&(images[i])); 40 | images[i] = NULL; 41 | } 42 | } 43 | } 44 | int size; 45 | IplImage *images[SCALE_NUM]; 46 | }; 47 | 48 | 49 | static int g_Ths[SCALE_NUM] = {30, 90, 150}; 50 | 51 | static Data * 52 | fromDT(const IplImage *gray) 53 | { 54 | Data *data = new Data; 55 | for (int i = 0; i < data->size; ++i) 56 | { 57 | data->images[i] = cvCreateImage(cvGetSize(gray), 8, 1); 58 | cvThreshold(gray, data->images[i], g_Ths[i], 255, CV_THRESH_BINARY); 59 | dt_binary((unsigned char*)data->images[i]->imageData, data->images[i]->height, data->images[i]->width, data->images[i]->widthStep); 60 | cvThreshold(data->images[i], data->images[i], 10, 255, CV_THRESH_BINARY); 61 | } 62 | return data; 63 | } 64 | 65 | 66 | static int 67 | LIMIT(int v, int L, int R) 68 | { 69 | return v < L ? L : (v > R ? R : v); 70 | } 71 | 72 | static vector 73 | getBBox(struct Data *data) 74 | { 75 | vector bboxes; 76 | const int W = data->images[0]->width; 77 | const int H = data->images[0]->height; 78 | 79 | for (int i = 0; i < data->size; ++i) 80 | { 81 | cv::Mat a = cv::cvarrToMat(data->images[i]); 82 | vector< vector >contours; 83 | vector hie; 84 | cv::findContours(a, contours, hie, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE); 85 | for (int j = 0; j < contours.size(); ++j) 86 | { 87 | cv::Rect bb = cv::boundingRect( contours[j] ); 88 | CvRect cr; 89 | cr.x = LIMIT(bb.x, 0, W-5); 90 | cr.y = LIMIT(bb.y, 0, H-5); 91 | cr.width = LIMIT(bb.width, 0, W - bb.x-5); 92 | cr.height = LIMIT(bb.height, 0, H-bb.y-5); 93 | //printf("%d, %d, %d, %d\n", W, H, cr.width, cr.height); 94 | bboxes.push_back(cr); 95 | } 96 | } 97 | return bboxes; 98 | } 99 | 100 | 101 | /* 102 | ---------------------------------------- 103 | x_overlap = Math.max(0, Math.min(x12,x22) - Math.max(x11,x21)); 104 | y_overlap = Math.max(0, Math.min(y12,y22) - Math.max(y11,y21)); 105 | overlapArea = x_overlap * y_overlap; 106 | ---------------------------------------- 107 | */ 108 | static bool 109 | big_overlap(const CvRect &a, const CvRect &b) 110 | { 111 | int t = (double)std::max(a.width * a.height, b.width * b.height) * 0.5; 112 | int x11, y11, x12, y12, x21, y21, x22, y22; 113 | x11 = a.x; 114 | y11 = a.y; 115 | x12 = a.x + a.width; 116 | y12 = a.y + a.height; 117 | x21 = b.x; 118 | y21 = b.y; 119 | x22 = b.x + b.width; 120 | y22 = b.y + b.height; 121 | int x_overlap = std::max(0, std::min(x12,x22) - std::max(x11,x21)); 122 | int y_overlap = std::max(0, std::min(y12,y22) - std::max(y11,y21)); 123 | int overlapArea = x_overlap * y_overlap; 124 | return overlapArea > t; 125 | } 126 | 127 | /* 128 | ---------------------------------------- 129 | 1. Overlap > max(area(A), area(B)) * 0.5 130 | 131 | 0. rank BB 132 | 1. from big to small: 133 | 134 | ---------------------------------------- 135 | */ 136 | static void 137 | mergeBBox(vector &bboxes) 138 | { 139 | for (int i = 0; i < bboxes.size(); ++i) 140 | { 141 | for (int j = i + 1; j < bboxes.size(); ++j) 142 | { 143 | if (big_overlap(bboxes[i], bboxes[j])) 144 | { 145 | // remove small one 146 | bboxes.erase(bboxes.begin() + j); 147 | } 148 | } 149 | } 150 | return ; 151 | } 152 | 153 | static bool 154 | my_cmp(const CvRect& a, const CvRect& b) 155 | { 156 | return a.width * a.height > b.width * b.height; 157 | } 158 | 159 | 160 | static void 161 | rankBBox(vector &bboxes) 162 | { 163 | std::sort(bboxes.begin(), bboxes.end(), my_cmp); 164 | } 165 | 166 | 167 | static void 168 | draw(const vector &rects, const char *iname) 169 | { 170 | IplImage *img = cvLoadImage(iname, 1); 171 | const CvScalar color = cvScalar(0,0,255,0); 172 | 173 | for (int i = 0; i < rects.size(); ++i) 174 | { 175 | CvRect r = rects[i]; 176 | cvRectangle(img, cvPoint(r.x, r.y), cvPoint(r.x + r.width, r.y + r.height), color, 3, 8, 0); 177 | } 178 | cvNamedWindow("draw", 1); 179 | cvShowImage("draw", img); 180 | cvWaitKey(0); 181 | cvReleaseImage(&img); 182 | } 183 | 184 | 185 | static void 186 | output(const vector &rects, const char *filen) 187 | { 188 | FILE *fp = fopen(filen, "w"); 189 | assert(fp != NULL); 190 | for (int i = 0; i < rects.size(); ++i) 191 | { 192 | fprintf(fp, "%d %d %d %d ", rects[i].x, rects[i].y, rects[i].width, rects[i].height); 193 | } 194 | fclose(fp); 195 | return ; 196 | } 197 | 198 | static void 199 | output(const vector &rects) 200 | { 201 | for (int i = 0; i < rects.size(); ++i) 202 | { 203 | printf("%d %d %d %d ", rects[i].x, rects[i].y, rects[i].width, rects[i].height); 204 | } 205 | printf("\n"); 206 | } 207 | 208 | int 209 | main(int argc, char *argv[]) 210 | { 211 | if (argc != 5 && argc != 6) 212 | { 213 | puts(">>>./program image.jpg th0 th1 th2\nor"); 214 | puts(">>>./program image.jpg th0 th1 th2 output.txt"); 215 | return -1; 216 | } 217 | 218 | IplImage *gray = cvLoadImage(argv[1], 0); 219 | if (!gray) 220 | { 221 | puts("Can not open image, dude!\n"); 222 | } 223 | 224 | // set the thresholds 225 | { 226 | int t0, t1, t2; 227 | t0 = atoi(argv[2]); 228 | t1 = atoi(argv[3]); 229 | t2 = atoi(argv[4]); 230 | if (0 < t0 && t0 < t1 && t1 < t2 && t2 < 255) 231 | { 232 | g_Ths[0] = t0; 233 | g_Ths[1] = t1; 234 | g_Ths[2] = t2; 235 | } 236 | } 237 | 238 | 239 | Data *data = fromDT(gray); 240 | vector rects = getBBox(data); 241 | rankBBox(rects); 242 | mergeBBox(rects); 243 | 244 | 245 | //if (argc == 4) 246 | // draw(rects, argv[3]); 247 | 248 | 249 | 250 | if (argc == 6) 251 | output(rects, argv[5]); 252 | else 253 | output(rects); 254 | 255 | delete data; 256 | cvReleaseImage(&gray); 257 | return 0; 258 | } 259 | -------------------------------------------------------------------------------- /bboxgenerator/gc.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | ---------------------------------------- 3 | Using Heat map as the foreground input of 4 | the grabcut. 5 | 6 | Update: 7 | 0. output the biggest bounding box 8 | 1. expose the two thresholding value to the command line. 9 | ---------------------------------------- 10 | */ 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace cv; 16 | using std::vector; 17 | 18 | static int g_th0 = 10; 19 | static int g_th1 = 40; 20 | 21 | /* 22 | ---------------------------------------- 23 | Using the simplest thresholding to get the foreground. 24 | ---------------------------------------- 25 | */ 26 | static Mat 27 | foreground(const Mat &heatmap) 28 | { 29 | Mat bm; 30 | Mat re = heatmap.clone(); 31 | re.setTo(GC_BGD); 32 | 33 | threshold(heatmap, bm, g_th0, 255, THRESH_BINARY); 34 | re.setTo(GC_PR_BGD, bm); 35 | threshold(heatmap, bm, g_th1, 255, THRESH_BINARY); 36 | re.setTo(GC_PR_FGD, bm); 37 | 38 | return re; 39 | } 40 | 41 | 42 | static Mat 43 | cut(const Mat &src, const Mat &heatmap) 44 | { 45 | Mat mask = foreground(heatmap); 46 | Mat bgModel,fgModel; 47 | grabCut(src, mask, Rect(), bgModel,fgModel, 1, cv::GC_INIT_WITH_MASK); 48 | Mat1b mask_fgpf = ( mask == cv::GC_FGD) | (mask == cv::GC_PR_FGD); 49 | Mat3b tmp = Mat3b::zeros(src.rows, src.cols); 50 | src.copyTo(tmp, mask_fgpf); 51 | return tmp; 52 | } 53 | 54 | /* 55 | ---------------------------------------- 56 | The same with cut_mask, but save the segmented image 57 | ---------------------------------------- 58 | */ 59 | static Mat 60 | cut_mask_save(const Mat &src, const Mat &heatmap, const char *dstname) 61 | { 62 | Mat mask = foreground(heatmap); 63 | Mat bgModel,fgModel; 64 | grabCut(src, mask, Rect(), bgModel,fgModel, 1, cv::GC_INIT_WITH_MASK); 65 | Mat mask_fgpf = (mask == cv::GC_FGD) | (mask == cv::GC_PR_FGD); 66 | Mat tmp = Mat3b::zeros(src.rows, src.cols); 67 | src.copyTo(tmp, mask_fgpf); 68 | imwrite(dstname, tmp); 69 | return mask_fgpf; 70 | } 71 | 72 | 73 | /* 74 | ---------------------------------------- 75 | cut, return the mask. 76 | ---------------------------------------- 77 | */ 78 | static Mat 79 | cut_mask(const Mat &src, const Mat &heatmap) 80 | { 81 | Mat mask = foreground(heatmap); 82 | Mat bgModel,fgModel; 83 | grabCut(src, mask, Rect(), bgModel,fgModel, 1, cv::GC_INIT_WITH_MASK); 84 | return ( mask == cv::GC_FGD) | (mask == cv::GC_PR_FGD); 85 | } 86 | 87 | 88 | static bool 89 | rect_cmp(const Rect& a, const Rect& b) 90 | { 91 | return a.area()> b.area(); 92 | } 93 | 94 | 95 | static vector 96 | bbox(Mat &mask) 97 | { 98 | vector rs; 99 | vector< vector >contours; 100 | vector hie; 101 | findContours(mask, contours, hie, CV_RETR_CCOMP, CV_CHAIN_APPROX_SIMPLE); 102 | for (int j = 0; j < contours.size(); ++j) 103 | { 104 | const Rect bb = boundingRect( contours[j] ); 105 | if (bb.area() > 10) 106 | { 107 | rs.push_back(bb); 108 | } 109 | 110 | } 111 | 112 | if (rs.size() == 0) 113 | { 114 | rs.push_back(Rect(0,0,mask.cols, mask.rows)); 115 | return rs; 116 | } 117 | 118 | sort(rs.begin(), rs.end(), rect_cmp); 119 | return rs; 120 | } 121 | 122 | 123 | static void 124 | output(const vector &rs, const char *filen) 125 | { 126 | FILE *fp = fopen(filen, "w"); 127 | assert(fp != NULL); 128 | for (int i = 0; i < rs.size(); ++i) 129 | { 130 | fprintf(fp, "%d %d %d %d ", rs[i].x, rs[i].y, rs[i].width, rs[i].height); 131 | } 132 | 133 | fclose(fp); 134 | return ; 135 | } 136 | 137 | int 138 | main(int argc, char *argv[]) 139 | { 140 | if (argc != 4 && argc != 6 && argc != 7) 141 | { 142 | puts(">>./cut sample.jpg heat.jpg output.txt\nor"); 143 | puts(">>./cut sample.jpg heat.jpg output.txt th1[=10] th2[=40]\nor"); 144 | puts(">>./cut sample.jpg heat.jpg output.txt th1[=10] th2[=40] save_image_name.jpg"); 145 | return 0; 146 | } 147 | 148 | if (argc == 6) 149 | { 150 | int t0 = atoi(argv[4]); 151 | int t1 = atoi(argv[5]); 152 | if (0 <= t0 && t0 < t1 && t1 <= 255) 153 | { 154 | g_th0 = t0; 155 | g_th1 = t1; 156 | } 157 | } 158 | 159 | Mat src = imread(argv[1], 1); 160 | Mat heat = imread(argv[2], 0); 161 | Mat m; 162 | 163 | if (argc == 7) 164 | m = cut_mask_save(src, heat, argv[6]); 165 | else 166 | m = cut_mask(src, heat); 167 | 168 | vector bbs = bbox(m); 169 | output(bbs, argv[3]); 170 | //rectangle(src, box, Scalar(0,0,255)); 171 | //imwrite(argv[3], src); 172 | //imshow("result", src); 173 | //waitKey(0); 174 | return 0; 175 | } 176 | -------------------------------------------------------------------------------- /bboxgenerator/heatmap_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/bboxgenerator/heatmap_6.jpg -------------------------------------------------------------------------------- /bboxgenerator/heatmap_6.txt: -------------------------------------------------------------------------------- 1 | 41 145 362 202 102 188 242 126 -------------------------------------------------------------------------------- /bboxgenerator/sample_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/bboxgenerator/sample_6.jpg -------------------------------------------------------------------------------- /categories1000.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/categories1000.mat -------------------------------------------------------------------------------- /data_img1.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/data_img1.mat -------------------------------------------------------------------------------- /data_img2.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/data_img2.mat -------------------------------------------------------------------------------- /data_net.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/data_net.mat -------------------------------------------------------------------------------- /demo.m: -------------------------------------------------------------------------------- 1 | % Sample code to generate class activation map from 10 crops of activations 2 | % Bolei Zhou, March 15, 2016 3 | % for the online prediction, make sure you have complied matcaffe 4 | 5 | clear 6 | addpath('/xxx/yyy/caffe/matlab'); 7 | 8 | imgID = 2; % 1 or 2 9 | img = imread(['img' num2str(imgID) '.jpg']); 10 | img = imresize(img, [256 256]); 11 | online = 0; % whether extract features online or load pre-extracted features 12 | 13 | load('categories1000.mat'); 14 | if online == 1 15 | % load the CAM model and extract features 16 | 17 | net_weights = ['models/imagenet_googleletCAM_train_iter_120000.caffemodel']; 18 | net_model = ['models/deploy_googlenetCAM.prototxt']; 19 | net = caffe.Net(net_model, net_weights, 'test'); 20 | 21 | weights_LR = net.params('CAM_fc',1).get_data();% get the softmax layer of the network 22 | 23 | scores = net.forward({prepare_image(img)});% extract conv features online 24 | activation_lastconv = net.blobs('CAM_conv').get_data(); 25 | scores = scores{1}; 26 | else 27 | % use the extracted features and softmax parameters cached before hand 28 | load('data_net.mat'); % it contains the softmax weights and the category names of the network 29 | load(['data_img' num2str(imgID) '.mat']); %it contains the pre-extracted conv features 30 | end 31 | 32 | 33 | 34 | 35 | %% Class Activation Mapping 36 | 37 | topNum = 5; % generate heatmap for top X prediction results 38 | scoresMean = mean(scores,2); 39 | [value_category, IDX_category] = sort(scoresMean,'descend'); 40 | [curCAMmapAll] = returnCAMmap(activation_lastconv, weights_LR(:,IDX_category(1:topNum))); 41 | 42 | curResult = im2double(img); 43 | curPrediction = ''; 44 | 45 | for j=1:topNum 46 | curCAMmap_crops = squeeze(curCAMmapAll(:,:,j,:)); 47 | curCAMmapLarge_crops = imresize(curCAMmap_crops,[256 256]); 48 | curCAMLarge = mergeTenCrop(curCAMmapLarge_crops); 49 | curHeatMap = imresize(im2double(curCAMLarge),[256 256]); 50 | curHeatMap = im2double(curHeatMap); 51 | 52 | curHeatMap = map2jpg(curHeatMap,[], 'jet'); 53 | curHeatMap = im2double(img)*0.2+curHeatMap*0.7; 54 | curResult = [curResult ones(size(curHeatMap,1),8,3) curHeatMap]; 55 | curPrediction = [curPrediction ' --top' num2str(j) ':' categories{IDX_category(j)}]; 56 | 57 | end 58 | figure,imshow(curResult);title(curPrediction) 59 | 60 | if online==1 61 | caffe.reset_all(); 62 | end 63 | 64 | -------------------------------------------------------------------------------- /generate_bbox.m: -------------------------------------------------------------------------------- 1 | %% Here is the code to generate the bounding box from the heatmap 2 | % 3 | % to reproduce the ILSVRC localization result, you need to first generate 4 | % the heatmap for each testing image by merging the heatmap from the 5 | % 10-crops (it is exactly what the demo code is doing), then resize the merged heatmap back to the original size of 6 | % that image. Then use this bbox generator to generate the bbox from the resized heatmap. 7 | % 8 | % The source code of the bbox generator is also released. Probably you need 9 | % to install the correct version of OpenCV to compile it. 10 | % 11 | % Special thanks to Hui Li for helping on this code. 12 | % 13 | % Bolei Zhou, April 19, 2016 14 | 15 | bbox_threshold = [20, 100, 110]; % parameters for the bbox generator 16 | curParaThreshold = [num2str(bbox_threshold(1)) ' ' num2str(bbox_threshold(2)) ' ' num2str(bbox_threshold(3))]; 17 | curHeatMapFile = 'bboxgenerator/heatmap_6.jpg'; 18 | curImgFile = 'bboxgenerator/sample_6.jpg'; 19 | curBBoxFile = 'bboxgenerator/heatmap_6.txt'; 20 | system(['bboxgenerator/./dt_box ' curHeatMapFile ' ' curParaThreshold ' ' curBBoxFile]); 21 | 22 | boxData = dlmread(curBBoxFile); 23 | boxData_formulate = [boxData(1:4:end)' boxData(2:4:end)' boxData(1:4:end)'+boxData(3:4:end)' boxData(2:4:end)'+boxData(4:4:end)']; 24 | boxData_formulate = [min(boxData_formulate(:,1),boxData_formulate(:,3)),min(boxData_formulate(:,2),boxData_formulate(:,4)),max(boxData_formulate(:,1),boxData_formulate(:,3)),max(boxData_formulate(:,2),boxData_formulate(:,4))]; 25 | 26 | curHeatMap = imread(curHeatMapFile); 27 | %curHeatMap = imresize(curHeatMap,[height_original weight_original]); 28 | 29 | subplot(1,2,1),hold off, imshow(curImgFile); 30 | hold on 31 | for i=1:size(boxData_formulate,1) 32 | curBox = boxData_formulate(i,:); 33 | rectangle('Position',[curBox(1) curBox(2) curBox(3)-curBox(1) curBox(4)-curBox(2)],'EdgeColor',[1 0 0]); 34 | end 35 | subplot(1,2,2),imagesc(curHeatMap); -------------------------------------------------------------------------------- /ilsvrc_2012_mean.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/ilsvrc_2012_mean.mat -------------------------------------------------------------------------------- /img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/img1.jpg -------------------------------------------------------------------------------- /img2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/img2.jpg -------------------------------------------------------------------------------- /map2jpg.m: -------------------------------------------------------------------------------- 1 | function [img] = map2jpg(imgmap, range, colorMap) 2 | imgmap = double(imgmap); 3 | if(~exist('range', 'var') || isempty(range)), range = [min(imgmap(:)) max(imgmap(:))]; end 4 | 5 | heatmap_gray = mat2gray(imgmap, range); 6 | heatmap_x = gray2ind(heatmap_gray, 256); 7 | heatmap_x(isnan(imgmap)) = 0; 8 | 9 | if(~exist('colorMap', 'var')) 10 | img = ind2rgb(heatmap_x, jet(256)); 11 | else 12 | img = ind2rgb(heatmap_x, eval([colorMap '(256)'])); 13 | end 14 | 15 | -------------------------------------------------------------------------------- /mergeTenCrop.m: -------------------------------------------------------------------------------- 1 | function alignImgMean = mergeTenCrop( CAMmap_crops) 2 | % align the ten crops of CAMmaps back to one image (take a look at caffe 3 | % matlab wrapper about how ten crops are generated) 4 | cropImgSet = zeros([256 256 3 10]); 5 | cropImgSet(:,:,1,:) = CAMmap_crops; 6 | cropImgSet(:,:,2,:) = CAMmap_crops; 7 | cropImgSet(:,:,3,:) = CAMmap_crops; 8 | 9 | 10 | squareSize = 256; 11 | cropSize = size(cropImgSet,1); 12 | indices = [0 squareSize-cropSize] + 1; 13 | 14 | alignImgSet = zeros(256,256,size(cropImgSet,3),'single'); 15 | 16 | 17 | curr = 1; 18 | for i = indices 19 | for j = indices 20 | 21 | curCrop1 = permute(cropImgSet(:,:,:,curr),[2 1 3 4]); 22 | curCrop2 = permute(cropImgSet(end:-1:1,:,:,curr+5),[2 1 3 4]); 23 | 24 | 25 | alignImgSet(i:i+cropSize-1, j:j+cropSize-1,:,curr) = curCrop1; 26 | alignImgSet(i:i+cropSize-1, j:j+cropSize-1,:, curr+5) = curCrop2; 27 | 28 | curr = curr + 1; 29 | 30 | end 31 | end 32 | center = floor(indices(2) / 2)+1; 33 | curCrop1 = permute(cropImgSet(:,:,:,5),[2 1 3 4]); 34 | curCrop2 = permute(cropImgSet(end:-1:1,:,:,10),[2 1 3 4]); 35 | alignImgSet(center:center+cropSize-1, center:center+cropSize-1,:,5) = curCrop1; 36 | alignImgSet(center:center+cropSize-1, center:center+cropSize-1,:, 10) = curCrop2; 37 | alignImgMean = squeeze(sum(sum(abs(alignImgSet),3),4)); 38 | 39 | end 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /models/categoriesImageNet.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/models/categoriesImageNet.mat -------------------------------------------------------------------------------- /models/categories_places205.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gcucurull/CAM-Python/f1f83e8433c0c34d532b4878adc9d7b69948c2d7/models/categories_places205.mat -------------------------------------------------------------------------------- /models/deploy_alexnetplusCAM_imagenet.prototxt: -------------------------------------------------------------------------------- 1 | name: "imagenetCNN_alexnetdeep" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 227 6 | input_dim: 227 7 | layers { 8 | name: "conv1" 9 | type: CONVOLUTION 10 | bottom: "data" 11 | top: "conv1" 12 | blobs_lr: 1 13 | blobs_lr: 2 14 | weight_decay: 1 15 | weight_decay: 0 16 | convolution_param { 17 | num_output: 96 18 | kernel_size: 11 19 | stride: 4 20 | weight_filler { 21 | type: "gaussian" 22 | std: 0.01 23 | } 24 | bias_filler { 25 | type: "constant" 26 | value: 0 27 | } 28 | } 29 | } 30 | layers { 31 | name: "relu1" 32 | type: RELU 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layers { 37 | name: "pool1" 38 | type: POOLING 39 | bottom: "conv1" 40 | top: "pool1" 41 | pooling_param { 42 | pool: MAX 43 | kernel_size: 3 44 | stride: 2 45 | } 46 | } 47 | layers { 48 | name: "norm1" 49 | type: LRN 50 | bottom: "pool1" 51 | top: "norm1" 52 | lrn_param { 53 | local_size: 5 54 | alpha: 0.0001 55 | beta: 0.75 56 | } 57 | } 58 | layers { 59 | name: "conv2" 60 | type: CONVOLUTION 61 | bottom: "norm1" 62 | top: "conv2" 63 | blobs_lr: 1 64 | blobs_lr: 2 65 | weight_decay: 1 66 | weight_decay: 0 67 | convolution_param { 68 | num_output: 256 69 | pad: 2 70 | kernel_size: 5 71 | group: 2 72 | weight_filler { 73 | type: "gaussian" 74 | std: 0.01 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 1 79 | } 80 | } 81 | } 82 | layers { 83 | name: "relu2" 84 | type: RELU 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layers { 89 | name: "pool2" 90 | type: POOLING 91 | bottom: "conv2" 92 | top: "pool2" 93 | pooling_param { 94 | pool: MAX 95 | kernel_size: 3 96 | stride: 2 97 | } 98 | } 99 | layers { 100 | name: "norm2" 101 | type: LRN 102 | bottom: "pool2" 103 | top: "norm2" 104 | lrn_param { 105 | local_size: 5 106 | alpha: 0.0001 107 | beta: 0.75 108 | } 109 | } 110 | layers { 111 | name: "conv3" 112 | type: CONVOLUTION 113 | bottom: "norm2" 114 | top: "conv3" 115 | blobs_lr: 1 116 | blobs_lr: 2 117 | weight_decay: 1 118 | weight_decay: 0 119 | convolution_param { 120 | num_output: 384 121 | pad: 1 122 | kernel_size: 3 123 | weight_filler { 124 | type: "gaussian" 125 | std: 0.01 126 | } 127 | bias_filler { 128 | type: "constant" 129 | value: 0 130 | } 131 | } 132 | } 133 | layers { 134 | name: "relu3" 135 | type: RELU 136 | bottom: "conv3" 137 | top: "conv3" 138 | } 139 | layers { 140 | name: "conv4" 141 | type: CONVOLUTION 142 | bottom: "conv3" 143 | top: "conv4" 144 | blobs_lr: 1 145 | blobs_lr: 2 146 | weight_decay: 1 147 | weight_decay: 0 148 | convolution_param { 149 | num_output: 384 150 | pad: 1 151 | kernel_size: 3 152 | group: 2 153 | weight_filler { 154 | type: "gaussian" 155 | std: 0.01 156 | } 157 | bias_filler { 158 | type: "constant" 159 | value: 1 160 | } 161 | } 162 | } 163 | layers { 164 | name: "relu4" 165 | type: RELU 166 | bottom: "conv4" 167 | top: "conv4" 168 | } 169 | layers { 170 | name: "conv5" 171 | type: CONVOLUTION 172 | bottom: "conv4" 173 | top: "conv5" 174 | blobs_lr: 1 175 | blobs_lr: 2 176 | weight_decay: 1 177 | weight_decay: 0 178 | convolution_param { 179 | num_output: 384 180 | pad: 1 181 | kernel_size: 3 182 | group: 2 183 | weight_filler { 184 | type: "gaussian" 185 | std: 0.01 186 | } 187 | bias_filler { 188 | type: "constant" 189 | value: 1 190 | } 191 | } 192 | } 193 | layers { 194 | name: "relu5" 195 | type: RELU 196 | bottom: "conv5" 197 | top: "conv5" 198 | } 199 | layers { 200 | name: "pool5" 201 | type: POOLING 202 | bottom: "conv5" 203 | top: "pool5" 204 | pooling_param { 205 | pool: MAX 206 | kernel_size: 3 207 | stride: 1 208 | } 209 | } 210 | layers { 211 | name: "conv6" 212 | type: CONVOLUTION 213 | bottom: "pool5" 214 | top: "conv6" 215 | blobs_lr: 1 216 | blobs_lr: 2 217 | weight_decay: 1 218 | weight_decay: 0 219 | convolution_param { 220 | num_output: 512 221 | pad: 1 222 | kernel_size: 3 223 | group: 2 224 | weight_filler { 225 | type: "gaussian" 226 | std: 0.01 227 | } 228 | bias_filler { 229 | type: "constant" 230 | value: 1 231 | } 232 | } 233 | } 234 | layers { 235 | name: "relu6" 236 | type: RELU 237 | bottom: "conv6" 238 | top: "conv6" 239 | } 240 | layers { 241 | name: "conv7" 242 | type: CONVOLUTION 243 | bottom: "conv6" 244 | top: "conv7" 245 | blobs_lr: 1 246 | blobs_lr: 2 247 | weight_decay: 1 248 | weight_decay: 0 249 | convolution_param { 250 | num_output: 512 251 | pad: 1 252 | kernel_size: 3 253 | group: 2 254 | weight_filler { 255 | type: "gaussian" 256 | std: 0.01 257 | } 258 | bias_filler { 259 | type: "constant" 260 | value: 1 261 | } 262 | } 263 | } 264 | layers { 265 | name: "relu7" 266 | type: RELU 267 | bottom: "conv7" 268 | top: "conv7" 269 | } 270 | layers { 271 | name: "pool8_global" 272 | type: POOLING 273 | bottom: "conv7" 274 | top: "pool8_global" 275 | pooling_param { 276 | pool: AVE 277 | kernel_size: 11 278 | stride: 11 279 | } 280 | } 281 | layers { 282 | name: "drop8" 283 | type: DROPOUT 284 | bottom: "pool8_global" 285 | top: "pool8_global" 286 | dropout_param { 287 | dropout_ratio: 0.5 288 | } 289 | } 290 | layers { 291 | name: "fc9" 292 | type: INNER_PRODUCT 293 | bottom: "pool8_global" 294 | top: "fc9" 295 | blobs_lr: 1 296 | blobs_lr: 2 297 | weight_decay: 1 298 | weight_decay: 0 299 | inner_product_param { 300 | num_output: 1000 301 | weight_filler { 302 | type: "gaussian" 303 | std: 0.01 304 | } 305 | bias_filler { 306 | type: "constant" 307 | value: 0 308 | } 309 | } 310 | } 311 | layers { 312 | bottom: "fc9" 313 | top: "prob" 314 | name: "prob" 315 | type: SOFTMAX 316 | } 317 | 318 | -------------------------------------------------------------------------------- /models/deploy_alexnetplusCAM_places205.prototxt: -------------------------------------------------------------------------------- 1 | name: "placesCNNobjectdiscoveryAverageSumDeepNoDropout" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 227 6 | input_dim: 227 7 | layers { 8 | name: "conv1" 9 | type: CONVOLUTION 10 | bottom: "data" 11 | top: "conv1" 12 | blobs_lr: 1 13 | blobs_lr: 2 14 | weight_decay: 1 15 | weight_decay: 0 16 | convolution_param { 17 | num_output: 96 18 | kernel_size: 11 19 | stride: 4 20 | weight_filler { 21 | type: "gaussian" 22 | std: 0.01 23 | } 24 | bias_filler { 25 | type: "constant" 26 | value: 0 27 | } 28 | } 29 | } 30 | layers { 31 | name: "relu1" 32 | type: RELU 33 | bottom: "conv1" 34 | top: "conv1" 35 | } 36 | layers { 37 | name: "pool1" 38 | type: POOLING 39 | bottom: "conv1" 40 | top: "pool1" 41 | pooling_param { 42 | pool: MAX 43 | kernel_size: 3 44 | stride: 2 45 | } 46 | } 47 | layers { 48 | name: "norm1" 49 | type: LRN 50 | bottom: "pool1" 51 | top: "norm1" 52 | lrn_param { 53 | local_size: 5 54 | alpha: 0.0001 55 | beta: 0.75 56 | } 57 | } 58 | layers { 59 | name: "conv2" 60 | type: CONVOLUTION 61 | bottom: "norm1" 62 | top: "conv2" 63 | blobs_lr: 1 64 | blobs_lr: 2 65 | weight_decay: 1 66 | weight_decay: 0 67 | convolution_param { 68 | num_output: 256 69 | pad: 2 70 | kernel_size: 5 71 | group: 2 72 | weight_filler { 73 | type: "gaussian" 74 | std: 0.01 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 1 79 | } 80 | } 81 | } 82 | layers { 83 | name: "relu2" 84 | type: RELU 85 | bottom: "conv2" 86 | top: "conv2" 87 | } 88 | layers { 89 | name: "pool2" 90 | type: POOLING 91 | bottom: "conv2" 92 | top: "pool2" 93 | pooling_param { 94 | pool: MAX 95 | kernel_size: 3 96 | stride: 2 97 | } 98 | } 99 | layers { 100 | name: "norm2" 101 | type: LRN 102 | bottom: "pool2" 103 | top: "norm2" 104 | lrn_param { 105 | local_size: 5 106 | alpha: 0.0001 107 | beta: 0.75 108 | } 109 | } 110 | layers { 111 | name: "conv3" 112 | type: CONVOLUTION 113 | bottom: "norm2" 114 | top: "conv3" 115 | blobs_lr: 1 116 | blobs_lr: 2 117 | weight_decay: 1 118 | weight_decay: 0 119 | convolution_param { 120 | num_output: 384 121 | pad: 1 122 | kernel_size: 3 123 | weight_filler { 124 | type: "gaussian" 125 | std: 0.01 126 | } 127 | bias_filler { 128 | type: "constant" 129 | value: 0 130 | } 131 | } 132 | } 133 | layers { 134 | name: "relu3" 135 | type: RELU 136 | bottom: "conv3" 137 | top: "conv3" 138 | } 139 | layers { 140 | name: "conv4" 141 | type: CONVOLUTION 142 | bottom: "conv3" 143 | top: "conv4" 144 | blobs_lr: 1 145 | blobs_lr: 2 146 | weight_decay: 1 147 | weight_decay: 0 148 | convolution_param { 149 | num_output: 384 150 | pad: 1 151 | kernel_size: 3 152 | group: 2 153 | weight_filler { 154 | type: "gaussian" 155 | std: 0.01 156 | } 157 | bias_filler { 158 | type: "constant" 159 | value: 1 160 | } 161 | } 162 | } 163 | layers { 164 | name: "relu4" 165 | type: RELU 166 | bottom: "conv4" 167 | top: "conv4" 168 | } 169 | layers { 170 | name: "conv5" 171 | type: CONVOLUTION 172 | bottom: "conv4" 173 | top: "conv5" 174 | blobs_lr: 1 175 | blobs_lr: 2 176 | weight_decay: 1 177 | weight_decay: 0 178 | convolution_param { 179 | num_output: 384 180 | pad: 1 181 | kernel_size: 3 182 | group: 2 183 | weight_filler { 184 | type: "gaussian" 185 | std: 0.01 186 | } 187 | bias_filler { 188 | type: "constant" 189 | value: 1 190 | } 191 | } 192 | } 193 | layers { 194 | name: "relu5" 195 | type: RELU 196 | bottom: "conv5" 197 | top: "conv5" 198 | } 199 | layers { 200 | name: "pool5" 201 | type: POOLING 202 | bottom: "conv5" 203 | top: "pool5" 204 | pooling_param { 205 | pool: MAX 206 | kernel_size: 3 207 | stride: 1 208 | } 209 | } 210 | layers { 211 | name: "conv6" 212 | type: CONVOLUTION 213 | bottom: "pool5" 214 | top: "conv6" 215 | blobs_lr: 1 216 | blobs_lr: 2 217 | weight_decay: 1 218 | weight_decay: 0 219 | convolution_param { 220 | num_output: 512 221 | pad: 1 222 | kernel_size: 3 223 | group: 2 224 | weight_filler { 225 | type: "gaussian" 226 | std: 0.01 227 | } 228 | bias_filler { 229 | type: "constant" 230 | value: 1 231 | } 232 | } 233 | } 234 | layers { 235 | name: "relu6" 236 | type: RELU 237 | bottom: "conv6" 238 | top: "conv6" 239 | } 240 | layers { 241 | name: "conv7" 242 | type: CONVOLUTION 243 | bottom: "conv6" 244 | top: "conv7" 245 | blobs_lr: 1 246 | blobs_lr: 2 247 | weight_decay: 1 248 | weight_decay: 0 249 | convolution_param { 250 | num_output: 512 251 | pad: 1 252 | kernel_size: 3 253 | group: 2 254 | weight_filler { 255 | type: "gaussian" 256 | std: 0.01 257 | } 258 | bias_filler { 259 | type: "constant" 260 | value: 1 261 | } 262 | } 263 | } 264 | layers { 265 | name: "relu7" 266 | type: RELU 267 | bottom: "conv7" 268 | top: "conv7" 269 | } 270 | layers { 271 | name: "pool8_global" 272 | type: POOLING 273 | bottom: "conv7" 274 | top: "pool8_global" 275 | pooling_param { 276 | pool: AVE 277 | kernel_size: 11 278 | stride: 11 279 | } 280 | } 281 | layers { 282 | name: "fc9" 283 | type: INNER_PRODUCT 284 | bottom: "pool8_global" 285 | top: "fc9" 286 | blobs_lr: 1 287 | blobs_lr: 2 288 | weight_decay: 1 289 | weight_decay: 0 290 | inner_product_param { 291 | num_output: 205 292 | weight_filler { 293 | type: "gaussian" 294 | std: 0.01 295 | } 296 | bias_filler { 297 | type: "constant" 298 | value: 0 299 | } 300 | } 301 | } 302 | layers { 303 | name: "prob" 304 | type: SOFTMAX 305 | bottom: "fc9" 306 | top: "prob" 307 | } 308 | 309 | -------------------------------------------------------------------------------- /models/deploy_googlenetCAM.prototxt: -------------------------------------------------------------------------------- 1 | name: "GoogleNet" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 224 6 | input_dim: 224 7 | force_backward: true 8 | layer { 9 | name: "conv1/7x7_s2" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1/7x7_s2" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 64 23 | pad: 3 24 | kernel_size: 7 25 | stride: 2 26 | weight_filler { 27 | type: "xavier" 28 | std: 0.1 29 | } 30 | bias_filler { 31 | type: "constant" 32 | value: 0.2 33 | } 34 | } 35 | } 36 | layer { 37 | name: "conv1/relu_7x7" 38 | type: "ReLU" 39 | bottom: "conv1/7x7_s2" 40 | top: "conv1/7x7_s2" 41 | } 42 | layer { 43 | name: "pool1/3x3_s2" 44 | type: "Pooling" 45 | bottom: "conv1/7x7_s2" 46 | top: "pool1/3x3_s2" 47 | pooling_param { 48 | pool: MAX 49 | kernel_size: 3 50 | stride: 2 51 | } 52 | } 53 | layer { 54 | name: "pool1/norm1" 55 | type: "LRN" 56 | bottom: "pool1/3x3_s2" 57 | top: "pool1/norm1" 58 | lrn_param { 59 | local_size: 5 60 | alpha: 0.0001 61 | beta: 0.75 62 | } 63 | } 64 | layer { 65 | name: "conv2/3x3_reduce" 66 | type: "Convolution" 67 | bottom: "pool1/norm1" 68 | top: "conv2/3x3_reduce" 69 | param { 70 | lr_mult: 1 71 | decay_mult: 1 72 | } 73 | param { 74 | lr_mult: 2 75 | decay_mult: 0 76 | } 77 | convolution_param { 78 | num_output: 64 79 | kernel_size: 1 80 | weight_filler { 81 | type: "xavier" 82 | std: 0.1 83 | } 84 | bias_filler { 85 | type: "constant" 86 | value: 0.2 87 | } 88 | } 89 | } 90 | layer { 91 | name: "conv2/relu_3x3_reduce" 92 | type: "ReLU" 93 | bottom: "conv2/3x3_reduce" 94 | top: "conv2/3x3_reduce" 95 | } 96 | layer { 97 | name: "conv2/3x3" 98 | type: "Convolution" 99 | bottom: "conv2/3x3_reduce" 100 | top: "conv2/3x3" 101 | param { 102 | lr_mult: 1 103 | decay_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | decay_mult: 0 108 | } 109 | convolution_param { 110 | num_output: 192 111 | pad: 1 112 | kernel_size: 3 113 | weight_filler { 114 | type: "xavier" 115 | std: 0.03 116 | } 117 | bias_filler { 118 | type: "constant" 119 | value: 0.2 120 | } 121 | } 122 | } 123 | layer { 124 | name: "conv2/relu_3x3" 125 | type: "ReLU" 126 | bottom: "conv2/3x3" 127 | top: "conv2/3x3" 128 | } 129 | layer { 130 | name: "conv2/norm2" 131 | type: "LRN" 132 | bottom: "conv2/3x3" 133 | top: "conv2/norm2" 134 | lrn_param { 135 | local_size: 5 136 | alpha: 0.0001 137 | beta: 0.75 138 | } 139 | } 140 | layer { 141 | name: "pool2/3x3_s2" 142 | type: "Pooling" 143 | bottom: "conv2/norm2" 144 | top: "pool2/3x3_s2" 145 | pooling_param { 146 | pool: MAX 147 | kernel_size: 3 148 | stride: 2 149 | } 150 | } 151 | layer { 152 | name: "inception_3a/1x1" 153 | type: "Convolution" 154 | bottom: "pool2/3x3_s2" 155 | top: "inception_3a/1x1" 156 | param { 157 | lr_mult: 1 158 | decay_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | decay_mult: 0 163 | } 164 | convolution_param { 165 | num_output: 64 166 | kernel_size: 1 167 | weight_filler { 168 | type: "xavier" 169 | std: 0.03 170 | } 171 | bias_filler { 172 | type: "constant" 173 | value: 0.2 174 | } 175 | } 176 | } 177 | layer { 178 | name: "inception_3a/relu_1x1" 179 | type: "ReLU" 180 | bottom: "inception_3a/1x1" 181 | top: "inception_3a/1x1" 182 | } 183 | layer { 184 | name: "inception_3a/3x3_reduce" 185 | type: "Convolution" 186 | bottom: "pool2/3x3_s2" 187 | top: "inception_3a/3x3_reduce" 188 | param { 189 | lr_mult: 1 190 | decay_mult: 1 191 | } 192 | param { 193 | lr_mult: 2 194 | decay_mult: 0 195 | } 196 | convolution_param { 197 | num_output: 96 198 | kernel_size: 1 199 | weight_filler { 200 | type: "xavier" 201 | std: 0.09 202 | } 203 | bias_filler { 204 | type: "constant" 205 | value: 0.2 206 | } 207 | } 208 | } 209 | layer { 210 | name: "inception_3a/relu_3x3_reduce" 211 | type: "ReLU" 212 | bottom: "inception_3a/3x3_reduce" 213 | top: "inception_3a/3x3_reduce" 214 | } 215 | layer { 216 | name: "inception_3a/3x3" 217 | type: "Convolution" 218 | bottom: "inception_3a/3x3_reduce" 219 | top: "inception_3a/3x3" 220 | param { 221 | lr_mult: 1 222 | decay_mult: 1 223 | } 224 | param { 225 | lr_mult: 2 226 | decay_mult: 0 227 | } 228 | convolution_param { 229 | num_output: 128 230 | pad: 1 231 | kernel_size: 3 232 | weight_filler { 233 | type: "xavier" 234 | std: 0.03 235 | } 236 | bias_filler { 237 | type: "constant" 238 | value: 0.2 239 | } 240 | } 241 | } 242 | layer { 243 | name: "inception_3a/relu_3x3" 244 | type: "ReLU" 245 | bottom: "inception_3a/3x3" 246 | top: "inception_3a/3x3" 247 | } 248 | layer { 249 | name: "inception_3a/5x5_reduce" 250 | type: "Convolution" 251 | bottom: "pool2/3x3_s2" 252 | top: "inception_3a/5x5_reduce" 253 | param { 254 | lr_mult: 1 255 | decay_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | decay_mult: 0 260 | } 261 | convolution_param { 262 | num_output: 16 263 | kernel_size: 1 264 | weight_filler { 265 | type: "xavier" 266 | std: 0.2 267 | } 268 | bias_filler { 269 | type: "constant" 270 | value: 0.2 271 | } 272 | } 273 | } 274 | layer { 275 | name: "inception_3a/relu_5x5_reduce" 276 | type: "ReLU" 277 | bottom: "inception_3a/5x5_reduce" 278 | top: "inception_3a/5x5_reduce" 279 | } 280 | layer { 281 | name: "inception_3a/5x5" 282 | type: "Convolution" 283 | bottom: "inception_3a/5x5_reduce" 284 | top: "inception_3a/5x5" 285 | param { 286 | lr_mult: 1 287 | decay_mult: 1 288 | } 289 | param { 290 | lr_mult: 2 291 | decay_mult: 0 292 | } 293 | convolution_param { 294 | num_output: 32 295 | pad: 2 296 | kernel_size: 5 297 | weight_filler { 298 | type: "xavier" 299 | std: 0.03 300 | } 301 | bias_filler { 302 | type: "constant" 303 | value: 0.2 304 | } 305 | } 306 | } 307 | layer { 308 | name: "inception_3a/relu_5x5" 309 | type: "ReLU" 310 | bottom: "inception_3a/5x5" 311 | top: "inception_3a/5x5" 312 | } 313 | layer { 314 | name: "inception_3a/pool" 315 | type: "Pooling" 316 | bottom: "pool2/3x3_s2" 317 | top: "inception_3a/pool" 318 | pooling_param { 319 | pool: MAX 320 | kernel_size: 3 321 | stride: 1 322 | pad: 1 323 | } 324 | } 325 | layer { 326 | name: "inception_3a/pool_proj" 327 | type: "Convolution" 328 | bottom: "inception_3a/pool" 329 | top: "inception_3a/pool_proj" 330 | param { 331 | lr_mult: 1 332 | decay_mult: 1 333 | } 334 | param { 335 | lr_mult: 2 336 | decay_mult: 0 337 | } 338 | convolution_param { 339 | num_output: 32 340 | kernel_size: 1 341 | weight_filler { 342 | type: "xavier" 343 | std: 0.1 344 | } 345 | bias_filler { 346 | type: "constant" 347 | value: 0.2 348 | } 349 | } 350 | } 351 | layer { 352 | name: "inception_3a/relu_pool_proj" 353 | type: "ReLU" 354 | bottom: "inception_3a/pool_proj" 355 | top: "inception_3a/pool_proj" 356 | } 357 | layer { 358 | name: "inception_3a/output" 359 | type: "Concat" 360 | bottom: "inception_3a/1x1" 361 | bottom: "inception_3a/3x3" 362 | bottom: "inception_3a/5x5" 363 | bottom: "inception_3a/pool_proj" 364 | top: "inception_3a/output" 365 | } 366 | layer { 367 | name: "inception_3b/1x1" 368 | type: "Convolution" 369 | bottom: "inception_3a/output" 370 | top: "inception_3b/1x1" 371 | param { 372 | lr_mult: 1 373 | decay_mult: 1 374 | } 375 | param { 376 | lr_mult: 2 377 | decay_mult: 0 378 | } 379 | convolution_param { 380 | num_output: 128 381 | kernel_size: 1 382 | weight_filler { 383 | type: "xavier" 384 | std: 0.03 385 | } 386 | bias_filler { 387 | type: "constant" 388 | value: 0.2 389 | } 390 | } 391 | } 392 | layer { 393 | name: "inception_3b/relu_1x1" 394 | type: "ReLU" 395 | bottom: "inception_3b/1x1" 396 | top: "inception_3b/1x1" 397 | } 398 | layer { 399 | name: "inception_3b/3x3_reduce" 400 | type: "Convolution" 401 | bottom: "inception_3a/output" 402 | top: "inception_3b/3x3_reduce" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | convolution_param { 412 | num_output: 128 413 | kernel_size: 1 414 | weight_filler { 415 | type: "xavier" 416 | std: 0.09 417 | } 418 | bias_filler { 419 | type: "constant" 420 | value: 0.2 421 | } 422 | } 423 | } 424 | layer { 425 | name: "inception_3b/relu_3x3_reduce" 426 | type: "ReLU" 427 | bottom: "inception_3b/3x3_reduce" 428 | top: "inception_3b/3x3_reduce" 429 | } 430 | layer { 431 | name: "inception_3b/3x3" 432 | type: "Convolution" 433 | bottom: "inception_3b/3x3_reduce" 434 | top: "inception_3b/3x3" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | convolution_param { 444 | num_output: 192 445 | pad: 1 446 | kernel_size: 3 447 | weight_filler { 448 | type: "xavier" 449 | std: 0.03 450 | } 451 | bias_filler { 452 | type: "constant" 453 | value: 0.2 454 | } 455 | } 456 | } 457 | layer { 458 | name: "inception_3b/relu_3x3" 459 | type: "ReLU" 460 | bottom: "inception_3b/3x3" 461 | top: "inception_3b/3x3" 462 | } 463 | layer { 464 | name: "inception_3b/5x5_reduce" 465 | type: "Convolution" 466 | bottom: "inception_3a/output" 467 | top: "inception_3b/5x5_reduce" 468 | param { 469 | lr_mult: 1 470 | decay_mult: 1 471 | } 472 | param { 473 | lr_mult: 2 474 | decay_mult: 0 475 | } 476 | convolution_param { 477 | num_output: 32 478 | kernel_size: 1 479 | weight_filler { 480 | type: "xavier" 481 | std: 0.2 482 | } 483 | bias_filler { 484 | type: "constant" 485 | value: 0.2 486 | } 487 | } 488 | } 489 | layer { 490 | name: "inception_3b/relu_5x5_reduce" 491 | type: "ReLU" 492 | bottom: "inception_3b/5x5_reduce" 493 | top: "inception_3b/5x5_reduce" 494 | } 495 | layer { 496 | name: "inception_3b/5x5" 497 | type: "Convolution" 498 | bottom: "inception_3b/5x5_reduce" 499 | top: "inception_3b/5x5" 500 | param { 501 | lr_mult: 1 502 | decay_mult: 1 503 | } 504 | param { 505 | lr_mult: 2 506 | decay_mult: 0 507 | } 508 | convolution_param { 509 | num_output: 96 510 | pad: 2 511 | kernel_size: 5 512 | weight_filler { 513 | type: "xavier" 514 | std: 0.03 515 | } 516 | bias_filler { 517 | type: "constant" 518 | value: 0.2 519 | } 520 | } 521 | } 522 | layer { 523 | name: "inception_3b/relu_5x5" 524 | type: "ReLU" 525 | bottom: "inception_3b/5x5" 526 | top: "inception_3b/5x5" 527 | } 528 | layer { 529 | name: "inception_3b/pool" 530 | type: "Pooling" 531 | bottom: "inception_3a/output" 532 | top: "inception_3b/pool" 533 | pooling_param { 534 | pool: MAX 535 | kernel_size: 3 536 | stride: 1 537 | pad: 1 538 | } 539 | } 540 | layer { 541 | name: "inception_3b/pool_proj" 542 | type: "Convolution" 543 | bottom: "inception_3b/pool" 544 | top: "inception_3b/pool_proj" 545 | param { 546 | lr_mult: 1 547 | decay_mult: 1 548 | } 549 | param { 550 | lr_mult: 2 551 | decay_mult: 0 552 | } 553 | convolution_param { 554 | num_output: 64 555 | kernel_size: 1 556 | weight_filler { 557 | type: "xavier" 558 | std: 0.1 559 | } 560 | bias_filler { 561 | type: "constant" 562 | value: 0.2 563 | } 564 | } 565 | } 566 | layer { 567 | name: "inception_3b/relu_pool_proj" 568 | type: "ReLU" 569 | bottom: "inception_3b/pool_proj" 570 | top: "inception_3b/pool_proj" 571 | } 572 | layer { 573 | name: "inception_3b/output" 574 | type: "Concat" 575 | bottom: "inception_3b/1x1" 576 | bottom: "inception_3b/3x3" 577 | bottom: "inception_3b/5x5" 578 | bottom: "inception_3b/pool_proj" 579 | top: "inception_3b/output" 580 | } 581 | layer { 582 | name: "pool3/3x3_s2" 583 | type: "Pooling" 584 | bottom: "inception_3b/output" 585 | top: "pool3/3x3_s2" 586 | pooling_param { 587 | pool: MAX 588 | kernel_size: 3 589 | stride: 2 590 | } 591 | } 592 | layer { 593 | name: "inception_4a/1x1" 594 | type: "Convolution" 595 | bottom: "pool3/3x3_s2" 596 | top: "inception_4a/1x1" 597 | param { 598 | lr_mult: 1 599 | decay_mult: 1 600 | } 601 | param { 602 | lr_mult: 2 603 | decay_mult: 0 604 | } 605 | convolution_param { 606 | num_output: 192 607 | kernel_size: 1 608 | weight_filler { 609 | type: "xavier" 610 | std: 0.03 611 | } 612 | bias_filler { 613 | type: "constant" 614 | value: 0.2 615 | } 616 | } 617 | } 618 | layer { 619 | name: "inception_4a/relu_1x1" 620 | type: "ReLU" 621 | bottom: "inception_4a/1x1" 622 | top: "inception_4a/1x1" 623 | } 624 | layer { 625 | name: "inception_4a/3x3_reduce" 626 | type: "Convolution" 627 | bottom: "pool3/3x3_s2" 628 | top: "inception_4a/3x3_reduce" 629 | param { 630 | lr_mult: 1 631 | decay_mult: 1 632 | } 633 | param { 634 | lr_mult: 2 635 | decay_mult: 0 636 | } 637 | convolution_param { 638 | num_output: 96 639 | kernel_size: 1 640 | weight_filler { 641 | type: "xavier" 642 | std: 0.09 643 | } 644 | bias_filler { 645 | type: "constant" 646 | value: 0.2 647 | } 648 | } 649 | } 650 | layer { 651 | name: "inception_4a/relu_3x3_reduce" 652 | type: "ReLU" 653 | bottom: "inception_4a/3x3_reduce" 654 | top: "inception_4a/3x3_reduce" 655 | } 656 | layer { 657 | name: "inception_4a/3x3" 658 | type: "Convolution" 659 | bottom: "inception_4a/3x3_reduce" 660 | top: "inception_4a/3x3" 661 | param { 662 | lr_mult: 1 663 | decay_mult: 1 664 | } 665 | param { 666 | lr_mult: 2 667 | decay_mult: 0 668 | } 669 | convolution_param { 670 | num_output: 208 671 | pad: 1 672 | kernel_size: 3 673 | weight_filler { 674 | type: "xavier" 675 | std: 0.03 676 | } 677 | bias_filler { 678 | type: "constant" 679 | value: 0.2 680 | } 681 | } 682 | } 683 | layer { 684 | name: "inception_4a/relu_3x3" 685 | type: "ReLU" 686 | bottom: "inception_4a/3x3" 687 | top: "inception_4a/3x3" 688 | } 689 | layer { 690 | name: "inception_4a/5x5_reduce" 691 | type: "Convolution" 692 | bottom: "pool3/3x3_s2" 693 | top: "inception_4a/5x5_reduce" 694 | param { 695 | lr_mult: 1 696 | decay_mult: 1 697 | } 698 | param { 699 | lr_mult: 2 700 | decay_mult: 0 701 | } 702 | convolution_param { 703 | num_output: 16 704 | kernel_size: 1 705 | weight_filler { 706 | type: "xavier" 707 | std: 0.2 708 | } 709 | bias_filler { 710 | type: "constant" 711 | value: 0.2 712 | } 713 | } 714 | } 715 | layer { 716 | name: "inception_4a/relu_5x5_reduce" 717 | type: "ReLU" 718 | bottom: "inception_4a/5x5_reduce" 719 | top: "inception_4a/5x5_reduce" 720 | } 721 | layer { 722 | name: "inception_4a/5x5" 723 | type: "Convolution" 724 | bottom: "inception_4a/5x5_reduce" 725 | top: "inception_4a/5x5" 726 | param { 727 | lr_mult: 1 728 | decay_mult: 1 729 | } 730 | param { 731 | lr_mult: 2 732 | decay_mult: 0 733 | } 734 | convolution_param { 735 | num_output: 48 736 | pad: 2 737 | kernel_size: 5 738 | weight_filler { 739 | type: "xavier" 740 | std: 0.03 741 | } 742 | bias_filler { 743 | type: "constant" 744 | value: 0.2 745 | } 746 | } 747 | } 748 | layer { 749 | name: "inception_4a/relu_5x5" 750 | type: "ReLU" 751 | bottom: "inception_4a/5x5" 752 | top: "inception_4a/5x5" 753 | } 754 | layer { 755 | name: "inception_4a/pool" 756 | type: "Pooling" 757 | bottom: "pool3/3x3_s2" 758 | top: "inception_4a/pool" 759 | pooling_param { 760 | pool: MAX 761 | kernel_size: 3 762 | stride: 1 763 | pad: 1 764 | } 765 | } 766 | layer { 767 | name: "inception_4a/pool_proj" 768 | type: "Convolution" 769 | bottom: "inception_4a/pool" 770 | top: "inception_4a/pool_proj" 771 | param { 772 | lr_mult: 1 773 | decay_mult: 1 774 | } 775 | param { 776 | lr_mult: 2 777 | decay_mult: 0 778 | } 779 | convolution_param { 780 | num_output: 64 781 | kernel_size: 1 782 | weight_filler { 783 | type: "xavier" 784 | std: 0.1 785 | } 786 | bias_filler { 787 | type: "constant" 788 | value: 0.2 789 | } 790 | } 791 | } 792 | layer { 793 | name: "inception_4a/relu_pool_proj" 794 | type: "ReLU" 795 | bottom: "inception_4a/pool_proj" 796 | top: "inception_4a/pool_proj" 797 | } 798 | layer { 799 | name: "inception_4a/output" 800 | type: "Concat" 801 | bottom: "inception_4a/1x1" 802 | bottom: "inception_4a/3x3" 803 | bottom: "inception_4a/5x5" 804 | bottom: "inception_4a/pool_proj" 805 | top: "inception_4a/output" 806 | } 807 | layer { 808 | name: "inception_4b/1x1" 809 | type: "Convolution" 810 | bottom: "inception_4a/output" 811 | top: "inception_4b/1x1" 812 | param { 813 | lr_mult: 1 814 | decay_mult: 1 815 | } 816 | param { 817 | lr_mult: 2 818 | decay_mult: 0 819 | } 820 | convolution_param { 821 | num_output: 160 822 | kernel_size: 1 823 | weight_filler { 824 | type: "xavier" 825 | std: 0.03 826 | } 827 | bias_filler { 828 | type: "constant" 829 | value: 0.2 830 | } 831 | } 832 | } 833 | layer { 834 | name: "inception_4b/relu_1x1" 835 | type: "ReLU" 836 | bottom: "inception_4b/1x1" 837 | top: "inception_4b/1x1" 838 | } 839 | layer { 840 | name: "inception_4b/3x3_reduce" 841 | type: "Convolution" 842 | bottom: "inception_4a/output" 843 | top: "inception_4b/3x3_reduce" 844 | param { 845 | lr_mult: 1 846 | decay_mult: 1 847 | } 848 | param { 849 | lr_mult: 2 850 | decay_mult: 0 851 | } 852 | convolution_param { 853 | num_output: 112 854 | kernel_size: 1 855 | weight_filler { 856 | type: "xavier" 857 | std: 0.09 858 | } 859 | bias_filler { 860 | type: "constant" 861 | value: 0.2 862 | } 863 | } 864 | } 865 | layer { 866 | name: "inception_4b/relu_3x3_reduce" 867 | type: "ReLU" 868 | bottom: "inception_4b/3x3_reduce" 869 | top: "inception_4b/3x3_reduce" 870 | } 871 | layer { 872 | name: "inception_4b/3x3" 873 | type: "Convolution" 874 | bottom: "inception_4b/3x3_reduce" 875 | top: "inception_4b/3x3" 876 | param { 877 | lr_mult: 1 878 | decay_mult: 1 879 | } 880 | param { 881 | lr_mult: 2 882 | decay_mult: 0 883 | } 884 | convolution_param { 885 | num_output: 224 886 | pad: 1 887 | kernel_size: 3 888 | weight_filler { 889 | type: "xavier" 890 | std: 0.03 891 | } 892 | bias_filler { 893 | type: "constant" 894 | value: 0.2 895 | } 896 | } 897 | } 898 | layer { 899 | name: "inception_4b/relu_3x3" 900 | type: "ReLU" 901 | bottom: "inception_4b/3x3" 902 | top: "inception_4b/3x3" 903 | } 904 | layer { 905 | name: "inception_4b/5x5_reduce" 906 | type: "Convolution" 907 | bottom: "inception_4a/output" 908 | top: "inception_4b/5x5_reduce" 909 | param { 910 | lr_mult: 1 911 | decay_mult: 1 912 | } 913 | param { 914 | lr_mult: 2 915 | decay_mult: 0 916 | } 917 | convolution_param { 918 | num_output: 24 919 | kernel_size: 1 920 | weight_filler { 921 | type: "xavier" 922 | std: 0.2 923 | } 924 | bias_filler { 925 | type: "constant" 926 | value: 0.2 927 | } 928 | } 929 | } 930 | layer { 931 | name: "inception_4b/relu_5x5_reduce" 932 | type: "ReLU" 933 | bottom: "inception_4b/5x5_reduce" 934 | top: "inception_4b/5x5_reduce" 935 | } 936 | layer { 937 | name: "inception_4b/5x5" 938 | type: "Convolution" 939 | bottom: "inception_4b/5x5_reduce" 940 | top: "inception_4b/5x5" 941 | param { 942 | lr_mult: 1 943 | decay_mult: 1 944 | } 945 | param { 946 | lr_mult: 2 947 | decay_mult: 0 948 | } 949 | convolution_param { 950 | num_output: 64 951 | pad: 2 952 | kernel_size: 5 953 | weight_filler { 954 | type: "xavier" 955 | std: 0.03 956 | } 957 | bias_filler { 958 | type: "constant" 959 | value: 0.2 960 | } 961 | } 962 | } 963 | layer { 964 | name: "inception_4b/relu_5x5" 965 | type: "ReLU" 966 | bottom: "inception_4b/5x5" 967 | top: "inception_4b/5x5" 968 | } 969 | layer { 970 | name: "inception_4b/pool" 971 | type: "Pooling" 972 | bottom: "inception_4a/output" 973 | top: "inception_4b/pool" 974 | pooling_param { 975 | pool: MAX 976 | kernel_size: 3 977 | stride: 1 978 | pad: 1 979 | } 980 | } 981 | layer { 982 | name: "inception_4b/pool_proj" 983 | type: "Convolution" 984 | bottom: "inception_4b/pool" 985 | top: "inception_4b/pool_proj" 986 | param { 987 | lr_mult: 1 988 | decay_mult: 1 989 | } 990 | param { 991 | lr_mult: 2 992 | decay_mult: 0 993 | } 994 | convolution_param { 995 | num_output: 64 996 | kernel_size: 1 997 | weight_filler { 998 | type: "xavier" 999 | std: 0.1 1000 | } 1001 | bias_filler { 1002 | type: "constant" 1003 | value: 0.2 1004 | } 1005 | } 1006 | } 1007 | layer { 1008 | name: "inception_4b/relu_pool_proj" 1009 | type: "ReLU" 1010 | bottom: "inception_4b/pool_proj" 1011 | top: "inception_4b/pool_proj" 1012 | } 1013 | layer { 1014 | name: "inception_4b/output" 1015 | type: "Concat" 1016 | bottom: "inception_4b/1x1" 1017 | bottom: "inception_4b/3x3" 1018 | bottom: "inception_4b/5x5" 1019 | bottom: "inception_4b/pool_proj" 1020 | top: "inception_4b/output" 1021 | } 1022 | layer { 1023 | name: "inception_4c/1x1" 1024 | type: "Convolution" 1025 | bottom: "inception_4b/output" 1026 | top: "inception_4c/1x1" 1027 | param { 1028 | lr_mult: 1 1029 | decay_mult: 1 1030 | } 1031 | param { 1032 | lr_mult: 2 1033 | decay_mult: 0 1034 | } 1035 | convolution_param { 1036 | num_output: 128 1037 | kernel_size: 1 1038 | weight_filler { 1039 | type: "xavier" 1040 | std: 0.03 1041 | } 1042 | bias_filler { 1043 | type: "constant" 1044 | value: 0.2 1045 | } 1046 | } 1047 | } 1048 | layer { 1049 | name: "inception_4c/relu_1x1" 1050 | type: "ReLU" 1051 | bottom: "inception_4c/1x1" 1052 | top: "inception_4c/1x1" 1053 | } 1054 | layer { 1055 | name: "inception_4c/3x3_reduce" 1056 | type: "Convolution" 1057 | bottom: "inception_4b/output" 1058 | top: "inception_4c/3x3_reduce" 1059 | param { 1060 | lr_mult: 1 1061 | decay_mult: 1 1062 | } 1063 | param { 1064 | lr_mult: 2 1065 | decay_mult: 0 1066 | } 1067 | convolution_param { 1068 | num_output: 128 1069 | kernel_size: 1 1070 | weight_filler { 1071 | type: "xavier" 1072 | std: 0.09 1073 | } 1074 | bias_filler { 1075 | type: "constant" 1076 | value: 0.2 1077 | } 1078 | } 1079 | } 1080 | layer { 1081 | name: "inception_4c/relu_3x3_reduce" 1082 | type: "ReLU" 1083 | bottom: "inception_4c/3x3_reduce" 1084 | top: "inception_4c/3x3_reduce" 1085 | } 1086 | layer { 1087 | name: "inception_4c/3x3" 1088 | type: "Convolution" 1089 | bottom: "inception_4c/3x3_reduce" 1090 | top: "inception_4c/3x3" 1091 | param { 1092 | lr_mult: 1 1093 | decay_mult: 1 1094 | } 1095 | param { 1096 | lr_mult: 2 1097 | decay_mult: 0 1098 | } 1099 | convolution_param { 1100 | num_output: 256 1101 | pad: 1 1102 | kernel_size: 3 1103 | weight_filler { 1104 | type: "xavier" 1105 | std: 0.03 1106 | } 1107 | bias_filler { 1108 | type: "constant" 1109 | value: 0.2 1110 | } 1111 | } 1112 | } 1113 | layer { 1114 | name: "inception_4c/relu_3x3" 1115 | type: "ReLU" 1116 | bottom: "inception_4c/3x3" 1117 | top: "inception_4c/3x3" 1118 | } 1119 | layer { 1120 | name: "inception_4c/5x5_reduce" 1121 | type: "Convolution" 1122 | bottom: "inception_4b/output" 1123 | top: "inception_4c/5x5_reduce" 1124 | param { 1125 | lr_mult: 1 1126 | decay_mult: 1 1127 | } 1128 | param { 1129 | lr_mult: 2 1130 | decay_mult: 0 1131 | } 1132 | convolution_param { 1133 | num_output: 24 1134 | kernel_size: 1 1135 | weight_filler { 1136 | type: "xavier" 1137 | std: 0.2 1138 | } 1139 | bias_filler { 1140 | type: "constant" 1141 | value: 0.2 1142 | } 1143 | } 1144 | } 1145 | layer { 1146 | name: "inception_4c/relu_5x5_reduce" 1147 | type: "ReLU" 1148 | bottom: "inception_4c/5x5_reduce" 1149 | top: "inception_4c/5x5_reduce" 1150 | } 1151 | layer { 1152 | name: "inception_4c/5x5" 1153 | type: "Convolution" 1154 | bottom: "inception_4c/5x5_reduce" 1155 | top: "inception_4c/5x5" 1156 | param { 1157 | lr_mult: 1 1158 | decay_mult: 1 1159 | } 1160 | param { 1161 | lr_mult: 2 1162 | decay_mult: 0 1163 | } 1164 | convolution_param { 1165 | num_output: 64 1166 | pad: 2 1167 | kernel_size: 5 1168 | weight_filler { 1169 | type: "xavier" 1170 | std: 0.03 1171 | } 1172 | bias_filler { 1173 | type: "constant" 1174 | value: 0.2 1175 | } 1176 | } 1177 | } 1178 | layer { 1179 | name: "inception_4c/relu_5x5" 1180 | type: "ReLU" 1181 | bottom: "inception_4c/5x5" 1182 | top: "inception_4c/5x5" 1183 | } 1184 | layer { 1185 | name: "inception_4c/pool" 1186 | type: "Pooling" 1187 | bottom: "inception_4b/output" 1188 | top: "inception_4c/pool" 1189 | pooling_param { 1190 | pool: MAX 1191 | kernel_size: 3 1192 | stride: 1 1193 | pad: 1 1194 | } 1195 | } 1196 | layer { 1197 | name: "inception_4c/pool_proj" 1198 | type: "Convolution" 1199 | bottom: "inception_4c/pool" 1200 | top: "inception_4c/pool_proj" 1201 | param { 1202 | lr_mult: 1 1203 | decay_mult: 1 1204 | } 1205 | param { 1206 | lr_mult: 2 1207 | decay_mult: 0 1208 | } 1209 | convolution_param { 1210 | num_output: 64 1211 | kernel_size: 1 1212 | weight_filler { 1213 | type: "xavier" 1214 | std: 0.1 1215 | } 1216 | bias_filler { 1217 | type: "constant" 1218 | value: 0.2 1219 | } 1220 | } 1221 | } 1222 | layer { 1223 | name: "inception_4c/relu_pool_proj" 1224 | type: "ReLU" 1225 | bottom: "inception_4c/pool_proj" 1226 | top: "inception_4c/pool_proj" 1227 | } 1228 | layer { 1229 | name: "inception_4c/output" 1230 | type: "Concat" 1231 | bottom: "inception_4c/1x1" 1232 | bottom: "inception_4c/3x3" 1233 | bottom: "inception_4c/5x5" 1234 | bottom: "inception_4c/pool_proj" 1235 | top: "inception_4c/output" 1236 | } 1237 | layer { 1238 | name: "inception_4d/1x1" 1239 | type: "Convolution" 1240 | bottom: "inception_4c/output" 1241 | top: "inception_4d/1x1" 1242 | param { 1243 | lr_mult: 1 1244 | decay_mult: 1 1245 | } 1246 | param { 1247 | lr_mult: 2 1248 | decay_mult: 0 1249 | } 1250 | convolution_param { 1251 | num_output: 112 1252 | kernel_size: 1 1253 | weight_filler { 1254 | type: "xavier" 1255 | std: 0.03 1256 | } 1257 | bias_filler { 1258 | type: "constant" 1259 | value: 0.2 1260 | } 1261 | } 1262 | } 1263 | layer { 1264 | name: "inception_4d/relu_1x1" 1265 | type: "ReLU" 1266 | bottom: "inception_4d/1x1" 1267 | top: "inception_4d/1x1" 1268 | } 1269 | layer { 1270 | name: "inception_4d/3x3_reduce" 1271 | type: "Convolution" 1272 | bottom: "inception_4c/output" 1273 | top: "inception_4d/3x3_reduce" 1274 | param { 1275 | lr_mult: 1 1276 | decay_mult: 1 1277 | } 1278 | param { 1279 | lr_mult: 2 1280 | decay_mult: 0 1281 | } 1282 | convolution_param { 1283 | num_output: 144 1284 | kernel_size: 1 1285 | weight_filler { 1286 | type: "xavier" 1287 | std: 0.09 1288 | } 1289 | bias_filler { 1290 | type: "constant" 1291 | value: 0.2 1292 | } 1293 | } 1294 | } 1295 | layer { 1296 | name: "inception_4d/relu_3x3_reduce" 1297 | type: "ReLU" 1298 | bottom: "inception_4d/3x3_reduce" 1299 | top: "inception_4d/3x3_reduce" 1300 | } 1301 | layer { 1302 | name: "inception_4d/3x3" 1303 | type: "Convolution" 1304 | bottom: "inception_4d/3x3_reduce" 1305 | top: "inception_4d/3x3" 1306 | param { 1307 | lr_mult: 1 1308 | decay_mult: 1 1309 | } 1310 | param { 1311 | lr_mult: 2 1312 | decay_mult: 0 1313 | } 1314 | convolution_param { 1315 | num_output: 288 1316 | pad: 1 1317 | kernel_size: 3 1318 | weight_filler { 1319 | type: "xavier" 1320 | std: 0.03 1321 | } 1322 | bias_filler { 1323 | type: "constant" 1324 | value: 0.2 1325 | } 1326 | } 1327 | } 1328 | layer { 1329 | name: "inception_4d/relu_3x3" 1330 | type: "ReLU" 1331 | bottom: "inception_4d/3x3" 1332 | top: "inception_4d/3x3" 1333 | } 1334 | layer { 1335 | name: "inception_4d/5x5_reduce" 1336 | type: "Convolution" 1337 | bottom: "inception_4c/output" 1338 | top: "inception_4d/5x5_reduce" 1339 | param { 1340 | lr_mult: 1 1341 | decay_mult: 1 1342 | } 1343 | param { 1344 | lr_mult: 2 1345 | decay_mult: 0 1346 | } 1347 | convolution_param { 1348 | num_output: 32 1349 | kernel_size: 1 1350 | weight_filler { 1351 | type: "xavier" 1352 | std: 0.2 1353 | } 1354 | bias_filler { 1355 | type: "constant" 1356 | value: 0.2 1357 | } 1358 | } 1359 | } 1360 | layer { 1361 | name: "inception_4d/relu_5x5_reduce" 1362 | type: "ReLU" 1363 | bottom: "inception_4d/5x5_reduce" 1364 | top: "inception_4d/5x5_reduce" 1365 | } 1366 | layer { 1367 | name: "inception_4d/5x5" 1368 | type: "Convolution" 1369 | bottom: "inception_4d/5x5_reduce" 1370 | top: "inception_4d/5x5" 1371 | param { 1372 | lr_mult: 1 1373 | decay_mult: 1 1374 | } 1375 | param { 1376 | lr_mult: 2 1377 | decay_mult: 0 1378 | } 1379 | convolution_param { 1380 | num_output: 64 1381 | pad: 2 1382 | kernel_size: 5 1383 | weight_filler { 1384 | type: "xavier" 1385 | std: 0.03 1386 | } 1387 | bias_filler { 1388 | type: "constant" 1389 | value: 0.2 1390 | } 1391 | } 1392 | } 1393 | layer { 1394 | name: "inception_4d/relu_5x5" 1395 | type: "ReLU" 1396 | bottom: "inception_4d/5x5" 1397 | top: "inception_4d/5x5" 1398 | } 1399 | layer { 1400 | name: "inception_4d/pool" 1401 | type: "Pooling" 1402 | bottom: "inception_4c/output" 1403 | top: "inception_4d/pool" 1404 | pooling_param { 1405 | pool: MAX 1406 | kernel_size: 3 1407 | stride: 1 1408 | pad: 1 1409 | } 1410 | } 1411 | layer { 1412 | name: "inception_4d/pool_proj" 1413 | type: "Convolution" 1414 | bottom: "inception_4d/pool" 1415 | top: "inception_4d/pool_proj" 1416 | param { 1417 | lr_mult: 1 1418 | decay_mult: 1 1419 | } 1420 | param { 1421 | lr_mult: 2 1422 | decay_mult: 0 1423 | } 1424 | convolution_param { 1425 | num_output: 64 1426 | kernel_size: 1 1427 | weight_filler { 1428 | type: "xavier" 1429 | std: 0.1 1430 | } 1431 | bias_filler { 1432 | type: "constant" 1433 | value: 0.2 1434 | } 1435 | } 1436 | } 1437 | layer { 1438 | name: "inception_4d/relu_pool_proj" 1439 | type: "ReLU" 1440 | bottom: "inception_4d/pool_proj" 1441 | top: "inception_4d/pool_proj" 1442 | } 1443 | layer { 1444 | name: "inception_4d/output" 1445 | type: "Concat" 1446 | bottom: "inception_4d/1x1" 1447 | bottom: "inception_4d/3x3" 1448 | bottom: "inception_4d/5x5" 1449 | bottom: "inception_4d/pool_proj" 1450 | top: "inception_4d/output" 1451 | } 1452 | layer { 1453 | name: "inception_4e/1x1" 1454 | type: "Convolution" 1455 | bottom: "inception_4d/output" 1456 | top: "inception_4e/1x1" 1457 | param { 1458 | lr_mult: 1 1459 | decay_mult: 1 1460 | } 1461 | param { 1462 | lr_mult: 2 1463 | decay_mult: 0 1464 | } 1465 | convolution_param { 1466 | num_output: 256 1467 | kernel_size: 1 1468 | weight_filler { 1469 | type: "xavier" 1470 | std: 0.03 1471 | } 1472 | bias_filler { 1473 | type: "constant" 1474 | value: 0.2 1475 | } 1476 | } 1477 | } 1478 | layer { 1479 | name: "inception_4e/relu_1x1" 1480 | type: "ReLU" 1481 | bottom: "inception_4e/1x1" 1482 | top: "inception_4e/1x1" 1483 | } 1484 | layer { 1485 | name: "inception_4e/3x3_reduce" 1486 | type: "Convolution" 1487 | bottom: "inception_4d/output" 1488 | top: "inception_4e/3x3_reduce" 1489 | param { 1490 | lr_mult: 1 1491 | decay_mult: 1 1492 | } 1493 | param { 1494 | lr_mult: 2 1495 | decay_mult: 0 1496 | } 1497 | convolution_param { 1498 | num_output: 160 1499 | kernel_size: 1 1500 | weight_filler { 1501 | type: "xavier" 1502 | std: 0.09 1503 | } 1504 | bias_filler { 1505 | type: "constant" 1506 | value: 0.2 1507 | } 1508 | } 1509 | } 1510 | layer { 1511 | name: "inception_4e/relu_3x3_reduce" 1512 | type: "ReLU" 1513 | bottom: "inception_4e/3x3_reduce" 1514 | top: "inception_4e/3x3_reduce" 1515 | } 1516 | layer { 1517 | name: "inception_4e/3x3" 1518 | type: "Convolution" 1519 | bottom: "inception_4e/3x3_reduce" 1520 | top: "inception_4e/3x3" 1521 | param { 1522 | lr_mult: 1 1523 | decay_mult: 1 1524 | } 1525 | param { 1526 | lr_mult: 2 1527 | decay_mult: 0 1528 | } 1529 | convolution_param { 1530 | num_output: 320 1531 | pad: 1 1532 | kernel_size: 3 1533 | weight_filler { 1534 | type: "xavier" 1535 | std: 0.03 1536 | } 1537 | bias_filler { 1538 | type: "constant" 1539 | value: 0.2 1540 | } 1541 | } 1542 | } 1543 | layer { 1544 | name: "inception_4e/relu_3x3" 1545 | type: "ReLU" 1546 | bottom: "inception_4e/3x3" 1547 | top: "inception_4e/3x3" 1548 | } 1549 | layer { 1550 | name: "inception_4e/5x5_reduce" 1551 | type: "Convolution" 1552 | bottom: "inception_4d/output" 1553 | top: "inception_4e/5x5_reduce" 1554 | param { 1555 | lr_mult: 1 1556 | decay_mult: 1 1557 | } 1558 | param { 1559 | lr_mult: 2 1560 | decay_mult: 0 1561 | } 1562 | convolution_param { 1563 | num_output: 32 1564 | kernel_size: 1 1565 | weight_filler { 1566 | type: "xavier" 1567 | std: 0.2 1568 | } 1569 | bias_filler { 1570 | type: "constant" 1571 | value: 0.2 1572 | } 1573 | } 1574 | } 1575 | layer { 1576 | name: "inception_4e/relu_5x5_reduce" 1577 | type: "ReLU" 1578 | bottom: "inception_4e/5x5_reduce" 1579 | top: "inception_4e/5x5_reduce" 1580 | } 1581 | layer { 1582 | name: "inception_4e/5x5" 1583 | type: "Convolution" 1584 | bottom: "inception_4e/5x5_reduce" 1585 | top: "inception_4e/5x5" 1586 | param { 1587 | lr_mult: 1 1588 | decay_mult: 1 1589 | } 1590 | param { 1591 | lr_mult: 2 1592 | decay_mult: 0 1593 | } 1594 | convolution_param { 1595 | num_output: 128 1596 | pad: 2 1597 | kernel_size: 5 1598 | weight_filler { 1599 | type: "xavier" 1600 | std: 0.03 1601 | } 1602 | bias_filler { 1603 | type: "constant" 1604 | value: 0.2 1605 | } 1606 | } 1607 | } 1608 | layer { 1609 | name: "inception_4e/relu_5x5" 1610 | type: "ReLU" 1611 | bottom: "inception_4e/5x5" 1612 | top: "inception_4e/5x5" 1613 | } 1614 | layer { 1615 | name: "inception_4e/pool" 1616 | type: "Pooling" 1617 | bottom: "inception_4d/output" 1618 | top: "inception_4e/pool" 1619 | pooling_param { 1620 | pool: MAX 1621 | kernel_size: 3 1622 | stride: 1 1623 | pad: 1 1624 | } 1625 | } 1626 | layer { 1627 | name: "inception_4e/pool_proj" 1628 | type: "Convolution" 1629 | bottom: "inception_4e/pool" 1630 | top: "inception_4e/pool_proj" 1631 | param { 1632 | lr_mult: 1 1633 | decay_mult: 1 1634 | } 1635 | param { 1636 | lr_mult: 2 1637 | decay_mult: 0 1638 | } 1639 | convolution_param { 1640 | num_output: 128 1641 | kernel_size: 1 1642 | weight_filler { 1643 | type: "xavier" 1644 | std: 0.1 1645 | } 1646 | bias_filler { 1647 | type: "constant" 1648 | value: 0.2 1649 | } 1650 | } 1651 | } 1652 | layer { 1653 | name: "inception_4e/relu_pool_proj" 1654 | type: "ReLU" 1655 | bottom: "inception_4e/pool_proj" 1656 | top: "inception_4e/pool_proj" 1657 | } 1658 | layer { 1659 | name: "inception_4e/output" 1660 | type: "Concat" 1661 | bottom: "inception_4e/1x1" 1662 | bottom: "inception_4e/3x3" 1663 | bottom: "inception_4e/5x5" 1664 | bottom: "inception_4e/pool_proj" 1665 | top: "inception_4e/output" 1666 | } 1667 | layer { 1668 | name: "CAM_conv" 1669 | type: "Convolution" 1670 | bottom: "inception_4e/output" 1671 | top: "CAM_conv" 1672 | param { 1673 | lr_mult: 1 1674 | decay_mult: 1 1675 | } 1676 | param { 1677 | lr_mult: 2 1678 | decay_mult: 0 1679 | } 1680 | convolution_param { 1681 | num_output: 1024 1682 | pad: 1 1683 | kernel_size: 3 1684 | group: 2 1685 | weight_filler { 1686 | type: "gaussian" 1687 | std: 0.01 1688 | } 1689 | bias_filler { 1690 | type: "constant" 1691 | value: 1 1692 | } 1693 | } 1694 | } 1695 | layer { 1696 | name: "CAM_relu" 1697 | type: "ReLU" 1698 | bottom: "CAM_conv" 1699 | top: "CAM_conv" 1700 | } 1701 | layer { 1702 | name: "CAM_pool" 1703 | type: "Pooling" 1704 | bottom: "CAM_conv" 1705 | top: "CAM_pool" 1706 | pooling_param { 1707 | pool: AVE 1708 | kernel_size: 14 1709 | stride: 14 1710 | } 1711 | } 1712 | layer { 1713 | name: "CAM_fc" 1714 | type: "InnerProduct" 1715 | bottom: "CAM_pool" 1716 | top: "CAM_fc" 1717 | param { 1718 | lr_mult: 1 1719 | decay_mult: 1 1720 | } 1721 | param { 1722 | lr_mult: 2 1723 | decay_mult: 0 1724 | } 1725 | inner_product_param { 1726 | num_output: 1000 1727 | weight_filler { 1728 | type: "xavier" 1729 | } 1730 | bias_filler { 1731 | type: "constant" 1732 | value: 0 1733 | } 1734 | } 1735 | } 1736 | layer { 1737 | name: "prob" 1738 | type: "Softmax" 1739 | bottom: "CAM_fc" 1740 | top: "prob" 1741 | } 1742 | 1743 | -------------------------------------------------------------------------------- /models/deploy_googlenetCAM_places205.prototxt: -------------------------------------------------------------------------------- 1 | name: "GoogleNet" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 224 6 | input_dim: 224 7 | force_backward: true 8 | layer { 9 | name: "conv1/7x7_s2" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1/7x7_s2" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 64 23 | pad: 3 24 | kernel_size: 7 25 | stride: 2 26 | weight_filler { 27 | type: "xavier" 28 | std: 0.1 29 | } 30 | bias_filler { 31 | type: "constant" 32 | value: 0.2 33 | } 34 | } 35 | } 36 | layer { 37 | name: "conv1/relu_7x7" 38 | type: "ReLU" 39 | bottom: "conv1/7x7_s2" 40 | top: "conv1/7x7_s2" 41 | } 42 | layer { 43 | name: "pool1/3x3_s2" 44 | type: "Pooling" 45 | bottom: "conv1/7x7_s2" 46 | top: "pool1/3x3_s2" 47 | pooling_param { 48 | pool: MAX 49 | kernel_size: 3 50 | stride: 2 51 | } 52 | } 53 | layer { 54 | name: "pool1/norm1" 55 | type: "LRN" 56 | bottom: "pool1/3x3_s2" 57 | top: "pool1/norm1" 58 | lrn_param { 59 | local_size: 5 60 | alpha: 0.0001 61 | beta: 0.75 62 | } 63 | } 64 | layer { 65 | name: "conv2/3x3_reduce" 66 | type: "Convolution" 67 | bottom: "pool1/norm1" 68 | top: "conv2/3x3_reduce" 69 | param { 70 | lr_mult: 1 71 | decay_mult: 1 72 | } 73 | param { 74 | lr_mult: 2 75 | decay_mult: 0 76 | } 77 | convolution_param { 78 | num_output: 64 79 | kernel_size: 1 80 | weight_filler { 81 | type: "xavier" 82 | std: 0.1 83 | } 84 | bias_filler { 85 | type: "constant" 86 | value: 0.2 87 | } 88 | } 89 | } 90 | layer { 91 | name: "conv2/relu_3x3_reduce" 92 | type: "ReLU" 93 | bottom: "conv2/3x3_reduce" 94 | top: "conv2/3x3_reduce" 95 | } 96 | layer { 97 | name: "conv2/3x3" 98 | type: "Convolution" 99 | bottom: "conv2/3x3_reduce" 100 | top: "conv2/3x3" 101 | param { 102 | lr_mult: 1 103 | decay_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | decay_mult: 0 108 | } 109 | convolution_param { 110 | num_output: 192 111 | pad: 1 112 | kernel_size: 3 113 | weight_filler { 114 | type: "xavier" 115 | std: 0.03 116 | } 117 | bias_filler { 118 | type: "constant" 119 | value: 0.2 120 | } 121 | } 122 | } 123 | layer { 124 | name: "conv2/relu_3x3" 125 | type: "ReLU" 126 | bottom: "conv2/3x3" 127 | top: "conv2/3x3" 128 | } 129 | layer { 130 | name: "conv2/norm2" 131 | type: "LRN" 132 | bottom: "conv2/3x3" 133 | top: "conv2/norm2" 134 | lrn_param { 135 | local_size: 5 136 | alpha: 0.0001 137 | beta: 0.75 138 | } 139 | } 140 | layer { 141 | name: "pool2/3x3_s2" 142 | type: "Pooling" 143 | bottom: "conv2/norm2" 144 | top: "pool2/3x3_s2" 145 | pooling_param { 146 | pool: MAX 147 | kernel_size: 3 148 | stride: 2 149 | } 150 | } 151 | layer { 152 | name: "inception_3a/1x1" 153 | type: "Convolution" 154 | bottom: "pool2/3x3_s2" 155 | top: "inception_3a/1x1" 156 | param { 157 | lr_mult: 1 158 | decay_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | decay_mult: 0 163 | } 164 | convolution_param { 165 | num_output: 64 166 | kernel_size: 1 167 | weight_filler { 168 | type: "xavier" 169 | std: 0.03 170 | } 171 | bias_filler { 172 | type: "constant" 173 | value: 0.2 174 | } 175 | } 176 | } 177 | layer { 178 | name: "inception_3a/relu_1x1" 179 | type: "ReLU" 180 | bottom: "inception_3a/1x1" 181 | top: "inception_3a/1x1" 182 | } 183 | layer { 184 | name: "inception_3a/3x3_reduce" 185 | type: "Convolution" 186 | bottom: "pool2/3x3_s2" 187 | top: "inception_3a/3x3_reduce" 188 | param { 189 | lr_mult: 1 190 | decay_mult: 1 191 | } 192 | param { 193 | lr_mult: 2 194 | decay_mult: 0 195 | } 196 | convolution_param { 197 | num_output: 96 198 | kernel_size: 1 199 | weight_filler { 200 | type: "xavier" 201 | std: 0.09 202 | } 203 | bias_filler { 204 | type: "constant" 205 | value: 0.2 206 | } 207 | } 208 | } 209 | layer { 210 | name: "inception_3a/relu_3x3_reduce" 211 | type: "ReLU" 212 | bottom: "inception_3a/3x3_reduce" 213 | top: "inception_3a/3x3_reduce" 214 | } 215 | layer { 216 | name: "inception_3a/3x3" 217 | type: "Convolution" 218 | bottom: "inception_3a/3x3_reduce" 219 | top: "inception_3a/3x3" 220 | param { 221 | lr_mult: 1 222 | decay_mult: 1 223 | } 224 | param { 225 | lr_mult: 2 226 | decay_mult: 0 227 | } 228 | convolution_param { 229 | num_output: 128 230 | pad: 1 231 | kernel_size: 3 232 | weight_filler { 233 | type: "xavier" 234 | std: 0.03 235 | } 236 | bias_filler { 237 | type: "constant" 238 | value: 0.2 239 | } 240 | } 241 | } 242 | layer { 243 | name: "inception_3a/relu_3x3" 244 | type: "ReLU" 245 | bottom: "inception_3a/3x3" 246 | top: "inception_3a/3x3" 247 | } 248 | layer { 249 | name: "inception_3a/5x5_reduce" 250 | type: "Convolution" 251 | bottom: "pool2/3x3_s2" 252 | top: "inception_3a/5x5_reduce" 253 | param { 254 | lr_mult: 1 255 | decay_mult: 1 256 | } 257 | param { 258 | lr_mult: 2 259 | decay_mult: 0 260 | } 261 | convolution_param { 262 | num_output: 16 263 | kernel_size: 1 264 | weight_filler { 265 | type: "xavier" 266 | std: 0.2 267 | } 268 | bias_filler { 269 | type: "constant" 270 | value: 0.2 271 | } 272 | } 273 | } 274 | layer { 275 | name: "inception_3a/relu_5x5_reduce" 276 | type: "ReLU" 277 | bottom: "inception_3a/5x5_reduce" 278 | top: "inception_3a/5x5_reduce" 279 | } 280 | layer { 281 | name: "inception_3a/5x5" 282 | type: "Convolution" 283 | bottom: "inception_3a/5x5_reduce" 284 | top: "inception_3a/5x5" 285 | param { 286 | lr_mult: 1 287 | decay_mult: 1 288 | } 289 | param { 290 | lr_mult: 2 291 | decay_mult: 0 292 | } 293 | convolution_param { 294 | num_output: 32 295 | pad: 2 296 | kernel_size: 5 297 | weight_filler { 298 | type: "xavier" 299 | std: 0.03 300 | } 301 | bias_filler { 302 | type: "constant" 303 | value: 0.2 304 | } 305 | } 306 | } 307 | layer { 308 | name: "inception_3a/relu_5x5" 309 | type: "ReLU" 310 | bottom: "inception_3a/5x5" 311 | top: "inception_3a/5x5" 312 | } 313 | layer { 314 | name: "inception_3a/pool" 315 | type: "Pooling" 316 | bottom: "pool2/3x3_s2" 317 | top: "inception_3a/pool" 318 | pooling_param { 319 | pool: MAX 320 | kernel_size: 3 321 | stride: 1 322 | pad: 1 323 | } 324 | } 325 | layer { 326 | name: "inception_3a/pool_proj" 327 | type: "Convolution" 328 | bottom: "inception_3a/pool" 329 | top: "inception_3a/pool_proj" 330 | param { 331 | lr_mult: 1 332 | decay_mult: 1 333 | } 334 | param { 335 | lr_mult: 2 336 | decay_mult: 0 337 | } 338 | convolution_param { 339 | num_output: 32 340 | kernel_size: 1 341 | weight_filler { 342 | type: "xavier" 343 | std: 0.1 344 | } 345 | bias_filler { 346 | type: "constant" 347 | value: 0.2 348 | } 349 | } 350 | } 351 | layer { 352 | name: "inception_3a/relu_pool_proj" 353 | type: "ReLU" 354 | bottom: "inception_3a/pool_proj" 355 | top: "inception_3a/pool_proj" 356 | } 357 | layer { 358 | name: "inception_3a/output" 359 | type: "Concat" 360 | bottom: "inception_3a/1x1" 361 | bottom: "inception_3a/3x3" 362 | bottom: "inception_3a/5x5" 363 | bottom: "inception_3a/pool_proj" 364 | top: "inception_3a/output" 365 | } 366 | layer { 367 | name: "inception_3b/1x1" 368 | type: "Convolution" 369 | bottom: "inception_3a/output" 370 | top: "inception_3b/1x1" 371 | param { 372 | lr_mult: 1 373 | decay_mult: 1 374 | } 375 | param { 376 | lr_mult: 2 377 | decay_mult: 0 378 | } 379 | convolution_param { 380 | num_output: 128 381 | kernel_size: 1 382 | weight_filler { 383 | type: "xavier" 384 | std: 0.03 385 | } 386 | bias_filler { 387 | type: "constant" 388 | value: 0.2 389 | } 390 | } 391 | } 392 | layer { 393 | name: "inception_3b/relu_1x1" 394 | type: "ReLU" 395 | bottom: "inception_3b/1x1" 396 | top: "inception_3b/1x1" 397 | } 398 | layer { 399 | name: "inception_3b/3x3_reduce" 400 | type: "Convolution" 401 | bottom: "inception_3a/output" 402 | top: "inception_3b/3x3_reduce" 403 | param { 404 | lr_mult: 1 405 | decay_mult: 1 406 | } 407 | param { 408 | lr_mult: 2 409 | decay_mult: 0 410 | } 411 | convolution_param { 412 | num_output: 128 413 | kernel_size: 1 414 | weight_filler { 415 | type: "xavier" 416 | std: 0.09 417 | } 418 | bias_filler { 419 | type: "constant" 420 | value: 0.2 421 | } 422 | } 423 | } 424 | layer { 425 | name: "inception_3b/relu_3x3_reduce" 426 | type: "ReLU" 427 | bottom: "inception_3b/3x3_reduce" 428 | top: "inception_3b/3x3_reduce" 429 | } 430 | layer { 431 | name: "inception_3b/3x3" 432 | type: "Convolution" 433 | bottom: "inception_3b/3x3_reduce" 434 | top: "inception_3b/3x3" 435 | param { 436 | lr_mult: 1 437 | decay_mult: 1 438 | } 439 | param { 440 | lr_mult: 2 441 | decay_mult: 0 442 | } 443 | convolution_param { 444 | num_output: 192 445 | pad: 1 446 | kernel_size: 3 447 | weight_filler { 448 | type: "xavier" 449 | std: 0.03 450 | } 451 | bias_filler { 452 | type: "constant" 453 | value: 0.2 454 | } 455 | } 456 | } 457 | layer { 458 | name: "inception_3b/relu_3x3" 459 | type: "ReLU" 460 | bottom: "inception_3b/3x3" 461 | top: "inception_3b/3x3" 462 | } 463 | layer { 464 | name: "inception_3b/5x5_reduce" 465 | type: "Convolution" 466 | bottom: "inception_3a/output" 467 | top: "inception_3b/5x5_reduce" 468 | param { 469 | lr_mult: 1 470 | decay_mult: 1 471 | } 472 | param { 473 | lr_mult: 2 474 | decay_mult: 0 475 | } 476 | convolution_param { 477 | num_output: 32 478 | kernel_size: 1 479 | weight_filler { 480 | type: "xavier" 481 | std: 0.2 482 | } 483 | bias_filler { 484 | type: "constant" 485 | value: 0.2 486 | } 487 | } 488 | } 489 | layer { 490 | name: "inception_3b/relu_5x5_reduce" 491 | type: "ReLU" 492 | bottom: "inception_3b/5x5_reduce" 493 | top: "inception_3b/5x5_reduce" 494 | } 495 | layer { 496 | name: "inception_3b/5x5" 497 | type: "Convolution" 498 | bottom: "inception_3b/5x5_reduce" 499 | top: "inception_3b/5x5" 500 | param { 501 | lr_mult: 1 502 | decay_mult: 1 503 | } 504 | param { 505 | lr_mult: 2 506 | decay_mult: 0 507 | } 508 | convolution_param { 509 | num_output: 96 510 | pad: 2 511 | kernel_size: 5 512 | weight_filler { 513 | type: "xavier" 514 | std: 0.03 515 | } 516 | bias_filler { 517 | type: "constant" 518 | value: 0.2 519 | } 520 | } 521 | } 522 | layer { 523 | name: "inception_3b/relu_5x5" 524 | type: "ReLU" 525 | bottom: "inception_3b/5x5" 526 | top: "inception_3b/5x5" 527 | } 528 | layer { 529 | name: "inception_3b/pool" 530 | type: "Pooling" 531 | bottom: "inception_3a/output" 532 | top: "inception_3b/pool" 533 | pooling_param { 534 | pool: MAX 535 | kernel_size: 3 536 | stride: 1 537 | pad: 1 538 | } 539 | } 540 | layer { 541 | name: "inception_3b/pool_proj" 542 | type: "Convolution" 543 | bottom: "inception_3b/pool" 544 | top: "inception_3b/pool_proj" 545 | param { 546 | lr_mult: 1 547 | decay_mult: 1 548 | } 549 | param { 550 | lr_mult: 2 551 | decay_mult: 0 552 | } 553 | convolution_param { 554 | num_output: 64 555 | kernel_size: 1 556 | weight_filler { 557 | type: "xavier" 558 | std: 0.1 559 | } 560 | bias_filler { 561 | type: "constant" 562 | value: 0.2 563 | } 564 | } 565 | } 566 | layer { 567 | name: "inception_3b/relu_pool_proj" 568 | type: "ReLU" 569 | bottom: "inception_3b/pool_proj" 570 | top: "inception_3b/pool_proj" 571 | } 572 | layer { 573 | name: "inception_3b/output" 574 | type: "Concat" 575 | bottom: "inception_3b/1x1" 576 | bottom: "inception_3b/3x3" 577 | bottom: "inception_3b/5x5" 578 | bottom: "inception_3b/pool_proj" 579 | top: "inception_3b/output" 580 | } 581 | layer { 582 | name: "pool3/3x3_s2" 583 | type: "Pooling" 584 | bottom: "inception_3b/output" 585 | top: "pool3/3x3_s2" 586 | pooling_param { 587 | pool: MAX 588 | kernel_size: 3 589 | stride: 2 590 | } 591 | } 592 | layer { 593 | name: "inception_4a/1x1" 594 | type: "Convolution" 595 | bottom: "pool3/3x3_s2" 596 | top: "inception_4a/1x1" 597 | param { 598 | lr_mult: 1 599 | decay_mult: 1 600 | } 601 | param { 602 | lr_mult: 2 603 | decay_mult: 0 604 | } 605 | convolution_param { 606 | num_output: 192 607 | kernel_size: 1 608 | weight_filler { 609 | type: "xavier" 610 | std: 0.03 611 | } 612 | bias_filler { 613 | type: "constant" 614 | value: 0.2 615 | } 616 | } 617 | } 618 | layer { 619 | name: "inception_4a/relu_1x1" 620 | type: "ReLU" 621 | bottom: "inception_4a/1x1" 622 | top: "inception_4a/1x1" 623 | } 624 | layer { 625 | name: "inception_4a/3x3_reduce" 626 | type: "Convolution" 627 | bottom: "pool3/3x3_s2" 628 | top: "inception_4a/3x3_reduce" 629 | param { 630 | lr_mult: 1 631 | decay_mult: 1 632 | } 633 | param { 634 | lr_mult: 2 635 | decay_mult: 0 636 | } 637 | convolution_param { 638 | num_output: 96 639 | kernel_size: 1 640 | weight_filler { 641 | type: "xavier" 642 | std: 0.09 643 | } 644 | bias_filler { 645 | type: "constant" 646 | value: 0.2 647 | } 648 | } 649 | } 650 | layer { 651 | name: "inception_4a/relu_3x3_reduce" 652 | type: "ReLU" 653 | bottom: "inception_4a/3x3_reduce" 654 | top: "inception_4a/3x3_reduce" 655 | } 656 | layer { 657 | name: "inception_4a/3x3" 658 | type: "Convolution" 659 | bottom: "inception_4a/3x3_reduce" 660 | top: "inception_4a/3x3" 661 | param { 662 | lr_mult: 1 663 | decay_mult: 1 664 | } 665 | param { 666 | lr_mult: 2 667 | decay_mult: 0 668 | } 669 | convolution_param { 670 | num_output: 208 671 | pad: 1 672 | kernel_size: 3 673 | weight_filler { 674 | type: "xavier" 675 | std: 0.03 676 | } 677 | bias_filler { 678 | type: "constant" 679 | value: 0.2 680 | } 681 | } 682 | } 683 | layer { 684 | name: "inception_4a/relu_3x3" 685 | type: "ReLU" 686 | bottom: "inception_4a/3x3" 687 | top: "inception_4a/3x3" 688 | } 689 | layer { 690 | name: "inception_4a/5x5_reduce" 691 | type: "Convolution" 692 | bottom: "pool3/3x3_s2" 693 | top: "inception_4a/5x5_reduce" 694 | param { 695 | lr_mult: 1 696 | decay_mult: 1 697 | } 698 | param { 699 | lr_mult: 2 700 | decay_mult: 0 701 | } 702 | convolution_param { 703 | num_output: 16 704 | kernel_size: 1 705 | weight_filler { 706 | type: "xavier" 707 | std: 0.2 708 | } 709 | bias_filler { 710 | type: "constant" 711 | value: 0.2 712 | } 713 | } 714 | } 715 | layer { 716 | name: "inception_4a/relu_5x5_reduce" 717 | type: "ReLU" 718 | bottom: "inception_4a/5x5_reduce" 719 | top: "inception_4a/5x5_reduce" 720 | } 721 | layer { 722 | name: "inception_4a/5x5" 723 | type: "Convolution" 724 | bottom: "inception_4a/5x5_reduce" 725 | top: "inception_4a/5x5" 726 | param { 727 | lr_mult: 1 728 | decay_mult: 1 729 | } 730 | param { 731 | lr_mult: 2 732 | decay_mult: 0 733 | } 734 | convolution_param { 735 | num_output: 48 736 | pad: 2 737 | kernel_size: 5 738 | weight_filler { 739 | type: "xavier" 740 | std: 0.03 741 | } 742 | bias_filler { 743 | type: "constant" 744 | value: 0.2 745 | } 746 | } 747 | } 748 | layer { 749 | name: "inception_4a/relu_5x5" 750 | type: "ReLU" 751 | bottom: "inception_4a/5x5" 752 | top: "inception_4a/5x5" 753 | } 754 | layer { 755 | name: "inception_4a/pool" 756 | type: "Pooling" 757 | bottom: "pool3/3x3_s2" 758 | top: "inception_4a/pool" 759 | pooling_param { 760 | pool: MAX 761 | kernel_size: 3 762 | stride: 1 763 | pad: 1 764 | } 765 | } 766 | layer { 767 | name: "inception_4a/pool_proj" 768 | type: "Convolution" 769 | bottom: "inception_4a/pool" 770 | top: "inception_4a/pool_proj" 771 | param { 772 | lr_mult: 1 773 | decay_mult: 1 774 | } 775 | param { 776 | lr_mult: 2 777 | decay_mult: 0 778 | } 779 | convolution_param { 780 | num_output: 64 781 | kernel_size: 1 782 | weight_filler { 783 | type: "xavier" 784 | std: 0.1 785 | } 786 | bias_filler { 787 | type: "constant" 788 | value: 0.2 789 | } 790 | } 791 | } 792 | layer { 793 | name: "inception_4a/relu_pool_proj" 794 | type: "ReLU" 795 | bottom: "inception_4a/pool_proj" 796 | top: "inception_4a/pool_proj" 797 | } 798 | layer { 799 | name: "inception_4a/output" 800 | type: "Concat" 801 | bottom: "inception_4a/1x1" 802 | bottom: "inception_4a/3x3" 803 | bottom: "inception_4a/5x5" 804 | bottom: "inception_4a/pool_proj" 805 | top: "inception_4a/output" 806 | } 807 | layer { 808 | name: "inception_4b/1x1" 809 | type: "Convolution" 810 | bottom: "inception_4a/output" 811 | top: "inception_4b/1x1" 812 | param { 813 | lr_mult: 1 814 | decay_mult: 1 815 | } 816 | param { 817 | lr_mult: 2 818 | decay_mult: 0 819 | } 820 | convolution_param { 821 | num_output: 160 822 | kernel_size: 1 823 | weight_filler { 824 | type: "xavier" 825 | std: 0.03 826 | } 827 | bias_filler { 828 | type: "constant" 829 | value: 0.2 830 | } 831 | } 832 | } 833 | layer { 834 | name: "inception_4b/relu_1x1" 835 | type: "ReLU" 836 | bottom: "inception_4b/1x1" 837 | top: "inception_4b/1x1" 838 | } 839 | layer { 840 | name: "inception_4b/3x3_reduce" 841 | type: "Convolution" 842 | bottom: "inception_4a/output" 843 | top: "inception_4b/3x3_reduce" 844 | param { 845 | lr_mult: 1 846 | decay_mult: 1 847 | } 848 | param { 849 | lr_mult: 2 850 | decay_mult: 0 851 | } 852 | convolution_param { 853 | num_output: 112 854 | kernel_size: 1 855 | weight_filler { 856 | type: "xavier" 857 | std: 0.09 858 | } 859 | bias_filler { 860 | type: "constant" 861 | value: 0.2 862 | } 863 | } 864 | } 865 | layer { 866 | name: "inception_4b/relu_3x3_reduce" 867 | type: "ReLU" 868 | bottom: "inception_4b/3x3_reduce" 869 | top: "inception_4b/3x3_reduce" 870 | } 871 | layer { 872 | name: "inception_4b/3x3" 873 | type: "Convolution" 874 | bottom: "inception_4b/3x3_reduce" 875 | top: "inception_4b/3x3" 876 | param { 877 | lr_mult: 1 878 | decay_mult: 1 879 | } 880 | param { 881 | lr_mult: 2 882 | decay_mult: 0 883 | } 884 | convolution_param { 885 | num_output: 224 886 | pad: 1 887 | kernel_size: 3 888 | weight_filler { 889 | type: "xavier" 890 | std: 0.03 891 | } 892 | bias_filler { 893 | type: "constant" 894 | value: 0.2 895 | } 896 | } 897 | } 898 | layer { 899 | name: "inception_4b/relu_3x3" 900 | type: "ReLU" 901 | bottom: "inception_4b/3x3" 902 | top: "inception_4b/3x3" 903 | } 904 | layer { 905 | name: "inception_4b/5x5_reduce" 906 | type: "Convolution" 907 | bottom: "inception_4a/output" 908 | top: "inception_4b/5x5_reduce" 909 | param { 910 | lr_mult: 1 911 | decay_mult: 1 912 | } 913 | param { 914 | lr_mult: 2 915 | decay_mult: 0 916 | } 917 | convolution_param { 918 | num_output: 24 919 | kernel_size: 1 920 | weight_filler { 921 | type: "xavier" 922 | std: 0.2 923 | } 924 | bias_filler { 925 | type: "constant" 926 | value: 0.2 927 | } 928 | } 929 | } 930 | layer { 931 | name: "inception_4b/relu_5x5_reduce" 932 | type: "ReLU" 933 | bottom: "inception_4b/5x5_reduce" 934 | top: "inception_4b/5x5_reduce" 935 | } 936 | layer { 937 | name: "inception_4b/5x5" 938 | type: "Convolution" 939 | bottom: "inception_4b/5x5_reduce" 940 | top: "inception_4b/5x5" 941 | param { 942 | lr_mult: 1 943 | decay_mult: 1 944 | } 945 | param { 946 | lr_mult: 2 947 | decay_mult: 0 948 | } 949 | convolution_param { 950 | num_output: 64 951 | pad: 2 952 | kernel_size: 5 953 | weight_filler { 954 | type: "xavier" 955 | std: 0.03 956 | } 957 | bias_filler { 958 | type: "constant" 959 | value: 0.2 960 | } 961 | } 962 | } 963 | layer { 964 | name: "inception_4b/relu_5x5" 965 | type: "ReLU" 966 | bottom: "inception_4b/5x5" 967 | top: "inception_4b/5x5" 968 | } 969 | layer { 970 | name: "inception_4b/pool" 971 | type: "Pooling" 972 | bottom: "inception_4a/output" 973 | top: "inception_4b/pool" 974 | pooling_param { 975 | pool: MAX 976 | kernel_size: 3 977 | stride: 1 978 | pad: 1 979 | } 980 | } 981 | layer { 982 | name: "inception_4b/pool_proj" 983 | type: "Convolution" 984 | bottom: "inception_4b/pool" 985 | top: "inception_4b/pool_proj" 986 | param { 987 | lr_mult: 1 988 | decay_mult: 1 989 | } 990 | param { 991 | lr_mult: 2 992 | decay_mult: 0 993 | } 994 | convolution_param { 995 | num_output: 64 996 | kernel_size: 1 997 | weight_filler { 998 | type: "xavier" 999 | std: 0.1 1000 | } 1001 | bias_filler { 1002 | type: "constant" 1003 | value: 0.2 1004 | } 1005 | } 1006 | } 1007 | layer { 1008 | name: "inception_4b/relu_pool_proj" 1009 | type: "ReLU" 1010 | bottom: "inception_4b/pool_proj" 1011 | top: "inception_4b/pool_proj" 1012 | } 1013 | layer { 1014 | name: "inception_4b/output" 1015 | type: "Concat" 1016 | bottom: "inception_4b/1x1" 1017 | bottom: "inception_4b/3x3" 1018 | bottom: "inception_4b/5x5" 1019 | bottom: "inception_4b/pool_proj" 1020 | top: "inception_4b/output" 1021 | } 1022 | layer { 1023 | name: "inception_4c/1x1" 1024 | type: "Convolution" 1025 | bottom: "inception_4b/output" 1026 | top: "inception_4c/1x1" 1027 | param { 1028 | lr_mult: 1 1029 | decay_mult: 1 1030 | } 1031 | param { 1032 | lr_mult: 2 1033 | decay_mult: 0 1034 | } 1035 | convolution_param { 1036 | num_output: 128 1037 | kernel_size: 1 1038 | weight_filler { 1039 | type: "xavier" 1040 | std: 0.03 1041 | } 1042 | bias_filler { 1043 | type: "constant" 1044 | value: 0.2 1045 | } 1046 | } 1047 | } 1048 | layer { 1049 | name: "inception_4c/relu_1x1" 1050 | type: "ReLU" 1051 | bottom: "inception_4c/1x1" 1052 | top: "inception_4c/1x1" 1053 | } 1054 | layer { 1055 | name: "inception_4c/3x3_reduce" 1056 | type: "Convolution" 1057 | bottom: "inception_4b/output" 1058 | top: "inception_4c/3x3_reduce" 1059 | param { 1060 | lr_mult: 1 1061 | decay_mult: 1 1062 | } 1063 | param { 1064 | lr_mult: 2 1065 | decay_mult: 0 1066 | } 1067 | convolution_param { 1068 | num_output: 128 1069 | kernel_size: 1 1070 | weight_filler { 1071 | type: "xavier" 1072 | std: 0.09 1073 | } 1074 | bias_filler { 1075 | type: "constant" 1076 | value: 0.2 1077 | } 1078 | } 1079 | } 1080 | layer { 1081 | name: "inception_4c/relu_3x3_reduce" 1082 | type: "ReLU" 1083 | bottom: "inception_4c/3x3_reduce" 1084 | top: "inception_4c/3x3_reduce" 1085 | } 1086 | layer { 1087 | name: "inception_4c/3x3" 1088 | type: "Convolution" 1089 | bottom: "inception_4c/3x3_reduce" 1090 | top: "inception_4c/3x3" 1091 | param { 1092 | lr_mult: 1 1093 | decay_mult: 1 1094 | } 1095 | param { 1096 | lr_mult: 2 1097 | decay_mult: 0 1098 | } 1099 | convolution_param { 1100 | num_output: 256 1101 | pad: 1 1102 | kernel_size: 3 1103 | weight_filler { 1104 | type: "xavier" 1105 | std: 0.03 1106 | } 1107 | bias_filler { 1108 | type: "constant" 1109 | value: 0.2 1110 | } 1111 | } 1112 | } 1113 | layer { 1114 | name: "inception_4c/relu_3x3" 1115 | type: "ReLU" 1116 | bottom: "inception_4c/3x3" 1117 | top: "inception_4c/3x3" 1118 | } 1119 | layer { 1120 | name: "inception_4c/5x5_reduce" 1121 | type: "Convolution" 1122 | bottom: "inception_4b/output" 1123 | top: "inception_4c/5x5_reduce" 1124 | param { 1125 | lr_mult: 1 1126 | decay_mult: 1 1127 | } 1128 | param { 1129 | lr_mult: 2 1130 | decay_mult: 0 1131 | } 1132 | convolution_param { 1133 | num_output: 24 1134 | kernel_size: 1 1135 | weight_filler { 1136 | type: "xavier" 1137 | std: 0.2 1138 | } 1139 | bias_filler { 1140 | type: "constant" 1141 | value: 0.2 1142 | } 1143 | } 1144 | } 1145 | layer { 1146 | name: "inception_4c/relu_5x5_reduce" 1147 | type: "ReLU" 1148 | bottom: "inception_4c/5x5_reduce" 1149 | top: "inception_4c/5x5_reduce" 1150 | } 1151 | layer { 1152 | name: "inception_4c/5x5" 1153 | type: "Convolution" 1154 | bottom: "inception_4c/5x5_reduce" 1155 | top: "inception_4c/5x5" 1156 | param { 1157 | lr_mult: 1 1158 | decay_mult: 1 1159 | } 1160 | param { 1161 | lr_mult: 2 1162 | decay_mult: 0 1163 | } 1164 | convolution_param { 1165 | num_output: 64 1166 | pad: 2 1167 | kernel_size: 5 1168 | weight_filler { 1169 | type: "xavier" 1170 | std: 0.03 1171 | } 1172 | bias_filler { 1173 | type: "constant" 1174 | value: 0.2 1175 | } 1176 | } 1177 | } 1178 | layer { 1179 | name: "inception_4c/relu_5x5" 1180 | type: "ReLU" 1181 | bottom: "inception_4c/5x5" 1182 | top: "inception_4c/5x5" 1183 | } 1184 | layer { 1185 | name: "inception_4c/pool" 1186 | type: "Pooling" 1187 | bottom: "inception_4b/output" 1188 | top: "inception_4c/pool" 1189 | pooling_param { 1190 | pool: MAX 1191 | kernel_size: 3 1192 | stride: 1 1193 | pad: 1 1194 | } 1195 | } 1196 | layer { 1197 | name: "inception_4c/pool_proj" 1198 | type: "Convolution" 1199 | bottom: "inception_4c/pool" 1200 | top: "inception_4c/pool_proj" 1201 | param { 1202 | lr_mult: 1 1203 | decay_mult: 1 1204 | } 1205 | param { 1206 | lr_mult: 2 1207 | decay_mult: 0 1208 | } 1209 | convolution_param { 1210 | num_output: 64 1211 | kernel_size: 1 1212 | weight_filler { 1213 | type: "xavier" 1214 | std: 0.1 1215 | } 1216 | bias_filler { 1217 | type: "constant" 1218 | value: 0.2 1219 | } 1220 | } 1221 | } 1222 | layer { 1223 | name: "inception_4c/relu_pool_proj" 1224 | type: "ReLU" 1225 | bottom: "inception_4c/pool_proj" 1226 | top: "inception_4c/pool_proj" 1227 | } 1228 | layer { 1229 | name: "inception_4c/output" 1230 | type: "Concat" 1231 | bottom: "inception_4c/1x1" 1232 | bottom: "inception_4c/3x3" 1233 | bottom: "inception_4c/5x5" 1234 | bottom: "inception_4c/pool_proj" 1235 | top: "inception_4c/output" 1236 | } 1237 | layer { 1238 | name: "inception_4d/1x1" 1239 | type: "Convolution" 1240 | bottom: "inception_4c/output" 1241 | top: "inception_4d/1x1" 1242 | param { 1243 | lr_mult: 1 1244 | decay_mult: 1 1245 | } 1246 | param { 1247 | lr_mult: 2 1248 | decay_mult: 0 1249 | } 1250 | convolution_param { 1251 | num_output: 112 1252 | kernel_size: 1 1253 | weight_filler { 1254 | type: "xavier" 1255 | std: 0.03 1256 | } 1257 | bias_filler { 1258 | type: "constant" 1259 | value: 0.2 1260 | } 1261 | } 1262 | } 1263 | layer { 1264 | name: "inception_4d/relu_1x1" 1265 | type: "ReLU" 1266 | bottom: "inception_4d/1x1" 1267 | top: "inception_4d/1x1" 1268 | } 1269 | layer { 1270 | name: "inception_4d/3x3_reduce" 1271 | type: "Convolution" 1272 | bottom: "inception_4c/output" 1273 | top: "inception_4d/3x3_reduce" 1274 | param { 1275 | lr_mult: 1 1276 | decay_mult: 1 1277 | } 1278 | param { 1279 | lr_mult: 2 1280 | decay_mult: 0 1281 | } 1282 | convolution_param { 1283 | num_output: 144 1284 | kernel_size: 1 1285 | weight_filler { 1286 | type: "xavier" 1287 | std: 0.09 1288 | } 1289 | bias_filler { 1290 | type: "constant" 1291 | value: 0.2 1292 | } 1293 | } 1294 | } 1295 | layer { 1296 | name: "inception_4d/relu_3x3_reduce" 1297 | type: "ReLU" 1298 | bottom: "inception_4d/3x3_reduce" 1299 | top: "inception_4d/3x3_reduce" 1300 | } 1301 | layer { 1302 | name: "inception_4d/3x3" 1303 | type: "Convolution" 1304 | bottom: "inception_4d/3x3_reduce" 1305 | top: "inception_4d/3x3" 1306 | param { 1307 | lr_mult: 1 1308 | decay_mult: 1 1309 | } 1310 | param { 1311 | lr_mult: 2 1312 | decay_mult: 0 1313 | } 1314 | convolution_param { 1315 | num_output: 288 1316 | pad: 1 1317 | kernel_size: 3 1318 | weight_filler { 1319 | type: "xavier" 1320 | std: 0.03 1321 | } 1322 | bias_filler { 1323 | type: "constant" 1324 | value: 0.2 1325 | } 1326 | } 1327 | } 1328 | layer { 1329 | name: "inception_4d/relu_3x3" 1330 | type: "ReLU" 1331 | bottom: "inception_4d/3x3" 1332 | top: "inception_4d/3x3" 1333 | } 1334 | layer { 1335 | name: "inception_4d/5x5_reduce" 1336 | type: "Convolution" 1337 | bottom: "inception_4c/output" 1338 | top: "inception_4d/5x5_reduce" 1339 | param { 1340 | lr_mult: 1 1341 | decay_mult: 1 1342 | } 1343 | param { 1344 | lr_mult: 2 1345 | decay_mult: 0 1346 | } 1347 | convolution_param { 1348 | num_output: 32 1349 | kernel_size: 1 1350 | weight_filler { 1351 | type: "xavier" 1352 | std: 0.2 1353 | } 1354 | bias_filler { 1355 | type: "constant" 1356 | value: 0.2 1357 | } 1358 | } 1359 | } 1360 | layer { 1361 | name: "inception_4d/relu_5x5_reduce" 1362 | type: "ReLU" 1363 | bottom: "inception_4d/5x5_reduce" 1364 | top: "inception_4d/5x5_reduce" 1365 | } 1366 | layer { 1367 | name: "inception_4d/5x5" 1368 | type: "Convolution" 1369 | bottom: "inception_4d/5x5_reduce" 1370 | top: "inception_4d/5x5" 1371 | param { 1372 | lr_mult: 1 1373 | decay_mult: 1 1374 | } 1375 | param { 1376 | lr_mult: 2 1377 | decay_mult: 0 1378 | } 1379 | convolution_param { 1380 | num_output: 64 1381 | pad: 2 1382 | kernel_size: 5 1383 | weight_filler { 1384 | type: "xavier" 1385 | std: 0.03 1386 | } 1387 | bias_filler { 1388 | type: "constant" 1389 | value: 0.2 1390 | } 1391 | } 1392 | } 1393 | layer { 1394 | name: "inception_4d/relu_5x5" 1395 | type: "ReLU" 1396 | bottom: "inception_4d/5x5" 1397 | top: "inception_4d/5x5" 1398 | } 1399 | layer { 1400 | name: "inception_4d/pool" 1401 | type: "Pooling" 1402 | bottom: "inception_4c/output" 1403 | top: "inception_4d/pool" 1404 | pooling_param { 1405 | pool: MAX 1406 | kernel_size: 3 1407 | stride: 1 1408 | pad: 1 1409 | } 1410 | } 1411 | layer { 1412 | name: "inception_4d/pool_proj" 1413 | type: "Convolution" 1414 | bottom: "inception_4d/pool" 1415 | top: "inception_4d/pool_proj" 1416 | param { 1417 | lr_mult: 1 1418 | decay_mult: 1 1419 | } 1420 | param { 1421 | lr_mult: 2 1422 | decay_mult: 0 1423 | } 1424 | convolution_param { 1425 | num_output: 64 1426 | kernel_size: 1 1427 | weight_filler { 1428 | type: "xavier" 1429 | std: 0.1 1430 | } 1431 | bias_filler { 1432 | type: "constant" 1433 | value: 0.2 1434 | } 1435 | } 1436 | } 1437 | layer { 1438 | name: "inception_4d/relu_pool_proj" 1439 | type: "ReLU" 1440 | bottom: "inception_4d/pool_proj" 1441 | top: "inception_4d/pool_proj" 1442 | } 1443 | layer { 1444 | name: "inception_4d/output" 1445 | type: "Concat" 1446 | bottom: "inception_4d/1x1" 1447 | bottom: "inception_4d/3x3" 1448 | bottom: "inception_4d/5x5" 1449 | bottom: "inception_4d/pool_proj" 1450 | top: "inception_4d/output" 1451 | } 1452 | layer { 1453 | name: "inception_4e/1x1" 1454 | type: "Convolution" 1455 | bottom: "inception_4d/output" 1456 | top: "inception_4e/1x1" 1457 | param { 1458 | lr_mult: 1 1459 | decay_mult: 1 1460 | } 1461 | param { 1462 | lr_mult: 2 1463 | decay_mult: 0 1464 | } 1465 | convolution_param { 1466 | num_output: 256 1467 | kernel_size: 1 1468 | weight_filler { 1469 | type: "xavier" 1470 | std: 0.03 1471 | } 1472 | bias_filler { 1473 | type: "constant" 1474 | value: 0.2 1475 | } 1476 | } 1477 | } 1478 | layer { 1479 | name: "inception_4e/relu_1x1" 1480 | type: "ReLU" 1481 | bottom: "inception_4e/1x1" 1482 | top: "inception_4e/1x1" 1483 | } 1484 | layer { 1485 | name: "inception_4e/3x3_reduce" 1486 | type: "Convolution" 1487 | bottom: "inception_4d/output" 1488 | top: "inception_4e/3x3_reduce" 1489 | param { 1490 | lr_mult: 1 1491 | decay_mult: 1 1492 | } 1493 | param { 1494 | lr_mult: 2 1495 | decay_mult: 0 1496 | } 1497 | convolution_param { 1498 | num_output: 160 1499 | kernel_size: 1 1500 | weight_filler { 1501 | type: "xavier" 1502 | std: 0.09 1503 | } 1504 | bias_filler { 1505 | type: "constant" 1506 | value: 0.2 1507 | } 1508 | } 1509 | } 1510 | layer { 1511 | name: "inception_4e/relu_3x3_reduce" 1512 | type: "ReLU" 1513 | bottom: "inception_4e/3x3_reduce" 1514 | top: "inception_4e/3x3_reduce" 1515 | } 1516 | layer { 1517 | name: "inception_4e/3x3" 1518 | type: "Convolution" 1519 | bottom: "inception_4e/3x3_reduce" 1520 | top: "inception_4e/3x3" 1521 | param { 1522 | lr_mult: 1 1523 | decay_mult: 1 1524 | } 1525 | param { 1526 | lr_mult: 2 1527 | decay_mult: 0 1528 | } 1529 | convolution_param { 1530 | num_output: 320 1531 | pad: 1 1532 | kernel_size: 3 1533 | weight_filler { 1534 | type: "xavier" 1535 | std: 0.03 1536 | } 1537 | bias_filler { 1538 | type: "constant" 1539 | value: 0.2 1540 | } 1541 | } 1542 | } 1543 | layer { 1544 | name: "inception_4e/relu_3x3" 1545 | type: "ReLU" 1546 | bottom: "inception_4e/3x3" 1547 | top: "inception_4e/3x3" 1548 | } 1549 | layer { 1550 | name: "inception_4e/5x5_reduce" 1551 | type: "Convolution" 1552 | bottom: "inception_4d/output" 1553 | top: "inception_4e/5x5_reduce" 1554 | param { 1555 | lr_mult: 1 1556 | decay_mult: 1 1557 | } 1558 | param { 1559 | lr_mult: 2 1560 | decay_mult: 0 1561 | } 1562 | convolution_param { 1563 | num_output: 32 1564 | kernel_size: 1 1565 | weight_filler { 1566 | type: "xavier" 1567 | std: 0.2 1568 | } 1569 | bias_filler { 1570 | type: "constant" 1571 | value: 0.2 1572 | } 1573 | } 1574 | } 1575 | layer { 1576 | name: "inception_4e/relu_5x5_reduce" 1577 | type: "ReLU" 1578 | bottom: "inception_4e/5x5_reduce" 1579 | top: "inception_4e/5x5_reduce" 1580 | } 1581 | layer { 1582 | name: "inception_4e/5x5" 1583 | type: "Convolution" 1584 | bottom: "inception_4e/5x5_reduce" 1585 | top: "inception_4e/5x5" 1586 | param { 1587 | lr_mult: 1 1588 | decay_mult: 1 1589 | } 1590 | param { 1591 | lr_mult: 2 1592 | decay_mult: 0 1593 | } 1594 | convolution_param { 1595 | num_output: 128 1596 | pad: 2 1597 | kernel_size: 5 1598 | weight_filler { 1599 | type: "xavier" 1600 | std: 0.03 1601 | } 1602 | bias_filler { 1603 | type: "constant" 1604 | value: 0.2 1605 | } 1606 | } 1607 | } 1608 | layer { 1609 | name: "inception_4e/relu_5x5" 1610 | type: "ReLU" 1611 | bottom: "inception_4e/5x5" 1612 | top: "inception_4e/5x5" 1613 | } 1614 | layer { 1615 | name: "inception_4e/pool" 1616 | type: "Pooling" 1617 | bottom: "inception_4d/output" 1618 | top: "inception_4e/pool" 1619 | pooling_param { 1620 | pool: MAX 1621 | kernel_size: 3 1622 | stride: 1 1623 | pad: 1 1624 | } 1625 | } 1626 | layer { 1627 | name: "inception_4e/pool_proj" 1628 | type: "Convolution" 1629 | bottom: "inception_4e/pool" 1630 | top: "inception_4e/pool_proj" 1631 | param { 1632 | lr_mult: 1 1633 | decay_mult: 1 1634 | } 1635 | param { 1636 | lr_mult: 2 1637 | decay_mult: 0 1638 | } 1639 | convolution_param { 1640 | num_output: 128 1641 | kernel_size: 1 1642 | weight_filler { 1643 | type: "xavier" 1644 | std: 0.1 1645 | } 1646 | bias_filler { 1647 | type: "constant" 1648 | value: 0.2 1649 | } 1650 | } 1651 | } 1652 | layer { 1653 | name: "inception_4e/relu_pool_proj" 1654 | type: "ReLU" 1655 | bottom: "inception_4e/pool_proj" 1656 | top: "inception_4e/pool_proj" 1657 | } 1658 | layer { 1659 | name: "inception_4e/output" 1660 | type: "Concat" 1661 | bottom: "inception_4e/1x1" 1662 | bottom: "inception_4e/3x3" 1663 | bottom: "inception_4e/5x5" 1664 | bottom: "inception_4e/pool_proj" 1665 | top: "inception_4e/output" 1666 | } 1667 | layer { 1668 | name: "CAM_conv" 1669 | type: "Convolution" 1670 | bottom: "inception_4e/output" 1671 | top: "CAM_conv" 1672 | param { 1673 | lr_mult: 1 1674 | decay_mult: 1 1675 | } 1676 | param { 1677 | lr_mult: 2 1678 | decay_mult: 0 1679 | } 1680 | convolution_param { 1681 | num_output: 1024 1682 | pad: 1 1683 | kernel_size: 3 1684 | group: 2 1685 | weight_filler { 1686 | type: "gaussian" 1687 | std: 0.01 1688 | } 1689 | bias_filler { 1690 | type: "constant" 1691 | value: 1 1692 | } 1693 | } 1694 | } 1695 | layer { 1696 | name: "CAM_relu" 1697 | type: "ReLU" 1698 | bottom: "CAM_conv" 1699 | top: "CAM_conv" 1700 | } 1701 | layer { 1702 | name: "CAM_pool" 1703 | type: "Pooling" 1704 | bottom: "CAM_conv" 1705 | top: "CAM_pool" 1706 | pooling_param { 1707 | pool: AVE 1708 | kernel_size: 14 1709 | stride: 14 1710 | } 1711 | } 1712 | layer { 1713 | name: "CAM_fc" 1714 | type: "InnerProduct" 1715 | bottom: "CAM_pool" 1716 | top: "CAM_fc" 1717 | param { 1718 | lr_mult: 1 1719 | decay_mult: 1 1720 | } 1721 | param { 1722 | lr_mult: 2 1723 | decay_mult: 0 1724 | } 1725 | inner_product_param { 1726 | num_output: 205 1727 | weight_filler { 1728 | type: "xavier" 1729 | } 1730 | bias_filler { 1731 | type: "constant" 1732 | value: 0 1733 | } 1734 | } 1735 | } 1736 | layer { 1737 | name: "prob" 1738 | type: "Softmax" 1739 | bottom: "CAM_fc" 1740 | top: "prob" 1741 | } 1742 | 1743 | -------------------------------------------------------------------------------- /models/deploy_vgg16CAM.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 224 6 | input_dim: 224 7 | layers { 8 | bottom: "data" 9 | top: "conv1_1" 10 | name: "conv1_1" 11 | type: CONVOLUTION 12 | convolution_param { 13 | num_output: 64 14 | pad: 1 15 | kernel_size: 3 16 | } 17 | blobs_lr: 1 18 | blobs_lr: 2 19 | weight_decay: 1 20 | weight_decay: 0 21 | } 22 | layers { 23 | bottom: "conv1_1" 24 | top: "conv1_1" 25 | name: "relu1_1" 26 | type: RELU 27 | } 28 | layers { 29 | bottom: "conv1_1" 30 | top: "conv1_2" 31 | name: "conv1_2" 32 | type: CONVOLUTION 33 | convolution_param { 34 | num_output: 64 35 | pad: 1 36 | kernel_size: 3 37 | } 38 | blobs_lr: 1 39 | blobs_lr: 2 40 | weight_decay: 1 41 | weight_decay: 0 42 | } 43 | layers { 44 | bottom: "conv1_2" 45 | top: "conv1_2" 46 | name: "relu1_2" 47 | type: RELU 48 | } 49 | layers { 50 | bottom: "conv1_2" 51 | top: "pool1" 52 | name: "pool1" 53 | type: POOLING 54 | pooling_param { 55 | pool: MAX 56 | kernel_size: 2 57 | stride: 2 58 | } 59 | } 60 | layers { 61 | bottom: "pool1" 62 | top: "conv2_1" 63 | name: "conv2_1" 64 | type: CONVOLUTION 65 | convolution_param { 66 | num_output: 128 67 | pad: 1 68 | kernel_size: 3 69 | } 70 | blobs_lr: 1 71 | blobs_lr: 2 72 | weight_decay: 1 73 | weight_decay: 0 74 | } 75 | layers { 76 | bottom: "conv2_1" 77 | top: "conv2_1" 78 | name: "relu2_1" 79 | type: RELU 80 | } 81 | layers { 82 | bottom: "conv2_1" 83 | top: "conv2_2" 84 | name: "conv2_2" 85 | type: CONVOLUTION 86 | convolution_param { 87 | num_output: 128 88 | pad: 1 89 | kernel_size: 3 90 | } 91 | blobs_lr: 1 92 | blobs_lr: 2 93 | weight_decay: 1 94 | weight_decay: 0 95 | } 96 | layers { 97 | bottom: "conv2_2" 98 | top: "conv2_2" 99 | name: "relu2_2" 100 | type: RELU 101 | } 102 | layers { 103 | bottom: "conv2_2" 104 | top: "pool2" 105 | name: "pool2" 106 | type: POOLING 107 | pooling_param { 108 | pool: MAX 109 | kernel_size: 2 110 | stride: 2 111 | } 112 | } 113 | layers { 114 | bottom: "pool2" 115 | top: "conv3_1" 116 | name: "conv3_1" 117 | type: CONVOLUTION 118 | convolution_param { 119 | num_output: 256 120 | pad: 1 121 | kernel_size: 3 122 | } 123 | blobs_lr: 1 124 | blobs_lr: 2 125 | weight_decay: 1 126 | weight_decay: 0 127 | } 128 | layers { 129 | bottom: "conv3_1" 130 | top: "conv3_1" 131 | name: "relu3_1" 132 | type: RELU 133 | } 134 | layers { 135 | bottom: "conv3_1" 136 | top: "conv3_2" 137 | name: "conv3_2" 138 | type: CONVOLUTION 139 | convolution_param { 140 | num_output: 256 141 | pad: 1 142 | kernel_size: 3 143 | } 144 | blobs_lr: 1 145 | blobs_lr: 2 146 | weight_decay: 1 147 | weight_decay: 0 148 | } 149 | layers { 150 | bottom: "conv3_2" 151 | top: "conv3_2" 152 | name: "relu3_2" 153 | type: RELU 154 | } 155 | layers { 156 | bottom: "conv3_2" 157 | top: "conv3_3" 158 | name: "conv3_3" 159 | type: CONVOLUTION 160 | convolution_param { 161 | num_output: 256 162 | pad: 1 163 | kernel_size: 3 164 | } 165 | blobs_lr: 1 166 | blobs_lr: 2 167 | weight_decay: 1 168 | weight_decay: 0 169 | } 170 | layers { 171 | bottom: "conv3_3" 172 | top: "conv3_3" 173 | name: "relu3_3" 174 | type: RELU 175 | } 176 | layers { 177 | bottom: "conv3_3" 178 | top: "pool3" 179 | name: "pool3" 180 | type: POOLING 181 | pooling_param { 182 | pool: MAX 183 | kernel_size: 2 184 | stride: 2 185 | } 186 | } 187 | layers { 188 | bottom: "pool3" 189 | top: "conv4_1" 190 | name: "conv4_1" 191 | type: CONVOLUTION 192 | convolution_param { 193 | num_output: 512 194 | pad: 1 195 | kernel_size: 3 196 | } 197 | blobs_lr: 1 198 | blobs_lr: 2 199 | weight_decay: 1 200 | weight_decay: 0 201 | } 202 | layers { 203 | bottom: "conv4_1" 204 | top: "conv4_1" 205 | name: "relu4_1" 206 | type: RELU 207 | } 208 | layers { 209 | bottom: "conv4_1" 210 | top: "conv4_2" 211 | name: "conv4_2" 212 | type: CONVOLUTION 213 | convolution_param { 214 | num_output: 512 215 | pad: 1 216 | kernel_size: 3 217 | } 218 | blobs_lr: 1 219 | blobs_lr: 2 220 | weight_decay: 1 221 | weight_decay: 0 222 | } 223 | layers { 224 | bottom: "conv4_2" 225 | top: "conv4_2" 226 | name: "relu4_2" 227 | type: RELU 228 | } 229 | layers { 230 | bottom: "conv4_2" 231 | top: "conv4_3" 232 | name: "conv4_3" 233 | type: CONVOLUTION 234 | convolution_param { 235 | num_output: 512 236 | pad: 1 237 | kernel_size: 3 238 | } 239 | blobs_lr: 1 240 | blobs_lr: 2 241 | weight_decay: 1 242 | weight_decay: 0 243 | } 244 | layers { 245 | bottom: "conv4_3" 246 | top: "conv4_3" 247 | name: "relu4_3" 248 | type: RELU 249 | } 250 | layers { 251 | bottom: "conv4_3" 252 | top: "pool4" 253 | name: "pool4" 254 | type: POOLING 255 | pooling_param { 256 | pool: MAX 257 | kernel_size: 2 258 | stride: 2 259 | } 260 | } 261 | layers { 262 | bottom: "pool4" 263 | top: "conv5_1" 264 | name: "conv5_1" 265 | type: CONVOLUTION 266 | convolution_param { 267 | num_output: 512 268 | pad: 1 269 | kernel_size: 3 270 | } 271 | blobs_lr: 1 272 | blobs_lr: 2 273 | weight_decay: 1 274 | weight_decay: 0 275 | } 276 | layers { 277 | bottom: "conv5_1" 278 | top: "conv5_1" 279 | name: "relu5_1" 280 | type: RELU 281 | } 282 | layers { 283 | bottom: "conv5_1" 284 | top: "conv5_2" 285 | name: "conv5_2" 286 | type: CONVOLUTION 287 | convolution_param { 288 | num_output: 512 289 | pad: 1 290 | kernel_size: 3 291 | } 292 | blobs_lr: 1 293 | blobs_lr: 2 294 | weight_decay: 1 295 | weight_decay: 0 296 | } 297 | layers { 298 | bottom: "conv5_2" 299 | top: "conv5_2" 300 | name: "relu5_2" 301 | type: RELU 302 | } 303 | layers { 304 | bottom: "conv5_2" 305 | top: "conv5_3" 306 | name: "conv5_3" 307 | type: CONVOLUTION 308 | convolution_param { 309 | num_output: 512 310 | pad: 1 311 | kernel_size: 3 312 | } 313 | blobs_lr: 1 314 | blobs_lr: 2 315 | weight_decay: 1 316 | weight_decay: 0 317 | } 318 | layers { 319 | bottom: "conv5_3" 320 | top: "conv5_3" 321 | name: "relu5_3" 322 | type: RELU 323 | } 324 | layers { 325 | bottom: "conv5_3" 326 | top: "CAM_conv" 327 | name: "CAM_conv" 328 | type: CONVOLUTION 329 | convolution_param { 330 | num_output: 1024 331 | pad: 1 332 | kernel_size: 3 333 | group: 2 334 | weight_filler { 335 | type: "gaussian" 336 | std: 0.01 337 | } 338 | bias_filler { 339 | type: "constant" 340 | value: 0 341 | } 342 | } 343 | blobs_lr: 1 344 | blobs_lr: 2 345 | weight_decay: 1 346 | weight_decay: 0 347 | } 348 | layers { 349 | bottom: "CAM_conv" 350 | top: "CAM_conv" 351 | name: "CAM_relu" 352 | type: RELU 353 | } 354 | layers { 355 | name: "CAM_pool" 356 | type: POOLING 357 | bottom: "CAM_conv" 358 | top: "CAM_pool" 359 | pooling_param { 360 | pool: AVE 361 | kernel_size: 14 362 | stride: 14 363 | } 364 | } 365 | layers { 366 | bottom: "CAM_pool" 367 | top: "CAM_pool" 368 | name: "CAM_dropout" 369 | type: DROPOUT 370 | dropout_param { 371 | dropout_ratio: 0.5 372 | } 373 | } 374 | layers { 375 | name: "CAM_fc" 376 | bottom: "CAM_pool" 377 | top: "CAM_fc" 378 | type: INNER_PRODUCT 379 | inner_product_param { 380 | num_output: 1000 381 | } 382 | blobs_lr: 1 383 | weight_decay: 1 384 | 385 | } 386 | layers { 387 | bottom: "CAM_fc" 388 | top: "prob" 389 | name: "prob" 390 | type: SOFTMAX 391 | } 392 | 393 | -------------------------------------------------------------------------------- /models/download.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd $(dirname $0) 4 | curl -O http://cnnlocalization.csail.mit.edu/demoCAM/models/imagenet_googlenetCAM_train_iter_120000.caffemodel 5 | -------------------------------------------------------------------------------- /prepare_image.m: -------------------------------------------------------------------------------- 1 | function crops_data = prepare_image(im) 2 | % ------------------------------------------------------------------------ 3 | % caffe/matlab/+caffe/imagenet/ilsvrc_2012_mean.mat contains mean_data that 4 | % is already in W x H x C with BGR channels 5 | d = load('ilsvrc_2012_mean.mat'); 6 | mean_data = d.mean_data; 7 | IMAGE_DIM = 256; 8 | CROPPED_DIM = 224; % 224 for googLeNet , 227 for VGG and AlexNet 9 | 10 | % Convert an image returned by Matlab's imread to im_data in caffe's data 11 | % format: W x H x C with BGR channels 12 | im_data = im(:, :, [3, 2, 1]); % permute channels from RGB to BGR 13 | im_data = permute(im_data, [2, 1, 3]); % flip width and height 14 | im_data = single(im_data); % convert from uint8 to single 15 | im_data = imresize(im_data, [IMAGE_DIM IMAGE_DIM], 'bilinear'); % resize im_data 16 | im_data = im_data - mean_data; % subtract mean_data (already in W x H x C, BGR) 17 | 18 | % oversample (4 corners, center, and their x-axis flips) 19 | crops_data = zeros(CROPPED_DIM, CROPPED_DIM, 3, 10, 'single'); 20 | indices = [0 IMAGE_DIM-CROPPED_DIM] + 1; 21 | n = 1; 22 | for i = indices 23 | for j = indices 24 | crops_data(:, :, :, n) = im_data(i:i+CROPPED_DIM-1, j:j+CROPPED_DIM-1, :); 25 | crops_data(:, :, :, n+5) = crops_data(end:-1:1, :, :, n); 26 | n = n + 1; 27 | end 28 | end 29 | center = floor(indices(2) / 2) + 1; 30 | crops_data(:,:,:,5) = ... 31 | im_data(center:center+CROPPED_DIM-1,center:center+CROPPED_DIM-1,:); 32 | crops_data(:,:,:,10) = crops_data(end:-1:1, :, :, 5); 33 | -------------------------------------------------------------------------------- /py_demo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import os 4 | try: 5 | caffe_root = os.environ['CAFFE_ROOT'] + '/' 6 | except KeyError: 7 | raise KeyError("Define CAFFE_ROOT in ~/.bashrc") 8 | 9 | sys.path.insert(1, caffe_root+'python/') 10 | import caffe 11 | import cv2 12 | from py_returnCAMmap import py_returnCAMmap 13 | from py_map2jpg import py_map2jpg 14 | import scipy.io 15 | 16 | def im2double(im): 17 | return cv2.normalize(im.astype('float'), None, 0.0, 1.0, cv2.NORM_MINMAX) 18 | 19 | ## Be aware that since Matlab is 1-indexed and column-major, 20 | ## the usual 4 blob dimensions in Matlab are [width, height, channels, num] 21 | 22 | ## In python the dimensions are [num, channels, width, height] 23 | 24 | model = 'googlenet' 25 | if model == 'alexnet': 26 | net_weights = 'models/alexnetplusCAM_imagenet.caffemodel' 27 | net_model = 'models/deploy_alexnetplusCAM_imagenet.prototxt' 28 | out_layer = 'fc9' 29 | last_conv = 'conv7' 30 | crop_size = 227 31 | elif model == 'googlenet': 32 | net_weights = 'models/imagenet_googlenetCAM_train_iter_120000.caffemodel' 33 | net_model = 'models/deploy_googlenetCAM.prototxt' 34 | out_layer = 'CAM_fc' 35 | crop_size = 224 36 | last_conv = 'CAM_conv' 37 | else: 38 | raise Exception('This model is not defined') 39 | 40 | categories = scipy.io.loadmat('categories1000.mat') 41 | 42 | # load CAM model and extract features 43 | net = caffe.Net(net_model, net_weights, caffe.TEST) 44 | 45 | transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) 46 | transformer.set_transpose('data', (2,0,1)) 47 | transformer.set_mean('data', np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy').mean(1).mean(1)) 48 | #transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB 49 | 50 | weights_LR = net.params[out_layer][0].data # get the softmax layer of the network 51 | # shape: [1000, N] N-> depends on the network 52 | 53 | image = cv2.imread('img2.jpg') 54 | image = cv2.resize(image, (256, 256)) 55 | 56 | # Take center crop. 57 | center = np.array(image.shape[:2]) / 2.0 58 | crop = np.tile(center, (1, 2))[0] + np.concatenate([ 59 | -np.array([crop_size, crop_size]) / 2.0, 60 | np.array([crop_size, crop_size]) / 2.0 61 | ]) 62 | crop = crop.astype(int) 63 | input_ = image[crop[0]:crop[2], crop[1]:crop[3], :] 64 | 65 | # extract conv features 66 | net.blobs['data'].reshape(*np.asarray([1,3,crop_size,crop_size])) # run only one image 67 | net.blobs['data'].data[...][0,:,:,:] = transformer.preprocess('data', input_) 68 | out = net.forward() 69 | scores = out['prob'] 70 | activation_lastconv = net.blobs[last_conv].data 71 | 72 | 73 | 74 | 75 | ## Class Activation Mapping 76 | 77 | topNum = 5 # generate heatmap for top X prediction results 78 | scoresMean = np.mean(scores, axis=0) 79 | ascending_order = np.argsort(scoresMean) 80 | IDX_category = ascending_order[::-1] # [::-1] to sort in descending order 81 | 82 | curCAMmapAll = py_returnCAMmap(activation_lastconv, weights_LR[IDX_category[:topNum],:]) 83 | 84 | curResult = im2double(image) 85 | 86 | for j in range(topNum): 87 | # for one image 88 | curCAMmap_crops = curCAMmapAll[:,:,j] 89 | curCAMmapLarge_crops = cv2.resize(curCAMmap_crops, (256,256)) 90 | curHeatMap = cv2.resize(im2double(curCAMmapLarge_crops),(256,256)) # this line is not doing much 91 | curHeatMap = im2double(curHeatMap) 92 | 93 | curHeatMap = py_map2jpg(curHeatMap, None, 'jet') 94 | curHeatMap = im2double(image)*0.2+im2double(curHeatMap)*0.7 95 | 96 | cv2.imshow(categories['categories'][IDX_category[j]][0][0], curHeatMap) 97 | cv2.waitKey(0) 98 | -------------------------------------------------------------------------------- /py_generate_bbox.py: -------------------------------------------------------------------------------- 1 | ## Here is the code to generate the bounding box from the heatmap 2 | # 3 | # to reproduce the ILSVRC localization result, you need to first generate 4 | # the heatmap for each testing image by merging the heatmap from the 5 | # 10-crops (it is exactly what the demo code is doing), then resize the merged heatmap back to the original size of 6 | # that image. Then use this bbox generator to generate the bbox from the resized heatmap. 7 | # 8 | # The source code of the bbox generator is also released. Probably you need 9 | # to install the correct version of OpenCV to compile it. 10 | # 11 | # Special thanks to Hui Li for helping on this code. 12 | # 13 | # Bolei Zhou, April 19, 2016 14 | 15 | import os 16 | import numpy as np 17 | import cv2 18 | from py_map2jpg import py_map2jpg 19 | 20 | def im2double(im): 21 | return cv2.normalize(im.astype('float'), None, 0.0, 1.0, cv2.NORM_MINMAX) 22 | 23 | bbox_threshold = [20, 100, 110] # parameters for the bbox generator 24 | curParaThreshold = str(bbox_threshold[0])+' '+str(bbox_threshold[1])+' '+str(bbox_threshold[2])+' ' 25 | curHeatMapFile = 'bboxgenerator/heatmap_6.jpg'; 26 | curImgFile = 'bboxgenerator/sample_6.jpg'; 27 | curBBoxFile = 'bboxgenerator/heatmap_6.txt'; 28 | 29 | os.system("bboxgenerator/./dt_box "+curHeatMapFile+' '+curParaThreshold+' '+curBBoxFile) 30 | 31 | with open(curBBoxFile) as f: 32 | for line in f: 33 | items = [int(x) for x in line.strip().split()] 34 | 35 | boxData1 = np.array(items[0::4]).T 36 | boxData2 = np.array(items[1::4]).T 37 | boxData3 = np.array(items[2::4]).T 38 | boxData4 = np.array(items[3::4]).T 39 | 40 | boxData_formulate = np.array([boxData1, boxData2, boxData1+boxData3, boxData2+boxData4]).T 41 | 42 | col1 = np.min(np.array([boxData_formulate[:,0], boxData_formulate[:,2]]), axis=0) 43 | col2 = np.min(np.array([boxData_formulate[:,1], boxData_formulate[:,3]]), axis=0) 44 | col3 = np.max(np.array([boxData_formulate[:,0], boxData_formulate[:,2]]), axis=0) 45 | col4 = np.max(np.array([boxData_formulate[:,1], boxData_formulate[:,3]]), axis=0) 46 | 47 | boxData_formulate = np.array([col1, col2, col3, col4]).T 48 | 49 | curHeatMap = cv2.imread(curHeatMapFile) 50 | curImg = cv2.imread(curImgFile) 51 | 52 | curHeatMap = im2double(curHeatMap) 53 | curHeatMap = py_map2jpg(curHeatMap, None, 'jet') 54 | curHeatMap = im2double(curImg)*0.2+im2double(curHeatMap)*0.7 55 | 56 | for i in range(boxData_formulate.shape[0]): # for each bbox 57 | print(boxData_formulate[i][:2]) 58 | print(boxData_formulate[i][2:]) 59 | cv2.rectangle(curHeatMap, tuple(boxData_formulate[i][:2]), tuple(boxData_formulate[i][2:]), (255,0,0), 3) 60 | cv2.imshow('bbox', curHeatMap) 61 | cv2.waitKey(0) -------------------------------------------------------------------------------- /py_map2jpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | 4 | def py_map2jpg(imgmap, rang, colorMap): 5 | if rang is None: 6 | rang = [np.min(imgmap), np.max(imgmap)] 7 | 8 | heatmap_x = np.round(imgmap*255).astype(np.uint8) 9 | 10 | return cv2.applyColorMap(heatmap_x, cv2.COLORMAP_JET) -------------------------------------------------------------------------------- /py_returnCAMmap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def py_returnCAMmap(activation, weights_LR): 4 | print(activation.shape) 5 | 6 | if activation.shape[0] == 1: # only one image 7 | n_feat, w, h = activation[0].shape 8 | act_vec = np.reshape(activation[0], [n_feat, w*h]) 9 | n_top = weights_LR.shape[0] 10 | out = np.zeros([w, h, n_top]) 11 | 12 | for t in range(n_top): 13 | weights_vec = np.reshape(weights_LR[t], [1, weights_LR[t].shape[0]]) 14 | heatmap_vec = np.dot(weights_vec,act_vec) 15 | heatmap = np.reshape( np.squeeze(heatmap_vec) , [w, h]) 16 | out[:,:,t] = heatmap 17 | else: # 10 images (over-sampling) 18 | raise Exception('Not implemented') 19 | 20 | return out -------------------------------------------------------------------------------- /returnCAMmap.m: -------------------------------------------------------------------------------- 1 | function [curColumnMap] = returnCAMmap( featureObjectSwitchSpatial, weights_LR) 2 | %RETURNCOLUMNMAP Summary of this function goes here 3 | % Detailed explanation goes here 4 | 5 | if size(featureObjectSwitchSpatial,4) ==1 6 | 7 | featureObjectSwitchSpatial_vectorized = reshape(featureObjectSwitchSpatial,[size(featureObjectSwitchSpatial,1)*size(featureObjectSwitchSpatial,2) size(featureObjectSwitchSpatial,3)]); 8 | detectionMap = featureObjectSwitchSpatial_vectorized*weights_LR; 9 | curColumnMap = reshape(detectionMap,[size(featureObjectSwitchSpatial,1),size(featureObjectSwitchSpatial,2), size(weights_LR,2)]); 10 | else 11 | columnSet = zeros(size(featureObjectSwitchSpatial,1),size(featureObjectSwitchSpatial,2),size(weights_LR,2),size(featureObjectSwitchSpatial,4)); 12 | for i=1:size(featureObjectSwitchSpatial,4) 13 | curFeatureObjectSwitchSpatial = squeeze(featureObjectSwitchSpatial(:,:,:,i)); 14 | featureObjectSwitchSpatial_vectorized = reshape(curFeatureObjectSwitchSpatial,[size(curFeatureObjectSwitchSpatial,1)*size(curFeatureObjectSwitchSpatial,2) size(curFeatureObjectSwitchSpatial,3)]); 15 | detectionMap = featureObjectSwitchSpatial_vectorized*weights_LR; 16 | curColumnMap = reshape(detectionMap,[size(featureObjectSwitchSpatial,1),size(featureObjectSwitchSpatial,2), size(weights_LR,2)]); 17 | columnSet(:,:,:,i) = curColumnMap; 18 | end 19 | curColumnMap = columnSet; 20 | end 21 | 22 | 23 | 24 | end 25 | 26 | --------------------------------------------------------------------------------