├── .gitignore ├── .gitmodules ├── README.md ├── boxinclusion.m ├── boxoverlap.m ├── boxsuppress.m ├── detect.m ├── detectAtMultipleScales.m ├── doc ├── images │ ├── cover.idraw │ └── cover.jpeg ├── instructions.html └── instructions.md ├── evalDetections.m ├── evaluateModel.m ├── exercise1.m ├── exercise2.m ├── exercise3.m ├── exercise4.m ├── exercise5.m ├── extra ├── Makefile ├── download.sh ├── prepareLabData.m └── signs-sample-image.jpg ├── extract.m ├── loadData.m └── setup.m /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | vlfeat 3 | doc/base.css 4 | doc/prism.css 5 | doc/prism.js 6 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "matconvnet"] 2 | path = matconvnet 3 | url = ssh://git@bitbucket.org/ovl/matconvnet.git 4 | [submodule "extra/practical"] 5 | path = extra/practical 6 | url = git@github.com:vedaldi/practical.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Object category detection practical 2 | =================================== 3 | 4 | > A computer vision practical by the Oxford Visual Geometry group, 5 | > authored by Andrea Vedaldi and Andrew Zisserman. 6 | 7 | Start from `doc/instructions.html`. 8 | 9 | Package contents 10 | ---------------- 11 | 12 | The practical consists of four exercises, organized in the following 13 | files: 14 | 15 | * `exercise1.m` -- Part 1: Detection fundamentals 16 | * `exercise2.m` -- Part 2: Multiple scales and learning with an SVM 17 | * `exercise3.m` -- Part 3: Multiple objects and evaluation 18 | * `exercise4.m` -- Part 4: Hard negative mining 19 | * `exercise5.m` -- Part 5: Train your own object detector 20 | 21 | The practical runs in MATLAB and uses 22 | [MatConvNet](http://www.vlfeat.org/matconvnet) and 23 | [VLFeat](http://www.vlfeat.org). This package contains the following 24 | MATLAB functions: 25 | 26 | * `boxinclusion.m`: compute the inclusion of bounding boxes. 27 | * `boxoverlap.m`: compute the overlap of bounding boxes. 28 | * `boxsuppress.m`: non-maxima box suppression. 29 | * `detect.m`: sliding window detector. 30 | * `detectAtMultipleScales.m`: an intermediate example detector. 31 | * `evalDetections.m`: evaluate detections using the PASCAL VOC criterion. 32 | * `evaluateModel.m`: evaluate a detector against a database of images. 33 | * `extract.m`: extract HOG features from bounding boxes. 34 | * `loadData.m`: load practical data. 35 | * `setup.m`: setup MATLAB environment. 36 | 37 | Appendix: Installing from scratch 38 | --------------------------------- 39 | 40 | The practical requires both VLFeat and MatConvNet. VLFeat comes with 41 | pre-built binaries, but MatConvNet does not. 42 | 43 | 1. From Bash, run `./extras/download.sh`. This will download the 44 | German Street Sign Benchmark data and VLFeat. 45 | 2. From MATLAB, run `addpath extras ; prepareLabData.m`. 46 | 47 | Changes 48 | ------- 49 | 50 | * *2014a* - Initial edition 51 | 52 | License 53 | ------- 54 | 55 | Copyright (c) 2011-13 Andrea Vedaldi 56 | 57 | Permission is hereby granted, free of charge, to any person 58 | obtaining a copy of this software and associated documentation 59 | files (the "Software"), to deal in the Software without 60 | restriction, including without limitation the rights to use, copy, 61 | modify, merge, publish, distribute, sublicense, and/or sell copies 62 | of the Software, and to permit persons to whom the Software is 63 | furnished to do so, subject to the following conditions: 64 | 65 | The above copyright notice and this permission notice shall be 66 | included in all copies or substantial portions of the Software. 67 | 68 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 69 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 70 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 71 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 72 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 73 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 74 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 75 | DEALINGS IN THE SOFTWARE. 76 | -------------------------------------------------------------------------------- /boxinclusion.m: -------------------------------------------------------------------------------- 1 | function dist = calcBoxInclusion(A, B, varargin) 2 | % GETBOXOVERLAP 3 | % A and B have a box for each column, in the format [xmin ymin xmax 4 | % ymax]. The resulting matrix dist has A's boxes along the rows 5 | % and B's boxes along the columns and contains the percentage of 6 | % the area of each box B contained in the box A. 7 | % 8 | % Author:: Andrea Vedaldi 9 | 10 | % AUTORIGHTS 11 | % Copyright (C) 2008-09 Andrea Vedaldi 12 | % 13 | % This file is part of VGG MKL classification and detection code, 14 | % available in the terms of the GNU General Public License version 2. 15 | 16 | opts.pascalFormat = false ; 17 | opts = vl_argparse(opts, varargin) ; 18 | 19 | m = size(A,2) ; 20 | n = size(B,2) ; 21 | O = [] ; 22 | 23 | if m==0 || n==0, dist = zeros(m,n) ; return ; end 24 | 25 | om = ones(1,m) ; 26 | on = ones(1,n) ; 27 | 28 | if opts.pascalFormat 29 | A(3:4,:) = A(3:4,:) + 1 ; 30 | B(3:4,:) = B(3:4,:) + 1 ; 31 | end 32 | 33 | % find length Ox of the overlap range [x1, x2] along x 34 | % x1 cannot be smaller than A.xmin B.xmin 35 | % x2 cannot be larger than A.xmax B.xmax 36 | % Ox is x2 - x1 or 0 37 | 38 | x1 = max(A(1*on,:)', B(1*om,:)) ; 39 | x2 = min(A(3*on,:)', B(3*om,:)) ; 40 | Ox = max(x2 - x1, 0) ; 41 | 42 | y1 = max(A(2*on,:)', B(2*om,:)) ; 43 | y2 = min(A(4*on,:)', B(4*om,:)) ; 44 | Oy = max(y2 - y1, 0) ; 45 | 46 | % are of the intersection 47 | areaInt = Ox .* Oy ; 48 | 49 | % area of the union is sum of areas - inersection 50 | areaA = prod(A(3:4,:) - A(1:2,:)) ; 51 | areaB = prod(B(3:4,:) - B(1:2,:)) ; 52 | 53 | % final distance matrix 54 | dist = areaInt ./ (areaB(om,:) + eps) ; 55 | 56 | -------------------------------------------------------------------------------- /boxoverlap.m: -------------------------------------------------------------------------------- 1 | function dist = calcBoxOverlap(A, B, varargin) 2 | % GETBOXOVERLAP 3 | % A and B have a box for each column, in the format [xmin ymin xmax 4 | % ymax]. The resulting matrix dist has A's boxes along the rows 5 | % and B's boxes along the columns. 6 | % 7 | % Options: 8 | % 9 | % pascalFormat:: false 10 | % If set to TRUE, then the boxes are assumed to be specified in 11 | % the PASCAL format. In this case the coordinates are indeces of 12 | % the upper-left and bottom-right pixels, not the coordinates of 13 | % 2-D points. The difference is that in the former case the area 14 | % of the box includes the pixels that belongs to the boundary. For 15 | % instance the box [1;1;1;1] has area 1 according to the PASCAL 16 | % convention, and area 0 according to the default convention. 17 | % 18 | % Author:: Andrea Vedaldi 19 | 20 | % AUTORIGHTS 21 | % Copyright (C) 2008-09 Andrea Vedaldi 22 | % 23 | % This file is part of the VGG MKL Class and VGG MKL Det code packages, 24 | % available in the terms of the GNU General Public License version 2. 25 | 26 | opts.pascalFormat = false ; 27 | opts = vl_argparse(opts, varargin) ; 28 | 29 | m = size(A,2) ; 30 | n = size(B,2) ; 31 | O = [] ; 32 | 33 | if m==0 || n==0, dist = zeros(m,n) ; return ; end 34 | 35 | om = ones(1,m) ; 36 | on = ones(1,n) ; 37 | 38 | if opts.pascalFormat 39 | A(3:4,:) = A(3:4,:) + 1 ; 40 | B(3:4,:) = B(3:4,:) + 1 ; 41 | end 42 | 43 | % find length Ox of the overlap range [x1, x2] along x 44 | % x1 cannot be smaller than A.xmin B.xmin 45 | % x2 cannot be larger than A.xmax B.xmax 46 | % Ox is x2 - x1 or 0 47 | 48 | x1 = max(A(1*on,:)', B(1*om,:)) ; 49 | x2 = min(A(3*on,:)', B(3*om,:)) ; 50 | Ox = max(x2 - x1, 0) ; 51 | 52 | y1 = max(A(2*on,:)', B(2*om,:)) ; 53 | y2 = min(A(4*on,:)', B(4*om,:)) ; 54 | Oy = max(y2 - y1, 0) ; 55 | 56 | % are of the intersection, of A, and of B 57 | areaInt = Ox .* Oy ; 58 | areaA = prod(A(3:4,:) - A(1:2,:)) ; 59 | areaB = prod(B(3:4,:) - B(1:2,:)) ; 60 | 61 | % area of the union is sum of areas - inersection 62 | dist = areaInt ./ (areaA(on,:)' + areaB(om,:) - areaInt) ; 63 | -------------------------------------------------------------------------------- /boxsuppress.m: -------------------------------------------------------------------------------- 1 | function keep = boxsuppress(boxes, scores, threshold) 2 | % BOXSUPPRESS Box non-maxima suprression 3 | % KEEP = BOXSUPPRESS(BOXES, SCORES, THRESHOLD) 4 | 5 | % remove any empty box (xmax < xmin or ymax < ymin) 6 | scores(any([-1 0 1 0 ; 0 -1 0 1] * boxes < 0)) = -inf ; 7 | 8 | keep = false(1, size(boxes,2)) ; 9 | while true 10 | [score, best] = max(scores) ; 11 | if score == -inf, break ; end 12 | keep(best) = true ; 13 | remove = boxinclusion(boxes(:,best), boxes, 'pascalFormat', true) >= threshold ; 14 | scores(remove) = -inf ; 15 | scores(best) = -inf ; % `best` is not in `remove` if threshold > 1 16 | end 17 | -------------------------------------------------------------------------------- /detect.m: -------------------------------------------------------------------------------- 1 | function [detections,scores,hog] = detect(im, w, hogCellSize, scales) 2 | 3 | modelWidth = size(w, 2) ; 4 | modelHeight = size(w, 1) ; 5 | 6 | detections = {} ; 7 | scores = {} ; 8 | hog = {} ; 9 | 10 | for s = scales 11 | % scale image 12 | t = imresize(im, 1/s) ; 13 | 14 | % skip if too small 15 | if min([size(t,1), size(t,2)]) < 128, break ; end 16 | 17 | % extract HOG features 18 | hog{end+1} = vl_hog(t, hogCellSize) ; 19 | 20 | % convolve model 21 | sc = vl_nnconv(hog{end}, w, []) ; 22 | 23 | % get all detections 24 | [hy,hx] = ind2sub(size(sc), 1:numel(sc)) ; 25 | 26 | hx = hx(:)' ; 27 | hy = hy(:)' ; 28 | x = (hx - 1) * hogCellSize * s + 1 ; 29 | y = (hy - 1) * hogCellSize * s + 1 ; 30 | detections{end+1} = [... 31 | x - 0.5 ; 32 | y - 0.5 ; 33 | x + hogCellSize * modelWidth * s - 0.5 ; 34 | y + hogCellSize * modelHeight * s - 0.5 ;] ; 35 | scores{end+1} = sc(:)' ; 36 | end 37 | 38 | detections = cat(2, detections{:}) ; 39 | scores = cat(2, scores{:}) ; 40 | 41 | [~, perm] = sort(scores, 'descend') ; 42 | 43 | perm = perm(1:1000) ; 44 | scores = scores(perm) ; 45 | detections = detections(:, perm) ; 46 | -------------------------------------------------------------------------------- /detectAtMultipleScales.m: -------------------------------------------------------------------------------- 1 | function detection = detectAtMultipleScales(im, w, hogCellSize, scales) 2 | 3 | modelWidth = size(w, 2) ; 4 | modelHeight = size(w, 1) ; 5 | bestScore = -inf ; 6 | minScore = +inf ; 7 | maxScore = -inf ; 8 | h = [] ; 9 | 10 | for s = scales 11 | % scale image 12 | t = imresize(im, 1/s) ; 13 | 14 | % extract HOG features 15 | hog = vl_hog(t, hogCellSize) ; 16 | 17 | % convolve model 18 | scores = vl_nnconv(hog, w, []) ; 19 | 20 | % pick best response 21 | [score, index] = max(scores(:)) ; 22 | if score > bestScore 23 | bestScore = score ; 24 | [hy, hx] = ind2sub(size(scores), index) ; 25 | x = (hx - 1) * hogCellSize * s + 1 ; 26 | y = (hy - 1) * hogCellSize * s + 1 ; 27 | detection = [ 28 | x - 0.5 ; 29 | y - 0.5 ; 30 | x + hogCellSize * modelWidth * s - 0.5 ; 31 | y + hogCellSize * modelHeight * s - 0.5 ;] ; 32 | end 33 | 34 | % plot score map 35 | vl_tightsubplot(numel(scales),find(s==scales)) ; 36 | imagesc(scores) ; axis off square ; 37 | h(end+1) = gca; 38 | minScore = min([minScore;scores(:)]) ; 39 | maxScore = max([maxScore;scores(:)]) ; 40 | end 41 | 42 | set(h, 'clim', [minScore, maxScore]) ; 43 | 44 | 45 | -------------------------------------------------------------------------------- /doc/images/cover.idraw: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/doc/images/cover.idraw -------------------------------------------------------------------------------- /doc/images/cover.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/doc/images/cover.jpeg -------------------------------------------------------------------------------- /doc/instructions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | VGG Practical 7 | 8 | 9 | 10 | 11 |

Object category detection practical

12 |

This is an Oxford Visual Geometry Group computer vision practical, authored by Andrea Vedaldi and Andrew Zisserman (Release 2018a).

13 |

cover

14 |

The goal of object category detection is to identify and localize objects of a given type in an image. Examples applications include detecting pedestrian, cars, or traffic signs in street scenes, objects of interest such as tools or animals in web images, or particular features in medical image. Given a target class, such as people, a detector receives as input an image and produces as output zero, one, or more bounding boxes around each occurrence of the object class in the image. The key challenge is that the detector needs to find objects regardless of their location and scale in the image, as well as pose and other variation factors, such as clothing, illumination, occlusions, etc.

15 |

This practical explores basic techniques in visual object detection, focusing on image based models. The appearance of image patches containing objects is learned using statistical analysis. Then, in order to detect objects in an image, the statistical model is applied to image windows extracted at all possible scales and locations, in order to identify which ones, if any, contain the object.

16 |

In more detail, the practical explores the following topics: (i) using HOG features to describe image regions, (ii) building a HOG-based sliding-window detector to localize objects in images; (iii) working with multiple scales and multiple object occurrences; (iv) using a linear support vector machine to learn the appearance of objects; (v) evaluating an object detector in term of average precision; (vi) learning an object detector using hard negative mining.

17 |
18 | 59 |
60 |

Getting started

61 |

Read and understand the requirements and installation instructions. The download links for this practical are:

62 | 68 |

After the installation is complete, open and edit the script exercise1.m in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, relative to Part I of this document. You can cut and paste this code into the MATLAB window to run it, and will need to modify it as you go through the session. Other files exercise2.m, exercise3.m, and exercise4.m are given for Part II, III, and IV.

69 |

Each part contains several Questions and Tasks to be answered/completed before proceeding further in the practical.

70 |

Part 1: Detection fundamentals

71 |

Part I--IV use as running example the problem of street sign detection, using the data from the German Traffic Sign Detection Benchmark. This data consists of a number of example traffic images, as well as a number of larger test images containing one or more traffic signs at different sizes and locations. It also comes with ground truth annotation, i.e. with specified bounding boxes and sign labels for each sign occurrence, which is required to evaluate the quality of the detector.

72 |

In this part we will build a basic sliding-window object detector based on HOG features. Follow the steps below:

73 |

Step 1.0: Loading the training data

74 |

The MATLAB m-file loadData.m loads the data for the practical into memory. The function loadData(targetClass) takes a targetClass argument specifying the object class of interest. Open the example1.m file, select the following part of the code, and execute it in MATLAB (right button > Evaluate selection or Shift+F7).

75 |
% Load the training and testing data (trainImages, trainBoxes, ...)
 76 | % The functio takes the ID of the type of traffic sign we want to recognize
 77 | % 1 is the 30 km/h speed limit
 78 | loadData(1) ;
 79 | 
80 | 81 |

This loads into the current workspace the following variables:

82 | 89 |

An analogous set of variables testImages, testBoxes, and so on are provided for the test data. Familiarise yourself with the contents of these variables.

90 |
91 |

Question: why is there a trainImages and a trainBoxImages variables?

92 |
93 |

Step 1.1: Visualize the training images

94 |

Select now the part of the code related to section 1.1 and execute it. This will create an image visualizing both the complete list of object training examples and their average.

95 |
96 |

Question: what can you deduce about the object variability from the average image?

97 |

Question: most boxes extend slightly around the object extent. Why do you think this may be valuable in learning a detector?

98 |
99 |

Step 1.2: Extract HOG features from the training images

100 |

Object detectors usually work on top of a layer of low-level features. In this case, we use HOG (Histogram of Oriented Gradients) features. In order to learn a model of the object, we start by extracting features from the image patches corresponding to the available training examples. This is done by the following for loop:

101 |
hogCellSize = 8 ;
102 | trainHog = {} ;
103 | for i = 1:size(trainBoxPatches,4)
104 |   trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
105 | end
106 | trainHog = cat(4, trainHog{:}) ;
107 | 
108 | 109 |

HOG is computed by the VLFeat function vl_hog (doc). This function takes as parameter the size in pixels of each HOG cell hogCellSize. It also takes a RGB image, represented in MATLAB as a $w \times h \times 3$ array (extracted as a slice of trainBoxPatches). The output is a $w/\mathtt{hogCellSize} \times h/\mathtt{hogCellSize} \times 31$ dimensional array. One such array is extracted for each example image end eventually these are concatenated in a 4D array along the fourth dimension.

110 |

Step 1.3: Learn a simple HOG template model

111 |

A very basic object model can be obtained by averaging the features of the example objects. This is done by:

112 |
w = mean(trainHog, 4) ;
113 | 
114 | 115 |

The model can be visualized by rendering w as if it was a HOG feature array. This can be done using the render option of vl_hog:

116 |
figure(2) ; clf ;
117 | imagesc(vl_hog('render', w)) ;
118 | 
119 | 120 |

Spend some time to study this plot and make sure you understand what is visualized.

121 |
122 |

Question: Can you make sense of the resulting plot?

123 |
124 |

Step 1.4: Apply the model to a test image

125 |

The model is matched to a test image by: (i) extracting the HOG features of the image and (ii) convolving the model over the resulting feature map:

126 |
im = imread('data/signs-sample-image.jpg') ;
127 | im = im2single(im) ;
128 | hog = vl_hog(im, hogCellSize) ;
129 | scores = vl_nnconv(hog, w, []) ;
130 | 
131 | 132 |

The first two lines read a sample image and conver it to single format. The third line computes the HOG features of the image using the vl_hog seen above. The fourth line convolves the HOG map hog with the model w. It uses the function vl_nnconv1 and returns a scores map.

133 |
134 |

Task: Work out the dimension of the scores arrays. Then, check your result with the dimension of the array computed by MATLAB.

135 |

Question: Visualize the image im and the scores array using the provided example code. Does the result match your expectations?

136 |
137 |

Step 1.5: Extract the top detection

138 |

Now that the model has been applied to the image, we have a response map scores. To extract a detection from this, we (i) find the maximum response and (ii) compute the bounding box of the image patch containing the corresponding HOG features. The maximum is found by:

139 |
[best, bestIndex] = max(scores(:)) ;
140 | 
141 | 142 |

Note that bestIndex is a linear index in the range $[1, M]$ where $M$ is the number of possible filter locations. We convert this into a subscript $(h_x,h_y)$ using MATLAB ind2sub function:

143 |
[hy, hx] = ind2sub(size(scores), bestIndex) ;
144 | 
145 | 146 |

$(h_x,h_y)$ are in units of HOG cells. We convert this into pixel coordinates as follows:

147 |
x = (hx - 1) * hogCellSize + 1 ;
148 | y = (hy - 1) * hogCellSize + 1 ;
149 | 
150 | 151 |
152 |

Question: Why are we subtracting -1 and summing +1? Which pixel $(x,y)$ of the HOG cell $(h_x,h_y)$ is found?

153 |
154 |

The size of the model template in number of HOG cell can be computed in several way; one is simply:

155 |
modelWidth = size(trainHog, 2) ;
156 | modelHeight = size(trainHog, 1) ;
157 | 
158 | 159 |

Now we have enough information to compute the bounding box as follows:

160 |
detection = [
161 |   x - 0.5 ;
162 |   y - 0.5 ;
163 |   x + hogCellSize * modelWidth - 0.5 ;
164 |   y + hogCellSize * modelHeight - 0.5 ;] ;
165 | 
166 | 167 |

Note: the bounding box encloses exactly all the pixel of the HOG template. In MATLAB, pixel centers have integer coordinates and pixel borders are at a distance $\pm1/2$.

168 |
169 |

Question: Use the example code to plot the image and overlay the bounding box of the detected object. Did it work as expected?

170 |
171 |

Part 2: Multiple scales and learning with an SVM

172 |

In this second part, we will: (i) extend the detector to search objects at multiple scales and (ii) learn a better model using a support vector machine. Let's start by loading the data as needed:

173 |
setup ;
174 | targetClass = 'mandatory' ;
175 | loadData(targetClass) ;
176 | 
177 | 178 |

The mandatory target class is simply the union of all mandatory traffic signs.

179 |

Step 2.1: Multi-scale detection

180 |

Objects exist in images at sizes different from one of the learned template. In order to find objects of all sizes, we scale the image up and down and search for the object over and over again.

181 |

The set of searched scales is defined as follows:

182 |
% Scale space configuraiton
183 | minScale = -1 ;
184 | maxScale = 3 ;
185 | numOctaveSubdivisions = 3 ;
186 | scales = 2.^linspace(...
187 |   minScale,...
188 |   maxScale,...
189 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
190 | 
191 | 192 |

Given the model w, as determined in Part I, we use the function detectAtMultipleScales in order to search for the object at multiple scales:

193 |
detection = detectAtMultipleScales(im, w, hogCellSize, scales) ;
194 | 
195 | 196 |

Note that the function generates a figure as it runs, so prepare a new figure before running it using the figure command if you do not want your current figure to be deleted.

197 |
198 |

Question: Open and study the detectAtMultipleScales function. Convince yourself that it is the same code as before, but operated after rescaling the image a number of times.

199 |

Question: Visualize the resulting detection using the supplied example code. Did it work? If not, can you make sense of the errors?

200 |

Question: Look at the array of scores maps generated by detectAtMultipleScales using the example code. Do they make sense? Is there anything wrong?

201 |
202 |

Step 2.2: Collect positive and negative training data

203 |

The model learned so far is too weak to work well. It is now time to use an SVM to learn a better one. In order to do so, we need to prepare suitable data. We already have positive examples (features extracted from object patches):

204 |
% Collect positive training data
205 | pos = trainHog ;
206 | 
207 | 208 |

Ino order to collect negative examples (features extracted from non-object patches), we loop through a number of training images and sample patches uniformly:

209 |
210 |

Task: Identify the code that extract these patches in example2.m and make sure you understand it.

211 |

Question: How many negative examples are we collecting?

212 |
213 |

Step 2.3: Learn a model with an SVM

214 |

Now that we have the data, we can learn an SVM model. To this end we will use the vl_svmtrain function. This function requires the data to be in a $D \times N$ matrix, where $D$ are the feature dimensions and $N$ the number of training points. This is done by:

215 |
% Pack the data into a matrix with one datum per column
216 | x = cat(4, pos, neg) ;
217 | x = reshape(x, [], numPos + numNeg) ;
218 | 
219 | 220 |

We also need a vector of binary labels, +1 for positive points and -1 for negative ones:

221 |
% Create a vector of binary labels
222 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
223 | 
224 | 225 |

Finally, we need to set the parameter $\lambda$ of the SVM solver. For reasons that will become clearer later, we use instead the equivalent $C$ parameter:

226 |
numPos = size(pos,4) ;
227 | numNeg = size(neg,4) ;
228 | C = 10 ;
229 | lambda = 1 / (C * (numPos + numNeg)) ;
230 | 
231 | 232 |

Learning the SVM is then a one-liner:

233 |
% Learn the SVM using an SVM solver
234 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
235 | 
236 | 237 |
238 |

Question: Visualize the learned model w using the supplied code. Does it differ from the naive model learned before? How?

239 |
240 |

Step 2.4: Evaluate the learned model

241 |

Use the detectAtMultipleScales seen above to evaluate the new SVM-based model.

242 |
243 |

Question: Does the learned model perform better than the naive average?

244 |

Task: Try different images. Does this detector work all the times? If not, what types of mistakes do you see? Are these mistakes reasonable?

245 |
246 |

Part 3: Multiple objects and evaluation

247 |

Step 3.1: Multiple detections

248 |

Detecting at multiple scales is insufficient: we must also allow for more than one object occurrence in the image. In order to to so, the package include a suitalbe detect function. This function is similar to detectAtMultipleScales, but it returns the top 1000 detector responses rather than just the top one:

249 |
% Compute detections
250 | [detections, scores] = detect(im, w, hogCellSize, scales) ;
251 | 
252 | 253 |
254 |

Task: Open and study detect.m. Make sure that you understand how it works.

255 |

Question: Why do we want to return so many responses? In practice, it is unlikely that more than a handful of object occurrences may be contained in any given image...

256 |
257 |

A single object occurrence generates multiple detector responses at nearby image locations and scales. In order to eliminate these redundant detections, we use a non-maximum suppression algorithm. This is implemented by the boxsuppress.m MATLAB m-file. The algorithm is simple: start from the highest-scoring detection, then remove any other detection whose overlap[^overlap] is greater than a threshold. The function returns a boolean vector keep of detections to preserve:

258 |
% Non-maximum suppression
259 | keep = boxsuppress(detections, scores, 0.25) ;
260 | 
261 | detections = detections(:, keep) ;
262 | scores = scores(keep) ;
263 | 
264 | 265 |

For efficiency, after non-maximum suppression we keep just ten responses (as we do not expect more than a few objects in any image):

266 |
% Further keep only top detections
267 | detections = detections(:, 1:10) ;
268 | scores = scores(1:10) ;
269 | 
270 | 271 |

Step 3.2: Detector evaluation

272 |

We are now going to look at properly evaluating our detector. We use the PASCAL VOC criterion, computing Average Precision (AP). Consider a test image containing a number of ground truth object occurrences $(g_1,\dots,g_m)$ and a list $(b_1,s_1),\dots,(b_n,s_n)$ of candidate detections $b_i$ with score $s_i$. The following algorithm converts this data into a list of labels and scores $(s_i,y_i)$ that can be used to compute a precision-recall curve, for example using VLFeat vl_pr function. The algorithm, implemented by evalDetections.m, is as follows:

273 |
    274 |
  1. Assign each candidate detection $(b_i,s_i)$ a true or false label $y_i \in {+1,-1}$. To do so:
      275 |
    1. The candidate detections $(b_i,s_i)$ are sorted by decreasing score $s_i$.
    2. 276 |
    3. For each candidate detection in order: 277 | a. If there is a matching ground truth detection $g_j$ ($\operatorname{overlap}(b_i,g_j)$ larger than 50%), the candidate detection is considered positive ($y_i=+1$). Furthermore, the ground truth detection is removed from the list and not considered further. 278 | b. Otherwise, the candidate detection is negative ($y_i=-1$).
    4. 279 |
    280 |
  2. 281 |
  3. Add each ground truth object $g_i$ that is still unassigned to the list of candidates as pair $(g_j, -\infty)$ with label $y_j=+1$.
  4. 282 |
283 |

The overlap metric used to compare a candidate detection to a ground truth bounding box is defined as the ratio of the area of the intersection over the area of the union of the two bounding boxes: 284 | 287 |

288 |
289 |

Questions:

290 | 295 |
296 |

In order to apply this algorithm, we first need to find the ground truth bounding boxes in the test image:

297 |
% Find all the objects in the target image
298 | s = find(strcmp(testImages{1}, testBoxImages)) ;
299 | gtBoxes = testBoxes(:, s) ;
300 | 
301 | 302 |

Then evalDetections can be used:

303 |
% No example is considered difficult
304 | gtDifficult = false(1, numel(s)) ;
305 | 
306 | % PASCAL-like evaluation
307 | matches = evalDetections(...
308 |   gtBoxes, gtDifficult, ...
309 |   detections, scores) ;
310 | 
311 | 312 |

The gtDifficult flags can be used to mark some ground truth object occurrence as difficult and hence ignored in the evaluation. This is used in the PASCAL VOC challenge, but not here (i.e. no object occurrence is considered difficult).

313 |

evalDetections returns a matches structure with several fields. We focus here on matches.detBoxFlags: this contains a +1 for each detection that was found to be correct and -1 otherwise. We use this to visualize the detection errors:

314 |
% Visualization
315 | figure(1) ; clf ;
316 | imagesc(im) ; axis equal ; hold on ;
317 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ;
318 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ;
319 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ;
320 | axis off ;
321 | 
322 | 323 |
324 |

Task: Use the supplied example code to evaluate the detector on one image. Look carefully at the output and convince yourself that it makes sense.

325 |
326 |

Now Plot the PR curve:

327 |
figure(2) ; clf ;
328 | vl_pr(matches.labels, matches.scores) ;
329 | 
330 | 331 |
332 |

Question: There are a large number of errors in each image. Should you worry? In what manner is the PR curve affected? How would you eliminate the vast majority of those in a practice?

333 |
334 |

Step 3.3: Evaluation on multiple images

335 |

Evaluation is typically done on multiple images rather than just one. This is implemented by the evalModel.m m-file.

336 |
337 |

Task: Open evalModel.m and make sure you understand the main steps of the evaluation procedure.

338 |
339 |

Use the supplied example code to run the evaluation on the entiere test set:

340 |
matches = evaluateModel(testImages, testBoxes, testBoxImages, ...
341 |   w, hogCellSize, scales) ;
342 | 
343 | 344 |

Note: The function processes an image per time, visualizing the results as it progresses. The PR curve is the result of the accumulation of the detections obtained thus far.

345 |
346 |

Task: Open the evaluateModel.m file in MATLAB and add a breakpoint right at the end of the for loop. Now run the evaluation code again and look at each image individually (use dbcont to go to the next image). Check out the correct and incorrect matches in each image and their ranking and the effect of this in the cumulative precision-recall curve.

347 |
348 |

Part 4: Hard negative mining

349 |

This part explores more advanced learning methods. So far, the SVM has been learned using a small and randomly sampled number of negative examples. However, in principle, every single patch that does not contain the object can be considered as a negative sample. These are of course too many to be used in practice; unfortunately, random sampling is ineffective as the most interesting (confusing) negative samples are a very small and special subset of all the possible ones.

350 |

Hard negative mining is a simple technique that allows finding a small set of key negative examples. The idea is simple: we start by training a model without any negatives at all (in this case the solver learns a 1-class SVM), and then we alternate between evaluating the model on the training data to find erroneous responses and adding the corresponding examples to the training set.

351 |

Step 4.1: Train with hard negative mining

352 |

Use the supplied code in example4.m to run hard negative mining. The code repeats SVM training, as seen above, a number of times, progressively increasing the size of the neg array containing the negative samples. This is updated using the output of:

353 |
 [matches, moreNeg] = ...
354 |     evaluateModel(...
355 |     vl_colsubset(trainImages', schedule(t), 'beginning'), ...
356 |     trainBoxes, trainBoxImages, ...
357 |     w, hogCellSize, scales) ;
358 | 
359 | 360 |

Here moreNeg contains the HOG features of the top (highest scoring and hence most confusing) image patches in the supplied training images.

361 |
362 |

Task: Examine evaluateModel.m again to understand how hard negatives are extracted.

363 |

Question: What is the purpose of the construct vl_colsubset(trainImages', schedule(t), 'beginning')? Why do you think we visit more negative images in later iterations?

364 |
365 |

The next step is to fuse the new negative set with the old one:

366 |
% Add negatives
367 | neg = cat(4, neg, moreNeg) ;
368 | 
369 | 370 |

Note that hard negative mining could select the same negatives at different iterations; the following code squashes these duplicates:

371 |
% Remove negative duplicates
372 | z = reshape(neg, [], size(neg,4)) ;
373 | [~,keep] = unique(z','stable','rows') ;
374 | neg = neg(:,:,:,keep) ;
375 | 
376 | 377 |

Step 4.2: Evaluate the model on the test data

378 |

Once hard negative mining and training are done, we are ready to evaluate the model on the test data (note that the model is evaluated on the training data for mining). As before:

379 |
evaluateModel(...
380 |     testImages, testBoxes, testBoxImages, ...
381 |     w, hogCellSize, scales) ;
382 | 
383 | 384 |

Part 5: Train your own object detector

385 |

Skip on fast track

386 |

In this last part, you will learn your own object detector. To this end, open and look at exercise5.m. You will need to prepare the following data:

387 |

Step 5.1: Preparing the training data

388 | 393 |

Run the code in example5.m to check that your training data looks right.

394 |
395 |

Task: Understand the limitations of this simple detector and choose a target object that has a good chance of being learnable.

396 |
397 |

Hint: Note in particular that object instances must be similar and roughly aligned. If your object is not symmetric, consider choosing instances that face a particular direction (e.g. left-facing horse head).

398 |

Step 5.2: Learn the model

399 |

Use the code supplied in example5.m to learn an SVM model for your object using hard negative mining as in Stage 4.1.

400 |

Step 5.3: Test the model

401 |

Use the code supplied in example5.m to evaluate the SVM model on a test image and visualize the result as in Stage 2.1.

402 |
403 |

Task: Make sure you get sensible results. Go back to step 5.1 if needed and adjust your data.

404 |
405 |

Hint: For debugging purposes, try using one of your training images as test. Does it work at least in this case?

406 |

Step 5.4: Detecting symmetric objects with multiple aspects

407 |

The basic detectors you have learned so far are not invariant to effects such as object deformations, out-of-plane rotations, and partial occlusions that affect most natural objects. Handling these effects requires additional sophistications, including using deformable templates, and a mixture of multiple templates.

408 |

In particular, many objects in nature are symmetric and, as such, their images appear flipped when the objects are seen from the left or the right direction (consider for example a face). This can be handled by a pair of symmetric HOG templates. In this part we will explore this option.

409 |
410 |

Task: Using the procedure above, train a HOG template w for a symmetric object facing in one specific direction. For example, train a left-facing horse head detector.

411 |

Task: Collect test images containing the object facing in both directions. Run your detector and convince yourself that it works well only for the direction it was trained for.

412 |
413 |

HOG features have a well defined structure that makes it possible to predict how the features transform when the underlying image is flipped. The transformation is in fact a simple permutation of the HOG elements. For a given spatial cell, HOG has 31 dimensions. The following code permutes the dimension to flip the cell around the vertical axis:

414 |
perm = vl_hog('permutation') ;
415 | hog_flipped = hog(perm) ;
416 | 
417 |

Note that this permutation applies to a single HOG cell. However, the template is a $H \times W \times 31$ dimensional array of HOG cells.

418 |
419 |

Task: Given a hog array of dimension $H \times W \times 31$, write MATLAB code to obtain the flipped feature array hog_flipped.

420 |
421 |

Hint: Recall that the first dimension spans the vertical axis, the second dimension the horizontal axis, and the third dimension feature channels. perm should be applied to the last dimension. Do you need to permute anything else?

422 |

Now let us apply flipping to the model trained earlier:

423 |
424 |

Task: Let w be the model you trained before. Use the procedure to flip HOG to generate w_flipped. Then visualize both w and w_flipped as done in Sect. 1.3. Convince yourself that flipping was successful.

425 |
426 |

We have now two models, w and w_flipped, one for each view of the object.

427 |
428 |

Task: Run both models in turn on the same image, obtaining two list of bounding boxes. Find a way to merge the two lists and visualise the top detections. Convince yourself that you can now detect objects facing either way.

429 |
430 |

Hint: Recall how redundant detections can be removed using non-maximum suppression.

431 |

Congratulations: This concludes the practical!

432 |

History

433 | 436 |
437 |
438 |
    439 |
  1. 440 |

    This is part of the MatConvNet toolbox for convolutional neural networks. Nevertheless, there is no neural network discussed here. 

    441 |
  2. 442 |
443 |
461 | 462 | 463 | 464 | 465 | -------------------------------------------------------------------------------- /doc/instructions.md: -------------------------------------------------------------------------------- 1 | # Object category detection practical 2 | 3 | This is an [Oxford Visual Geometry Group](http://www.robots.ox.ac.uk/~vgg) computer vision practical, authored by [Andrea Vedaldi](http://www.robots.ox.ac.uk/~vedaldi/) and Andrew Zisserman (Release 2018a). 4 | 5 | ![cover][1] 6 | 7 | The goal of *object category detection* is to identify and localize objects of a given type in an image. Examples applications include detecting pedestrian, cars, or traffic signs in street scenes, objects of interest such as tools or animals in web images, or particular features in medical image. Given a target class, such as *people*, a *detector* receives as input an image and produces as output zero, one, or more bounding boxes around each occurrence of the object class in the image. The key challenge is that the detector needs to find objects regardless of their location and scale in the image, as well as pose and other variation factors, such as clothing, illumination, occlusions, etc. 8 | 9 | This practical explores basic techniques in visual object detection, focusing on *image based models*. The appearance of image patches containing objects is learned using statistical analysis. Then, in order to detect objects in an image, the statistical model is applied to image windows extracted at all possible scales and locations, in order to identify which ones, if any, contain the object. 10 | 11 | In more detail, the practical explores the following topics: (i) using HOG features to describe image regions, (ii) building a HOG-based sliding-window detector to localize objects in images; (iii) working with multiple scales and multiple object occurrences; (iv) using a linear support vector machine to learn the appearance of objects; (v) evaluating an object detector in term of average precision; (vi) learning an object detector using hard negative mining. 12 | 13 | [TOC] 14 | 15 | ## Getting started 16 | 17 | Read and understand the [requirements and installation instructions](../overview/index.html#installation). The download links for this practical are: 18 | 19 | * Code and data: [practical-category-detection-2018a.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a.tar.gz) 20 | * Code only: [practical-category-detection-2018a-code-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-code-only.tar.gz) 21 | * Data only: [practical-category-detection-2018a-data-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-data-only.tar.gz) 22 | * [Git repository](https://github.com/vedaldi/practical-object-category-detection) (for lab setters and developers) 23 | 24 | After the installation is complete, open and edit the script `exercise1.m` in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, relative to [Part I](#part1) of this document. You can cut and paste this code into the MATLAB window to run it, and will need to modify it as you go through the session. Other files `exercise2.m`, `exercise3.m`, and `exercise4.m` are given for [Part II](#part2), [III](#part3), and [IV](part4). 25 | 26 | Each part contains several **Questions** and **Tasks** to be answered/completed before proceeding further in the practical. 27 | 28 | ## Part 1: Detection fundamentals {#part1} 29 | 30 | Part I--IV use as running example the problem of street sign detection, using the data from the [German Traffic Sign Detection Benchmark](http://benchmark.ini.rub.de/?section=gtsdb&subsection=news). This data consists of a number of example traffic images, as well as a number of larger test images containing one or more traffic signs at different sizes and locations. It also comes with *ground truth* annotation, i.e. with specified bounding boxes and sign labels for each sign occurrence, which is required to evaluate the quality of the detector. 31 | 32 | In this part we will build a basic sliding-window object detector based on HOG features. Follow the steps below: 33 | 34 | ### Step 1.0: Loading the training data 35 | 36 | The MATLAB m-file `loadData.m` loads the data for the practical into memory. The function `loadData(targetClass)` takes a `targetClass` argument specifying the object class of interest. Open the `example1.m` file, select the following part of the code, and execute it in MATLAB (right button > `Evaluate selection` or `Shift+F7`). 37 | 38 | ```matlab 39 | % Load the training and testing data (trainImages, trainBoxes, ...) 40 | % The functio takes the ID of the type of traffic sign we want to recognize 41 | % 1 is the 30 km/h speed limit 42 | loadData(1) ; 43 | ``` 44 | 45 | This loads into the current workspace the following variables: 46 | 47 | * `trainImages`: a list of train image names. 48 | * `trainBoxes`: a $4\times N$ array of object bounding boxes, in the form $[x_\text{min},y_\text{min},x_\text{max},y_\text{max}]$. 49 | * `trainBoxImages`: for each bounding box, the name of the image containing it. 50 | * `trainBoxLabels`: for each bounding box, the object label. It is one of the index in `targetClass`. 51 | * `trainBoxPatches`: a $64 \times 64 \times 3 \times N$ array of image patches, one for each training object. Patches are in RGB format. 52 | 53 | An analogous set of variables `testImages`, `testBoxes`, and so on are provided for the test data. Familiarise yourself with the contents of these variables. 54 | 55 | > **Question:** why is there a `trainImages` and a `trainBoxImages` variables? 56 | 57 | ### Step 1.1: Visualize the training images 58 | 59 | Select now the part of the code related to section 1.1 and execute it. This will create an image visualizing both the complete list of object training examples and their average. 60 | 61 | > **Question:** what can you deduce about the object variability from the average image? 62 | 63 | > **Question:** most boxes extend slightly around the object extent. Why do you think this may be valuable in learning a detector? 64 | 65 | ### Step 1.2: Extract HOG features from the training images 66 | 67 | Object detectors usually work on top of a layer of low-level features. In this case, we use HOG (*Histogram of Oriented Gradients*) features. In order to learn a model of the object, we start by extracting features from the image patches corresponding to the available training examples. This is done by the following `for` loop: 68 | 69 | ```matlab 70 | hogCellSize = 8 ; 71 | trainHog = {} ; 72 | for i = 1:size(trainBoxPatches,4) 73 | trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ; 74 | end 75 | trainHog = cat(4, trainHog{:}) ; 76 | ``` 77 | 78 | HOG is computed by the [VLFeat](http::www.vlfeat.org) function `vl_hog` ([doc](http://www.vlfeat.org/matlab/vl_hog.html)). This function takes as parameter the size in pixels of each HOG cell `hogCellSize`. It also takes a RGB image, represented in MATLAB as a $w \times h \times 3$ array (extracted as a slice of `trainBoxPatches`). The output is a $w/\mathtt{hogCellSize} \times h/\mathtt{hogCellSize} \times 31$ dimensional array. One such array is extracted for each example image end eventually these are concatenated in a 4D array along the fourth dimension. 79 | 80 | ### Step 1.3: Learn a simple HOG template model {#sect13} 81 | 82 | A very basic object model can be obtained by averaging the features of the example objects. This is done by: 83 | 84 | ```matlab 85 | w = mean(trainHog, 4) ; 86 | ``` 87 | 88 | The model can be visualized by *rendering* `w` as if it was a HOG feature array. This can be done using the `render` option of `vl_hog`: 89 | 90 | ```matlab 91 | figure(2) ; clf ; 92 | imagesc(vl_hog('render', w)) ; 93 | ``` 94 | 95 | Spend some time to study this plot and make sure you understand what is visualized. 96 | 97 | > **Question:** Can you make sense of the resulting plot? 98 | 99 | ### Step 1.4: Apply the model to a test image 100 | 101 | The model is matched to a test image by: (i) extracting the HOG features of the image and (ii) convolving the model over the resulting feature map: 102 | 103 | ```matlab 104 | im = imread('data/signs-sample-image.jpg') ; 105 | im = im2single(im) ; 106 | hog = vl_hog(im, hogCellSize) ; 107 | scores = vl_nnconv(hog, w, []) ; 108 | ``` 109 | 110 | The first two lines read a sample image and conver it to single format. The third line computes the HOG features of the image using the `vl_hog` seen above. The fourth line convolves the HOG map `hog` with the model `w`. It uses the function `vl_nnconv`[^nn] and returns a `scores` map. 111 | 112 | > **Task:** Work out the dimension of the `scores` arrays. Then, check your result with the dimension of the array computed by MATLAB. 113 | 114 | > **Question:** Visualize the image `im` and the `scores` array using the provided example code. Does the result match your expectations? 115 | 116 | ### Step 1.5: Extract the top detection 117 | 118 | Now that the model has been applied to the image, we have a response map `scores`. To extract a detection from this, we (i) find the maximum response and (ii) compute the bounding box of the image patch containing the corresponding HOG features. The maximum is found by: 119 | 120 | ```matlab 121 | [best, bestIndex] = max(scores(:)) ; 122 | ``` 123 | 124 | Note that `bestIndex` is a linear index in the range $[1, M]$ where $M$ is the number of possible filter locations. We convert this into a subscript $(h_x,h_y)$ using MATLAB `ind2sub` function: 125 | 126 | ```matlab 127 | [hy, hx] = ind2sub(size(scores), bestIndex) ; 128 | ``` 129 | 130 | $(h_x,h_y)$ are in units of HOG cells. We convert this into pixel coordinates as follows: 131 | 132 | ``` 133 | x = (hx - 1) * hogCellSize + 1 ; 134 | y = (hy - 1) * hogCellSize + 1 ; 135 | ``` 136 | 137 | > **Question:** Why are we subtracting -1 and summing +1? Which pixel $(x,y)$ of the HOG cell $(h_x,h_y)$ is found? 138 | 139 | The size of the model template in number of HOG cell can be computed in several way; one is simply: 140 | 141 | ```matlab 142 | modelWidth = size(trainHog, 2) ; 143 | modelHeight = size(trainHog, 1) ; 144 | ``` 145 | 146 | Now we have enough information to compute the bounding box as follows: 147 | 148 | ```matlab 149 | detection = [ 150 | x - 0.5 ; 151 | y - 0.5 ; 152 | x + hogCellSize * modelWidth - 0.5 ; 153 | y + hogCellSize * modelHeight - 0.5 ;] ; 154 | ``` 155 | 156 | **Note:** the bounding box encloses exactly all the pixel of the HOG template. In MATLAB, pixel centers have integer coordinates and pixel borders are at a distance $\pm1/2$. 157 | 158 | > **Question:** Use the example code to plot the image and overlay the bounding box of the detected object. Did it work as expected? 159 | 160 | ## Part 2: Multiple scales and learning with an SVM {#part2} 161 | 162 | In this second part, we will: (i) extend the detector to search objects at multiple scales and (ii) learn a better model using a support vector machine. Let's start by loading the data as needed: 163 | 164 | ```matlab 165 | setup ; 166 | targetClass = 'mandatory' ; 167 | loadData(targetClass) ; 168 | ``` 169 | 170 | The `mandatory` target class is simply the union of all mandatory traffic signs. 171 | 172 | ### Step 2.1: Multi-scale detection {#step2.1} 173 | 174 | Objects exist in images at sizes different from one of the learned template. In order to find objects of all sizes, we scale the image up and down and search for the object over and over again. 175 | 176 | The set of searched scales is defined as follows: 177 | 178 | ```matlab 179 | % Scale space configuraiton 180 | minScale = -1 ; 181 | maxScale = 3 ; 182 | numOctaveSubdivisions = 3 ; 183 | scales = 2.^linspace(... 184 | minScale,... 185 | maxScale,... 186 | numOctaveSubdivisions*(maxScale-minScale+1)) ; 187 | ``` 188 | 189 | Given the model `w`, as determined in Part I, we use the function `detectAtMultipleScales` in order to search for the object at multiple scales: 190 | 191 | ```matlab 192 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ; 193 | ``` 194 | 195 | Note that the function generates a figure as it runs, so prepare a new figure before running it using the `figure` command if you do not want your current figure to be deleted. 196 | 197 | > **Question:** Open and study the `detectAtMultipleScales` function. Convince yourself that it is the same code as before, but operated after rescaling the image a number of times. 198 | 199 | > **Question:** Visualize the resulting detection using the supplied example code. Did it work? If not, can you make sense of the errors? 200 | 201 | > **Question:** Look at the array of `scores` maps generated by `detectAtMultipleScales` using the example code. Do they make sense? Is there anything wrong? 202 | 203 | ### Step 2.2: Collect positive and negative training data 204 | 205 | The model learned so far is too weak to work well. It is now time to use an SVM to learn a better one. In order to do so, we need to prepare suitable data. We already have positive examples (features extracted from object patches): 206 | 207 | ```matlab 208 | % Collect positive training data 209 | pos = trainHog ; 210 | ``` 211 | 212 | Ino order to collect negative examples (features extracted from non-object patches), we loop through a number of training images and sample patches uniformly: 213 | 214 | > **Task:** Identify the code that extract these patches in `example2.m` and make sure you understand it. 215 | 216 | > **Question:** How many negative examples are we collecting? 217 | 218 | ### Step 2.3: Learn a model with an SVM 219 | 220 | Now that we have the data, we can learn an SVM model. To this end we will use the `vl_svmtrain` function. This function requires the data to be in a $D \times N$ matrix, where $D$ are the feature dimensions and $N$ the number of training points. This is done by: 221 | 222 | ```matlab 223 | % Pack the data into a matrix with one datum per column 224 | x = cat(4, pos, neg) ; 225 | x = reshape(x, [], numPos + numNeg) ; 226 | ``` 227 | 228 | We also need a vector of binary labels, +1 for positive points and -1 for negative ones: 229 | ```matlab 230 | % Create a vector of binary labels 231 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ; 232 | ``` 233 | 234 | Finally, we need to set the parameter $\lambda$ of the SVM solver. For reasons that will become clearer later, we use instead the equivalent $C$ parameter: 235 | ```matlab 236 | numPos = size(pos,4) ; 237 | numNeg = size(neg,4) ; 238 | C = 10 ; 239 | lambda = 1 / (C * (numPos + numNeg)) ; 240 | ``` 241 | 242 | Learning the SVM is then a one-liner: 243 | ``` 244 | % Learn the SVM using an SVM solver 245 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ; 246 | ``` 247 | 248 | > **Question:** Visualize the learned model `w` using the supplied code. Does it differ from the naive model learned before? How? 249 | 250 | ### Step 2.4: Evaluate the learned model 251 | 252 | Use the `detectAtMultipleScales` seen above to evaluate the new SVM-based model. 253 | 254 | > **Question:** Does the learned model perform better than the naive average? 255 | 256 | > **Task:** Try different images. Does this detector work all the times? If not, what types of mistakes do you see? Are these mistakes reasonable? 257 | 258 | ## Part 3: Multiple objects and evaluation {#part3} 259 | 260 | ### Step 3.1: Multiple detections 261 | 262 | Detecting at multiple scales is insufficient: we must also allow for more than one object occurrence in the image. In order to to so, the package include a suitalbe `detect` function. This function is similar to `detectAtMultipleScales`, but it returns the top 1000 detector responses rather than just the top one: 263 | ```matlab 264 | % Compute detections 265 | [detections, scores] = detect(im, w, hogCellSize, scales) ; 266 | ``` 267 | 268 | > **Task:** Open and study `detect.m`. Make sure that you understand how it works. 269 | 270 | > **Question:** Why do we want to return so many responses? In practice, it is unlikely that more than a handful of object occurrences may be contained in any given image... 271 | 272 | A single object occurrence generates multiple detector responses at nearby image locations and scales. In order to eliminate these redundant detections, we use a *non-maximum suppression* algorithm. This is implemented by the `boxsuppress.m` MATLAB m-file. The algorithm is simple: start from the highest-scoring detection, then remove any other detection whose overlap[^overlap] is greater than a threshold. The function returns a boolean vector `keep` of detections to preserve: 273 | 274 | ```matlab 275 | % Non-maximum suppression 276 | keep = boxsuppress(detections, scores, 0.25) ; 277 | 278 | detections = detections(:, keep) ; 279 | scores = scores(keep) ; 280 | ``` 281 | 282 | For efficiency, after non-maximum suppression we keep just ten responses (as we do not expect more than a few objects in any image): 283 | ```matlab 284 | % Further keep only top detections 285 | detections = detections(:, 1:10) ; 286 | scores = scores(1:10) ; 287 | ``` 288 | 289 | ### Step 3.2: Detector evaluation 290 | 291 | We are now going to look at properly evaluating our detector. We use the [PASCAL VOC criterion](http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2012/devkit_doc.pdf), computing *Average Precision (AP)*. Consider a test image containing a number of ground truth object occurrences $(g_1,\dots,g_m)$ and a list $(b_1,s_1),\dots,(b_n,s_n)$ of candidate detections $b_i$ with score $s_i$. The following algorithm converts this data into a list of labels and scores $(s_i,y_i)$ that can be used to compute a precision-recall curve, for example using VLFeat `vl_pr` function. The algorithm, implemented by `evalDetections.m`, is as follows: 292 | 293 | 1. Assign each candidate detection $(b_i,s_i)$ a true or false label $y_i \in \{+1,-1\}$. To do so: 294 | 1. The candidate detections $(b_i,s_i)$ are sorted by decreasing score $s_i$. 295 | 2. For each candidate detection in order: 296 | a. If there is a matching ground truth detection $g_j$ ($\operatorname{overlap}(b_i,g_j)$ larger than 50%), the candidate detection is considered positive ($y_i=+1$). Furthermore, the ground truth detection is *removed from the list* and not considered further. 297 | b. Otherwise, the candidate detection is negative ($y_i=-1$). 298 | 2. Add each ground truth object $g_i$ that is still unassigned to the list of candidates as pair $(g_j, -\infty)$ with label $y_j=+1$. 299 | 300 | The overlap metric used to compare a candidate detection to a ground truth bounding box is defined as the *ratio of the area of the intersection over the area of the union* of the two bounding boxes: 301 | $$ 302 | \operatorname{overlap}(A,B) = \frac{|A\cap B|}{|A \cup B|}. 303 | $$ 304 | 305 | > **Questions:** 306 | 307 | > * Why are ground truth detections removed after being matched? 308 | > * What happens if an object is detected twice? 309 | > * Can you explain why unassigned ground-truth objects are added to the list of candidates with $-\infty$ score? 310 | 311 | In order to apply this algorithm, we first need to find the ground truth bounding boxes in the test image: 312 | ```matlab 313 | % Find all the objects in the target image 314 | s = find(strcmp(testImages{1}, testBoxImages)) ; 315 | gtBoxes = testBoxes(:, s) ; 316 | ``` 317 | 318 | Then `evalDetections` can be used: 319 | ```matlab 320 | % No example is considered difficult 321 | gtDifficult = false(1, numel(s)) ; 322 | 323 | % PASCAL-like evaluation 324 | matches = evalDetections(... 325 | gtBoxes, gtDifficult, ... 326 | detections, scores) ; 327 | ``` 328 | The `gtDifficult` flags can be used to mark some ground truth object occurrence as *difficult* and hence ignored in the evaluation. This is used in the PASCAL VOC challenge, but not here (i.e. no object occurrence is considered difficult). 329 | 330 | `evalDetections` returns a `matches` structure with several fields. We focus here on `matches.detBoxFlags`: this contains a +1 for each detection that was found to be correct and -1 otherwise. We use this to visualize the detection errors: 331 | ```matlab 332 | % Visualization 333 | figure(1) ; clf ; 334 | imagesc(im) ; axis equal ; hold on ; 335 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ; 336 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ; 337 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ; 338 | axis off ; 339 | ``` 340 | 341 | > **Task:** Use the supplied example code to evaluate the detector on one image. Look carefully at the output and convince yourself that it makes sense. 342 | 343 | Now Plot the PR curve: 344 | ```matlab 345 | figure(2) ; clf ; 346 | vl_pr(matches.labels, matches.scores) ; 347 | ``` 348 | 349 | > **Question:** There are a large number of errors in each image. Should you worry? In what manner is the PR curve affected? How would you eliminate the vast majority of those in a practice? 350 | 351 | ### Step 3.3: Evaluation on multiple images 352 | 353 | Evaluation is typically done on multiple images rather than just one. This is implemented by the `evalModel.m` m-file. 354 | 355 | > **Task:** Open `evalModel.m` and make sure you understand the main steps of the evaluation procedure. 356 | 357 | Use the supplied example code to run the evaluation on the entiere test set: 358 | ```matlab 359 | matches = evaluateModel(testImages, testBoxes, testBoxImages, ... 360 | w, hogCellSize, scales) ; 361 | ``` 362 | 363 | **Note:** The function processes an image per time, visualizing the results as it progresses. The PR curve is the result of the *accumulation* of the detections obtained thus far. 364 | 365 | > **Task:** Open the `evaluateModel.m` file in MATLAB and add a breakpoint right at the end of the for loop. Now run the evaluation code again and look at each image individually (use `dbcont` to go to the next image). Check out the correct and incorrect matches in each image and their ranking and the effect of this in the cumulative precision-recall curve. 366 | 367 | ## Part 4: Hard negative mining {#part4} 368 | 369 | This part explores more advanced learning methods. So far, the SVM has been learned using a small and randomly sampled number of negative examples. However, in principle, every single patch that does not contain the object can be considered as a negative sample. These are of course too many to be used in practice; unfortunately, random sampling is ineffective as the most interesting (confusing) negative samples are a very small and special subset of all the possible ones. 370 | 371 | *Hard negative mining* is a simple technique that allows finding a small set of key negative examples. The idea is simple: we start by training a model without any negatives at all (in this case the solver learns a 1-class SVM), and then we alternate between evaluating the model on the training data to find erroneous responses and adding the corresponding examples to the training set. 372 | 373 | ### Step 4.1: Train with hard negative mining {#stage4.1} 374 | 375 | Use the supplied code in `example4.m` to run hard negative mining. The code repeats SVM training, as seen above, a number of times, progressively increasing the size of the `neg` array containing the negative samples. This is updated using the output of: 376 | 377 | ```matlab 378 | [matches, moreNeg] = ... 379 | evaluateModel(... 380 | vl_colsubset(trainImages', schedule(t), 'beginning'), ... 381 | trainBoxes, trainBoxImages, ... 382 | w, hogCellSize, scales) ; 383 | ``` 384 | 385 | Here `moreNeg` contains the HOG features of the top (highest scoring and hence most confusing) image patches in the supplied training images. 386 | 387 | > **Task:** Examine `evaluateModel.m` again to understand how hard negatives are extracted. 388 | 389 | > **Question:** What is the purpose of the construct `vl_colsubset(trainImages', schedule(t), 'beginning')`? Why do you think we visit more negative images in later iterations? 390 | 391 | The next step is to fuse the new negative set with the old one: 392 | ```matlab 393 | % Add negatives 394 | neg = cat(4, neg, moreNeg) ; 395 | ``` 396 | 397 | Note that hard negative mining could select the same negatives at different iterations; the following code squashes these duplicates: 398 | ```matlab 399 | % Remove negative duplicates 400 | z = reshape(neg, [], size(neg,4)) ; 401 | [~,keep] = unique(z','stable','rows') ; 402 | neg = neg(:,:,:,keep) ; 403 | ``` 404 | 405 | ### Step 4.2: Evaluate the model on the test data 406 | 407 | Once hard negative mining and training are done, we are ready to evaluate the model on the *test* data (note that the model is evaluated on the *training* data for mining). As before: 408 | ```matlab 409 | evaluateModel(... 410 | testImages, testBoxes, testBoxImages, ... 411 | w, hogCellSize, scales) ; 412 | ``` 413 | 414 | ## Part 5: Train your own object detector 415 | 416 | **Skip on fast track** 417 | 418 | In this last part, you will learn your own object detector. To this end, open and look at `exercise5.m`. You will need to prepare the following data: 419 | 420 | ### Step 5.1: Preparing the training data 421 | 422 | * A folder `data/myPositives` containing files `image1.jpeg`, `image2.jpeg`, ..., each containing a single cropped occurence of the target object. These crops can be of any size, but should be roughly square. 423 | * A folder `data/myNegatives` containing images `image1.jpeg`, `image2.jpeg`, ..., that *do not* contain the target object at all. 424 | * A test image `data/myTestImage.jpeg` containing the target object. This should not be one of the training images. 425 | 426 | Run the code in `example5.m` to check that your training data looks right. 427 | 428 | > **Task:** Understand the limitations of this simple detector and choose a target object that has a good chance of being learnable. 429 | 430 | **Hint:** Note in particular that object instances must be similar and roughly aligned. If your object is not symmetric, consider choosing instances that face a particular direction (e.g. left-facing horse head). 431 | 432 | ### Step 5.2: Learn the model 433 | 434 | Use the code supplied in `example5.m` to learn an SVM model for your object using hard negative mining as in [Stage 4.1](#stage4.1). 435 | 436 | ### Step 5.3: Test the model 437 | 438 | Use the code supplied in `example5.m` to evaluate the SVM model on a test image and visualize the result as in [Stage 2.1](#stage2.1). 439 | 440 | > **Task:** Make sure you get sensible results. Go back to step 5.1 if needed and adjust your data. 441 | 442 | **Hint:** For debugging purposes, try using one of your training images as test. Does it work at least in this case? 443 | 444 | ### Step 5.4: Detecting symmetric objects with multiple aspects 445 | 446 | The basic detectors you have learned so far are *not* invariant to effects such as object deformations, out-of-plane rotations, and partial occlusions that affect most natural objects. Handling these effects requires additional sophistications, including using deformable templates, and a mixture of multiple templates. 447 | 448 | In particular, many objects in nature are symmetric and, as such, their images appear flipped when the objects are seen from the left or the right direction (consider for example a face). This can be handled by a pair of symmetric HOG templates. In this part we will explore this option. 449 | 450 | > **Task:** Using the procedure above, train a HOG template `w` for a symmetric object facing in one specific direction. For example, train a left-facing horse head detector. 451 | 452 | > **Task:** Collect test images containing the object facing in both directions. Run your detector and convince yourself that it works well only for the direction it was trained for. 453 | 454 | HOG features have a well defined structure that makes it possible to predict how the features transform when the underlying image is flipped. The transformation is in fact a simple *permutation* of the HOG elements. For a given spatial cell, HOG has 31 dimensions. The following code permutes the dimension to flip the cell around the vertical axis: 455 | 456 | perm = vl_hog('permutation') ; 457 | hog_flipped = hog(perm) ; 458 | 459 | Note that this permutation applies to a *single* HOG cell. However, the template is a $H \times W \times 31$ dimensional array of HOG cells. 460 | 461 | > **Task:** Given a `hog` array of dimension $H \times W \times 31$, write MATLAB code to obtain the flipped feature array `hog_flipped`. 462 | 463 | **Hint:** Recall that the first dimension spans the vertical axis, the second dimension the horizontal axis, and the third dimension feature channels. `perm` should be applied to the last dimension. Do you need to permute anything else? 464 | 465 | Now let us apply flipping to the model trained earlier: 466 | 467 | > **Task:** Let `w` be the model you trained before. Use the procedure to flip HOG to generate `w_flipped`. Then visualize both `w` and `w_flipped` as done in [Sect. 1.3](#sect13). Convince yourself that flipping was successful. 468 | 469 | We have now two models, `w` and `w_flipped`, one for each view of the object. 470 | 471 | > **Task:** Run both models in turn on the same image, obtaining two list of bounding boxes. Find a way to merge the two lists and visualise the top detections. Convince yourself that you can now detect objects facing either way. 472 | 473 | **Hint:** Recall how redundant detections can be removed using non-maximum suppression. 474 | 475 | **Congratulations: This concludes the practical!** 476 | 477 | [^nn]: This is part of the MatConvNet toolbox for convolutional neural networks. Nevertheless, there is no neural network discussed here. 478 | 479 | [1]: images/cover.jpeg "cover.jpeg" 480 | 481 | ## History 482 | 483 | * Used in the Oxford AIMS CDT, 2014-18 484 | -------------------------------------------------------------------------------- /evalDetections.m: -------------------------------------------------------------------------------- 1 | function match = evalDetections(gtBoxes, gtDifficult, detBoxes, detScores, varargin) 2 | % EVALDETECTIONS 3 | % MATCH = EVALDETECTIONS(GTBOXES, GTDIFFICUTL, DETBOXES, DETSCORES) 4 | % 5 | % MATCH.DETBOXFLAGS: +1 good, 0 match to difficult/ignored, -1 wrong 6 | % MATCH.DETBOXTOGT: map to matched GT, NaN if no match 7 | % MATCH.GTBOXTODET: map to matched Det, NaN if missed, 0 if difficult, -1 if ignored 8 | % MATCH.SCORES: for evaluation (missed boxes have -inf score) 9 | % MATCH.LABELS: for evaluation (difficult/ignored boxes are assigned 0 label) 10 | % 11 | % The first portion fo MATCH.SCORES and MATCH.LABELS correspond to 12 | % the DETBOXES, and DETSCORES passed as input. To these, any 13 | % non-matched ground thruth bounding box is appended with -INF 14 | % score. 15 | % 16 | % The boxes are assumed to be given in the PASCAL format, i.e. the 17 | % coordinates are indeces of the top-left, bottom-right pixels, not 18 | % dimensionless coordinates of the box boundaries. 19 | % 20 | % The detection scores are NOT used to reorder the detection boxes 21 | % (these should normally be passed by decreasing score) so the 22 | % output variables match the order of the input variables. 23 | % 24 | % Auhtor:: Andrea Vedaldi 25 | 26 | % AUTORIGHTS 27 | % Copyright (C) 2008-09 Andrea Vedaldi 28 | % 29 | % This file is part of the VGG MKL Class and VGG MKL Det code packages, 30 | % available in the terms of the GNU General Public License version 2. 31 | 32 | opts.threshold = 0.5 ; 33 | opts.criterion = 'overlap' ; 34 | opts.ignoreDuplicates = false ; 35 | opts.pascalFormat = true ; 36 | opts.display = false ; 37 | opts = vl_argparse(opts, varargin) ; 38 | numGtBoxes = size(gtBoxes, 2) ; 39 | numDetBoxes = size(detBoxes, 2) ; 40 | 41 | gtBoxToDet = NaN * ones(1, numGtBoxes) ; 42 | detBoxToGt = NaN * zeros(1, numDetBoxes) ; 43 | detBoxFlags = - ones(1,numDetBoxes) ; 44 | 45 | if isempty(gtBoxes) 46 | match.detBoxFlags = detBoxFlags ; 47 | match.detBoxToGt = detBoxToGt ; 48 | match.gtBoxToDet = [] ; 49 | match.scores = detScores ; 50 | match.labels = -ones(1,size(detBoxes,2)) ; 51 | return ; 52 | end 53 | 54 | % match detected boxes to gt boxes based on the selected criterion 55 | switch lower(opts.criterion) 56 | case 'overlap' 57 | criterion = boxoverlap(gtBoxes, detBoxes, 'pascalFormat', opts.pascalFormat) ; 58 | case 'inclusion' 59 | criterion = boxinclusion(gtBoxes, detBoxes, 'pascalFormat', opts.pascalFormat) ; 60 | otherwise 61 | error('Unknown criterion %s.', opts.criterion) ; 62 | end 63 | [criterion, allDetBoxToGt] = max(criterion', [], 2) ; 64 | 65 | % prematch detected boxes to difficult gt boxes and remove them from 66 | % the evaluation 67 | selDiff = find((criterion > opts.threshold) & gtDifficult(1,allDetBoxToGt)') ; 68 | detBoxFlags(selDiff) = 0 ; 69 | detBoxToGt(selDiff) = allDetBoxToGt(selDiff) ; 70 | gtBoxToDet(gtDifficult) = 0 ; 71 | 72 | % match the remaining detected boxes to the non-difficult gt boxes 73 | selDetOk = find(criterion > opts.threshold) ; 74 | 75 | nMiss = sum(~gtDifficult) ; 76 | for oki = 1:length(selDetOk) 77 | % if all gt boxes have been assigned stop 78 | if nMiss == 0 & ~opts.ignoreDuplicates, break ; end 79 | 80 | dei = selDetOk(oki) ; 81 | gti = allDetBoxToGt(dei) ; 82 | 83 | % match the gt box to the detection only if the gt box 84 | % is still unassigned (first detection) 85 | if isnan(gtBoxToDet(gti)) 86 | gtBoxToDet(gti) = dei ; 87 | detBoxToGt(dei) = gti ; 88 | detBoxFlags(dei) = +1 ; 89 | nMiss = nMiss - 1 ; 90 | 91 | detBoxToGt(dei) = gti ; 92 | detBoxFlags(dei) = +1 ; 93 | elseif opts.ignoreDuplicates 94 | % match the detection to the gt box in any case 95 | % if duplicates are ignoreed 96 | detBoxToGt(dei) = gti ; 97 | detBoxFlags(dei) = 0 ; 98 | end 99 | end 100 | 101 | % calculate equivalent (scores, labels) pair 102 | selM = find(detBoxFlags == +1) ; % match 103 | selDM = find(detBoxFlags == -1) ; % don't match 104 | selDF = find(detBoxFlags == 0) ; % difficult or ignored 105 | 106 | scores = [detScores, -inf * ones(1,nMiss)] ; 107 | labels = [ones(size(detScores)), ones(1,nMiss)] ; 108 | labels(selDM) = -1 ; 109 | labels(selDF) = 0 ; 110 | 111 | match.detBoxFlags = detBoxFlags ; 112 | match.detBoxToGt = detBoxToGt ; 113 | match.gtBoxToDet = gtBoxToDet ; 114 | match.scores = scores ; 115 | match.labels = labels ; 116 | 117 | if opts.display 118 | hold on ; 119 | vl_plotbox(gtBoxes, 'b', 'linewidth', 2) ; 120 | vl_plotbox(detBoxes(:, detBoxFlags == +1), 'g') ; 121 | vl_plotbox(detBoxes(:, detBoxFlags == 0), 'y') ; 122 | vl_plotbox(detBoxes(:, detBoxFlags == -1), 'r') ; 123 | end 124 | -------------------------------------------------------------------------------- /evaluateModel.m: -------------------------------------------------------------------------------- 1 | function [matches, negs] = evaluateModel(... 2 | testImages, testBoxes, testBoxImages, w, hogCellSize, scales) 3 | 4 | clear matches ; 5 | negs = {} ; 6 | for i=1:numel(testImages) 7 | % Detect on test image 8 | im = imread(testImages{i}) ; 9 | im = im2single(im) ; 10 | [detections, scores, hog] = detect(im, w, hogCellSize, scales) ; 11 | 12 | % Non-maxima suppression 13 | keep = boxsuppress(detections, scores, 0.5) ; 14 | keep = find(keep) ; 15 | keep = vl_colsubset(keep, 15, 'beginning') ; 16 | detections = detections(:, keep) ; 17 | scores = scores(keep) ; 18 | 19 | % Find all the objects in the target image 20 | ok = find(strcmp(testImages{i}, testBoxImages)) ; 21 | gtBoxes = testBoxes(:, ok) ; 22 | gtDifficult = false(1, numel(ok)) ; 23 | matches(i) = evalDetections(... 24 | gtBoxes, gtDifficult, ... 25 | detections, scores) ; 26 | 27 | % Visualize progres 28 | clf; 29 | subplot(1,3,[1 2]) ; 30 | imagesc(im) ; axis equal ; hold on ; 31 | labels = arrayfun(@(x)sprintf('%d',x),1:size(detections,2),'uniformoutput',0) ; 32 | sp = fliplr(find(matches(i).detBoxFlags == -1)) ; 33 | sn = fliplr(find(matches(i).detBoxFlags == +1)) ; 34 | vl_plotbox(detections(:, sp), 'r', 'linewidth', 1, 'label', labels(sp)) ; 35 | vl_plotbox(detections(:, sn), 'g', 'linewidth', 2, 'label', labels(sn)) ; 36 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ; 37 | title(sprintf('Image %d of %d', i, numel(testImages))) ; 38 | axis off ; 39 | 40 | subplot(1,3,3) ; 41 | vl_pr([matches.labels], [matches.scores]) ; 42 | 43 | % If required, collect top negative features 44 | if nargout > 1 45 | overlaps = boxoverlap(gtBoxes, detections) ; 46 | overlaps(end+1,:) = 0 ; 47 | overlaps = max(overlaps,[],1) ; 48 | detections(:, overlaps >= 0.25) = [] ; 49 | detections = vl_colsubset(detections, 10, 'beginning') ; 50 | negs{end+1} = extract(hog, hogCellSize, scales, w, detections) ; 51 | end 52 | 53 | % Break here with the debugger 54 | drawnow ; 55 | end 56 | 57 | if nargout > 1 58 | negs = cat(4, negs{:}) ; 59 | end -------------------------------------------------------------------------------- /exercise1.m: -------------------------------------------------------------------------------- 1 | 2 | % ------------------------------------------------------------------------- 3 | % Step 1.0: Load training data 4 | % ------------------------------------------------------------------------- 5 | 6 | setup ; 7 | 8 | % Load the training and testing data (trainImages, trainBoxes, ...) 9 | % The functio takes the ID of the type of traffic sign we want to recognize 10 | % 1 is the 30 km/h speed limit 11 | loadData(1) ; 12 | 13 | % ------------------------------------------------------------------------- 14 | % Step 1.1: Visualize the training images 15 | % ------------------------------------------------------------------------- 16 | 17 | figure(1) ; clf ; 18 | 19 | subplot(1,2,1) ; 20 | imagesc(vl_imarraysc(trainBoxPatches)) ; 21 | axis off ; 22 | title('Training images (positive samples)') ; 23 | axis equal ; 24 | 25 | subplot(1,2,2) ; 26 | imagesc(mean(trainBoxPatches,4)) ; 27 | box off ; 28 | title('Average') ; 29 | axis equal ; 30 | 31 | % ------------------------------------------------------------------------- 32 | % Step 1.2: Extract HOG features from the training images 33 | % ------------------------------------------------------------------------- 34 | 35 | hogCellSize = 8 ; 36 | trainHog = {} ; 37 | for i = 1:size(trainBoxPatches,4) 38 | trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ; 39 | end 40 | trainHog = cat(4, trainHog{:}) ; 41 | 42 | % ------------------------------------------------------------------------- 43 | % Step 1.3: Learn a simple HOG template model 44 | % ------------------------------------------------------------------------- 45 | 46 | w = mean(trainHog, 4) ; 47 | 48 | save('data/signs-model-1.mat', 'w') ; 49 | 50 | figure(2) ; clf ; 51 | imagesc(vl_hog('render', w)) ; 52 | colormap gray ; 53 | axis equal ; 54 | title('HOG model') ; 55 | 56 | % ------------------------------------------------------------------------- 57 | % Step 1.4: Apply the model to a test image 58 | % ------------------------------------------------------------------------- 59 | 60 | im = imread('data/signs-sample-image.jpg') ; 61 | im = im2single(im) ; 62 | hog = vl_hog(im, hogCellSize) ; 63 | scores = vl_nnconv(hog, w, []) ; 64 | 65 | figure(3) ; clf ; 66 | imagesc(scores) ; 67 | title('Detection') ; 68 | colorbar ; 69 | 70 | % ------------------------------------------------------------------------- 71 | % Step 1.5: Extract the top detection 72 | % ------------------------------------------------------------------------- 73 | 74 | [best, bestIndex] = max(scores(:)) ; 75 | 76 | [hy, hx] = ind2sub(size(scores), bestIndex) ; 77 | x = (hx - 1) * hogCellSize + 1 ; 78 | y = (hy - 1) * hogCellSize + 1 ; 79 | 80 | modelWidth = size(trainHog, 2) ; 81 | modelHeight = size(trainHog, 1) ; 82 | detection = [ 83 | x - 0.5 ; 84 | y - 0.5 ; 85 | x + hogCellSize * modelWidth - 0.5 ; 86 | y + hogCellSize * modelHeight - 0.5 ;] ; 87 | 88 | figure(4) ; clf ; 89 | imagesc(im) ; axis equal ; 90 | hold on ; 91 | vl_plotbox(detection, 'g', 'linewidth', 5) ; 92 | title('Response scores') ; 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /exercise2.m: -------------------------------------------------------------------------------- 1 | % EXERCISE2 2 | setup ; 3 | 4 | %targetClass = 1 ; 5 | %targetClass = 'prohibitory' ; 6 | targetClass = 'mandatory' ; 7 | %targetClass = 'danger' ; 8 | 9 | loadData(targetClass) ; 10 | 11 | % Compute HOG features of examples (see Step 1.2) 12 | hogCellSize = 8 ; 13 | trainHog = {} ; 14 | for i = 1:size(trainBoxPatches,4) 15 | trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ; 16 | end 17 | trainHog = cat(4, trainHog{:}) ; 18 | 19 | % Learn a trivial HOG model (see Step 1.3) 20 | w = mean(trainHog, 4) ; 21 | save('data/signs-model-1.mat', 'w', 'targetClass') ; 22 | 23 | figure(2) ; clf ; 24 | imagesc(vl_hog('render', w)) ; 25 | colormap gray ; axis equal off ; 26 | title('Trivial HOG model') ; 27 | 28 | % ------------------------------------------------------------------------- 29 | % Step 2.1: Multi-scale detection 30 | % ------------------------------------------------------------------------- 31 | 32 | % Scale space configuraiton 33 | minScale = -1 ; 34 | maxScale = 3 ; 35 | numOctaveSubdivisions = 3 ; 36 | scales = 2.^linspace(... 37 | minScale,... 38 | maxScale,... 39 | numOctaveSubdivisions*(maxScale-minScale+1)) ; 40 | 41 | im = imread(testImages{3}) ; 42 | im = im2single(im) ; 43 | 44 | figure(5) ; clf ; 45 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ; 46 | 47 | figure(6) ; clf ; 48 | imagesc(im) ; axis equal off ; hold on ; 49 | vl_plotbox(detection, 'g', 'linewidth', 2) ; 50 | title('Trivial detector output') ; 51 | 52 | % ------------------------------------------------------------------------- 53 | % Step 2.2: Collect positive and negative trainign data 54 | % ------------------------------------------------------------------------- 55 | 56 | % Collect positive training data 57 | pos = trainHog ; 58 | 59 | % Collect negative training data 60 | neg = {} ; 61 | modelWidth = size(trainHog, 2) ; 62 | modelHeight = size(trainHog, 1) ; 63 | for t=1:numel(trainImages) 64 | % Get the HOG features of a training image 65 | t = imread(trainImages{t}) ; 66 | t = im2single(t) ; 67 | hog = vl_hog(t, hogCellSize) ; 68 | 69 | % Sample uniformly 5 HOG patches 70 | % Assume that these are negative (almost certain) 71 | width = size(hog,2) - modelWidth + 1 ; 72 | height = size(hog,1) - modelHeight + 1 ; 73 | index = vl_colsubset(1:width*height, 10, 'uniform') ; 74 | 75 | for j=1:numel(index) 76 | [hy, hx] = ind2sub([height width], index(j)) ; 77 | sx = hx + (0:modelWidth-1) ; 78 | sy = hy + (0:modelHeight-1) ; 79 | neg{end+1} = hog(sy, sx, :) ; 80 | end 81 | end 82 | neg = cat(4, neg{:}) ; 83 | 84 | % ------------------------------------------------------------------------- 85 | % Step 2.3: Learn a model with an SVM 86 | % ------------------------------------------------------------------------- 87 | 88 | numPos = size(pos,4) ; 89 | numNeg = size(neg,4) ; 90 | C = 10 ; 91 | lambda = 1 / (C * (numPos + numNeg)) ; 92 | 93 | % Pack the data into a matrix with one datum per column 94 | x = cat(4, pos, neg) ; 95 | x = reshape(x, [], numPos + numNeg) ; 96 | 97 | % Create a vector of binary labels 98 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ; 99 | 100 | % Learn the SVM using an SVM solver 101 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ; 102 | 103 | % Reshape the model vector into a model HOG template 104 | w = single(reshape(w, modelHeight, modelWidth, [])) ; 105 | save('data/signs-model-2.mat', 'w', 'targetClass') ; 106 | 107 | % Plot model 108 | figure(7) ; clf ; 109 | imagesc(vl_hog('render', w)) ; 110 | colormap gray ; axis equal off ; 111 | title('SVM HOG model') ; 112 | 113 | % ------------------------------------------------------------------------- 114 | % Step 2.4: Evaluate learned model 115 | % ------------------------------------------------------------------------- 116 | 117 | % Compute detections 118 | figure(8) ; clf ; 119 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ; 120 | 121 | % Plot top detection 122 | figure(9) ; clf ; 123 | imagesc(im) ; axis equal off ; hold on ; 124 | vl_plotbox(detection, 'g', 'linewidth', 2) ; 125 | title('SVM detector output') ; 126 | 127 | 128 | -------------------------------------------------------------------------------- /exercise3.m: -------------------------------------------------------------------------------- 1 | % EXERCISE3 2 | setup ; 3 | 4 | % Feature configuration 5 | hogCellSize = 8 ; 6 | numHardNegativeMiningIterations = 3 ; 7 | minScale = -1 ; 8 | maxScale = 3 ; 9 | numOctaveSubdivisions = 3 ; 10 | scales = 2.^linspace(... 11 | minScale,... 12 | maxScale,... 13 | numOctaveSubdivisions*(maxScale-minScale+1)) ; 14 | 15 | % Load data 16 | load('data/signs-model-2.mat','w','targetClass') ; 17 | loadData(targetClass) ; 18 | 19 | % ------------------------------------------------------------------------- 20 | % Step 3.1: Multiple detections 21 | % ------------------------------------------------------------------------- 22 | 23 | im = imread(testImages{3}) ; 24 | im = im2single(im) ; 25 | 26 | % Compute detections 27 | [detections, scores] = detect(im, w, hogCellSize, scales) ; 28 | 29 | % Non-maxima suppression 30 | keep = boxsuppress(detections, scores, 0.25) ; 31 | 32 | detections = detections(:, keep) ; 33 | scores = scores(keep) ; 34 | 35 | % Further keep only top detections 36 | detections = detections(:, 1:10) ; 37 | scores = scores(1:10) ; 38 | 39 | % Plot top detection 40 | figure(10) ; clf ; 41 | imagesc(im) ; axis equal ; 42 | hold on ; 43 | vl_plotbox(detections, 'g', 'linewidth', 2, ... 44 | 'label', arrayfun(@(x)sprintf('%.2f',x),scores,'uniformoutput',0)) ; 45 | title('Multiple detections') ; 46 | 47 | % ------------------------------------------------------------------------- 48 | % Step 3.2: Detector evaluation 49 | % ------------------------------------------------------------------------- 50 | 51 | % Find all the objects in the target image 52 | s = find(strcmp(testImages{3}, testBoxImages)) ; 53 | gtBoxes = testBoxes(:, s) ; 54 | 55 | % No example is considered difficult 56 | gtDifficult = false(1, numel(s)) ; 57 | 58 | % PASCAL-like evaluation 59 | matches = evalDetections(... 60 | gtBoxes, gtDifficult, ... 61 | detections, scores) ; 62 | 63 | % Visualization 64 | figure(1) ; clf ; 65 | imagesc(im) ; axis equal ; hold on ; 66 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ; 67 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ; 68 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ; 69 | axis off ; 70 | 71 | figure(2) ; clf ; 72 | vl_pr(matches.labels, matches.scores) ; 73 | 74 | % ------------------------------------------------------------------------- 75 | % Step 3.3: Evaluation on multiple images 76 | % ------------------------------------------------------------------------- 77 | 78 | figure(3) ; clf ; 79 | 80 | matches = evaluateModel(testImages, testBoxes, testBoxImages, ... 81 | w, hogCellSize, scales) ; 82 | 83 | -------------------------------------------------------------------------------- /exercise4.m: -------------------------------------------------------------------------------- 1 | % EXERCISE4 2 | setup ; 3 | 4 | % Training cofiguration 5 | %targetClass = 1 ; 6 | %targetClass = 'prohibitory' ; 7 | targetClass = 'mandatory' ; 8 | %targetClass = 'danger' ; 9 | numHardNegativeMiningIterations = 7 ; 10 | schedule = [1 2 5 5 100 100 100] ; 11 | 12 | % Scale space configuration 13 | hogCellSize = 8 ; 14 | minScale = -1 ; 15 | maxScale = 3 ; 16 | numOctaveSubdivisions = 3 ; 17 | scales = 2.^linspace(... 18 | minScale,... 19 | maxScale,... 20 | numOctaveSubdivisions*(maxScale-minScale+1)) ; 21 | 22 | % Load data 23 | loadData(targetClass) ; 24 | 25 | % Compute HOG features of examples (see Step 1.2) 26 | trainBoxHog = {} ; 27 | for i = 1:size(trainBoxPatches,4) 28 | trainBoxHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ; 29 | end 30 | trainBoxHog = cat(4, trainBoxHog{:}) ; 31 | modelWidth = size(trainBoxHog,2) ; 32 | modelHeight = size(trainBoxHog,1) ; 33 | 34 | % ------------------------------------------------------------------------- 35 | % Step 4.1: Train with hard negative mining 36 | % ------------------------------------------------------------------------- 37 | 38 | % Initial positive and negative data 39 | pos = trainBoxHog(:,:,:,ismember(trainBoxLabels,targetClass)) ; 40 | neg = zeros(size(pos,1),size(pos,2),size(pos,3),0) ; 41 | 42 | for t=1:numHardNegativeMiningIterations 43 | numPos = size(pos,4) ; 44 | numNeg = size(neg,4) ; 45 | C = 1 ; 46 | lambda = 1 / (C * (numPos + numNeg)) ; 47 | 48 | fprintf('Hard negative mining iteration %d: pos %d, neg %d\n', ... 49 | t, numPos, numNeg) ; 50 | 51 | % Train an SVM model (see Step 2.2) 52 | x = cat(4, pos, neg) ; 53 | x = reshape(x, [], numPos + numNeg) ; 54 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ; 55 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ; 56 | w = single(reshape(w, modelHeight, modelWidth, [])) ; 57 | 58 | % Plot model 59 | figure(1) ; clf ; 60 | imagesc(vl_hog('render', w)) ; 61 | colormap gray ; axis equal ; 62 | title(sprintf('SVM HOG model (retraining ieration %d)',t)) ; 63 | 64 | % Evaluate on training data and mine hard negatives 65 | figure(2) ; set(gcf, 'name', sprintf('Retraining iteration %d',t)) ; 66 | [matches, moreNeg] = ... 67 | evaluateModel(... 68 | vl_colsubset(trainImages', schedule(t), 'beginning'), ... 69 | trainBoxes, trainBoxImages, ... 70 | w, hogCellSize, scales) ; 71 | 72 | % Add negatives 73 | neg = cat(4, neg, moreNeg) ; 74 | 75 | % Remove negative duplicates 76 | z = reshape(neg, [], size(neg,4)) ; 77 | [~,keep] = unique(z','stable','rows') ; 78 | neg = neg(:,:,:,keep) ; 79 | end 80 | 81 | % ------------------------------------------------------------------------- 82 | % Step 4.2: Evaluate the model on the test data 83 | % ------------------------------------------------------------------------- 84 | 85 | figure(3) ; clf ; 86 | evaluateModel(... 87 | testImages, testBoxes, testBoxImages, ... 88 | w, hogCellSize, scales) ; -------------------------------------------------------------------------------- /exercise5.m: -------------------------------------------------------------------------------- 1 | % EXERCISE5 2 | setup ; 3 | 4 | % Training cofiguration 5 | targetClass = 1 ; 6 | numHardNegativeMiningIterations = 5 ; 7 | schedule = [1 2 5 5 5] ; 8 | 9 | % Scale space configuration 10 | hogCellSize = 8 ; 11 | minScale = -1 ; 12 | maxScale = 3 ; 13 | numOctaveSubdivisions = 3 ; 14 | scales = 2.^linspace(... 15 | minScale,... 16 | maxScale,... 17 | numOctaveSubdivisions*(maxScale-minScale+1)) ; 18 | 19 | % ------------------------------------------------------------------------- 20 | % Step 5.1: Construct custom training data 21 | % ------------------------------------------------------------------------- 22 | 23 | % Load object examples 24 | trainImages = {} ; 25 | trainBoxes = [] ; 26 | trainBoxPatches = {} ; 27 | trainBoxImages = {} ; 28 | trainBoxLabels = [] ; 29 | 30 | % Construct negative data 31 | names = dir('data/myNegatives/*.jpeg') ; 32 | trainImages = fullfile('data', 'myNegatives', {names.name}) ; 33 | 34 | % Construct positive data 35 | names = dir('data/myPositives/*.jpeg') ; 36 | names = fullfile('data', 'myPositives', {names.name}) ; 37 | for i=1:numel(names) 38 | im = imread(names{i}) ; 39 | im = imresize(im, [64 64]) ; 40 | trainBoxes(:,i) = [0.5 ; 0.5 ; 64.5 ; 64.5] ; 41 | trainBoxPatches{i} = im2single(im) ; 42 | trainBoxImages{i} = names{i} ; 43 | trainBoxLabels(i) = 1 ; 44 | end 45 | trainBoxPatches = cat(4, trainBoxPatches{:}) ; 46 | 47 | % Compute HOG features of examples (see Step 1.2) 48 | trainBoxHog = {} ; 49 | for i = 1:size(trainBoxPatches,4) 50 | trainBoxHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ; 51 | end 52 | trainBoxHog = cat(4, trainBoxHog{:}) ; 53 | modelWidth = size(trainBoxHog,2) ; 54 | modelHeight = size(trainBoxHog,1) ; 55 | 56 | % ------------------------------------------------------------------------- 57 | % Step 5.2: Visualize the training images 58 | % ------------------------------------------------------------------------- 59 | 60 | figure(1) ; clf ; 61 | 62 | subplot(1,2,1) ; 63 | imagesc(vl_imarraysc(trainBoxPatches)) ; 64 | axis off ; 65 | title('Training images (positive samples)') ; 66 | axis equal ; 67 | 68 | subplot(1,2,2) ; 69 | imagesc(mean(trainBoxPatches,4)) ; 70 | box off ; 71 | title('Average') ; 72 | axis equal ; 73 | 74 | % ------------------------------------------------------------------------- 75 | % Step 5.3: Train with hard negative mining 76 | % ------------------------------------------------------------------------- 77 | 78 | % Initial positive and negative data 79 | pos = trainBoxHog(:,:,:,ismember(trainBoxLabels,targetClass)) ; 80 | neg = zeros(size(pos,1),size(pos,2),size(pos,3),0) ; 81 | 82 | for t=1:numHardNegativeMiningIterations 83 | numPos = size(pos,4) ; 84 | numNeg = size(neg,4) ; 85 | C = 1 ; 86 | lambda = 1 / (C * (numPos + numNeg)) ; 87 | 88 | fprintf('Hard negative mining iteration %d: pos %d, neg %d\n', ... 89 | t, numPos, numNeg) ; 90 | 91 | % Train an SVM model (see Step 2.2) 92 | x = cat(4, pos, neg) ; 93 | x = reshape(x, [], numPos + numNeg) ; 94 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ; 95 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ; 96 | w = single(reshape(w, modelHeight, modelWidth, [])) ; 97 | 98 | % Plot model 99 | figure(2) ; clf ; 100 | imagesc(vl_hog('render', w)) ; 101 | colormap gray ; 102 | axis equal ; 103 | title('SVM HOG model') ; 104 | 105 | % Evaluate on training data and mine hard negatives 106 | figure(3) ; 107 | [matches, moreNeg] = ... 108 | evaluateModel(... 109 | vl_colsubset(trainImages', schedule(t), 'beginning'), ... 110 | trainBoxes, trainBoxImages, ... 111 | w, hogCellSize, scales) ; 112 | 113 | % Add negatives 114 | neg = cat(4, neg, moreNeg) ; 115 | 116 | % Remove negative duplicates 117 | z = reshape(neg, [], size(neg,4)) ; 118 | [~,keep] = unique(z','stable','rows') ; 119 | neg = neg(:,:,:,keep) ; 120 | end 121 | 122 | 123 | % ------------------------------------------------------------------------- 124 | % Step 5.3: Evaluate the model on the test data 125 | % ------------------------------------------------------------------------- 126 | 127 | im = imread('data/myTestImage.jpeg') ; 128 | im = im2single(im) ; 129 | 130 | % Compute detections 131 | [detections, scores] = detect(im, w, hogCellSize, scales) ; 132 | keep = boxsuppress(detections, scores, 0.25) ; 133 | detections = detections(:, keep(1:10)) ; 134 | scores = scores(keep(1:10)) ; 135 | 136 | % Plot top detection 137 | figure(3) ; clf ; 138 | imagesc(im) ; axis equal ; 139 | hold on ; 140 | vl_plotbox(detections, 'g', 'linewidth', 2, ... 141 | 'label', arrayfun(@(x)sprintf('%.2f',x),scores,'uniformoutput',0)) ; 142 | title('Multiple detections') ; -------------------------------------------------------------------------------- /extra/Makefile: -------------------------------------------------------------------------------- 1 | name ?= practical-category-detection 2 | ver ?= 2018a 3 | 4 | code=\ 5 | boxinclusion.m \ 6 | boxoverlap.m \ 7 | boxsuppress.m \ 8 | detect.m \ 9 | detectAtMultipleScales.m \ 10 | evalDetections.m \ 11 | evaluateModel.m \ 12 | exercise1.m \ 13 | exercise2.m \ 14 | exercise3.m \ 15 | exercise4.m \ 16 | exercise5.m \ 17 | extract.m \ 18 | loadData.m \ 19 | setup.m \ 20 | README.md \ 21 | vlfeat \ 22 | matconvnet 23 | 24 | doc=\ 25 | doc/images \ 26 | doc/instructions.html 27 | 28 | data=\ 29 | data/signs.mat \ 30 | data/signs-sample-image.jpg \ 31 | data/signs 32 | 33 | include extra/practical/Makefile 34 | -------------------------------------------------------------------------------- /extra/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p data/tmp 4 | 5 | cp -vf extra/signs-sample-image.jpg data/ 6 | 7 | ( 8 | cd data/tmp 9 | wget -c nc http://benchmark.ini.rub.de/Dataset_GTSDB/TrainIJCNN2013.zip 10 | unzip -n TrainIJCNN2013.zip 11 | ) 12 | 13 | mkdir -p data/signs 14 | mogrify -path data/signs -format jpeg data/tmp/TrainIJCNN2013/*.ppm 15 | -------------------------------------------------------------------------------- /extra/prepareLabData.m: -------------------------------------------------------------------------------- 1 | % PREPARELABDATA 2 | 3 | % -------------------------------------------------------------------- 4 | % Download VLFeat 5 | % -------------------------------------------------------------------- 6 | 7 | if ~exist('vlfeat', 'dir') 8 | from = 'http://www.vlfeat.org/download/vlfeat-0.9.21-bin.tar.gz' ; 9 | fprintf('Downloading vlfeat from %s\n', from) ; 10 | untar(from, 'data') ; 11 | movefile('data/vlfeat-0.9.21', 'vlfeat') ; 12 | end 13 | 14 | setup ; 15 | 16 | % -------------------------------------------------------------------- 17 | % Download and preprocess traffic sign data 18 | % -------------------------------------------------------------------- 19 | 20 | prefix = 'data/tmp/TrainIJCNN2013' ; 21 | [names,x1,y1,x2,y2,labels] = textread(fullfile(prefix, 'gt.txt'), ... 22 | '%s%d%d%d%d%d', 'headerlines', 1, 'delimiter', ';') ; 23 | boxes = [x1, y1, x2, y2]'+1 ; 24 | 25 | images = fullfile(prefix, names) ; 26 | patches = {} ; 27 | for j = 1:numel(images) 28 | t = imread(images{j}) ; 29 | t = im2single(t) ; 30 | t = imcrop(t, [x1(j) y1(j) x2(j)-x1(j)+1 y2(j)-y1(j)+1]) ; 31 | t = imresize(t, [64 64]) ; 32 | patches{j} = t ; 33 | [~,base,~] = fileparts(images{j}) ; 34 | images{j} = fullfile('data', 'signs', [base '.jpeg']) ; 35 | end 36 | patches = cat(4, patches{:}) ; 37 | 38 | train = unique(names) ; 39 | train = train(randperm(numel(train))) ; 40 | train = train(1:400) ; 41 | train = ismember(names, train) ; 42 | test = ~train ; 43 | 44 | trainImages = unique(images(train)) ; 45 | trainBoxes = boxes(:, train) ; 46 | trainBoxImages = images(train) ; 47 | trainBoxLabels = labels(train) ; 48 | trainBoxPatches = patches(:,:,:,train) ; 49 | 50 | testImages = unique(images(test)) ; 51 | testBoxes = boxes(:, test) ; 52 | testBoxImages = images(test) ; 53 | testBoxLabels = labels(test) ; 54 | testBoxPatches = patches(:,:,:,test) ; 55 | 56 | save('data/signs.mat', ... 57 | 'trainImages', ... 58 | 'trainBoxes', ... 59 | 'trainBoxImages', ... 60 | 'trainBoxLabels', ... 61 | 'trainBoxPatches', ... 62 | 'testImages', ... 63 | 'testBoxes', ... 64 | 'testBoxImages', ... 65 | 'testBoxLabels', ... 66 | 'testBoxPatches') ; 67 | -------------------------------------------------------------------------------- /extra/signs-sample-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/extra/signs-sample-image.jpg -------------------------------------------------------------------------------- /extract.m: -------------------------------------------------------------------------------- 1 | function features = extract(hog, hogCellSize, scales, w, detections) 2 | 3 | modelWidth = size(w,2) ; 4 | modelHeight = size(w,1) ; 5 | 6 | s = (detections(3,:) - detections(1,:)) / hogCellSize / modelWidth ; 7 | 8 | features = {} ; 9 | for i=1:size(detections,2) 10 | [~,j] = min(abs(s(i) - scales)) ; 11 | 12 | hx = (detections(1,i) - 0.5) / hogCellSize / s(i) + 1 ; 13 | hy = (detections(2,i) - 0.5) / hogCellSize / s(i) + 1 ; 14 | sx = round(hx)+ (0:modelWidth-1) ; 15 | sy = round(hy) + (0:modelHeight-1) ; 16 | 17 | features{end+1} = hog{j}(sy, sx, :) ; 18 | end 19 | features = cat(4, features{:}) ; 20 | -------------------------------------------------------------------------------- /loadData.m: -------------------------------------------------------------------------------- 1 | function loadData(targetClass, numPosImages, numNegImages) 2 | % LOADDATA Load data for exercies 3 | % LOADDATA(TARGETCLASS) loads the data configuring it to train 4 | % the specified target class. TARGETCLASS is a vector of one or more 5 | % labels. If more than one label is specified, then multiple classes 6 | % are merged into one. 7 | % 8 | % LOADDATA(TARGETCLASS, NUMPOSIMAGES, NUMNEGIMAGES) allows specifying 9 | % the number of positive and negative images too. 10 | % 11 | % The following variables are created in the workspace: 12 | % 13 | % - trainImages: list of training image names. 14 | % - trainBoxes: 4 x N array of object bounding boxes 15 | % - trainBoxImages: for each box, the corresponding image. 16 | % - trainBoxLabels: the class label of the box (one of TARGETCLASS). 17 | % - trainBoxPatches: 64 x 64 x 3 x N array of box patches. 18 | % 19 | % The same for the test data. 20 | 21 | if nargin < 2 22 | numPosImages = 20 ; 23 | end 24 | 25 | if nargin < 3 26 | numNegImages = 20 ; 27 | end 28 | 29 | load('data/signs.mat', ... 30 | 'trainImages', ... 31 | 'trainBoxes', ... 32 | 'trainBoxImages', ... 33 | 'trainBoxLabels', ... 34 | 'trainBoxPatches', ... 35 | 'testImages', ... 36 | 'testBoxes', ... 37 | 'testBoxImages', ... 38 | 'testBoxLabels', ... 39 | 'testBoxPatches') ; 40 | 41 | 42 | if isstr(targetClass) 43 | switch lower(targetClass) 44 | case 'prohibitory', targetClass = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 15, 16] ; 45 | case 'mandatory', targetClass = [33, 34, 35, 36, 37, 38, 39, 40] ; 46 | case 'danger', targetClass = [11, 18, 19, 20 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] ; 47 | end 48 | end 49 | 50 | % Select only the target class 51 | ok = ismember(trainBoxLabels, targetClass) ; 52 | trainBoxes = trainBoxes(:,ok) ; 53 | trainBoxImages = trainBoxImages(ok) ; 54 | trainBoxLabels = trainBoxLabels(ok) ; 55 | trainBoxPatches = trainBoxPatches(:,:,:,ok) ; 56 | 57 | ok = ismember(testBoxLabels, targetClass) ; 58 | testBoxes = testBoxes(:,ok) ; 59 | testBoxImages = testBoxImages(ok) ; 60 | testBoxLabels = testBoxLabels(ok) ; 61 | testBoxPatches = testBoxPatches(:,:,:,ok) ; 62 | 63 | % Select a subset of training and testing images 64 | [~,perm] = sort(ismember(trainImages, trainBoxImages),'descend') ; 65 | trainImages = trainImages(vl_colsubset(perm', numPosImages, 'beginning')) ; 66 | 67 | [~,perm] = sort(ismember(testImages, testBoxImages),'descend') ; 68 | testImages = testImages(vl_colsubset(perm', numNegImages, 'beginning')) ; 69 | 70 | vars = {... 71 | 'trainImages', ... 72 | 'trainBoxes', ... 73 | 'trainBoxImages', ... 74 | 'trainBoxLabels', ... 75 | 'trainBoxPatches', ... 76 | 'testImages', ... 77 | 'testBoxes', ... 78 | 'testBoxImages', ... 79 | 'testBoxLabels', ... 80 | 'testBoxPatches', ... 81 | 'targetClass'} ; 82 | 83 | for i = 1:numel(vars) 84 | assignin('caller',vars{i},eval(vars{i})) ; 85 | end 86 | -------------------------------------------------------------------------------- /setup.m: -------------------------------------------------------------------------------- 1 | function setup(varargin) 2 | % SETUP Add the required search paths to MATLAB 3 | run matconvnet/matlab/vl_setupnn ; 4 | run vlfeat/toolbox/vl_setup ; 5 | 6 | opts.useGpu = false ; 7 | opts.verbose = false ; 8 | opts.enableImReadJPEG = false ; 9 | opts = vl_argparse(opts, varargin) ; 10 | 11 | try 12 | vl_nnconv(single(1),single(1),[]) ; 13 | catch 14 | warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ; 15 | vl_compilenn('enableGpu', opts.useGpu, ... 16 | 'enableImReadJPEG', opts.enableImReadJPEG, ... 17 | 'verbose', opts.verbose) ; 18 | end 19 | 20 | if opts.useGpu 21 | try 22 | vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ; 23 | catch 24 | vl_compilenn('enableGpu', opts.useGpu, ... 25 | 'enableImReadJPEG', opts.enableImReadJPEG, ... 26 | 'verbose', opts.verbose) ; 27 | warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now') ; 28 | end 29 | end 30 | --------------------------------------------------------------------------------