├── .gitignore
├── .gitmodules
├── README.md
├── boxinclusion.m
├── boxoverlap.m
├── boxsuppress.m
├── detect.m
├── detectAtMultipleScales.m
├── doc
    ├── images
    │   ├── cover.idraw
    │   └── cover.jpeg
    ├── instructions.html
    └── instructions.md
├── evalDetections.m
├── evaluateModel.m
├── exercise1.m
├── exercise2.m
├── exercise3.m
├── exercise4.m
├── exercise5.m
├── extra
    ├── Makefile
    ├── download.sh
    ├── prepareLabData.m
    └── signs-sample-image.jpg
├── extract.m
├── loadData.m
└── setup.m


/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | vlfeat
3 | doc/base.css
4 | doc/prism.css
5 | doc/prism.js
6 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "matconvnet"]
2 | 	path = matconvnet
3 | 	url = ssh://git@bitbucket.org/ovl/matconvnet.git
4 | [submodule "extra/practical"]
5 | 	path = extra/practical
6 | 	url = git@github.com:vedaldi/practical.git
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Object category detection practical
 2 | ===================================
 3 | 
 4 | > A computer vision practical by the Oxford Visual Geometry group,
 5 | > authored by Andrea Vedaldi and Andrew Zisserman.
 6 | 
 7 | Start from `doc/instructions.html`.
 8 | 
 9 | Package contents
10 | ----------------
11 | 
12 | The practical consists of four exercises, organized in the following
13 | files:
14 | 
15 | * `exercise1.m` -- Part 1: Detection fundamentals
16 | * `exercise2.m` -- Part 2: Multiple scales and learning with an SVM
17 | * `exercise3.m` -- Part 3: Multiple objects and evaluation
18 | * `exercise4.m` -- Part 4: Hard negative mining
19 | * `exercise5.m` -- Part 5: Train your own object detector
20 | 
21 | The practical runs in MATLAB and uses
22 | [MatConvNet](http://www.vlfeat.org/matconvnet) and
23 | [VLFeat](http://www.vlfeat.org). This package contains the following
24 | MATLAB functions:
25 | 
26 | * `boxinclusion.m`: compute the inclusion of bounding boxes.
27 | * `boxoverlap.m`: compute the overlap of bounding boxes.
28 | * `boxsuppress.m`: non-maxima box suppression.
29 | * `detect.m`: sliding window detector.
30 | * `detectAtMultipleScales.m`: an intermediate example detector.
31 | * `evalDetections.m`: evaluate detections using the PASCAL VOC criterion.
32 | * `evaluateModel.m`: evaluate a detector against a database of images.
33 | * `extract.m`: extract HOG features from bounding boxes.
34 | * `loadData.m`: load practical data.
35 | * `setup.m`: setup MATLAB environment.
36 | 
37 | Appendix: Installing from scratch
38 | ---------------------------------
39 | 
40 | The practical requires both VLFeat and MatConvNet. VLFeat comes with
41 | pre-built binaries, but MatConvNet does not.
42 | 
43 | 1. From Bash, run `./extras/download.sh`. This will download the
44 |    German Street Sign Benchmark data and VLFeat.
45 | 2. From MATLAB, run `addpath extras ; prepareLabData.m`.
46 | 
47 | Changes
48 | -------
49 | 
50 | * *2014a* - Initial edition
51 | 
52 | License
53 | -------
54 | 
55 |     Copyright (c) 2011-13 Andrea Vedaldi
56 | 
57 |     Permission is hereby granted, free of charge, to any person
58 |     obtaining a copy of this software and associated documentation
59 |     files (the "Software"), to deal in the Software without
60 |     restriction, including without limitation the rights to use, copy,
61 |     modify, merge, publish, distribute, sublicense, and/or sell copies
62 |     of the Software, and to permit persons to whom the Software is
63 |     furnished to do so, subject to the following conditions:
64 | 
65 |     The above copyright notice and this permission notice shall be
66 |     included in all copies or substantial portions of the Software.
67 | 
68 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
69 |     EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
70 |     MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
71 |     NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
72 |     HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
73 |     WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
74 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
75 |     DEALINGS IN THE SOFTWARE.
76 | 


--------------------------------------------------------------------------------
/boxinclusion.m:
--------------------------------------------------------------------------------
 1 | function dist = calcBoxInclusion(A, B, varargin)
 2 | % GETBOXOVERLAP
 3 | %   A and B have a box for each column, in the format [xmin ymin xmax
 4 | %   ymax]. The resulting matrix dist has A's boxes along the rows
 5 | %   and B's boxes along the columns and contains the percentage of
 6 | %   the area of each box B contained in the box A.
 7 | %
 8 | %   Author:: Andrea Vedaldi
 9 | 
10 | % AUTORIGHTS
11 | % Copyright (C) 2008-09 Andrea Vedaldi
12 | % 
13 | % This file is part of VGG MKL classification and detection code,
14 | % available in the terms of the GNU General Public License version 2.
15 | 
16 | opts.pascalFormat = false ;
17 | opts = vl_argparse(opts, varargin) ;
18 | 
19 | m = size(A,2) ;
20 | n = size(B,2) ;
21 | O = [] ;
22 | 
23 | if m==0 || n==0, dist = zeros(m,n) ; return ; end
24 | 
25 | om = ones(1,m) ;
26 | on = ones(1,n) ;
27 | 
28 | if opts.pascalFormat
29 |   A(3:4,:) = A(3:4,:) + 1 ;
30 |   B(3:4,:) = B(3:4,:) + 1 ;
31 | end
32 | 
33 | % find length Ox of the overlap range [x1, x2] along x
34 | % x1 cannot be smaller than A.xmin B.xmin
35 | % x2 cannot be larger  than A.xmax B.xmax
36 | % Ox is x2 - x1 or 0
37 | 
38 | x1 = max(A(1*on,:)', B(1*om,:)) ;
39 | x2 = min(A(3*on,:)', B(3*om,:)) ;
40 | Ox = max(x2 - x1, 0) ;
41 | 
42 | y1 = max(A(2*on,:)', B(2*om,:)) ;
43 | y2 = min(A(4*on,:)', B(4*om,:)) ;
44 | Oy = max(y2 - y1, 0) ;
45 | 
46 | % are of the intersection
47 | areaInt = Ox .* Oy ;
48 | 
49 | % area of the union is sum of areas - inersection
50 | areaA = prod(A(3:4,:) - A(1:2,:)) ;
51 | areaB = prod(B(3:4,:) - B(1:2,:)) ;
52 | 
53 | % final distance matrix
54 | dist = areaInt ./ (areaB(om,:) + eps) ;
55 | 
56 | 


--------------------------------------------------------------------------------
/boxoverlap.m:
--------------------------------------------------------------------------------
 1 | function dist = calcBoxOverlap(A, B, varargin)
 2 | % GETBOXOVERLAP
 3 | %   A and B have a box for each column, in the format [xmin ymin xmax
 4 | %   ymax]. The resulting matrix dist has A's boxes along the rows
 5 | %   and B's boxes along the columns.
 6 | %
 7 | %   Options:
 8 | %
 9 | %   pascalFormat:: false
10 | %     If set to TRUE, then the boxes are assumed to be specified in
11 | %     the PASCAL format. In this case the coordinates are indeces of
12 | %     the upper-left and bottom-right pixels, not the coordinates of
13 | %     2-D points. The difference is that in the former case the area
14 | %     of the box includes the pixels that belongs to the boundary. For
15 | %     instance the box [1;1;1;1] has area 1 according to the PASCAL
16 | %     convention, and area 0 according to the default convention.
17 | %
18 | %   Author:: Andrea Vedaldi
19 | 
20 | % AUTORIGHTS
21 | % Copyright (C) 2008-09 Andrea Vedaldi
22 | %
23 | % This file is part of the VGG MKL Class and VGG MKL Det code packages,
24 | % available in the terms of the GNU General Public License version 2.
25 | 
26 | opts.pascalFormat = false ;
27 | opts = vl_argparse(opts, varargin) ;
28 | 
29 | m = size(A,2) ;
30 | n = size(B,2) ;
31 | O = [] ;
32 | 
33 | if m==0 || n==0, dist = zeros(m,n) ; return ; end
34 | 
35 | om = ones(1,m) ;
36 | on = ones(1,n) ;
37 | 
38 | if opts.pascalFormat
39 |   A(3:4,:) = A(3:4,:) + 1 ;
40 |   B(3:4,:) = B(3:4,:) + 1 ;
41 | end
42 | 
43 | % find length Ox of the overlap range [x1, x2] along x
44 | % x1 cannot be smaller than A.xmin B.xmin
45 | % x2 cannot be larger  than A.xmax B.xmax
46 | % Ox is x2 - x1 or 0
47 | 
48 | x1 = max(A(1*on,:)', B(1*om,:)) ;
49 | x2 = min(A(3*on,:)', B(3*om,:)) ;
50 | Ox = max(x2 - x1, 0) ;
51 | 
52 | y1 = max(A(2*on,:)', B(2*om,:)) ;
53 | y2 = min(A(4*on,:)', B(4*om,:)) ;
54 | Oy = max(y2 - y1, 0) ;
55 | 
56 | % are of the intersection, of A, and of B
57 | areaInt = Ox .* Oy ;
58 | areaA = prod(A(3:4,:) - A(1:2,:)) ;
59 | areaB = prod(B(3:4,:) - B(1:2,:)) ;
60 | 
61 | % area of the union is sum of areas - inersection
62 | dist = areaInt ./ (areaA(on,:)' + areaB(om,:) - areaInt) ;
63 | 


--------------------------------------------------------------------------------
/boxsuppress.m:
--------------------------------------------------------------------------------
 1 | function keep = boxsuppress(boxes, scores, threshold)
 2 | % BOXSUPPRESS Box non-maxima suprression
 3 | %   KEEP = BOXSUPPRESS(BOXES, SCORES, THRESHOLD)
 4 | 
 5 | % remove any empty box (xmax < xmin or ymax < ymin)
 6 | scores(any([-1 0 1 0 ; 0 -1 0 1] * boxes < 0)) = -inf ;
 7 | 
 8 | keep = false(1, size(boxes,2)) ;
 9 | while true
10 |   [score, best] = max(scores) ;
11 |   if score == -inf, break ; end
12 |   keep(best) = true ;
13 |   remove = boxinclusion(boxes(:,best), boxes, 'pascalFormat', true) >= threshold ;
14 |   scores(remove) = -inf ;
15 |   scores(best) = -inf ; % `best` is not in `remove` if threshold > 1
16 | end
17 | 


--------------------------------------------------------------------------------
/detect.m:
--------------------------------------------------------------------------------
 1 | function [detections,scores,hog] = detect(im, w, hogCellSize, scales)
 2 | 
 3 | modelWidth = size(w, 2) ;
 4 | modelHeight = size(w, 1) ;
 5 | 
 6 | detections = {} ;
 7 | scores = {} ;
 8 | hog = {} ;
 9 | 
10 | for s = scales
11 |   % scale image
12 |   t = imresize(im, 1/s) ;
13 |   
14 |   % skip if too small
15 |   if min([size(t,1), size(t,2)]) < 128, break ; end
16 | 
17 |   % extract HOG features
18 |   hog{end+1} = vl_hog(t, hogCellSize) ;
19 |   
20 |   % convolve model
21 |   sc = vl_nnconv(hog{end}, w, []) ;
22 |   
23 |   % get all detections
24 |   [hy,hx] = ind2sub(size(sc), 1:numel(sc)) ;
25 |   
26 |   hx = hx(:)' ;
27 |   hy = hy(:)' ;
28 |   x = (hx - 1) * hogCellSize * s + 1 ;
29 |   y = (hy - 1) * hogCellSize * s + 1 ;
30 |   detections{end+1} = [...
31 |     x - 0.5 ;
32 |     y - 0.5 ;
33 |     x + hogCellSize * modelWidth * s - 0.5 ;
34 |     y + hogCellSize * modelHeight * s - 0.5 ;] ;
35 |   scores{end+1} = sc(:)' ;
36 | end
37 | 
38 | detections = cat(2, detections{:}) ;
39 | scores = cat(2, scores{:}) ;
40 | 
41 | [~, perm] = sort(scores, 'descend') ;
42 | 
43 | perm = perm(1:1000) ;
44 | scores = scores(perm) ;
45 | detections = detections(:, perm) ;
46 | 


--------------------------------------------------------------------------------
/detectAtMultipleScales.m:
--------------------------------------------------------------------------------
 1 | function detection = detectAtMultipleScales(im, w, hogCellSize, scales)
 2 | 
 3 | modelWidth = size(w, 2) ;
 4 | modelHeight = size(w, 1) ;
 5 | bestScore = -inf ;
 6 | minScore = +inf ;
 7 | maxScore = -inf ;
 8 | h = [] ;
 9 | 
10 | for s = scales
11 |   % scale image
12 |   t = imresize(im, 1/s) ;
13 |   
14 |   % extract HOG features
15 |   hog = vl_hog(t, hogCellSize) ;
16 |   
17 |   % convolve model
18 |   scores = vl_nnconv(hog, w, []) ;
19 |   
20 |   % pick best response
21 |   [score, index] = max(scores(:)) ;
22 |   if score > bestScore
23 |     bestScore = score ;
24 |     [hy, hx] = ind2sub(size(scores), index) ;
25 |     x = (hx - 1) * hogCellSize * s + 1 ;
26 |     y = (hy - 1) * hogCellSize * s + 1 ;
27 |     detection = [
28 |       x - 0.5 ;
29 |       y - 0.5 ;
30 |       x + hogCellSize * modelWidth * s - 0.5 ;
31 |       y + hogCellSize * modelHeight * s - 0.5 ;] ;
32 |   end
33 |     
34 |   % plot score map
35 |   vl_tightsubplot(numel(scales),find(s==scales)) ;
36 |   imagesc(scores) ; axis off square ;
37 |   h(end+1) = gca;
38 |   minScore = min([minScore;scores(:)]) ;
39 |   maxScore = max([maxScore;scores(:)]) ;
40 | end
41 | 
42 | set(h, 'clim', [minScore, maxScore]) ;
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/doc/images/cover.idraw:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/doc/images/cover.idraw


--------------------------------------------------------------------------------
/doc/images/cover.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/doc/images/cover.jpeg


--------------------------------------------------------------------------------
/doc/instructions.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <meta charset="utf-8">
  5 |   <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |   <title>VGG Practical</title>
  7 |   <link rel="stylesheet" href="base.css" />
  8 |   <link rel="stylesheet" href="prism.css" />
  9 | </head>
 10 | <body>
 11 | <h1 id="object-category-detection-practical">Object category detection practical</h1>
 12 | <p>This is an <a href="http://www.robots.ox.ac.uk/~vgg">Oxford Visual Geometry Group</a> computer vision practical, authored by <a href="http://www.robots.ox.ac.uk/~vedaldi/">Andrea Vedaldi</a> and Andrew Zisserman (Release 2018a).</p>
 13 | <p><img alt="cover" src="images/cover.jpeg" title="cover.jpeg" /></p>
 14 | <p>The goal of <em>object category detection</em> is to identify and localize objects of a given type in an image. Examples applications include detecting pedestrian, cars, or traffic signs in street scenes, objects of interest such as tools or animals in web images, or particular features in medical image. Given a target class, such as <em>people</em>, a <em>detector</em> receives as input an image and produces as output zero, one, or more bounding boxes around each occurrence of the object class in the image. The key challenge is that the detector needs to find objects regardless of their location and scale in the image, as well as pose and other variation factors, such as clothing, illumination, occlusions, etc.</p>
 15 | <p>This practical explores basic techniques in visual object detection, focusing on  <em>image based models</em>. The appearance of image patches containing objects is learned using statistical analysis. Then, in order to detect objects in an image, the statistical model is applied to image windows extracted at all possible scales and locations, in order to identify which ones, if any, contain the object.</p>
 16 | <p>In more detail, the practical explores the following topics: (i) using HOG features to describe image regions, (ii) building a HOG-based sliding-window detector to localize objects in images; (iii) working with multiple scales and multiple object occurrences; (iv) using a linear support vector machine to learn the appearance of objects; (v) evaluating an object detector in term of average precision; (vi) learning an object detector using hard negative mining.</p>
 17 | <div class="toc">
 18 | <ul>
 19 | <li><a href="#object-category-detection-practical">Object category detection practical</a><ul>
 20 | <li><a href="#getting-started">Getting started</a></li>
 21 | <li><a href="#part1">Part 1: Detection fundamentals</a><ul>
 22 | <li><a href="#step-10-loading-the-training-data">Step 1.0: Loading the training data</a></li>
 23 | <li><a href="#step-11-visualize-the-training-images">Step 1.1: Visualize the training images</a></li>
 24 | <li><a href="#step-12-extract-hog-features-from-the-training-images">Step 1.2: Extract HOG features from the training images</a></li>
 25 | <li><a href="#sect13">Step 1.3: Learn a simple HOG template model</a></li>
 26 | <li><a href="#step-14-apply-the-model-to-a-test-image">Step 1.4: Apply the model to a test image</a></li>
 27 | <li><a href="#step-15-extract-the-top-detection">Step 1.5: Extract the top detection</a></li>
 28 | </ul>
 29 | </li>
 30 | <li><a href="#part2">Part 2: Multiple scales and learning with an SVM</a><ul>
 31 | <li><a href="#step2.1">Step 2.1: Multi-scale detection</a></li>
 32 | <li><a href="#step-22-collect-positive-and-negative-training-data">Step 2.2: Collect positive and negative training data</a></li>
 33 | <li><a href="#step-23-learn-a-model-with-an-svm">Step 2.3: Learn a model with an SVM</a></li>
 34 | <li><a href="#step-24-evaluate-the-learned-model">Step 2.4: Evaluate the learned model</a></li>
 35 | </ul>
 36 | </li>
 37 | <li><a href="#part3">Part 3: Multiple objects and evaluation</a><ul>
 38 | <li><a href="#step-31-multiple-detections">Step 3.1: Multiple detections</a></li>
 39 | <li><a href="#step-32-detector-evaluation">Step 3.2: Detector evaluation</a></li>
 40 | <li><a href="#step-33-evaluation-on-multiple-images">Step 3.3: Evaluation on multiple images</a></li>
 41 | </ul>
 42 | </li>
 43 | <li><a href="#part4">Part 4: Hard negative mining</a><ul>
 44 | <li><a href="#stage4.1">Step 4.1: Train with hard negative mining</a></li>
 45 | <li><a href="#step-42-evaluate-the-model-on-the-test-data">Step 4.2: Evaluate the model on the test data</a></li>
 46 | </ul>
 47 | </li>
 48 | <li><a href="#part-5-train-your-own-object-detector">Part 5: Train your own object detector</a><ul>
 49 | <li><a href="#step-51-preparing-the-training-data">Step 5.1: Preparing the training data</a></li>
 50 | <li><a href="#step-52-learn-the-model">Step 5.2: Learn the model</a></li>
 51 | <li><a href="#step-53-test-the-model">Step 5.3: Test the model</a></li>
 52 | <li><a href="#step-54-detecting-symmetric-objects-with-multiple-aspects">Step 5.4: Detecting symmetric objects with multiple aspects</a></li>
 53 | </ul>
 54 | </li>
 55 | <li><a href="#history">History</a></li>
 56 | </ul>
 57 | </li>
 58 | </ul>
 59 | </div>
 60 | <h2 id="getting-started">Getting started</h2>
 61 | <p>Read and understand the <a href="../overview/index.html#installation">requirements and installation instructions</a>. The download links for this practical are:</p>
 62 | <ul>
 63 | <li>Code and data: <a href="http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a.tar.gz">practical-category-detection-2018a.tar.gz</a></li>
 64 | <li>Code only: <a href="http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-code-only.tar.gz">practical-category-detection-2018a-code-only.tar.gz</a></li>
 65 | <li>Data only: <a href="http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-data-only.tar.gz">practical-category-detection-2018a-data-only.tar.gz</a></li>
 66 | <li><a href="https://github.com/vedaldi/practical-object-category-detection">Git repository</a> (for lab setters and developers)</li>
 67 | </ul>
 68 | <p>After the installation is complete, open and edit the script <code>exercise1.m</code> in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, relative to <a href="#part1">Part I</a> of this document. You can cut and paste this code into the MATLAB window to run it, and will need to modify it as you go through the session. Other files <code>exercise2.m</code>, <code>exercise3.m</code>, and <code>exercise4.m</code> are given for <a href="#part2">Part II</a>, <a href="#part3">III</a>, and <a href="part4">IV</a>.</p>
 69 | <p>Each part contains several <strong>Questions</strong> and <strong>Tasks</strong> to be answered/completed before proceeding further in the practical.</p>
 70 | <h2 id="part1">Part 1: Detection fundamentals</h2>
 71 | <p>Part I--IV use as running example the problem of street sign detection, using the data from the <a href="http://benchmark.ini.rub.de/?section=gtsdb&amp;subsection=news">German Traffic Sign Detection Benchmark</a>. This data consists of a number of example traffic images, as well as a number of larger test images containing one or more traffic signs at different sizes and locations. It also comes with <em>ground truth</em> annotation, i.e. with specified bounding boxes and sign labels for each sign occurrence, which is required to evaluate the quality of the detector.</p>
 72 | <p>In this part we will build a basic sliding-window object detector based on HOG features. Follow the steps below:</p>
 73 | <h3 id="step-10-loading-the-training-data">Step 1.0: Loading the training data</h3>
 74 | <p>The MATLAB m-file <code>loadData.m</code> loads the data for the practical into memory. The function <code>loadData(targetClass)</code> takes a <code>targetClass</code> argument specifying the object class of interest. Open the <code>example1.m</code> file, select the following part of the code, and execute it in MATLAB (right button &gt; <code>Evaluate selection</code> or <code>Shift+F7</code>).</p>
 75 | <pre><code class="language-matlab">% Load the training and testing data (trainImages, trainBoxes, ...)
 76 | % The functio takes the ID of the type of traffic sign we want to recognize
 77 | % 1 is the 30 km/h speed limit
 78 | loadData(1) ;
 79 | </code></pre>
 80 | 
 81 | <p>This loads into the current workspace the following variables:</p>
 82 | <ul>
 83 | <li><code>trainImages</code>: a list of train image names.</li>
 84 | <li><code>trainBoxes</code>: a $4\times N$ array of object bounding boxes, in the form $[x_\text{min},y_\text{min},x_\text{max},y_\text{max}]$.</li>
 85 | <li><code>trainBoxImages</code>: for each bounding box, the name of the image containing it.</li>
 86 | <li><code>trainBoxLabels</code>: for each bounding box, the object label. It is one of the index in <code>targetClass</code>.</li>
 87 | <li><code>trainBoxPatches</code>: a $64 \times 64 \times 3 \times N$ array of image patches, one for each training object. Patches are in RGB format.</li>
 88 | </ul>
 89 | <p>An analogous set of variables <code>testImages</code>, <code>testBoxes</code>, and so on are provided for the test data. Familiarise yourself with the contents of these variables.</p>
 90 | <blockquote>
 91 | <p><strong>Question:</strong> why is there a <code>trainImages</code> and a <code>trainBoxImages</code> variables?</p>
 92 | </blockquote>
 93 | <h3 id="step-11-visualize-the-training-images">Step 1.1: Visualize the training images</h3>
 94 | <p>Select now the part of the code related to section 1.1 and execute it. This will create an image visualizing both the complete list of object training examples and their average.</p>
 95 | <blockquote>
 96 | <p><strong>Question:</strong> what can you deduce about the object variability from the average image?</p>
 97 | <p><strong>Question:</strong> most boxes extend slightly around the object extent. Why do you think this may be valuable in learning a detector?</p>
 98 | </blockquote>
 99 | <h3 id="step-12-extract-hog-features-from-the-training-images">Step 1.2: Extract HOG features from the training images</h3>
100 | <p>Object detectors usually work on top of a layer of low-level features. In this case, we use HOG (<em>Histogram of Oriented Gradients</em>) features. In order to learn a model of the object, we start by extracting features from the image patches corresponding to the available training examples. This is done by the following <code>for</code> loop:</p>
101 | <pre><code class="language-matlab">hogCellSize = 8 ;
102 | trainHog = {} ;
103 | for i = 1:size(trainBoxPatches,4)
104 |   trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
105 | end
106 | trainHog = cat(4, trainHog{:}) ;
107 | </code></pre>
108 | 
109 | <p>HOG is computed by the <a href="http::www.vlfeat.org">VLFeat</a> function <code>vl_hog</code> (<a href="http://www.vlfeat.org/matlab/vl_hog.html">doc</a>). This function takes as parameter the size in pixels of each HOG cell <code>hogCellSize</code>. It also takes a RGB image, represented in MATLAB as a $w \times h \times 3$ array (extracted as a slice of <code>trainBoxPatches</code>). The output is a $w/\mathtt{hogCellSize} \times h/\mathtt{hogCellSize} \times 31$ dimensional array. One such array is extracted for each example image end eventually these are concatenated in a 4D array along the fourth dimension.</p>
110 | <h3 id="sect13">Step 1.3: Learn a simple HOG template model</h3>
111 | <p>A very basic object model can be obtained by averaging the features of the example objects. This is done by:</p>
112 | <pre><code class="language-matlab">w = mean(trainHog, 4) ;
113 | </code></pre>
114 | 
115 | <p>The model can be visualized by <em>rendering</em> <code>w</code> as if it was a HOG feature array. This can be done using the <code>render</code> option of <code>vl_hog</code>:</p>
116 | <pre><code class="language-matlab">figure(2) ; clf ;
117 | imagesc(vl_hog('render', w)) ;
118 | </code></pre>
119 | 
120 | <p>Spend some time to study this plot and make sure you understand what is visualized.</p>
121 | <blockquote>
122 | <p><strong>Question:</strong> Can you make sense of the resulting plot?</p>
123 | </blockquote>
124 | <h3 id="step-14-apply-the-model-to-a-test-image">Step 1.4: Apply the model to a test image</h3>
125 | <p>The model is matched to a test image by: (i) extracting the HOG features of the image and (ii) convolving the model over the resulting feature map:</p>
126 | <pre><code class="language-matlab">im = imread('data/signs-sample-image.jpg') ;
127 | im = im2single(im) ;
128 | hog = vl_hog(im, hogCellSize) ;
129 | scores = vl_nnconv(hog, w, []) ;
130 | </code></pre>
131 | 
132 | <p>The first two lines read a sample image and conver it to single format. The third line computes the HOG features of the image using the <code>vl_hog</code> seen above. The fourth line convolves the HOG map <code>hog</code> with the model <code>w</code>. It uses the function <code>vl_nnconv</code><sup id="fnref:nn"><a class="footnote-ref" href="#fn:nn" rel="footnote">1</a></sup> and returns a <code>scores</code> map.</p>
133 | <blockquote>
134 | <p><strong>Task:</strong> Work out the dimension of the <code>scores</code> arrays. Then, check your result with the dimension of the array computed by MATLAB.</p>
135 | <p><strong>Question:</strong> Visualize the image <code>im</code> and the <code>scores</code> array using the provided example code. Does the result match your expectations?</p>
136 | </blockquote>
137 | <h3 id="step-15-extract-the-top-detection">Step 1.5: Extract the top detection</h3>
138 | <p>Now that the model has been applied to the image, we have a response map <code>scores</code>. To extract a detection from this, we (i) find the maximum response and (ii) compute the bounding box of the image patch containing the corresponding HOG features. The maximum is found by:</p>
139 | <pre><code class="language-matlab">[best, bestIndex] = max(scores(:)) ;
140 | </code></pre>
141 | 
142 | <p>Note that <code>bestIndex</code> is a linear index in the range $[1, M]$ where $M$ is the number of possible filter locations. We convert this into a subscript $(h_x,h_y)$ using MATLAB <code>ind2sub</code> function:</p>
143 | <pre><code class="language-matlab">[hy, hx] = ind2sub(size(scores), bestIndex) ;
144 | </code></pre>
145 | 
146 | <p>$(h_x,h_y)$ are in units of HOG cells. We convert this into pixel coordinates as follows:</p>
147 | <pre><code>x = (hx - 1) * hogCellSize + 1 ;
148 | y = (hy - 1) * hogCellSize + 1 ;
149 | </code></pre>
150 | 
151 | <blockquote>
152 | <p><strong>Question:</strong> Why are we subtracting -1 and summing +1? Which pixel $(x,y)$ of the HOG cell $(h_x,h_y)$ is found?</p>
153 | </blockquote>
154 | <p>The size of the model template in number of HOG cell can be computed in several way; one is simply:</p>
155 | <pre><code class="language-matlab">modelWidth = size(trainHog, 2) ;
156 | modelHeight = size(trainHog, 1) ;
157 | </code></pre>
158 | 
159 | <p>Now we have enough information to compute the bounding box as follows:</p>
160 | <pre><code class="language-matlab">detection = [
161 |   x - 0.5 ;
162 |   y - 0.5 ;
163 |   x + hogCellSize * modelWidth - 0.5 ;
164 |   y + hogCellSize * modelHeight - 0.5 ;] ;
165 | </code></pre>
166 | 
167 | <p><strong>Note:</strong> the bounding box encloses exactly all the pixel of the HOG template. In MATLAB, pixel centers have integer coordinates and pixel borders are at a distance $\pm1/2$.</p>
168 | <blockquote>
169 | <p><strong>Question:</strong> Use the example code to plot the image and overlay the bounding box of the detected object. Did it work as expected?</p>
170 | </blockquote>
171 | <h2 id="part2">Part 2: Multiple scales and learning with an SVM</h2>
172 | <p>In this second part, we will: (i) extend the detector to search objects at multiple scales and (ii) learn a better model using a support vector machine. Let's start by loading the data as needed:</p>
173 | <pre><code class="language-matlab">setup ;
174 | targetClass = 'mandatory' ;
175 | loadData(targetClass) ;
176 | </code></pre>
177 | 
178 | <p>The <code>mandatory</code> target class is simply the union of all mandatory traffic signs.</p>
179 | <h3 id="step2.1">Step 2.1: Multi-scale detection</h3>
180 | <p>Objects exist in images at sizes different from one of the learned template. In order to find objects of all sizes, we scale the image up and down and search for the object over and over again.</p>
181 | <p>The set of searched scales is defined as follows:</p>
182 | <pre><code class="language-matlab">% Scale space configuraiton
183 | minScale = -1 ;
184 | maxScale = 3 ;
185 | numOctaveSubdivisions = 3 ;
186 | scales = 2.^linspace(...
187 |   minScale,...
188 |   maxScale,...
189 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
190 | </code></pre>
191 | 
192 | <p>Given the model <code>w</code>, as determined in Part I, we use the function <code>detectAtMultipleScales</code> in order to search for the object at multiple scales:</p>
193 | <pre><code class="language-matlab">detection = detectAtMultipleScales(im, w, hogCellSize, scales) ;
194 | </code></pre>
195 | 
196 | <p>Note that the function generates a figure as it runs, so prepare a new figure before running it using the <code>figure</code> command if you do not want your current figure to be deleted.</p>
197 | <blockquote>
198 | <p><strong>Question:</strong> Open and study the <code>detectAtMultipleScales</code> function. Convince yourself that it is the same code as before, but operated after rescaling the image a number of times. </p>
199 | <p><strong>Question:</strong> Visualize the resulting detection using the supplied example code. Did it work? If not, can you make sense of the errors?</p>
200 | <p><strong>Question:</strong> Look at the array of <code>scores</code> maps generated by <code>detectAtMultipleScales</code> using the example code. Do they make sense? Is there anything wrong?</p>
201 | </blockquote>
202 | <h3 id="step-22-collect-positive-and-negative-training-data">Step 2.2: Collect positive and negative training data</h3>
203 | <p>The model learned so far is too weak to work well. It is now time to use an SVM to learn a better one. In order to do so, we need to prepare suitable data. We already have positive examples (features extracted from object patches):</p>
204 | <pre><code class="language-matlab">% Collect positive training data
205 | pos = trainHog ;
206 | </code></pre>
207 | 
208 | <p>Ino order to collect negative examples (features extracted from non-object patches), we loop through a number of training images and sample patches uniformly:</p>
209 | <blockquote>
210 | <p><strong>Task:</strong> Identify the code that extract these patches in <code>example2.m</code> and make sure you understand it.</p>
211 | <p><strong>Question:</strong> How many negative examples are we collecting?</p>
212 | </blockquote>
213 | <h3 id="step-23-learn-a-model-with-an-svm">Step 2.3: Learn a model with an SVM</h3>
214 | <p>Now that we have the data, we can learn an SVM model. To this end we will use the <code>vl_svmtrain</code> function. This function requires the data to be in a $D \times N$ matrix, where $D$ are the feature dimensions and $N$ the number of training points. This is done by:</p>
215 | <pre><code class="language-matlab">% Pack the data into a matrix with one datum per column
216 | x = cat(4, pos, neg) ;
217 | x = reshape(x, [], numPos + numNeg) ;
218 | </code></pre>
219 | 
220 | <p>We also need a vector of binary labels, +1 for positive points and -1 for negative ones:</p>
221 | <pre><code class="language-matlab">% Create a vector of binary labels
222 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
223 | </code></pre>
224 | 
225 | <p>Finally, we need to set the parameter $\lambda$ of the SVM solver. For reasons that will become clearer later, we use instead the equivalent $C$ parameter:</p>
226 | <pre><code class="language-matlab">numPos = size(pos,4) ;
227 | numNeg = size(neg,4) ;
228 | C = 10 ;
229 | lambda = 1 / (C * (numPos + numNeg)) ;
230 | </code></pre>
231 | 
232 | <p>Learning the SVM is then a one-liner:</p>
233 | <pre><code>% Learn the SVM using an SVM solver
234 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
235 | </code></pre>
236 | 
237 | <blockquote>
238 | <p><strong>Question:</strong> Visualize the learned model <code>w</code> using the supplied code. Does it differ from the naive model learned before? How?</p>
239 | </blockquote>
240 | <h3 id="step-24-evaluate-the-learned-model">Step 2.4: Evaluate the learned model</h3>
241 | <p>Use the <code>detectAtMultipleScales</code> seen above to evaluate the new SVM-based model.</p>
242 | <blockquote>
243 | <p><strong>Question:</strong> Does the learned model perform better than the naive average?</p>
244 | <p><strong>Task:</strong> Try different images. Does this detector work all the times? If not, what types of mistakes do you see? Are these mistakes reasonable?</p>
245 | </blockquote>
246 | <h2 id="part3">Part 3: Multiple objects and evaluation</h2>
247 | <h3 id="step-31-multiple-detections">Step 3.1: Multiple detections</h3>
248 | <p>Detecting at multiple scales is insufficient: we must also allow for more than one object occurrence in the image. In order to to so, the package include a suitalbe <code>detect</code> function. This function is similar to <code>detectAtMultipleScales</code>, but it returns the top 1000 detector responses rather than just the top one:</p>
249 | <pre><code class="language-matlab">% Compute detections
250 | [detections, scores] = detect(im, w, hogCellSize, scales) ;
251 | </code></pre>
252 | 
253 | <blockquote>
254 | <p><strong>Task:</strong> Open and study <code>detect.m</code>. Make sure that you understand how it works.</p>
255 | <p><strong>Question:</strong> Why do we want to return so many responses? In practice, it is unlikely that more than a handful of object occurrences may be contained in any given image...</p>
256 | </blockquote>
257 | <p>A single object occurrence generates multiple detector responses at nearby image locations and scales. In order to eliminate these redundant detections, we use a <em>non-maximum suppression</em> algorithm. This is implemented by the <code>boxsuppress.m</code> MATLAB m-file. The algorithm is simple: start from the highest-scoring detection, then remove any other detection whose overlap[^overlap] is greater than a threshold. The function returns a boolean vector <code>keep</code> of detections to preserve:</p>
258 | <pre><code class="language-matlab">% Non-maximum suppression
259 | keep = boxsuppress(detections, scores, 0.25) ;
260 | 
261 | detections = detections(:, keep) ;
262 | scores = scores(keep) ;
263 | </code></pre>
264 | 
265 | <p>For efficiency, after non-maximum suppression we keep just ten responses (as we do not expect more than a few objects in any image):</p>
266 | <pre><code class="language-matlab">% Further keep only top detections
267 | detections = detections(:, 1:10) ;
268 | scores = scores(1:10) ;
269 | </code></pre>
270 | 
271 | <h3 id="step-32-detector-evaluation">Step 3.2: Detector evaluation</h3>
272 | <p>We are now going to look at properly evaluating our detector. We use the <a href="http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2012/devkit_doc.pdf">PASCAL VOC criterion</a>, computing <em>Average Precision (AP)</em>. Consider a test image containing a number of ground truth object occurrences $(g_1,\dots,g_m)$ and a list $(b_1,s_1),\dots,(b_n,s_n)$ of candidate detections $b_i$ with score $s_i$. The following algorithm converts this data into a list of labels and scores $(s_i,y_i)$ that can be used to compute a precision-recall curve, for example using VLFeat <code>vl_pr</code> function. The algorithm, implemented by <code>evalDetections.m</code>, is as follows:</p>
273 | <ol>
274 | <li>Assign each candidate detection $(b_i,s_i)$ a true or false label $y_i \in {+1,-1}$. To do so:<ol>
275 | <li>The candidate detections $(b_i,s_i)$ are sorted by decreasing score $s_i$.</li>
276 | <li>For each candidate detection in order:
277 |     a. If there is a matching ground truth detection $g_j$ ($\operatorname{overlap}(b_i,g_j)$ larger than 50%), the candidate detection is considered positive ($y_i=+1$). Furthermore, the ground truth detection is <em>removed from the list</em> and not considered further.
278 |     b. Otherwise, the candidate detection is negative ($y_i=-1$).</li>
279 | </ol>
280 | </li>
281 | <li>Add each ground truth object $g_i$ that is still unassigned to the list of candidates as pair $(g_j, -\infty)$ with label $y_j=+1$.</li>
282 | </ol>
283 | <p>The overlap metric used to compare a candidate detection to a ground truth bounding box is defined as the <em>ratio of the area of the intersection over the area of the union</em> of the two bounding boxes:
284 | <script type="math/tex; mode=display">
285 | \operatorname{overlap}(A,B) = \frac{|A\cap B|}{|A \cup B|}.
286 | </script>
287 | </p>
288 | <blockquote>
289 | <p><strong>Questions:</strong></p>
290 | <ul>
291 | <li>Why are ground truth detections removed after being matched?</li>
292 | <li>What happens if an object is detected twice?</li>
293 | <li>Can you explain why unassigned ground-truth objects are added to the list of candidates with $-\infty$ score?</li>
294 | </ul>
295 | </blockquote>
296 | <p>In order to apply this algorithm, we first need to find the ground truth bounding boxes in the test image:</p>
297 | <pre><code class="language-matlab">% Find all the objects in the target image
298 | s = find(strcmp(testImages{1}, testBoxImages)) ;
299 | gtBoxes = testBoxes(:, s) ;
300 | </code></pre>
301 | 
302 | <p>Then <code>evalDetections</code> can be used:</p>
303 | <pre><code class="language-matlab">% No example is considered difficult
304 | gtDifficult = false(1, numel(s)) ;
305 | 
306 | % PASCAL-like evaluation
307 | matches = evalDetections(...
308 |   gtBoxes, gtDifficult, ...
309 |   detections, scores) ;
310 | </code></pre>
311 | 
312 | <p>The <code>gtDifficult</code> flags can be used to mark some ground truth object occurrence as <em>difficult</em> and hence ignored in the evaluation. This is used in the PASCAL VOC challenge, but not here (i.e. no object occurrence is considered difficult).</p>
313 | <p><code>evalDetections</code> returns a <code>matches</code> structure with several fields. We focus here on <code>matches.detBoxFlags</code>: this contains a +1 for each detection that was found to be correct and -1 otherwise. We use this to visualize the detection errors:</p>
314 | <pre><code class="language-matlab">% Visualization
315 | figure(1) ; clf ;
316 | imagesc(im) ; axis equal ; hold on ;
317 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ;
318 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ;
319 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ;
320 | axis off ;
321 | </code></pre>
322 | 
323 | <blockquote>
324 | <p><strong>Task:</strong> Use the supplied example code to evaluate the detector on one image. Look carefully at the output and convince yourself that it makes sense.</p>
325 | </blockquote>
326 | <p>Now Plot the PR curve:</p>
327 | <pre><code class="language-matlab">figure(2) ; clf ;
328 | vl_pr(matches.labels, matches.scores) ;
329 | </code></pre>
330 | 
331 | <blockquote>
332 | <p><strong>Question:</strong> There are a large number of errors in each image. Should you worry?  In what manner is the PR curve affected? How would you eliminate the vast majority of those in a practice?</p>
333 | </blockquote>
334 | <h3 id="step-33-evaluation-on-multiple-images">Step 3.3: Evaluation on multiple images</h3>
335 | <p>Evaluation is typically done on multiple images rather than just one. This is implemented by the <code>evalModel.m</code> m-file.</p>
336 | <blockquote>
337 | <p><strong>Task:</strong> Open <code>evalModel.m</code> and make sure you understand the main steps of the evaluation procedure.</p>
338 | </blockquote>
339 | <p>Use the supplied example code to run the evaluation on the entiere test set:</p>
340 | <pre><code class="language-matlab">matches = evaluateModel(testImages, testBoxes, testBoxImages, ...
341 |   w, hogCellSize, scales) ;
342 | </code></pre>
343 | 
344 | <p><strong>Note:</strong> The function processes an image per time, visualizing the results as it progresses. The PR curve is the result of the <em>accumulation</em> of the detections obtained thus far.</p>
345 | <blockquote>
346 | <p><strong>Task:</strong> Open the <code>evaluateModel.m</code> file in MATLAB and add a breakpoint right at the end of the for loop. Now run the evaluation code again and look at each image individually (use <code>dbcont</code> to go to the next image). Check out the correct and incorrect matches in each image and their ranking and the effect of this in the cumulative precision-recall curve.</p>
347 | </blockquote>
348 | <h2 id="part4">Part 4: Hard negative mining</h2>
349 | <p>This part explores more advanced learning methods. So far, the SVM has been learned using a small and randomly sampled number of negative examples. However, in principle, every single patch that does not contain the object can be considered as a negative sample. These are of course too many to be used in practice; unfortunately, random sampling is ineffective as the most interesting (confusing) negative samples are a very small and special subset of all the possible ones.</p>
350 | <p><em>Hard negative mining</em> is a simple technique that allows finding a small set of key negative examples. The idea is simple: we start by training a model without any negatives at all (in this case the solver learns a 1-class SVM), and then we alternate between evaluating the model on the training data to find erroneous responses and adding the corresponding examples to the training set.</p>
351 | <h3 id="stage4.1">Step 4.1: Train with hard negative mining</h3>
352 | <p>Use the supplied code in <code>example4.m</code> to run hard negative mining. The code repeats SVM training, as seen above, a number of times, progressively increasing the size of the <code>neg</code> array containing the negative samples. This is updated using the output of:</p>
353 | <pre><code class="language-matlab"> [matches, moreNeg] = ...
354 |     evaluateModel(...
355 |     vl_colsubset(trainImages', schedule(t), 'beginning'), ...
356 |     trainBoxes, trainBoxImages, ...
357 |     w, hogCellSize, scales) ;
358 | </code></pre>
359 | 
360 | <p>Here <code>moreNeg</code> contains the HOG features of the top (highest scoring and hence most confusing) image patches in the supplied training images.</p>
361 | <blockquote>
362 | <p><strong>Task:</strong> Examine <code>evaluateModel.m</code> again to understand how hard negatives are extracted.</p>
363 | <p><strong>Question:</strong> What is the purpose of the construct <code>vl_colsubset(trainImages', schedule(t), 'beginning')</code>? Why do you think we visit more negative images in later iterations?</p>
364 | </blockquote>
365 | <p>The next step is to fuse the new negative set with the old one:</p>
366 | <pre><code class="language-matlab">% Add negatives
367 | neg = cat(4, neg, moreNeg) ;
368 | </code></pre>
369 | 
370 | <p>Note that hard negative mining could select the same negatives at different iterations; the following code squashes these duplicates:</p>
371 | <pre><code class="language-matlab">% Remove negative duplicates
372 | z = reshape(neg, [], size(neg,4)) ;
373 | [~,keep] = unique(z','stable','rows') ;
374 | neg = neg(:,:,:,keep) ;
375 | </code></pre>
376 | 
377 | <h3 id="step-42-evaluate-the-model-on-the-test-data">Step 4.2: Evaluate the model on the test data</h3>
378 | <p>Once hard negative mining and training are done, we are ready to evaluate the model on the <em>test</em> data (note that the model is evaluated on the <em>training</em> data for mining). As before:</p>
379 | <pre><code class="language-matlab">evaluateModel(...
380 |     testImages, testBoxes, testBoxImages, ...
381 |     w, hogCellSize, scales) ;
382 | </code></pre>
383 | 
384 | <h2 id="part-5-train-your-own-object-detector">Part 5: Train your own object detector</h2>
385 | <p><strong>Skip on fast track</strong></p>
386 | <p>In this last part, you will learn your own object detector. To this end, open and look at <code>exercise5.m</code>. You will need to prepare the following data:</p>
387 | <h3 id="step-51-preparing-the-training-data">Step 5.1: Preparing the training data</h3>
388 | <ul>
389 | <li>A folder <code>data/myPositives</code> containing files <code>image1.jpeg</code>, <code>image2.jpeg</code>, ..., each containing a single cropped occurence of the target object. These crops can be of any size, but should be roughly square.</li>
390 | <li>A folder <code>data/myNegatives</code> containing images <code>image1.jpeg</code>, <code>image2.jpeg</code>, ..., that <em>do not</em> contain the target object at all.</li>
391 | <li>A test image <code>data/myTestImage.jpeg</code> containing the target object. This should not be one of the training images.</li>
392 | </ul>
393 | <p>Run the code in <code>example5.m</code> to check that your training data looks right.</p>
394 | <blockquote>
395 | <p><strong>Task:</strong> Understand the limitations of this simple detector and choose a target object that has a good chance of being learnable. </p>
396 | </blockquote>
397 | <p><strong>Hint:</strong> Note in particular that object instances must be similar and roughly aligned. If your object is not symmetric, consider choosing instances that face a particular direction (e.g. left-facing horse head).</p>
398 | <h3 id="step-52-learn-the-model">Step 5.2: Learn the model</h3>
399 | <p>Use the code supplied in <code>example5.m</code> to learn an SVM model for your object using hard negative mining as in <a href="#stage4.1">Stage 4.1</a>.</p>
400 | <h3 id="step-53-test-the-model">Step 5.3: Test the model</h3>
401 | <p>Use the code supplied in <code>example5.m</code> to evaluate the SVM model on a test image and visualize the result as in <a href="#stage2.1">Stage 2.1</a>.</p>
402 | <blockquote>
403 | <p><strong>Task:</strong> Make sure you get sensible results. Go back to step 5.1 if needed and adjust your data.</p>
404 | </blockquote>
405 | <p><strong>Hint:</strong> For debugging purposes, try using one of your training images as test. Does it work at least in this case?</p>
406 | <h3 id="step-54-detecting-symmetric-objects-with-multiple-aspects">Step 5.4: Detecting symmetric objects with multiple aspects</h3>
407 | <p>The basic detectors you have learned so far are <em>not</em> invariant to effects such as object deformations, out-of-plane rotations, and partial occlusions that affect most natural objects. Handling these effects requires additional sophistications, including using deformable templates, and a mixture of multiple templates.</p>
408 | <p>In particular, many objects in nature are symmetric and, as such, their images appear flipped when the objects are seen from the left or the right direction (consider for example a face). This can be handled by a pair of symmetric HOG templates. In this part we will explore this option.</p>
409 | <blockquote>
410 | <p><strong>Task:</strong> Using the procedure above, train a HOG template <code>w</code> for a symmetric object facing in one specific direction. For example, train a left-facing horse head detector.</p>
411 | <p><strong>Task:</strong> Collect test images containing the object facing in both directions. Run your detector and convince yourself that it works well only for the direction it was trained for.</p>
412 | </blockquote>
413 | <p>HOG features have a well defined structure that makes it possible to predict how the features transform when the underlying image is flipped. The transformation is in fact a simple <em>permutation</em> of the HOG elements. For a given spatial cell, HOG has 31 dimensions. The following code permutes the dimension to flip the cell around the vertical axis:</p>
414 | <pre><code>perm = vl_hog('permutation') ;
415 | hog_flipped = hog(perm) ;
416 | </code></pre>
417 | <p>Note that this permutation applies to a <em>single</em> HOG cell. However, the template is a $H \times W \times 31$ dimensional array of HOG cells.</p>
418 | <blockquote>
419 | <p><strong>Task:</strong> Given a <code>hog</code> array of dimension $H \times W \times 31$, write MATLAB code to obtain the flipped feature array <code>hog_flipped</code>.</p>
420 | </blockquote>
421 | <p><strong>Hint:</strong> Recall that the first dimension spans the vertical axis, the second dimension the horizontal axis, and the third dimension feature channels. <code>perm</code> should be applied to the last dimension. Do you need to permute anything else?</p>
422 | <p>Now let us apply flipping to the model trained earlier:</p>
423 | <blockquote>
424 | <p><strong>Task:</strong> Let <code>w</code> be the model you trained before. Use the procedure to flip HOG to generate <code>w_flipped</code>. Then visualize both <code>w</code> and <code>w_flipped</code> as done in <a href="#sect13">Sect. 1.3</a>. Convince yourself that flipping was successful.</p>
425 | </blockquote>
426 | <p>We have now two models, <code>w</code> and <code>w_flipped</code>, one for each view of the object.</p>
427 | <blockquote>
428 | <p><strong>Task:</strong> Run both models in turn on the same image, obtaining two list of bounding boxes. Find a way to merge the two lists and visualise the top detections. Convince yourself that you can now detect objects facing either way.</p>
429 | </blockquote>
430 | <p><strong>Hint:</strong> Recall how redundant detections can be removed using non-maximum suppression.</p>
431 | <p><strong>Congratulations: This concludes the practical!</strong></p>
432 | <h2 id="history">History</h2>
433 | <ul>
434 | <li>Used in the Oxford AIMS CDT, 2014-18</li>
435 | </ul>
436 | <div class="footnote">
437 | <hr />
438 | <ol>
439 | <li id="fn:nn">
440 | <p>This is part of the MatConvNet toolbox for convolutional neural networks. Nevertheless, there is no neural network discussed here.&#160;<a class="footnote-backref" href="#fnref:nn" rev="footnote" title="Jump back to footnote 1 in the text">&#8617;</a></p>
441 | </li>
442 | </ol>
443 | </div><script type="text/x-mathjax-config">
444 | MathJax.Hub.Config({
445 |     extensions: ["tex2jax.js"],
446 |     jax: ["input/TeX", "output/HTML-CSS"],
447 |     tex2jax: {
448 |       inlineMath: [ ['$','$'], ["\\(","\\)"] ],
449 |       displayMath: [ ['$$','$$'], ["\\[","\\]"] ],
450 |       processEscapes: true
451 |     },
452 |   "HTML-CSS": { availableFonts: ["TeX"] },
453 |   TeX: { equationNumbers: { autoNumber: "AMS" } }
454 |   });
455 | if (typeof MathJaxListener !== 'undefined') {
456 |   MathJax.Hub.Register.StartupHook('End', function () {
457 |     MathJaxListener.invokeCallbackForKey_('End');
458 |   });
459 | }
460 | </script>
461 | <script type="text/javascript" src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
462 | <script type="text/javascript" src="prism.js"></script>
463 | </body>
464 | </html>
465 | 


--------------------------------------------------------------------------------
/doc/instructions.md:
--------------------------------------------------------------------------------
  1 | # Object category detection practical
  2 | 
  3 | This is an [Oxford Visual Geometry Group](http://www.robots.ox.ac.uk/~vgg) computer vision practical, authored by [Andrea Vedaldi](http://www.robots.ox.ac.uk/~vedaldi/) and Andrew Zisserman (Release 2018a).
  4 | 
  5 | ![cover][1]
  6 | 
  7 | The goal of *object category detection* is to identify and localize objects of a given type in an image. Examples applications include detecting pedestrian, cars, or traffic signs in street scenes, objects of interest such as tools or animals in web images, or particular features in medical image. Given a target class, such as *people*, a *detector* receives as input an image and produces as output zero, one, or more bounding boxes around each occurrence of the object class in the image. The key challenge is that the detector needs to find objects regardless of their location and scale in the image, as well as pose and other variation factors, such as clothing, illumination, occlusions, etc.
  8 | 
  9 | This practical explores basic techniques in visual object detection, focusing on  *image based models*. The appearance of image patches containing objects is learned using statistical analysis. Then, in order to detect objects in an image, the statistical model is applied to image windows extracted at all possible scales and locations, in order to identify which ones, if any, contain the object.
 10 | 
 11 | In more detail, the practical explores the following topics: (i) using HOG features to describe image regions, (ii) building a HOG-based sliding-window detector to localize objects in images; (iii) working with multiple scales and multiple object occurrences; (iv) using a linear support vector machine to learn the appearance of objects; (v) evaluating an object detector in term of average precision; (vi) learning an object detector using hard negative mining.
 12 | 
 13 | [TOC]
 14 | 
 15 | ## Getting started
 16 | 
 17 | Read and understand the [requirements and installation instructions](../overview/index.html#installation). The download links for this practical are:
 18 | 
 19 | * Code and data: [practical-category-detection-2018a.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a.tar.gz)
 20 | * Code only: [practical-category-detection-2018a-code-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-code-only.tar.gz)
 21 | * Data only: [practical-category-detection-2018a-data-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-category-detection-2018a-data-only.tar.gz)
 22 | * [Git repository](https://github.com/vedaldi/practical-object-category-detection) (for lab setters and developers)
 23 | 
 24 | After the installation is complete, open and edit the script `exercise1.m` in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, relative to [Part I](#part1) of this document. You can cut and paste this code into the MATLAB window to run it, and will need to modify it as you go through the session. Other files `exercise2.m`, `exercise3.m`, and `exercise4.m` are given for [Part II](#part2), [III](#part3), and [IV](part4).
 25 | 
 26 | Each part contains several **Questions** and **Tasks** to be answered/completed before proceeding further in the practical.
 27 | 
 28 | ## Part 1: Detection fundamentals {#part1}
 29 | 
 30 | Part I--IV use as running example the problem of street sign detection, using the data from the [German Traffic Sign Detection Benchmark](http://benchmark.ini.rub.de/?section=gtsdb&subsection=news). This data consists of a number of example traffic images, as well as a number of larger test images containing one or more traffic signs at different sizes and locations. It also comes with *ground truth* annotation, i.e. with specified bounding boxes and sign labels for each sign occurrence, which is required to evaluate the quality of the detector.
 31 | 
 32 | In this part we will build a basic sliding-window object detector based on HOG features. Follow the steps below:
 33 | 
 34 | ### Step 1.0: Loading the training data
 35 | 
 36 | The MATLAB m-file `loadData.m` loads the data for the practical into memory. The function `loadData(targetClass)` takes a `targetClass` argument specifying the object class of interest. Open the `example1.m` file, select the following part of the code, and execute it in MATLAB (right button > `Evaluate selection` or `Shift+F7`).
 37 | 
 38 | ```matlab
 39 | % Load the training and testing data (trainImages, trainBoxes, ...)
 40 | % The functio takes the ID of the type of traffic sign we want to recognize
 41 | % 1 is the 30 km/h speed limit
 42 | loadData(1) ;
 43 | ```
 44 | 
 45 | This loads into the current workspace the following variables:
 46 | 
 47 | * `trainImages`: a list of train image names.
 48 | * `trainBoxes`: a $4\times N$ array of object bounding boxes, in the form $[x_\text{min},y_\text{min},x_\text{max},y_\text{max}]$.
 49 | * `trainBoxImages`: for each bounding box, the name of the image containing it.
 50 | * `trainBoxLabels`: for each bounding box, the object label. It is one of the index in `targetClass`.
 51 | * `trainBoxPatches`: a $64 \times 64 \times 3 \times N$ array of image patches, one for each training object. Patches are in RGB format.
 52 | 
 53 | An analogous set of variables `testImages`, `testBoxes`, and so on are provided for the test data. Familiarise yourself with the contents of these variables.
 54 | 
 55 | > **Question:** why is there a `trainImages` and a `trainBoxImages` variables?
 56 | 
 57 | ### Step 1.1: Visualize the training images
 58 | 
 59 | Select now the part of the code related to section 1.1 and execute it. This will create an image visualizing both the complete list of object training examples and their average.
 60 | 
 61 | > **Question:** what can you deduce about the object variability from the average image?
 62 | 
 63 | > **Question:** most boxes extend slightly around the object extent. Why do you think this may be valuable in learning a detector?
 64 | 
 65 | ### Step 1.2: Extract HOG features from the training images
 66 | 
 67 | Object detectors usually work on top of a layer of low-level features. In this case, we use HOG (*Histogram of Oriented Gradients*) features. In order to learn a model of the object, we start by extracting features from the image patches corresponding to the available training examples. This is done by the following `for` loop:
 68 | 
 69 | ```matlab
 70 | hogCellSize = 8 ;
 71 | trainHog = {} ;
 72 | for i = 1:size(trainBoxPatches,4)
 73 |   trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
 74 | end
 75 | trainHog = cat(4, trainHog{:}) ;
 76 | ```
 77 | 
 78 | HOG is computed by the [VLFeat](http::www.vlfeat.org) function `vl_hog` ([doc](http://www.vlfeat.org/matlab/vl_hog.html)). This function takes as parameter the size in pixels of each HOG cell `hogCellSize`. It also takes a RGB image, represented in MATLAB as a $w \times h \times 3$ array (extracted as a slice of `trainBoxPatches`). The output is a $w/\mathtt{hogCellSize} \times h/\mathtt{hogCellSize} \times 31$ dimensional array. One such array is extracted for each example image end eventually these are concatenated in a 4D array along the fourth dimension.
 79 | 
 80 | ### Step 1.3: Learn a simple HOG template model {#sect13}
 81 | 
 82 | A very basic object model can be obtained by averaging the features of the example objects. This is done by:
 83 | 
 84 | ```matlab
 85 | w = mean(trainHog, 4) ;
 86 | ```
 87 | 
 88 | The model can be visualized by *rendering* `w` as if it was a HOG feature array. This can be done using the `render` option of `vl_hog`:
 89 | 
 90 | ```matlab
 91 | figure(2) ; clf ;
 92 | imagesc(vl_hog('render', w)) ;
 93 | ```
 94 | 
 95 | Spend some time to study this plot and make sure you understand what is visualized.
 96 | 
 97 | > **Question:** Can you make sense of the resulting plot?
 98 | 
 99 | ### Step 1.4: Apply the model to a test image
100 | 
101 | The model is matched to a test image by: (i) extracting the HOG features of the image and (ii) convolving the model over the resulting feature map:
102 | 
103 | ```matlab
104 | im = imread('data/signs-sample-image.jpg') ;
105 | im = im2single(im) ;
106 | hog = vl_hog(im, hogCellSize) ;
107 | scores = vl_nnconv(hog, w, []) ;
108 | ```
109 | 
110 | The first two lines read a sample image and conver it to single format. The third line computes the HOG features of the image using the `vl_hog` seen above. The fourth line convolves the HOG map `hog` with the model `w`. It uses the function `vl_nnconv`[^nn] and returns a `scores` map.
111 | 
112 | > **Task:** Work out the dimension of the `scores` arrays. Then, check your result with the dimension of the array computed by MATLAB.
113 | 
114 | > **Question:** Visualize the image `im` and the `scores` array using the provided example code. Does the result match your expectations?
115 | 
116 | ### Step 1.5: Extract the top detection
117 | 
118 | Now that the model has been applied to the image, we have a response map `scores`. To extract a detection from this, we (i) find the maximum response and (ii) compute the bounding box of the image patch containing the corresponding HOG features. The maximum is found by:
119 | 
120 | ```matlab
121 | [best, bestIndex] = max(scores(:)) ;
122 | ```
123 | 
124 | Note that `bestIndex` is a linear index in the range $[1, M]$ where $M$ is the number of possible filter locations. We convert this into a subscript $(h_x,h_y)$ using MATLAB `ind2sub` function:
125 | 
126 | ```matlab
127 | [hy, hx] = ind2sub(size(scores), bestIndex) ;
128 | ```
129 | 
130 | $(h_x,h_y)$ are in units of HOG cells. We convert this into pixel coordinates as follows:
131 | 
132 | ```
133 | x = (hx - 1) * hogCellSize + 1 ;
134 | y = (hy - 1) * hogCellSize + 1 ;
135 | ```
136 | 
137 | > **Question:** Why are we subtracting -1 and summing +1? Which pixel $(x,y)$ of the HOG cell $(h_x,h_y)$ is found?
138 | 
139 | The size of the model template in number of HOG cell can be computed in several way; one is simply:
140 | 
141 | ```matlab
142 | modelWidth = size(trainHog, 2) ;
143 | modelHeight = size(trainHog, 1) ;
144 | ```
145 | 
146 | Now we have enough information to compute the bounding box as follows:
147 | 
148 | ```matlab
149 | detection = [
150 |   x - 0.5 ;
151 |   y - 0.5 ;
152 |   x + hogCellSize * modelWidth - 0.5 ;
153 |   y + hogCellSize * modelHeight - 0.5 ;] ;
154 | ```
155 | 
156 | **Note:** the bounding box encloses exactly all the pixel of the HOG template. In MATLAB, pixel centers have integer coordinates and pixel borders are at a distance $\pm1/2$.
157 | 
158 | > **Question:** Use the example code to plot the image and overlay the bounding box of the detected object. Did it work as expected?
159 | 
160 | ## Part 2: Multiple scales and learning with an SVM {#part2}
161 | 
162 | In this second part, we will: (i) extend the detector to search objects at multiple scales and (ii) learn a better model using a support vector machine. Let's start by loading the data as needed:
163 | 
164 | ```matlab
165 | setup ;
166 | targetClass = 'mandatory' ;
167 | loadData(targetClass) ;
168 | ```
169 | 
170 | The `mandatory` target class is simply the union of all mandatory traffic signs.
171 | 
172 | ### Step 2.1: Multi-scale detection {#step2.1}
173 | 
174 | Objects exist in images at sizes different from one of the learned template. In order to find objects of all sizes, we scale the image up and down and search for the object over and over again.
175 | 
176 | The set of searched scales is defined as follows:
177 | 
178 | ```matlab
179 | % Scale space configuraiton
180 | minScale = -1 ;
181 | maxScale = 3 ;
182 | numOctaveSubdivisions = 3 ;
183 | scales = 2.^linspace(...
184 |   minScale,...
185 |   maxScale,...
186 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
187 | ```
188 | 
189 | Given the model `w`, as determined in Part I, we use the function `detectAtMultipleScales` in order to search for the object at multiple scales:
190 | 
191 | ```matlab
192 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ;
193 | ```
194 | 
195 | Note that the function generates a figure as it runs, so prepare a new figure before running it using the `figure` command if you do not want your current figure to be deleted.
196 | 
197 | > **Question:** Open and study the `detectAtMultipleScales` function. Convince yourself that it is the same code as before, but operated after rescaling the image a number of times. 
198 | 
199 | > **Question:** Visualize the resulting detection using the supplied example code. Did it work? If not, can you make sense of the errors?
200 | 
201 | > **Question:** Look at the array of `scores` maps generated by `detectAtMultipleScales` using the example code. Do they make sense? Is there anything wrong?
202 | 
203 | ### Step 2.2: Collect positive and negative training data
204 | 
205 | The model learned so far is too weak to work well. It is now time to use an SVM to learn a better one. In order to do so, we need to prepare suitable data. We already have positive examples (features extracted from object patches):
206 | 
207 | ```matlab
208 | % Collect positive training data
209 | pos = trainHog ;
210 | ```
211 | 
212 | Ino order to collect negative examples (features extracted from non-object patches), we loop through a number of training images and sample patches uniformly:
213 | 
214 | > **Task:** Identify the code that extract these patches in `example2.m` and make sure you understand it.
215 | 
216 | > **Question:** How many negative examples are we collecting?
217 | 
218 | ### Step 2.3: Learn a model with an SVM
219 | 
220 | Now that we have the data, we can learn an SVM model. To this end we will use the `vl_svmtrain` function. This function requires the data to be in a $D \times N$ matrix, where $D$ are the feature dimensions and $N$ the number of training points. This is done by:
221 | 
222 | ```matlab
223 | % Pack the data into a matrix with one datum per column
224 | x = cat(4, pos, neg) ;
225 | x = reshape(x, [], numPos + numNeg) ;
226 | ```
227 | 
228 | We also need a vector of binary labels, +1 for positive points and -1 for negative ones:
229 | ```matlab
230 | % Create a vector of binary labels
231 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
232 | ```
233 | 
234 | Finally, we need to set the parameter $\lambda$ of the SVM solver. For reasons that will become clearer later, we use instead the equivalent $C$ parameter:
235 | ```matlab
236 | numPos = size(pos,4) ;
237 | numNeg = size(neg,4) ;
238 | C = 10 ;
239 | lambda = 1 / (C * (numPos + numNeg)) ;
240 | ```
241 | 
242 | Learning the SVM is then a one-liner:
243 | ```
244 | % Learn the SVM using an SVM solver
245 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
246 | ```
247 | 
248 | > **Question:** Visualize the learned model `w` using the supplied code. Does it differ from the naive model learned before? How?
249 | 
250 | ### Step 2.4: Evaluate the learned model
251 | 
252 | Use the `detectAtMultipleScales` seen above to evaluate the new SVM-based model.
253 | 
254 | > **Question:** Does the learned model perform better than the naive average?
255 | 
256 | > **Task:** Try different images. Does this detector work all the times? If not, what types of mistakes do you see? Are these mistakes reasonable?
257 | 
258 | ## Part 3: Multiple objects and evaluation {#part3}
259 | 
260 | ### Step 3.1: Multiple detections
261 | 
262 | Detecting at multiple scales is insufficient: we must also allow for more than one object occurrence in the image. In order to to so, the package include a suitalbe `detect` function. This function is similar to `detectAtMultipleScales`, but it returns the top 1000 detector responses rather than just the top one:
263 | ```matlab
264 | % Compute detections
265 | [detections, scores] = detect(im, w, hogCellSize, scales) ;
266 | ```
267 | 
268 | > **Task:** Open and study `detect.m`. Make sure that you understand how it works.
269 | 
270 | > **Question:** Why do we want to return so many responses? In practice, it is unlikely that more than a handful of object occurrences may be contained in any given image...
271 | 
272 | A single object occurrence generates multiple detector responses at nearby image locations and scales. In order to eliminate these redundant detections, we use a *non-maximum suppression* algorithm. This is implemented by the `boxsuppress.m` MATLAB m-file. The algorithm is simple: start from the highest-scoring detection, then remove any other detection whose overlap[^overlap] is greater than a threshold. The function returns a boolean vector `keep` of detections to preserve:
273 | 
274 | ```matlab
275 | % Non-maximum suppression
276 | keep = boxsuppress(detections, scores, 0.25) ;
277 | 
278 | detections = detections(:, keep) ;
279 | scores = scores(keep) ;
280 | ```
281 | 
282 | For efficiency, after non-maximum suppression we keep just ten responses (as we do not expect more than a few objects in any image):
283 | ```matlab
284 | % Further keep only top detections
285 | detections = detections(:, 1:10) ;
286 | scores = scores(1:10) ;
287 | ```
288 | 
289 | ### Step 3.2: Detector evaluation
290 | 
291 | We are now going to look at properly evaluating our detector. We use the [PASCAL VOC criterion](http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2012/devkit_doc.pdf), computing *Average Precision (AP)*. Consider a test image containing a number of ground truth object occurrences $(g_1,\dots,g_m)$ and a list $(b_1,s_1),\dots,(b_n,s_n)$ of candidate detections $b_i$ with score $s_i$. The following algorithm converts this data into a list of labels and scores $(s_i,y_i)$ that can be used to compute a precision-recall curve, for example using VLFeat `vl_pr` function. The algorithm, implemented by `evalDetections.m`, is as follows:
292 | 
293 | 1. Assign each candidate detection $(b_i,s_i)$ a true or false label $y_i \in \{+1,-1\}$. To do so:
294 |     1. The candidate detections $(b_i,s_i)$ are sorted by decreasing score $s_i$.
295 |     2. For each candidate detection in order:
296 |         a. If there is a matching ground truth detection $g_j$ ($\operatorname{overlap}(b_i,g_j)$ larger than 50%), the candidate detection is considered positive ($y_i=+1$). Furthermore, the ground truth detection is *removed from the list* and not considered further.
297 |         b. Otherwise, the candidate detection is negative ($y_i=-1$).
298 | 2. Add each ground truth object $g_i$ that is still unassigned to the list of candidates as pair $(g_j, -\infty)$ with label $y_j=+1$.
299 | 
300 | The overlap metric used to compare a candidate detection to a ground truth bounding box is defined as the *ratio of the area of the intersection over the area of the union* of the two bounding boxes:
301 | $$
302 | \operatorname{overlap}(A,B) = \frac{|A\cap B|}{|A \cup B|}.
303 | $$
304 | 
305 | > **Questions:**
306 | 
307 | > * Why are ground truth detections removed after being matched?
308 | > * What happens if an object is detected twice?
309 | > * Can you explain why unassigned ground-truth objects are added to the list of candidates with $-\infty$ score?
310 | 
311 | In order to apply this algorithm, we first need to find the ground truth bounding boxes in the test image:
312 | ```matlab
313 | % Find all the objects in the target image
314 | s = find(strcmp(testImages{1}, testBoxImages)) ;
315 | gtBoxes = testBoxes(:, s) ;
316 | ```
317 | 
318 | Then `evalDetections` can be used:
319 | ```matlab
320 | % No example is considered difficult
321 | gtDifficult = false(1, numel(s)) ;
322 | 
323 | % PASCAL-like evaluation
324 | matches = evalDetections(...
325 |   gtBoxes, gtDifficult, ...
326 |   detections, scores) ;
327 | ```
328 | The `gtDifficult` flags can be used to mark some ground truth object occurrence as *difficult* and hence ignored in the evaluation. This is used in the PASCAL VOC challenge, but not here (i.e. no object occurrence is considered difficult).
329 | 
330 | `evalDetections` returns a `matches` structure with several fields. We focus here on `matches.detBoxFlags`: this contains a +1 for each detection that was found to be correct and -1 otherwise. We use this to visualize the detection errors:
331 | ```matlab
332 | % Visualization
333 | figure(1) ; clf ;
334 | imagesc(im) ; axis equal ; hold on ;
335 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ;
336 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ;
337 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ;
338 | axis off ;
339 | ```
340 | 
341 | > **Task:** Use the supplied example code to evaluate the detector on one image. Look carefully at the output and convince yourself that it makes sense.
342 | 
343 | Now Plot the PR curve:
344 | ```matlab
345 | figure(2) ; clf ;
346 | vl_pr(matches.labels, matches.scores) ;
347 | ```
348 | 
349 | > **Question:** There are a large number of errors in each image. Should you worry?  In what manner is the PR curve affected? How would you eliminate the vast majority of those in a practice?
350 | 
351 | ### Step 3.3: Evaluation on multiple images
352 | 
353 | Evaluation is typically done on multiple images rather than just one. This is implemented by the `evalModel.m` m-file.
354 | 
355 | > **Task:** Open `evalModel.m` and make sure you understand the main steps of the evaluation procedure.
356 | 
357 | Use the supplied example code to run the evaluation on the entiere test set:
358 | ```matlab
359 | matches = evaluateModel(testImages, testBoxes, testBoxImages, ...
360 |   w, hogCellSize, scales) ;
361 | ```
362 | 
363 | **Note:** The function processes an image per time, visualizing the results as it progresses. The PR curve is the result of the *accumulation* of the detections obtained thus far.
364 | 
365 | > **Task:** Open the `evaluateModel.m` file in MATLAB and add a breakpoint right at the end of the for loop. Now run the evaluation code again and look at each image individually (use `dbcont` to go to the next image). Check out the correct and incorrect matches in each image and their ranking and the effect of this in the cumulative precision-recall curve.
366 | 
367 | ## Part 4: Hard negative mining {#part4}
368 | 
369 | This part explores more advanced learning methods. So far, the SVM has been learned using a small and randomly sampled number of negative examples. However, in principle, every single patch that does not contain the object can be considered as a negative sample. These are of course too many to be used in practice; unfortunately, random sampling is ineffective as the most interesting (confusing) negative samples are a very small and special subset of all the possible ones.
370 | 
371 | *Hard negative mining* is a simple technique that allows finding a small set of key negative examples. The idea is simple: we start by training a model without any negatives at all (in this case the solver learns a 1-class SVM), and then we alternate between evaluating the model on the training data to find erroneous responses and adding the corresponding examples to the training set.
372 | 
373 | ### Step 4.1: Train with hard negative mining {#stage4.1}
374 | 
375 | Use the supplied code in `example4.m` to run hard negative mining. The code repeats SVM training, as seen above, a number of times, progressively increasing the size of the `neg` array containing the negative samples. This is updated using the output of:
376 | 
377 | ```matlab
378 |  [matches, moreNeg] = ...
379 |     evaluateModel(...
380 |     vl_colsubset(trainImages', schedule(t), 'beginning'), ...
381 |     trainBoxes, trainBoxImages, ...
382 |     w, hogCellSize, scales) ;
383 | ```
384 | 
385 | Here `moreNeg` contains the HOG features of the top (highest scoring and hence most confusing) image patches in the supplied training images.
386 | 
387 | > **Task:** Examine `evaluateModel.m` again to understand how hard negatives are extracted.
388 | 
389 | > **Question:** What is the purpose of the construct `vl_colsubset(trainImages', schedule(t), 'beginning')`? Why do you think we visit more negative images in later iterations?
390 | 
391 | The next step is to fuse the new negative set with the old one:
392 | ```matlab
393 | % Add negatives
394 | neg = cat(4, neg, moreNeg) ;
395 | ```
396 | 
397 | Note that hard negative mining could select the same negatives at different iterations; the following code squashes these duplicates:
398 | ```matlab
399 | % Remove negative duplicates
400 | z = reshape(neg, [], size(neg,4)) ;
401 | [~,keep] = unique(z','stable','rows') ;
402 | neg = neg(:,:,:,keep) ;
403 | ```
404 | 
405 | ### Step 4.2: Evaluate the model on the test data
406 | 
407 | Once hard negative mining and training are done, we are ready to evaluate the model on the *test* data (note that the model is evaluated on the *training* data for mining). As before:
408 | ```matlab
409 | evaluateModel(...
410 |     testImages, testBoxes, testBoxImages, ...
411 |     w, hogCellSize, scales) ;
412 | ```
413 | 
414 | ## Part 5: Train your own object detector
415 | 
416 | **Skip on fast track**
417 | 
418 | In this last part, you will learn your own object detector. To this end, open and look at `exercise5.m`. You will need to prepare the following data:
419 | 
420 | ### Step 5.1: Preparing the training data
421 | 
422 | * A folder `data/myPositives` containing files `image1.jpeg`, `image2.jpeg`, ..., each containing a single cropped occurence of the target object. These crops can be of any size, but should be roughly square.
423 | * A folder `data/myNegatives` containing images `image1.jpeg`, `image2.jpeg`, ..., that *do not* contain the target object at all.
424 | * A test image `data/myTestImage.jpeg` containing the target object. This should not be one of the training images.
425 | 
426 | Run the code in `example5.m` to check that your training data looks right.
427 | 
428 | > **Task:** Understand the limitations of this simple detector and choose a target object that has a good chance of being learnable. 
429 | 
430 | **Hint:** Note in particular that object instances must be similar and roughly aligned. If your object is not symmetric, consider choosing instances that face a particular direction (e.g. left-facing horse head).
431 | 
432 | ### Step 5.2: Learn the model
433 | 
434 | Use the code supplied in `example5.m` to learn an SVM model for your object using hard negative mining as in [Stage 4.1](#stage4.1).
435 | 
436 | ### Step 5.3: Test the model
437 | 
438 | Use the code supplied in `example5.m` to evaluate the SVM model on a test image and visualize the result as in [Stage 2.1](#stage2.1).
439 | 
440 | > **Task:** Make sure you get sensible results. Go back to step 5.1 if needed and adjust your data.
441 | 
442 | **Hint:** For debugging purposes, try using one of your training images as test. Does it work at least in this case?
443 | 
444 | ### Step 5.4: Detecting symmetric objects with multiple aspects
445 | 
446 | The basic detectors you have learned so far are *not* invariant to effects such as object deformations, out-of-plane rotations, and partial occlusions that affect most natural objects. Handling these effects requires additional sophistications, including using deformable templates, and a mixture of multiple templates.
447 | 
448 | In particular, many objects in nature are symmetric and, as such, their images appear flipped when the objects are seen from the left or the right direction (consider for example a face). This can be handled by a pair of symmetric HOG templates. In this part we will explore this option.
449 | 
450 | > **Task:** Using the procedure above, train a HOG template `w` for a symmetric object facing in one specific direction. For example, train a left-facing horse head detector.
451 | 
452 | > **Task:** Collect test images containing the object facing in both directions. Run your detector and convince yourself that it works well only for the direction it was trained for.
453 | 
454 | HOG features have a well defined structure that makes it possible to predict how the features transform when the underlying image is flipped. The transformation is in fact a simple *permutation* of the HOG elements. For a given spatial cell, HOG has 31 dimensions. The following code permutes the dimension to flip the cell around the vertical axis:
455 | 
456 |     perm = vl_hog('permutation') ;
457 |     hog_flipped = hog(perm) ;
458 | 
459 | Note that this permutation applies to a *single* HOG cell. However, the template is a $H \times W \times 31$ dimensional array of HOG cells.
460 | 
461 | > **Task:** Given a `hog` array of dimension $H \times W \times 31$, write MATLAB code to obtain the flipped feature array `hog_flipped`.
462 | 
463 | **Hint:** Recall that the first dimension spans the vertical axis, the second dimension the horizontal axis, and the third dimension feature channels. `perm` should be applied to the last dimension. Do you need to permute anything else?
464 | 
465 | Now let us apply flipping to the model trained earlier:
466 | 
467 | > **Task:** Let `w` be the model you trained before. Use the procedure to flip HOG to generate `w_flipped`. Then visualize both `w` and `w_flipped` as done in [Sect. 1.3](#sect13). Convince yourself that flipping was successful.
468 | 
469 | We have now two models, `w` and `w_flipped`, one for each view of the object.
470 | 
471 | > **Task:** Run both models in turn on the same image, obtaining two list of bounding boxes. Find a way to merge the two lists and visualise the top detections. Convince yourself that you can now detect objects facing either way.
472 | 
473 | **Hint:** Recall how redundant detections can be removed using non-maximum suppression.
474 | 
475 | **Congratulations: This concludes the practical!**
476 | 
477 | [^nn]: This is part of the MatConvNet toolbox for convolutional neural networks. Nevertheless, there is no neural network discussed here.
478 | 
479 | [1]: images/cover.jpeg "cover.jpeg"
480 | 
481 | ## History
482 | 
483 | * Used in the Oxford AIMS CDT, 2014-18
484 | 


--------------------------------------------------------------------------------
/evalDetections.m:
--------------------------------------------------------------------------------
  1 | function match = evalDetections(gtBoxes, gtDifficult, detBoxes, detScores, varargin)
  2 | % EVALDETECTIONS
  3 | %   MATCH = EVALDETECTIONS(GTBOXES, GTDIFFICUTL, DETBOXES, DETSCORES)
  4 | %
  5 | %   MATCH.DETBOXFLAGS: +1 good, 0 match to difficult/ignored, -1 wrong
  6 | %   MATCH.DETBOXTOGT:  map to matched GT,  NaN if no match
  7 | %   MATCH.GTBOXTODET:  map to matched Det, NaN if missed, 0 if difficult, -1 if ignored
  8 | %   MATCH.SCORES:      for evaluation (missed boxes have -inf score)
  9 | %   MATCH.LABELS:      for evaluation (difficult/ignored boxes are assigned 0 label)
 10 | %
 11 | %   The first portion fo MATCH.SCORES and MATCH.LABELS correspond to
 12 | %   the DETBOXES, and DETSCORES passed as input. To these, any
 13 | %   non-matched ground thruth bounding box is appended with -INF
 14 | %   score.
 15 | %
 16 | %   The boxes are assumed to be given in the PASCAL format, i.e.  the
 17 | %   coordinates are indeces of the top-left, bottom-right pixels, not
 18 | %   dimensionless coordinates of the box boundaries.
 19 | %
 20 | %   The detection scores are NOT used to reorder the detection boxes
 21 | %   (these should normally be passed by decreasing score) so the
 22 | %   output variables match the order of the input variables.
 23 | %
 24 | %   Auhtor:: Andrea Vedaldi
 25 | 
 26 | % AUTORIGHTS
 27 | % Copyright (C) 2008-09 Andrea Vedaldi
 28 | %
 29 | % This file is part of the VGG MKL Class and VGG MKL Det code packages,
 30 | % available in the terms of the GNU General Public License version 2.
 31 | 
 32 | opts.threshold = 0.5 ;
 33 | opts.criterion = 'overlap' ;
 34 | opts.ignoreDuplicates = false ;
 35 | opts.pascalFormat = true ;
 36 | opts.display = false ;
 37 | opts = vl_argparse(opts, varargin) ;
 38 | numGtBoxes  = size(gtBoxes, 2) ;
 39 | numDetBoxes = size(detBoxes, 2) ;
 40 | 
 41 | gtBoxToDet  = NaN * ones(1, numGtBoxes) ;
 42 | detBoxToGt  = NaN * zeros(1, numDetBoxes) ;
 43 | detBoxFlags = - ones(1,numDetBoxes) ;
 44 | 
 45 | if isempty(gtBoxes)
 46 |   match.detBoxFlags = detBoxFlags ;
 47 |   match.detBoxToGt  = detBoxToGt ;
 48 |   match.gtBoxToDet  = [] ;
 49 |   match.scores      = detScores ;
 50 |   match.labels      = -ones(1,size(detBoxes,2)) ;
 51 |   return ;
 52 | end
 53 | 
 54 | % match detected boxes to gt boxes based on the selected criterion
 55 | switch lower(opts.criterion)
 56 |   case 'overlap'
 57 |     criterion = boxoverlap(gtBoxes, detBoxes, 'pascalFormat', opts.pascalFormat) ;
 58 |   case 'inclusion'
 59 |     criterion = boxinclusion(gtBoxes, detBoxes, 'pascalFormat', opts.pascalFormat) ;
 60 |   otherwise
 61 |     error('Unknown criterion %s.',  opts.criterion) ;
 62 | end
 63 | [criterion, allDetBoxToGt] = max(criterion', [], 2) ;
 64 | 
 65 | % prematch detected boxes to difficult gt boxes and remove them from
 66 | % the evaluation
 67 | selDiff = find((criterion > opts.threshold) & gtDifficult(1,allDetBoxToGt)') ;
 68 | detBoxFlags(selDiff) = 0 ;
 69 | detBoxToGt(selDiff) = allDetBoxToGt(selDiff) ;
 70 | gtBoxToDet(gtDifficult) = 0 ;
 71 | 
 72 | % match the remaining detected boxes to the non-difficult gt boxes
 73 | selDetOk = find(criterion > opts.threshold) ;
 74 | 
 75 | nMiss = sum(~gtDifficult) ;
 76 | for oki = 1:length(selDetOk)
 77 |   % if all gt boxes have been assigned stop
 78 |   if nMiss == 0 & ~opts.ignoreDuplicates, break ; end
 79 | 
 80 |   dei = selDetOk(oki) ;
 81 |   gti = allDetBoxToGt(dei) ;
 82 | 
 83 |   % match the gt box to the detection only if the gt box
 84 |   % is still unassigned (first detection)
 85 |   if isnan(gtBoxToDet(gti))
 86 |     gtBoxToDet(gti)  = dei ;
 87 |     detBoxToGt(dei)  = gti ;
 88 |     detBoxFlags(dei) = +1 ;
 89 |     nMiss = nMiss - 1 ;
 90 | 
 91 |     detBoxToGt(dei)  = gti ;
 92 |     detBoxFlags(dei) = +1 ;
 93 |   elseif opts.ignoreDuplicates
 94 |     % match the detection to the gt box in any case
 95 |     % if duplicates are ignoreed
 96 |     detBoxToGt(dei)  = gti ;
 97 |     detBoxFlags(dei) = 0 ;
 98 |   end
 99 | end
100 | 
101 | % calculate equivalent (scores, labels) pair
102 | selM   = find(detBoxFlags == +1) ; % match
103 | selDM  = find(detBoxFlags == -1) ; % don't match
104 | selDF  = find(detBoxFlags ==  0) ; % difficult or ignored
105 | 
106 | scores = [detScores, -inf * ones(1,nMiss)] ;
107 | labels = [ones(size(detScores)), ones(1,nMiss)] ;
108 | labels(selDM) = -1 ;
109 | labels(selDF) = 0 ;
110 | 
111 | match.detBoxFlags = detBoxFlags ;
112 | match.detBoxToGt = detBoxToGt ;
113 | match.gtBoxToDet = gtBoxToDet ;
114 | match.scores = scores ;
115 | match.labels = labels ;
116 | 
117 | if opts.display
118 |   hold on ;
119 |   vl_plotbox(gtBoxes, 'b', 'linewidth', 2) ;
120 |   vl_plotbox(detBoxes(:, detBoxFlags == +1), 'g') ;
121 |   vl_plotbox(detBoxes(:, detBoxFlags ==  0), 'y') ;
122 |   vl_plotbox(detBoxes(:, detBoxFlags == -1), 'r') ;
123 | end
124 | 


--------------------------------------------------------------------------------
/evaluateModel.m:
--------------------------------------------------------------------------------
 1 | function [matches, negs] = evaluateModel(...
 2 |   testImages, testBoxes, testBoxImages, w, hogCellSize, scales)
 3 | 
 4 | clear matches ;
 5 | negs = {} ;
 6 | for i=1:numel(testImages)
 7 |   % Detect on test image
 8 |   im = imread(testImages{i}) ;
 9 |   im = im2single(im) ;
10 |   [detections, scores, hog] = detect(im, w, hogCellSize, scales) ;
11 | 
12 |   % Non-maxima suppression
13 |   keep = boxsuppress(detections, scores, 0.5) ;
14 |   keep = find(keep) ;
15 |   keep = vl_colsubset(keep, 15, 'beginning') ;
16 |   detections = detections(:, keep) ;
17 |   scores = scores(keep) ;
18 | 
19 |   % Find all the objects in the target image
20 |   ok = find(strcmp(testImages{i}, testBoxImages)) ;
21 |   gtBoxes = testBoxes(:, ok) ;
22 |   gtDifficult = false(1, numel(ok)) ;
23 |   matches(i) = evalDetections(...
24 |     gtBoxes, gtDifficult, ...
25 |     detections, scores) ;
26 | 
27 |   % Visualize progres
28 |   clf;
29 |   subplot(1,3,[1 2]) ;
30 |   imagesc(im) ; axis equal ; hold on ;
31 |   labels = arrayfun(@(x)sprintf('%d',x),1:size(detections,2),'uniformoutput',0) ;
32 |   sp = fliplr(find(matches(i).detBoxFlags == -1)) ;
33 |   sn = fliplr(find(matches(i).detBoxFlags == +1)) ;
34 |   vl_plotbox(detections(:, sp), 'r', 'linewidth', 1, 'label', labels(sp)) ;
35 |   vl_plotbox(detections(:, sn), 'g', 'linewidth', 2, 'label', labels(sn)) ;
36 |   vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ;
37 |   title(sprintf('Image %d of %d', i, numel(testImages))) ;
38 |   axis off ;
39 | 
40 |   subplot(1,3,3) ;
41 |   vl_pr([matches.labels], [matches.scores]) ;
42 | 
43 |   % If required, collect top negative features
44 |   if nargout > 1
45 |     overlaps = boxoverlap(gtBoxes, detections) ;
46 |     overlaps(end+1,:) = 0 ;
47 |     overlaps = max(overlaps,[],1) ;
48 |     detections(:, overlaps >= 0.25) = [] ;
49 |     detections = vl_colsubset(detections, 10, 'beginning') ;
50 |     negs{end+1} = extract(hog, hogCellSize, scales, w, detections) ;
51 |   end
52 | 
53 |   % Break here with the debugger
54 |   drawnow ;
55 | end
56 | 
57 | if nargout > 1
58 |   negs = cat(4, negs{:}) ;
59 | end


--------------------------------------------------------------------------------
/exercise1.m:
--------------------------------------------------------------------------------
 1 | 
 2 | % -------------------------------------------------------------------------
 3 | % Step 1.0: Load training data
 4 | % -------------------------------------------------------------------------
 5 | 
 6 | setup ;
 7 | 
 8 | % Load the training and testing data (trainImages, trainBoxes, ...)
 9 | % The functio takes the ID of the type of traffic sign we want to recognize
10 | % 1 is the 30 km/h speed limit
11 | loadData(1) ;
12 | 
13 | % -------------------------------------------------------------------------
14 | % Step 1.1: Visualize the training images
15 | % -------------------------------------------------------------------------
16 | 
17 | figure(1) ; clf ;
18 | 
19 | subplot(1,2,1) ;
20 | imagesc(vl_imarraysc(trainBoxPatches)) ;
21 | axis off ;
22 | title('Training images (positive samples)') ;
23 | axis equal ;
24 | 
25 | subplot(1,2,2) ;
26 | imagesc(mean(trainBoxPatches,4)) ;
27 | box off ;
28 | title('Average') ;
29 | axis equal ;
30 | 
31 | % -------------------------------------------------------------------------
32 | % Step 1.2: Extract HOG features from the training images
33 | % -------------------------------------------------------------------------
34 | 
35 | hogCellSize = 8 ;
36 | trainHog = {} ;
37 | for i = 1:size(trainBoxPatches,4)
38 |   trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
39 | end
40 | trainHog = cat(4, trainHog{:}) ;
41 | 
42 | % -------------------------------------------------------------------------
43 | % Step 1.3: Learn a simple HOG template model
44 | % -------------------------------------------------------------------------
45 | 
46 | w = mean(trainHog, 4) ;
47 | 
48 | save('data/signs-model-1.mat', 'w') ;
49 | 
50 | figure(2) ; clf ;
51 | imagesc(vl_hog('render', w)) ;
52 | colormap gray ;
53 | axis equal ;
54 | title('HOG model') ;
55 | 
56 | % -------------------------------------------------------------------------
57 | % Step 1.4: Apply the model to a test image
58 | % -------------------------------------------------------------------------
59 | 
60 | im = imread('data/signs-sample-image.jpg') ;
61 | im = im2single(im) ;
62 | hog = vl_hog(im, hogCellSize) ;
63 | scores = vl_nnconv(hog, w, []) ;
64 | 
65 | figure(3) ; clf ;
66 | imagesc(scores) ;
67 | title('Detection') ;
68 | colorbar ;
69 | 
70 | % -------------------------------------------------------------------------
71 | % Step 1.5: Extract the top detection
72 | % -------------------------------------------------------------------------
73 | 
74 | [best, bestIndex] = max(scores(:)) ;
75 | 
76 | [hy, hx] = ind2sub(size(scores), bestIndex) ;
77 | x = (hx - 1) * hogCellSize + 1 ;
78 | y = (hy - 1) * hogCellSize + 1 ;
79 | 
80 | modelWidth = size(trainHog, 2) ;
81 | modelHeight = size(trainHog, 1) ;
82 | detection = [
83 |   x - 0.5 ;
84 |   y - 0.5 ;
85 |   x + hogCellSize * modelWidth - 0.5 ;
86 |   y + hogCellSize * modelHeight - 0.5 ;] ;
87 | 
88 | figure(4) ; clf ;
89 | imagesc(im) ; axis equal ;
90 | hold on ;
91 | vl_plotbox(detection, 'g', 'linewidth', 5) ;
92 | title('Response scores') ;
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/exercise2.m:
--------------------------------------------------------------------------------
  1 | % EXERCISE2
  2 | setup ;
  3 | 
  4 | %targetClass = 1 ;
  5 | %targetClass = 'prohibitory' ;
  6 | targetClass = 'mandatory' ;
  7 | %targetClass = 'danger' ;
  8 | 
  9 | loadData(targetClass) ;
 10 | 
 11 | % Compute HOG features of examples (see Step 1.2)
 12 | hogCellSize = 8 ;
 13 | trainHog = {} ;
 14 | for i = 1:size(trainBoxPatches,4)
 15 |   trainHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
 16 | end
 17 | trainHog = cat(4, trainHog{:}) ;
 18 | 
 19 | % Learn a trivial HOG model (see Step 1.3)
 20 | w = mean(trainHog, 4) ;
 21 | save('data/signs-model-1.mat', 'w', 'targetClass') ;
 22 | 
 23 | figure(2) ; clf ;
 24 | imagesc(vl_hog('render', w)) ;
 25 | colormap gray ; axis equal off ;
 26 | title('Trivial HOG model') ;
 27 | 
 28 | % -------------------------------------------------------------------------
 29 | % Step 2.1: Multi-scale detection
 30 | % -------------------------------------------------------------------------
 31 | 
 32 | % Scale space configuraiton
 33 | minScale = -1 ;
 34 | maxScale = 3 ;
 35 | numOctaveSubdivisions = 3 ;
 36 | scales = 2.^linspace(...
 37 |   minScale,...
 38 |   maxScale,...
 39 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
 40 | 
 41 | im = imread(testImages{3}) ;
 42 | im = im2single(im) ;
 43 | 
 44 | figure(5) ; clf ;
 45 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ;
 46 | 
 47 | figure(6) ; clf ;
 48 | imagesc(im) ; axis equal off ; hold on ;
 49 | vl_plotbox(detection, 'g', 'linewidth', 2) ;
 50 | title('Trivial detector output') ;
 51 | 
 52 | % -------------------------------------------------------------------------
 53 | % Step 2.2: Collect positive and negative trainign data
 54 | % -------------------------------------------------------------------------
 55 | 
 56 | % Collect positive training data
 57 | pos = trainHog ;
 58 | 
 59 | % Collect negative training data
 60 | neg = {} ;
 61 | modelWidth = size(trainHog, 2) ;
 62 | modelHeight = size(trainHog, 1) ;
 63 | for t=1:numel(trainImages)
 64 |   % Get the HOG features of a training image
 65 |   t = imread(trainImages{t}) ;
 66 |   t = im2single(t) ;  
 67 |   hog = vl_hog(t, hogCellSize) ;
 68 |   
 69 |   % Sample uniformly 5 HOG patches
 70 |   % Assume that these are negative (almost certain)
 71 |   width = size(hog,2) - modelWidth + 1 ;
 72 |   height = size(hog,1) - modelHeight + 1 ;
 73 |   index = vl_colsubset(1:width*height, 10, 'uniform') ;
 74 | 
 75 |   for j=1:numel(index)
 76 |     [hy, hx] = ind2sub([height width], index(j)) ;
 77 |     sx = hx + (0:modelWidth-1) ;
 78 |     sy = hy + (0:modelHeight-1) ;
 79 |     neg{end+1} = hog(sy, sx, :) ;
 80 |   end
 81 | end
 82 | neg = cat(4, neg{:}) ;
 83 | 
 84 | % -------------------------------------------------------------------------
 85 | % Step 2.3: Learn a model with an SVM
 86 | % -------------------------------------------------------------------------
 87 | 
 88 | numPos = size(pos,4) ;
 89 | numNeg = size(neg,4) ;
 90 | C = 10 ;
 91 | lambda = 1 / (C * (numPos + numNeg)) ;
 92 | 
 93 | % Pack the data into a matrix with one datum per column
 94 | x = cat(4, pos, neg) ;
 95 | x = reshape(x, [], numPos + numNeg) ;
 96 | 
 97 | % Create a vector of binary labels
 98 | y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
 99 | 
100 | % Learn the SVM using an SVM solver
101 | w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
102 | 
103 | % Reshape the model vector into a model HOG template
104 | w = single(reshape(w, modelHeight, modelWidth, [])) ;
105 | save('data/signs-model-2.mat', 'w', 'targetClass') ;
106 | 
107 | % Plot model
108 | figure(7) ; clf ;
109 | imagesc(vl_hog('render', w)) ;
110 | colormap gray ; axis equal off ;
111 | title('SVM HOG model') ;
112 | 
113 | % -------------------------------------------------------------------------
114 | % Step 2.4: Evaluate learned model
115 | % -------------------------------------------------------------------------
116 | 
117 | % Compute detections
118 | figure(8) ; clf ;
119 | detection = detectAtMultipleScales(im, w, hogCellSize, scales) ;
120 | 
121 | % Plot top detection
122 | figure(9) ; clf ;
123 | imagesc(im) ; axis equal off ; hold on ;
124 | vl_plotbox(detection, 'g', 'linewidth', 2) ;
125 | title('SVM detector output') ;
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/exercise3.m:
--------------------------------------------------------------------------------
 1 | % EXERCISE3
 2 | setup ;
 3 | 
 4 | % Feature configuration
 5 | hogCellSize = 8 ;
 6 | numHardNegativeMiningIterations = 3 ;
 7 | minScale = -1 ;
 8 | maxScale = 3 ;
 9 | numOctaveSubdivisions = 3 ;
10 | scales = 2.^linspace(...
11 |   minScale,...
12 |   maxScale,...
13 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
14 | 
15 | % Load data
16 | load('data/signs-model-2.mat','w','targetClass') ;
17 | loadData(targetClass) ;
18 | 
19 | % -------------------------------------------------------------------------
20 | % Step 3.1: Multiple detections
21 | % -------------------------------------------------------------------------
22 | 
23 | im = imread(testImages{3}) ;
24 | im = im2single(im) ;
25 | 
26 | % Compute detections
27 | [detections, scores] = detect(im, w, hogCellSize, scales) ;
28 | 
29 | % Non-maxima suppression
30 | keep = boxsuppress(detections, scores, 0.25) ;
31 | 
32 | detections = detections(:, keep) ;
33 | scores = scores(keep) ;
34 | 
35 | % Further keep only top detections
36 | detections = detections(:, 1:10) ;
37 | scores = scores(1:10) ;
38 | 
39 | % Plot top detection
40 | figure(10) ; clf ;
41 | imagesc(im) ; axis equal ;
42 | hold on ;
43 | vl_plotbox(detections, 'g', 'linewidth', 2, ...
44 |   'label', arrayfun(@(x)sprintf('%.2f',x),scores,'uniformoutput',0)) ;
45 | title('Multiple detections') ;
46 | 
47 | % -------------------------------------------------------------------------
48 | % Step 3.2: Detector evaluation
49 | % -------------------------------------------------------------------------
50 | 
51 | % Find all the objects in the target image
52 | s = find(strcmp(testImages{3}, testBoxImages)) ;
53 | gtBoxes = testBoxes(:, s) ;
54 | 
55 | % No example is considered difficult
56 | gtDifficult = false(1, numel(s)) ;
57 | 
58 | % PASCAL-like evaluation
59 | matches = evalDetections(...
60 |   gtBoxes, gtDifficult, ...
61 |   detections, scores) ;
62 | 
63 | % Visualization
64 | figure(1) ; clf ;
65 | imagesc(im) ; axis equal ; hold on ;
66 | vl_plotbox(detections(:, matches.detBoxFlags==+1), 'g', 'linewidth', 2) ;
67 | vl_plotbox(detections(:, matches.detBoxFlags==-1), 'r', 'linewidth', 2) ;
68 | vl_plotbox(gtBoxes, 'b', 'linewidth', 1) ;
69 | axis off ;
70 | 
71 | figure(2) ; clf ;
72 | vl_pr(matches.labels, matches.scores) ;
73 | 
74 | % -------------------------------------------------------------------------
75 | % Step 3.3: Evaluation on multiple images
76 | % -------------------------------------------------------------------------
77 | 
78 | figure(3) ; clf ;
79 | 
80 | matches = evaluateModel(testImages, testBoxes, testBoxImages, ...
81 |   w, hogCellSize, scales) ;
82 | 
83 | 


--------------------------------------------------------------------------------
/exercise4.m:
--------------------------------------------------------------------------------
 1 | % EXERCISE4
 2 | setup ;
 3 | 
 4 | % Training cofiguration
 5 | %targetClass = 1 ;
 6 | %targetClass = 'prohibitory' ;
 7 | targetClass = 'mandatory' ;
 8 | %targetClass = 'danger' ;
 9 | numHardNegativeMiningIterations = 7 ;
10 | schedule = [1 2 5 5 100 100 100] ;
11 | 
12 | % Scale space configuration
13 | hogCellSize = 8 ;
14 | minScale = -1 ;
15 | maxScale = 3 ;
16 | numOctaveSubdivisions = 3 ;
17 | scales = 2.^linspace(...
18 |   minScale,...
19 |   maxScale,...
20 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
21 | 
22 | % Load data
23 | loadData(targetClass) ;
24 | 
25 | % Compute HOG features of examples (see Step 1.2)
26 | trainBoxHog = {} ;
27 | for i = 1:size(trainBoxPatches,4)
28 |   trainBoxHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
29 | end
30 | trainBoxHog = cat(4, trainBoxHog{:}) ;
31 | modelWidth = size(trainBoxHog,2) ;
32 | modelHeight = size(trainBoxHog,1) ;
33 | 
34 | % -------------------------------------------------------------------------
35 | % Step 4.1: Train with hard negative mining
36 | % -------------------------------------------------------------------------
37 | 
38 | % Initial positive and negative data
39 | pos = trainBoxHog(:,:,:,ismember(trainBoxLabels,targetClass)) ;
40 | neg = zeros(size(pos,1),size(pos,2),size(pos,3),0) ;
41 | 
42 | for t=1:numHardNegativeMiningIterations
43 |   numPos = size(pos,4) ;
44 |   numNeg = size(neg,4) ;
45 |   C = 1 ;
46 |   lambda = 1 / (C * (numPos + numNeg)) ;
47 | 
48 |   fprintf('Hard negative mining iteration %d: pos %d, neg %d\n', ...
49 |     t, numPos, numNeg) ;
50 | 
51 |   % Train an SVM model (see Step 2.2)
52 |   x = cat(4, pos, neg) ;
53 |   x = reshape(x, [], numPos + numNeg) ;
54 |   y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
55 |   w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
56 |   w = single(reshape(w, modelHeight, modelWidth, [])) ;
57 | 
58 |   % Plot model
59 |   figure(1) ; clf ;
60 |   imagesc(vl_hog('render', w)) ;
61 |   colormap gray ; axis equal ;
62 |   title(sprintf('SVM HOG model (retraining ieration %d)',t)) ;
63 | 
64 |   % Evaluate on training data and mine hard negatives
65 |   figure(2) ; set(gcf, 'name', sprintf('Retraining iteration %d',t)) ;
66 |   [matches, moreNeg] = ...
67 |     evaluateModel(...
68 |     vl_colsubset(trainImages', schedule(t), 'beginning'), ...
69 |     trainBoxes, trainBoxImages, ...
70 |     w, hogCellSize, scales) ;
71 | 
72 |   % Add negatives
73 |   neg = cat(4, neg, moreNeg) ;
74 | 
75 |   % Remove negative duplicates
76 |   z = reshape(neg, [], size(neg,4)) ;
77 |   [~,keep] = unique(z','stable','rows') ;
78 |   neg = neg(:,:,:,keep) ;
79 | end
80 | 
81 | % -------------------------------------------------------------------------
82 | % Step 4.2: Evaluate the model on the test data
83 | % -------------------------------------------------------------------------
84 | 
85 | figure(3) ; clf ;
86 | evaluateModel(...
87 |     testImages, testBoxes, testBoxImages, ...
88 |     w, hogCellSize, scales) ;


--------------------------------------------------------------------------------
/exercise5.m:
--------------------------------------------------------------------------------
  1 | % EXERCISE5
  2 | setup ;
  3 | 
  4 | % Training cofiguration
  5 | targetClass = 1 ;
  6 | numHardNegativeMiningIterations = 5 ;
  7 | schedule = [1 2 5 5 5] ;
  8 | 
  9 | % Scale space configuration
 10 | hogCellSize = 8 ;
 11 | minScale = -1 ;
 12 | maxScale = 3 ;
 13 | numOctaveSubdivisions = 3 ;
 14 | scales = 2.^linspace(...
 15 |   minScale,...
 16 |   maxScale,...
 17 |   numOctaveSubdivisions*(maxScale-minScale+1)) ;
 18 | 
 19 | % -------------------------------------------------------------------------
 20 | % Step 5.1: Construct custom training data
 21 | % -------------------------------------------------------------------------
 22 | 
 23 | % Load object examples
 24 | trainImages = {} ;
 25 | trainBoxes = [] ;
 26 | trainBoxPatches = {} ;
 27 | trainBoxImages = {} ;
 28 | trainBoxLabels = [] ;
 29 | 
 30 | % Construct negative data
 31 | names = dir('data/myNegatives/*.jpeg') ;
 32 | trainImages = fullfile('data', 'myNegatives', {names.name}) ;
 33 | 
 34 | % Construct positive data
 35 | names = dir('data/myPositives/*.jpeg') ;
 36 | names = fullfile('data', 'myPositives', {names.name}) ;
 37 | for i=1:numel(names)
 38 |   im = imread(names{i}) ;
 39 |   im = imresize(im, [64 64]) ;
 40 |   trainBoxes(:,i) = [0.5 ; 0.5 ; 64.5 ; 64.5] ;
 41 |   trainBoxPatches{i} = im2single(im) ;
 42 |   trainBoxImages{i} = names{i} ;
 43 |   trainBoxLabels(i) = 1 ;
 44 | end
 45 | trainBoxPatches = cat(4, trainBoxPatches{:}) ;
 46 | 
 47 | % Compute HOG features of examples (see Step 1.2)
 48 | trainBoxHog = {} ;
 49 | for i = 1:size(trainBoxPatches,4)
 50 |   trainBoxHog{i} = vl_hog(trainBoxPatches(:,:,:,i), hogCellSize) ;
 51 | end
 52 | trainBoxHog = cat(4, trainBoxHog{:}) ;
 53 | modelWidth = size(trainBoxHog,2) ;
 54 | modelHeight = size(trainBoxHog,1) ;
 55 | 
 56 | % -------------------------------------------------------------------------
 57 | % Step 5.2: Visualize the training images
 58 | % -------------------------------------------------------------------------
 59 | 
 60 | figure(1) ; clf ;
 61 | 
 62 | subplot(1,2,1) ;
 63 | imagesc(vl_imarraysc(trainBoxPatches)) ;
 64 | axis off ;
 65 | title('Training images (positive samples)') ;
 66 | axis equal ;
 67 | 
 68 | subplot(1,2,2) ;
 69 | imagesc(mean(trainBoxPatches,4)) ;
 70 | box off ;
 71 | title('Average') ;
 72 | axis equal ;
 73 | 
 74 | % -------------------------------------------------------------------------
 75 | % Step 5.3: Train with hard negative mining
 76 | % -------------------------------------------------------------------------
 77 | 
 78 | % Initial positive and negative data
 79 | pos = trainBoxHog(:,:,:,ismember(trainBoxLabels,targetClass)) ;
 80 | neg = zeros(size(pos,1),size(pos,2),size(pos,3),0) ;
 81 | 
 82 | for t=1:numHardNegativeMiningIterations
 83 |   numPos = size(pos,4) ;
 84 |   numNeg = size(neg,4) ;
 85 |   C = 1 ;
 86 |   lambda = 1 / (C * (numPos + numNeg)) ;
 87 |   
 88 |   fprintf('Hard negative mining iteration %d: pos %d, neg %d\n', ...
 89 |     t, numPos, numNeg) ;
 90 |     
 91 |   % Train an SVM model (see Step 2.2)
 92 |   x = cat(4, pos, neg) ;
 93 |   x = reshape(x, [], numPos + numNeg) ;
 94 |   y = [ones(1, size(pos,4)) -ones(1, size(neg,4))] ;
 95 |   w = vl_svmtrain(x,y,lambda,'epsilon',0.01,'verbose') ;
 96 |   w = single(reshape(w, modelHeight, modelWidth, [])) ;
 97 | 
 98 |   % Plot model
 99 |   figure(2) ; clf ;
100 |   imagesc(vl_hog('render', w)) ;
101 |   colormap gray ;
102 |   axis equal ;
103 |   title('SVM HOG model') ;
104 |   
105 |   % Evaluate on training data and mine hard negatives
106 |   figure(3) ;  
107 |   [matches, moreNeg] = ...
108 |     evaluateModel(...
109 |     vl_colsubset(trainImages', schedule(t), 'beginning'), ...
110 |     trainBoxes, trainBoxImages, ...
111 |     w, hogCellSize, scales) ;
112 |   
113 |   % Add negatives
114 |   neg = cat(4, neg, moreNeg) ;
115 |   
116 |   % Remove negative duplicates
117 |   z = reshape(neg, [], size(neg,4)) ;
118 |   [~,keep] = unique(z','stable','rows') ;
119 |   neg = neg(:,:,:,keep) ;  
120 | end
121 | 
122 | 
123 | % -------------------------------------------------------------------------
124 | % Step 5.3: Evaluate the model on the test data
125 | % -------------------------------------------------------------------------
126 | 
127 | im = imread('data/myTestImage.jpeg') ;
128 | im = im2single(im) ;
129 | 
130 | % Compute detections
131 | [detections, scores] = detect(im, w, hogCellSize, scales) ;
132 | keep = boxsuppress(detections, scores, 0.25) ;
133 | detections = detections(:, keep(1:10)) ;
134 | scores = scores(keep(1:10)) ;
135 | 
136 | % Plot top detection
137 | figure(3) ; clf ;
138 | imagesc(im) ; axis equal ;
139 | hold on ;
140 | vl_plotbox(detections, 'g', 'linewidth', 2, ...
141 |   'label', arrayfun(@(x)sprintf('%.2f',x),scores,'uniformoutput',0)) ;
142 | title('Multiple detections') ;


--------------------------------------------------------------------------------
/extra/Makefile:
--------------------------------------------------------------------------------
 1 | name ?= practical-category-detection
 2 | ver ?= 2018a
 3 | 
 4 | code=\
 5 | boxinclusion.m \
 6 | boxoverlap.m \
 7 | boxsuppress.m \
 8 | detect.m \
 9 | detectAtMultipleScales.m \
10 | evalDetections.m \
11 | evaluateModel.m \
12 | exercise1.m \
13 | exercise2.m \
14 | exercise3.m \
15 | exercise4.m \
16 | exercise5.m \
17 | extract.m \
18 | loadData.m \
19 | setup.m \
20 | README.md \
21 | vlfeat \
22 | matconvnet
23 | 
24 | doc=\
25 | doc/images \
26 | doc/instructions.html
27 | 
28 | data=\
29 | data/signs.mat \
30 | data/signs-sample-image.jpg \
31 | data/signs
32 | 
33 | include extra/practical/Makefile
34 | 


--------------------------------------------------------------------------------
/extra/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p data/tmp
 4 | 
 5 | cp -vf extra/signs-sample-image.jpg data/
 6 | 
 7 | (
 8 |     cd data/tmp
 9 |     wget -c nc http://benchmark.ini.rub.de/Dataset_GTSDB/TrainIJCNN2013.zip
10 |     unzip -n TrainIJCNN2013.zip
11 | )
12 | 
13 | mkdir -p data/signs
14 | mogrify -path data/signs -format jpeg data/tmp/TrainIJCNN2013/*.ppm 
15 | 


--------------------------------------------------------------------------------
/extra/prepareLabData.m:
--------------------------------------------------------------------------------
 1 | % PREPARELABDATA
 2 | 
 3 | % --------------------------------------------------------------------
 4 | %                                                      Download VLFeat
 5 | % --------------------------------------------------------------------
 6 | 
 7 | if ~exist('vlfeat', 'dir')
 8 |   from = 'http://www.vlfeat.org/download/vlfeat-0.9.21-bin.tar.gz' ;
 9 |   fprintf('Downloading vlfeat from %s\n', from) ;
10 |   untar(from, 'data') ;
11 |   movefile('data/vlfeat-0.9.21', 'vlfeat') ;
12 | end
13 | 
14 | setup ;
15 | 
16 | % --------------------------------------------------------------------
17 | %                            Download and preprocess traffic sign data
18 | % --------------------------------------------------------------------
19 |   
20 | prefix = 'data/tmp/TrainIJCNN2013' ;
21 | [names,x1,y1,x2,y2,labels] = textread(fullfile(prefix, 'gt.txt'), ...
22 |   '%s%d%d%d%d%d', 'headerlines', 1, 'delimiter', ';') ;
23 | boxes = [x1, y1, x2, y2]'+1 ;
24 | 
25 | images = fullfile(prefix, names) ;
26 | patches = {} ;
27 | for j = 1:numel(images)
28 |   t = imread(images{j}) ;
29 |   t = im2single(t) ;
30 |   t = imcrop(t, [x1(j) y1(j) x2(j)-x1(j)+1 y2(j)-y1(j)+1]) ;
31 |   t = imresize(t, [64 64]) ;
32 |   patches{j} = t ;
33 |   [~,base,~] = fileparts(images{j}) ;
34 |   images{j} = fullfile('data', 'signs', [base '.jpeg']) ;
35 | end
36 | patches = cat(4, patches{:}) ;
37 | 
38 | train = unique(names) ;
39 | train = train(randperm(numel(train))) ;
40 | train = train(1:400) ;
41 | train = ismember(names, train) ;
42 | test = ~train ;
43 | 
44 | trainImages = unique(images(train)) ;
45 | trainBoxes = boxes(:, train) ;
46 | trainBoxImages = images(train) ;
47 | trainBoxLabels = labels(train) ;
48 | trainBoxPatches = patches(:,:,:,train) ;
49 | 
50 | testImages = unique(images(test)) ;
51 | testBoxes = boxes(:, test) ;
52 | testBoxImages = images(test) ;
53 | testBoxLabels = labels(test) ;
54 | testBoxPatches = patches(:,:,:,test) ;
55 | 
56 | save('data/signs.mat', ...
57 |   'trainImages', ...
58 |   'trainBoxes', ...
59 |   'trainBoxImages', ...
60 |   'trainBoxLabels', ...
61 |   'trainBoxPatches', ...
62 |   'testImages', ...
63 |   'testBoxes', ...
64 |   'testBoxImages', ...
65 |   'testBoxLabels', ...
66 |   'testBoxPatches') ;
67 | 


--------------------------------------------------------------------------------
/extra/signs-sample-image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-object-category-detection/046b6c030babe0f5e76e842abf61145136866308/extra/signs-sample-image.jpg


--------------------------------------------------------------------------------
/extract.m:
--------------------------------------------------------------------------------
 1 | function features = extract(hog, hogCellSize, scales, w, detections)
 2 | 
 3 | modelWidth = size(w,2) ;
 4 | modelHeight = size(w,1) ;
 5 | 
 6 | s = (detections(3,:) - detections(1,:)) / hogCellSize / modelWidth ;
 7 | 
 8 | features = {} ;
 9 | for i=1:size(detections,2)
10 |   [~,j] = min(abs(s(i) - scales)) ;
11 |   
12 |   hx = (detections(1,i) - 0.5) / hogCellSize / s(i) + 1 ;
13 |   hy = (detections(2,i) - 0.5) / hogCellSize / s(i) + 1 ;
14 |   sx = round(hx)+ (0:modelWidth-1) ;
15 |   sy = round(hy) + (0:modelHeight-1) ;
16 |   
17 |   features{end+1} = hog{j}(sy, sx, :) ;
18 | end
19 | features = cat(4, features{:}) ;
20 | 


--------------------------------------------------------------------------------
/loadData.m:
--------------------------------------------------------------------------------
 1 | function loadData(targetClass, numPosImages, numNegImages)
 2 | % LOADDATA  Load data for exercies
 3 | %   LOADDATA(TARGETCLASS) loads the data configuring it to train
 4 | %   the specified target class. TARGETCLASS is a vector of one or more
 5 | %   labels. If more than one label is specified, then multiple classes
 6 | %   are merged into one.
 7 | %
 8 | %   LOADDATA(TARGETCLASS, NUMPOSIMAGES, NUMNEGIMAGES) allows specifying
 9 | %   the number of positive and negative images too.
10 | %
11 | %   The following variables are created in the workspace:
12 | %
13 | %   - trainImages: list of training image names.
14 | %   - trainBoxes: 4 x N array of object bounding boxes
15 | %   - trainBoxImages: for each box, the corresponding image.
16 | %   - trainBoxLabels: the class label of the box (one of TARGETCLASS).
17 | %   - trainBoxPatches: 64 x 64 x 3 x N array of box patches.
18 | %
19 | %   The same for the test data.
20 | 
21 | if nargin < 2
22 |   numPosImages = 20 ;
23 | end
24 | 
25 | if nargin < 3
26 |   numNegImages = 20 ;
27 | end
28 | 
29 | load('data/signs.mat', ...
30 |   'trainImages', ...
31 |   'trainBoxes', ...
32 |   'trainBoxImages', ...
33 |   'trainBoxLabels', ...
34 |   'trainBoxPatches', ...
35 |   'testImages', ...
36 |   'testBoxes', ...
37 |   'testBoxImages', ...
38 |   'testBoxLabels', ...
39 |   'testBoxPatches') ;
40 | 
41 | 
42 | if isstr(targetClass)
43 |   switch lower(targetClass)
44 |     case 'prohibitory', targetClass = [0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 15, 16] ;
45 |     case 'mandatory', targetClass = [33, 34, 35, 36, 37, 38, 39, 40] ;
46 |     case 'danger', targetClass = [11, 18, 19, 20 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] ;
47 |   end
48 | end
49 | 
50 | % Select only the target class
51 | ok = ismember(trainBoxLabels, targetClass) ;
52 | trainBoxes = trainBoxes(:,ok) ;
53 | trainBoxImages = trainBoxImages(ok) ;
54 | trainBoxLabels = trainBoxLabels(ok) ;
55 | trainBoxPatches = trainBoxPatches(:,:,:,ok) ;
56 | 
57 | ok = ismember(testBoxLabels, targetClass) ;
58 | testBoxes = testBoxes(:,ok) ;
59 | testBoxImages = testBoxImages(ok) ;
60 | testBoxLabels = testBoxLabels(ok) ;
61 | testBoxPatches = testBoxPatches(:,:,:,ok) ;
62 | 
63 | % Select a subset of training and testing images
64 | [~,perm] = sort(ismember(trainImages, trainBoxImages),'descend') ;
65 | trainImages = trainImages(vl_colsubset(perm', numPosImages, 'beginning')) ;
66 | 
67 | [~,perm] = sort(ismember(testImages, testBoxImages),'descend') ;
68 | testImages = testImages(vl_colsubset(perm', numNegImages, 'beginning')) ;
69 |   
70 | vars = {...
71 |   'trainImages', ...
72 |   'trainBoxes', ...
73 |   'trainBoxImages', ...
74 |   'trainBoxLabels', ...
75 |   'trainBoxPatches', ...
76 |   'testImages', ...
77 |   'testBoxes', ...
78 |   'testBoxImages', ...
79 |   'testBoxLabels', ...
80 |   'testBoxPatches', ...
81 |   'targetClass'} ;
82 | 
83 | for i = 1:numel(vars)
84 |   assignin('caller',vars{i},eval(vars{i})) ;
85 | end
86 | 


--------------------------------------------------------------------------------
/setup.m:
--------------------------------------------------------------------------------
 1 | function setup(varargin)
 2 | % SETUP  Add the required search paths to MATLAB
 3 | run matconvnet/matlab/vl_setupnn ;
 4 | run vlfeat/toolbox/vl_setup ;
 5 | 
 6 | opts.useGpu = false ;
 7 | opts.verbose = false ;
 8 | opts.enableImReadJPEG = false ;
 9 | opts = vl_argparse(opts, varargin) ;
10 | 
11 | try
12 |   vl_nnconv(single(1),single(1),[]) ;
13 | catch
14 |   warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ;
15 |   vl_compilenn('enableGpu', opts.useGpu, ...
16 |                'enableImReadJPEG', opts.enableImReadJPEG, ...
17 |                'verbose', opts.verbose) ;
18 | end
19 | 
20 | if opts.useGpu
21 |   try
22 |     vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ;
23 |   catch
24 |     vl_compilenn('enableGpu', opts.useGpu, ...
25 |                  'enableImReadJPEG', opts.enableImReadJPEG, ...
26 |                  'verbose', opts.verbose) ;
27 |     warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now') ;
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------