├── .gitignore
├── .gitmodules
├── README.md
├── addCustomLossLayer.m
├── checkDerivativeNumerically.m
├── data
├── crab.jpg
└── ray.jpg
├── doc
├── images
│ ├── conv.png
│ ├── conv.svg
│ ├── cover.png
│ ├── mathworks_logo.png
│ ├── matlab_set_shortcuts.svg
│ ├── nvidia_logo.svg
│ ├── oxford.png
│ ├── step1.png
│ ├── step2.png
│ ├── step3.png
│ ├── step4.png
│ └── text.png
├── instructions.html
└── instructions.md
├── exercise1.m
├── exercise2.m
├── exercise3.m
├── extra
├── Makefile
├── getBlurredImagesData.m
├── post.sh
└── preprocess.m
├── getBatch.m
├── initializeLargeCNN.m
├── initializeSmallCNN.m
├── l1LossBackward.m
├── l1LossForward.m
├── l2LossBackward.m
├── l2LossForward.m
├── proj.m
├── setup.m
├── showDeblurringResult.m
├── showFeatureChannels.m
└── xavier.m
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 | data/
3 | local
4 | local/
5 | base.css
6 | doc/prism.css
7 | doc/prism.js
8 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "extra/practical"]
2 | path = extra/practical
3 | url = git@github.com:vedaldi/practical.git
4 | [submodule "matconvnet"]
5 | path = matconvnet
6 | url = git@github.com:vlfeat/matconvnet.git
7 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Convolutional neural network practical (2)
2 | ==========================================
3 |
4 | A computer vision practical by the Oxford Visual Geometry group,
5 | authored by Andrea Vedaldi, Karel Lenc, and Joao Henriques.
6 |
7 | Start from `doc/instructions.html`.
8 |
9 | > Note that this practical requires compiling the (included)
10 | > MatConvNet library. This should happen automatically (see the
11 | > `setup.m` script), but make sure that the compilation succeeds on
12 | > the laboratory computers.
13 |
14 | Package contents
15 | ----------------
16 |
17 | The practical consists of four exercises, organized in the following
18 | files:
19 |
20 | * `exercise1.m` -- Part 1: Building blocks: convolution and ReLU
21 | * `exercise2.m` -- Part 2: Derivatives and backpropagation
22 | * `exercise3.m` -- Part 3: Learning a CNN for text deblurring
23 |
24 | The practical runs in MATLAB and uses
25 | [MatConvNet](http://www.vlfeat.org/matconvnet). This package contains
26 | the following MATLAB functions:
27 |
28 | * `checkDerivativeNumerically.m`: check a layer derivatives numerically.
29 | * `customLayerForward.m` and `customLayerBackward.m`: code (partially) implementing a custom layer.
30 | * `getBatch.m:`: get a batch of images for training.
31 | * `getCustomLayer.m`: get the custom layer in SimpleNN format.
32 | * `initializeSmallCNN.m` and `initializeLargeCNN.m`: initialize CNN models for text deblurring.
33 | * `setup.m`: setup MATLAB environment.
34 | * `showDeblurringResult.m`: show results for the deblurring network.
35 | * `showFeatureChannels.m`: show the feature channels in a tensor.
36 | * `xavier.m`: Xaiver's initialization of the network weights.
37 |
38 | Appendix: Installing from scratch
39 | ---------------------------------
40 |
41 | The practical requires both VLFeat and MatConvNet. VLFeat comes with
42 | pre-built binaries, but MatConvNet does not.
43 |
44 | 0. Set the current directory to the practical base directory.
45 | 1. From Bash:
46 | 1. Run `git submodule update -i` to download the submodules.
47 | 2. Run `make -f ./extras/Makefile preproc`. This will create a copy
48 | of the data for the practical
49 | 2. From MATLAB run `addpath extra ; preprocess ;`. This will create
50 | `data/text_imdb.mat`.
51 | 3. Test the practical: from MATLAB run all the exercises in order.
52 |
53 | Changes
54 | -------
55 |
56 | * *2016a* - Initial edition
57 |
58 | License
59 | -------
60 |
61 | Copyright (c) 16 Andrea Vedaldi, Karel Lenc, and Joao Henriques
62 |
63 | Permission is hereby granted, free of charge, to any person
64 | obtaining a copy of this software and associated documentation
65 | files (the "Software"), to deal in the Software without
66 | restriction, including without limitation the rights to use, copy,
67 | modify, merge, publish, distribute, sublicense, and/or sell copies
68 | of the Software, and to permit persons to whom the Software is
69 | furnished to do so, subject to the following conditions:
70 |
71 | The above copyright notice and this permission notice shall be
72 | included in all copies or substantial portions of the Software.
73 |
74 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
75 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
76 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
77 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
78 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
79 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
80 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
81 | DEALINGS IN THE SOFTWARE.
82 |
--------------------------------------------------------------------------------
/addCustomLossLayer.m:
--------------------------------------------------------------------------------
1 | function net = addCustomLossLayer(net, fwfun, bwfun)
2 | %ADDCUSTOMLOSSLAYER Add a custom loss layer to a network
3 | % NET = ADDCUSTOMLOSSLAYER(NET, FWDFUN, BWDFUN) adds a custom loss
4 | % layer to the network NET using FWDFUN for forward pass and BWDFUN for
5 | % a backward pass.
6 |
7 | layer.name = 'loss' ;
8 | layer.type = 'custom' ;
9 | layer.forward = @forward ;
10 | layer.backward = @backward ;
11 | layer.class = [] ;
12 |
13 | % Make sure that the loss layer is not added multiple times
14 | if strcmp(net.layers{end}.name, layer.name)
15 | net.layers{end} = layer ;
16 | else
17 | net.layers{end+1} = layer ;
18 | end
19 |
20 | function res_ = forward(layer, res, res_)
21 | res_.x = fwfun(res.x, layer.class) ;
22 | end
23 |
24 | function res = backward(layer, res, res_)
25 | res.dzdx = bwfun(res.x, layer.class, res_.dzdx) ;
26 | end
27 | end
28 |
29 |
30 |
--------------------------------------------------------------------------------
/checkDerivativeNumerically.m:
--------------------------------------------------------------------------------
1 | function err = checkDerivativeNumerically(f, x, dx)
2 | %CHECKDERIVATIVENUMERICALLY Check a layer's deriviative numerically
3 | % ERR = CHECKDERIVATIVENUMERICALLY(F, X, DX) takes the scalar function F,
4 | % its tensor input X and its derivative DX at X and compares DX to
5 | % a numerical approximation of the derivative returing their difference
6 | % ERR.
7 |
8 | y = f(x) ;
9 | dx_numerical = zeros(size(dx), 'single') ;
10 | delta = 0.01 ;
11 |
12 | for n = 1:size(x,4)
13 | for k = 1:size(x,3)
14 | for j = 1:size(x,2)
15 | for i = 1:size(x,1)
16 | xp = x ;
17 | xp(i,j,k,n) = xp(i,j,k,n) + delta ;
18 | yp = f(xp) ;
19 | dx_numerical(i,j,k,n) = (yp - y) / delta ;
20 | end
21 | end
22 | end
23 | end
24 | err = dx_numerical - dx ;
25 |
26 | range = max(abs(dx(:))) * [-1 1] ;
27 | T = size(x,4) ;
28 | for t = 1:size(x,4)
29 | subplot(T,3,1+(t-1)*3) ; bar3(dx(:,:,1,t)) ; zlim(range) ;
30 | title(sprintf('dx(:,:,1,%d) (given)',t)) ;
31 | subplot(T,3,2+(t-1)*3) ; bar3(dx_numerical(:,:,1,t)) ; zlim(range) ;
32 | title(sprintf('dx(:,:,1,%d) (numerical)',t)) ;
33 | subplot(T,3,3+(t-1)*3) ; bar3(abs(err(:,:,1,t))) ; zlim(range) ;
34 | title('absolute difference') ;
35 | end
36 |
--------------------------------------------------------------------------------
/data/crab.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/data/crab.jpg
--------------------------------------------------------------------------------
/data/ray.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/data/ray.jpg
--------------------------------------------------------------------------------
/doc/images/conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/conv.png
--------------------------------------------------------------------------------
/doc/images/conv.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
131 |
--------------------------------------------------------------------------------
/doc/images/cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/cover.png
--------------------------------------------------------------------------------
/doc/images/mathworks_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/mathworks_logo.png
--------------------------------------------------------------------------------
/doc/images/matlab_set_shortcuts.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
744 |
--------------------------------------------------------------------------------
/doc/images/nvidia_logo.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
33 |
--------------------------------------------------------------------------------
/doc/images/oxford.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/oxford.png
--------------------------------------------------------------------------------
/doc/images/step1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step1.png
--------------------------------------------------------------------------------
/doc/images/step2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step2.png
--------------------------------------------------------------------------------
/doc/images/step3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step3.png
--------------------------------------------------------------------------------
/doc/images/step4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step4.png
--------------------------------------------------------------------------------
/doc/images/text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/text.png
--------------------------------------------------------------------------------
/doc/instructions.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Convolutional neural networks are an important class of learnable representations applicable, among others, to numerous computer vision problems. Deep CNNs, in particular, are composed of several layers of processing, each involving linear as well as non-linear operators, that are learned jointly, in an end-to-end manner, to solve a particular tasks. These methods are now the dominant approach for feature extraction from audiovisual and textual data.
19 |
This practical explores the basics of learning (deep) CNNs. The first part introduces typical CNN building blocks, such as ReLU units and linear filters. The second part explores backpropagation, including designing custom layers and verifying them numerically. The last part demonstrates learning a CNN for text deblurring; this differs from the usual problem of image classification and demonstrates the flexibility of these techniques.
20 |
This practical is based on MATLAB and the MatConvNet library. The practical demonstrates how easy it is to use this environment to prototype new network components and architectures. By only using familar MATLAB syntax, you will be able to implement new layers and take advantage of the GPU for faster computation.
You can either unpack the archive manually, or use the following MATLAB one-liner:
86 |
untar('http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz')
87 | cd practical-cnn-reg-2016a
88 |
89 |
90 |
91 |
Getting started
92 |
After the installation is complete, open and edit the script exercise1.m in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, for Part I of this document. You can cut and paste this code into the MATLAB window to run it, or use the shortcut Ctrl+Enter to run a code section. You will need to modify it as you go through the session. Other files exercise2.m, and exercise3.m, are given for Part II and III.
93 |
Each part contains several Questions (that may require pen and paper) and Tasks (that require experimentation or coding) to be answered/completed before proceeding further in the practical.
94 |
Part 1: CNN building blocks
95 |
In this part we will explore two fundamental building blocks of CNNs, linear convolution and non-linear activation functions. Open exercise1.m and run up to the setup() command, which initializes the MATLAB environment to use MatConvNet.
96 |
Part 1.1: Convolution
97 |
A convolutional neural network (CNN) is a sequence of linear and non-linear convolution-like operators. The most important example of such operators is linear convolution. In this part, we will explore linear convolution and see how to use it in MatConvNet.
98 |
Recall that linear convolution applies one (or more) filters $\bw$ to an image $\bx$ as follows:
99 |
100 |
Part 1.1.1: Convolution by a single filter
101 |
Start by identifying and then running the following code fragment in exercise1.m:
102 |
% Load an image and convert it to gray scale and single precision
103 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ;
104 |
105 | % Define a filter
106 | w = single([
107 | 0 -1 -0
108 | -1 4 -1
109 | 0 -1 0]) ;
110 |
111 | % Apply the filter to the image
112 | y = vl_nnconv(x, w, []) ;
113 |
114 |
115 |
The code loads the image data/ray.jpg and applies to it a linear filter using the linear convolution operator. The latter is implemented by the MatConvNet function vl_nnconv(). Note that all variables x, w, and y are in single precision; while MatConvNet supports double precision arithmetic too, single precision is usually preferred in applications as memory is often a bottleneck. The result can be visualized as follows:
Task: Run the code above and examine the result, which should look like the following image:
138 |
139 |
140 |
The input $\bx$ is an $M \times N$ matrix, which can be interpreted as a gray scale image. The filter $\bw$ is the $3 \times 3$ matrix
141 | $$
142 | \bw =
143 | \begin{bmatrix}
144 | 0 & -1 & 0 \\
145 | -1 & 4 & -1 \\
146 | 0 & -1 & 0 \\
147 | \end{bmatrix}
148 | $$
149 | The output of the convolution is a new matrix $\by$ given by1
150 | $$
151 | y_{ij} = \sum_{uv} w_{uv}\ x_{i+u,\ j+v}
152 | $$
153 |
154 |
Questions:
155 |
156 |
If $H \times W$ is the size of the input image, $H' \times W'$ the size of the filter, what is the size $H'' \times W''$ of the output image?
157 |
The filter $\bw$ given above is a discretized Laplacian operator. Which type of visual structures (corners, bars, ...) do you think may excite this filter the most?
158 |
159 |
160 |
Part 1.1.2: Convolution by a filter bank
161 |
In neural networks, one usually operates with filter banks instead of individual filters. Each filter can be though of as computing a different feature channel, characterizing a particular statistical property of the input image.
162 |
To see how to define and use a filter bank, create a bank of three filters as follows:
The first filter $\bw_1$ is the Laplacian operator seen above; two additional filters $\bw_2$ and $\bw_3$ are horizontal and vertical image derivatives, respectively. The command vl_nnconv(x, wbank, []) then applies all the filters in the bank to the input image x. Note that the output y is not just a matrix, but a 3D array (often called a tensor in the CNN jargon). This tensor has dimensions $H \times W \times K$, where $K$ is the number of feature channels.
183 |
184 |
Question: What is the number of feature channels $C$ in this example? Why?
185 |
Task: Run the code above and visualize the individual feature channels in the tensor y by using the provided function showFeatureChannels(). Do the channel responses make sense given the filter used to generate them?
186 |
187 |
In a CNN, not only the output tensor, but also the input tensor x and the filters wbank can have multiple feature channels. In this case, the convolution formula becomes:
188 | $$
189 | y_{ijk} = \sum_{uvp} w_{uvpk}\ x_{i+u,\ j+v,\ p}
190 | $$
191 |
192 |
Questions:
193 |
194 |
If the input tensor $\bx$ has $C$ feature channels, what should be the third dimension of $\bw$?
195 |
In the code above, the command wbank = cat(4, w1, w2, w3) concatenates the tensors w1, w2, and w3 along the fourth dimension. Why is that given that filters should have three dimensions?
196 |
197 |
198 |
Part 1.1.3: Convolving a batch of images
199 |
Finally, in training CNNs it is often important to be able to work efficiently with batches of data. MatConvNet allows packing more than one instance of the tensor $\bx$ in a single MATLAB array x by stacking the different instances along the fourth dimension of the array:
Task: Run the code above and visualize the result. Convince yourself that each filter is applied to each image.
209 |
210 |
Part 1.2: Non-linear activation (ReLU)
211 |
CNNs are obtained by composing several operators, individually called layers. In addition to convolution and other linear layers, CNNs should contain non-linear layers as well.
212 |
213 |
Question: What happens if all layers are linear?
214 |
215 |
The simplest non-linearity is given by scalar activation functions, which are applied independently to each element of a tensor. Perhaps the simplest and one of the most useful examples is the Rectified Linear Unit (ReLU) operator:
216 | $$
217 | y_{ijk} = \max \{0, x_{ijk}\}
218 | $$
219 | which simply cuts off any negative value in the data.
220 |
In MatConvNet, ReLU is implemented by the vl_nnrelu function. To demonstrate its use, we convolve the test image with the negated Laplacian, and then apply ReLU to the result:
221 |
% Convolve with the negated Laplacian
222 | y = vl_nnconv(x, - w, []) ;
223 |
224 | % Apply the ReLU operator
225 | z = vl_nnrelu(y) ;
226 |
227 |
228 |
229 |
Task: Run this code and visualize images x, y, and z.
230 |
Questions:
231 |
232 |
Which kind of image structures are preferred by this filter?
233 |
Why did we negate the Laplacian?
234 |
235 |
236 |
ReLU has a very important effect as it implicitly sets to zero the majority of the filter responses. In a certain sense, ReLU works as a detector, with the implicit convention that a certain pattern is detected when a corresponding filter response is large enough (greater than zero).
237 |
In practice, while signals are usually centered and therefore a threshold of zero is reasonable, there is no particular reason why this should always be appropriate. For this reason, the convolution operator allows to specify a bias term for each filter response. Let us use this term to make the response of ReLU more selective:
There is only one bias term because there is only one filter in the bank (note that, as for the rest of the data, bias is a single precision quantity). The bias is applied after convolution, effectively subtracting 0.2 from the filter responses. Hence, now a response is not suppressed by the subsequent ReLU operator only if it is at least 0.2 after convolution.
244 |
245 |
Task: Run this code and visualize images x, y, and z.
246 |
Question: Is the response now more selective?
247 |
Remark: There are many other building blocks used in CNNs, the most important of which is perhaps max pooling. However, convolution and ReLU can solve already many problems, as we will see in the remainder of the practical.
248 |
249 |
Part 2: Backpropagation
250 |
Training CNNs is normally done using a gradient-based optimization method. The CNN $f$ is the composition of $L$ layers $f_l$ each with parameters $\bw_l$, which in the simplest case of a chain looks like:
251 | $$
252 | \bx_0
253 | \longrightarrow
254 | \underset{\displaystyle\underset{\displaystyle\bw_1}{\uparrow}}{\boxed{f_1}}
255 | \longrightarrow
256 | \bx_1
257 | \longrightarrow
258 | \underset{\displaystyle\underset{\displaystyle\bw_2}{\uparrow}}{\boxed{f_2}}
259 | \longrightarrow
260 | \bx_2
261 | \longrightarrow
262 | \dots
263 | \longrightarrow
264 | \bx_{L-1}
265 | \longrightarrow
266 | \underset{\displaystyle\underset{\displaystyle\bw_L}{\uparrow}}{\boxed{f_L}}
267 | \longrightarrow
268 | \bx_L
269 | $$
270 | During learning, the last layer of the network is the loss function that should be minimized. Hence, the output $\bx_L = x_L$ of the network is a scalar quantity (a single number).
271 |
The gradient is easily computed using using the chain rule. If all network variables and parameters are scalar, this is given by2:
272 | $$
273 | \frac{\partial f}{\partial w_l}(x_0;w_1,\dots,w_L)
274 | =
275 | \frac{\partial f_L}{\partial x_L}(x_L;w_L) \times
276 | \cdots
277 | \times
278 | \frac{\partial f_{l+1}}{\partial x_l}(x_l;w_{l+1}) \times
279 | \frac{\partial f_{l}}{\partial w_l}(x_{l-1};w_l)
280 | $$
281 | With tensors, however, there are some complications. Consider for instance the derivative of a function $\by=f(\bx)$ where both $\by$ and $\bx$ are tensors; this is formed by taking the derivative of each scalar element in the output $\by$ with respect to each scalar element in the input $\bx$. If $\bx$ has dimensions $H \times W \times C$ and $\by$ has dimensions $H' \times W' \times C'$, then the derivative contains $HWCH'W'C'$ elements, which is often unmanageable (in the order of several GBs of memory for a single derivative).
282 |
Note that all intermediate derivatives in the chain rule may be affected by this size explosion except for the derivative of the network output that, being the loss, is a scalar.
283 |
284 |
Question: The output derivatives have the same size as the parameters in the network. Why?
285 |
286 |
Back-propagation allows computing the output derivatives in a memory-efficient manner. To see how, the first step is to generalize the equation above to tensors using a matrix notation. This is done by converting tensors into vectors by using the $\vv$ (stacking)3 operator:
287 | $$
288 | \frac{\partial \vv f}{\partial \vv^\top \bw_l}
289 | =
290 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times
291 | \cdots
292 | \times
293 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times
294 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l}
295 | $$
296 | In order to make this computation memory efficient, we project the derivative with respect to a tensor $\bp_L = 1$ as follows:
297 | $$
298 | (\vv \bp_L)^\top \times \frac{\partial \vv f}{\partial \vv^\top \bw_l}
299 | =
300 | (\vv \bp_L)^\top
301 | \times
302 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times
303 | \cdots
304 | \times
305 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times
306 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l}
307 | $$
308 | Note that $\bp_L=1$ has the same dimension as $\bx_L$ (the scalar loss) and, being equal to 1, multiplying it to the left of the expression does not change anything. Things are more interesting when products are evaluated from the left to the right, i.e. backward from the output to the input of the CNN. The first such factors is given by:
309 | \begin{equation}
310 | \label{e:factor}
311 | (\vv \bp_{L-1})^\top = (\vv \bp_L)^\top
312 | \times
313 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L}
314 | \end{equation}
315 | This results in a new projection vector $\bp_{L-1}$, which can then be multiplied from the left to obtain $\bp_{L-2}$ and so on. The last projection $\bp_l$ is the desired derivative. Crucially, each projection $\bp_q$ takes as much memory as the corresponding variable $\bx_q$.
316 |
Some might have noticed that, while projections remain small, each factor \eqref{e:factor} does contain one of the large derivatives that we cannot compute explicitly. The trick is that CNN toolboxes contain code that can compute the projected derivatives without explicitly computing this large factor. In particular, for any building block function $\by=f(\bx;\bw)$, a toolbox such as MatConvNet will implement:
317 |
318 |
A forward mode computing the function $\by=f(\bx;\bw)$.
319 |
A backward mode computing the derivatives of the projected function $\langle \bp, f(\bx;\bw) \rangle$ with respect to the input $\bx$ and parameter $\bw$:
y = vl_nnrelu(x) ;
334 | p = randn(size(y), 'single') ;
335 | dx = vl_nnrelu(x,p) ;
336 |
337 |
338 |
Part 2.1: Backward mode verification
339 |
Implementing new layers in a network is conceptually simple, but error prone. A simple way of testing a layer is to check whether the derivatives computed using the backward mode approximately match the derivatives computed numerically using the forward mode. The next example, contained in the file exercise2.m, shows how to do this:
Recall that the derivative of a function $y=f(x)$ is given by
359 | $$
360 | \frac{\partial f}{\partial x}(x) = \lim_{\delta\rightarrow 0} \frac{f(x+\delta) - f(x)}{\delta}
361 | $$
362 | Open the file checkDerivativeNumerically.m. Can you identify the lines in the code above that use this expression?
363 |
Note that checkDerivativeNumerically() is applied to the function @(x) proj(p, vl_nnconv(x, w, [])). This syntax defines a function on the fly (an anonymous closure to be more precise). In this case, the purpose of the closure is to evaluate the expression for a variable x and a fixed value of w. Furthermore, the closure projects the output of vl_nnconv() onto p by calling the proj() function. Why?
364 |
365 |
Tasks:
366 |
367 |
Run the code, visualizing the results. Convince yourself that the numerical and analytical derivatives are nearly identical.
368 |
Modify the code to compute the derivative of the first element of the output tensor $\by$ with respect to all the elements of the input tensor $\bx$. Hint: it suffices to change the value of $\bp$.
369 |
Modify the code to compute the derivative with respect to the convolution parameters $\bw$ instead of the convolution input $\bx$.
370 |
371 |
372 |
Part 2.2: Backpropagation
373 |
Next, we use the backward mode of convolution and ReLU to implement backpropagation in a network that consists of two layers:
374 |
% Forward mode: evaluate conv followed by ReLU
375 | y = vl_nnconv(x, w, []) ;
376 | z = vl_nnrelu(y) ;
377 |
378 | % Pick a random projection tensor
379 | p = randn(size(z), 'single') ;
380 |
381 | % Backward mode: projected derivatives
382 | dy = vl_nnrelu(z, p) ;
383 | [dx,dw] = vl_nnconv(x, w, [], dy) ;
384 |
385 |
386 |
387 |
Question (important) In the code above, in backward mode the projection p is fed to the vl_nnrelu operator. However, the vl_nnconv operator now receives dy as projection. Why?
388 |
Tasks:
389 |
390 |
Run the code and use checkDerivativeNumerically() to compare the analytical and numerical derivatives. Do they differ?
391 |
(Optional) Modify the code above to a chain of three layers: conv + ReLU + conv.
392 |
393 |
394 |
Part 2.3: Design and verify your own layer
395 |
Creating new layers is a common task when experimenting with novel CNN architectures. MatConvNet makes this particularly easy, since you can use all standard MATLAB operators and functions. The same code also works on the GPU.
396 |
In this part we will show how to implement a layer computing the Euclidean distance between a tensor x and a reference tensor r and your goal will be then to implement absolute difference (L1) loss. This layer will be used later to learn a CNN from data.
397 |
The first step is to write the forward mode. This is contained in the l2LossForward.m function. Open the file and check its content:
398 |
function y = l2LossForward(x,r)
399 | delta = x - r ;
400 | y = sum(delta(:).^2) ;
401 |
402 |
403 |
The function computes the difference x - r, squares the individual elements (.^2), and then sums the results. The vectorization delta(:) just turns the tensor into a vector by stacking, so that the sum is carried across all elements (by default sum operates only along the first dimension). The overall result is a scalar y, which is the sum of the squared Euclidean distances between x and r, for all data instances.
404 |
Next, we need to implement the backward mode:
405 |
function dx = l2LossBackward(x,r,p)
406 | dx = 2 * p * (x - r) ;
407 |
408 |
409 |
Note that the backward mode takes the projection tensor p as an additional argument. Let us show that this code is correct. Recall that the goal of the backward mode is to compute the derivative of the projected function:
Here the subscript $t$ index the data instance in the batch; note that, since this function computes the sum of Euclidean distances for all tensor instances, the output $f(\bx)$ is a scalar, and so is the projection $\bp = p$.
415 |
In order to see how to implement the backward mode, compute the derivative with respect to each input element $x_{ijkt}$ (note that $p$ is constant):
Verify that the forward and backward functions are correct by computing the derivatives numerically using checkDerivativeNumerically().
425 |
Implement the l1LossForward.m and l1LossBackward.m to compute the L1 distance (sum of absolute differences):
426 | $$
427 | f(\bx) = \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert.
428 | $$
429 | In order to implement the backward pass, you need to find
430 | $$
431 | \frac{\partial}{\partial x_{ijkt}}
432 | \langle \bp, f(\bx) \rangle
433 | =
434 | \frac{\partial}{\partial x_{ijkt} }
435 | \left[
436 | p \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert
437 | \right].
438 | $$
439 | Recall that for ${v} \neq 0$:
440 | $$
441 | \frac{\partial |v|}{\partial v} = \begin{cases} -1 & v<0 \\ 1 & v>0 \end{cases}.
442 | $$
443 |
Make sure that both the forward and backward modes are correctly modified by verifying the result numerically once more. What happens for the components of $\bx$ that are zero or very close to zero?
444 |
445 |
446 |
Part 3: Learning a CNN for text deblurring
447 |
By now you should be familiar with two basic CNN layers, convolution and ReLU, as well as with the idea of backpropagation. In this part, we will build on such concepts to learn a CNN model.
448 |
CNN are often used for classification; however, they are much more general than that. In order to demonstrate their flexibility, here we will design a CNN that takes an image as input and produces an image as output (instead of a class label).
449 |
We will consider in particular the problem of deblurring images of text, as in the following example:
450 |
451 |
Part 3.1: Preparing the data
452 |
The first task is to load the training and validation data and to understand its format. Start by opening in your MATLAB editor exercise3.m. The code responsible for loading the data is
453 |
imdb = load('data/text_imdb.mat') ;
454 |
455 |
456 |
The variable imdb is a structure containing $n$ images, which will be used for training and validation. The structure has the following fields:
457 |
458 |
imdb.images.data: a $64 \times 64 \times 1 \times n$ array of grayscale blurred images.
459 |
imdb.images.label: a $64 \times 64 \times 1 \times n$ of grayscale sharp images.
460 |
imdb.images.set: a $1 \times n$ vector containing a 1 for training images and an 2 for validation images. 75% of the images are used for training and 25% for test.
461 |
462 |
Run the following code, which displays the first image in the dataset and its label:
Task: make sure you understand the format of imdb. Use MATLAB to find out the number of training and validation images as well as the resolution (size) of each image.
476 |
477 |
It is often important to center the data to better condition the learning problem. This is usually obtained by subtracting the mean pixel intensity (computed from the training set) from each pixel. Here, however, pixels are rescaled and shifted to have values in the interval $[-1, 0]$.
478 |
479 |
Question: why was the interval $[-1, 0]$ chosen? Hint: what intensity corresponds to 'white'? What does the convolution operator do near the image boundaries?
480 |
481 |
Part 3.2: Defining a CNN architecture
482 |
Next we define a CNN net and initialize its weights randomly. A CNN is simply a collection of interlinked layers. While these can be assembled 'manually' as you did in Part 2, it is usually more convenient to use a wrapper.
483 |
MatConvNet contains two wrappers, SimpleNN and DagNN. SimpleNN is suitable for simple networks that are a chain of layers (as opposed to a more general graph). We will use SimpleNN here.
484 |
This wrapper defines the CNN as a structure net containing a cell-array layers listed in order of execution. Open initializeSmallCNN.m and find this code:
485 |
net.layers = { } ;
486 |
487 |
488 |
The first layer of the network is a convolution block:
name specifies a name for the layer, useful for debugging but otherwise arbitrary.
502 |
503 |
504 |
type specifies the layer type, in this case convolution.
505 |
506 |
507 |
weights is a cell array containing the layer parameters, in this case two tensors for the filters and the biases. The filters are initialized using the xavier() function to have dimensions $3 \times 3 \times 1 \times 32$ ($3\times 3$ spatial support, 1 input feature channels, and 32 filters). xavier() also initializes the biases to be zero.
508 |
509 |
510 |
pad specifies the amount of zero padding to apply to the layer input. By using a padding of one pixel and a $3\times 3$ filter support, the output of the convolution will have exactly the same height and width as the input.
511 |
512 |
513 |
learningRate contains two layer-specific multipliers to adjust the learning rate for the filters and the biases.
514 |
515 |
516 |
weightDecay contains two layer-specific multipliers to adjust the weight decay (regularization strength) for the layer filters and biases. Note that weight decay is not applied to the biases.
517 |
518 |
519 |
520 |
Question: what would happen if pad was set to zero?
521 |
522 |
The convolution layer is followed by ReLU, which is given simply by:
This pattern is repeated (possibly varying the number and dimensions of filters) for a total of three convolutional layers separated by ReLUs.
529 |
530 |
Question: The last layer, generating the output image, is convolutional and is not followed by ReLU. Why?
531 |
532 |
The command vl_simplenn_display() can be used to print information about the network. Here is a subset of this information:
533 |
534 |
535 |
536 |
layer
537 |
0
538 |
1
539 |
2
540 |
3
541 |
4
542 |
5
543 |
6
544 |
545 |
546 |
547 |
548 |
type
549 |
input
550 |
conv
551 |
relu
552 |
conv
553 |
relu
554 |
conv
555 |
custom
556 |
557 |
558 |
name
559 |
n/a
560 |
conv1
561 |
relu1
562 |
conv2
563 |
relu2
564 |
prediction
565 |
loss
566 |
567 |
568 |
support
569 |
n/a
570 |
3
571 |
1
572 |
3
573 |
1
574 |
3
575 |
1
576 |
577 |
578 |
filt dim
579 |
n/a
580 |
1
581 |
n/a
582 |
32
583 |
n/a
584 |
32
585 |
n/a
586 |
587 |
588 |
num filts
589 |
n/a
590 |
32
591 |
n/a
592 |
32
593 |
n/a
594 |
1
595 |
n/a
596 |
597 |
598 |
stride
599 |
n/a
600 |
1
601 |
1
602 |
1
603 |
1
604 |
1
605 |
1
606 |
607 |
608 |
pad
609 |
n/a
610 |
1
611 |
0
612 |
1
613 |
0
614 |
1
615 |
0
616 |
617 |
618 |
rf size
619 |
n/a
620 |
3
621 |
3
622 |
5
623 |
5
624 |
7
625 |
7
626 |
627 |
628 |
629 |
630 |
Questions: Look carefully at the generated table and answer the following questions:
631 |
632 |
How many layers are in this network?
633 |
What is the support (height and width) and depth (number of feature channels) of each intermediate tensor?
634 |
How is the number of feature channels related to the
635 | dimensions of the filters?
636 |
637 |
638 |
The last row reports the receptive field size for the layer. This is the size (in pixels) of the local image region that affects a particular element in a feature map.
639 |
640 |
Question: what is the receptive field size of the pixel in the output image (generated by the prediction layer)? Discuss whether a larger receptive field size might be preferable for this problem and how this might be obtained.
641 |
642 |
Part 3.3: Learning the network
643 |
In this part we will use SGD to learn the CNN from the available training data. As noted above, the CNN must however terminate in a loss layer. We add one such layer as follows:
644 |
% Add a loss (using our custom layer)
645 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ;
646 |
647 |
648 |
The function addCustomLossLayer() creates a layer structure compatible with SimpleNN and adds it as the last of the network. This structure contains handles to the functions defined in Part 2, namely l2LossForward() and l2LossBackward().
expDir specifies a directory to store intermediate data (snapshot and figures) as well as the final model. Note that the code resumes execution from the last snapshot; therefore change this directory or clear it if you want to start learning from scratch.
663 |
664 |
665 |
gpus contains a list of GPU IDs to use. For now, do not use any.
666 |
667 |
668 |
batchSize specifies how many images to include in a batch. Here we use 16.
669 |
670 |
671 |
learningRate is the learning rate in SGD.
672 |
673 |
674 |
plotDiagnostic can be used to plot statistics during training. This is slow, but can help setting a reasonable learning rate. Leave it off for now.
675 |
676 |
677 |
numEpochs is the number of epochs (passes through the training data) to perform before SGD stops.
678 |
679 |
680 |
errorFunction disables plotting the default error functions that are suitable for classification, but not for our problem.
681 |
682 |
683 |
Finally, we can invoke the learning code:
684 |
net = cnn_train(net, imdb, @getBatch, trainOpts) ;
685 |
686 |
687 |
The getBatch() function, passed as a handle, is particularly important. The training script cnn_train uses getBatch() to extract the images and corresponding labels for a certain batch, as follows:
688 |
function [im, label] = getBatch(imdb, batch)
689 | im = imdb.images.data(:,:,:,batch) ;
690 | label = imdb.images.label(:,:,:,batch) ;
691 |
692 |
693 |
The function takes as input the imdb structure defined above and a list batch of image indexes that should be returned for training. In this case, this amounts to simply extract and copy some data; however, in general getBatch can be used to e.g. read images from disk or apply transformations to them on the fly.
694 |
695 |
Task: run the training code and wait for learning to be complete. Note that the model is saved in data/text-small/net-epoch-16.mat, where 16 is the number of the last epoch.
696 |
697 |
Part 3.4: Evaluate the model
698 |
The network is evaluated on the validation set during training. The validation error (which in our case is the average squared differences between the predicted output pixels and the desired ones), is a good indicator of how well the network is doing (in practice, one should ultimately evaluate the network on a held-out test set).
699 |
In our example it is also informative to evaluate the qualitative result of the model. This can be done as follows:
700 |
train = find(imdb.images.set == 1) ;
701 | val = find(imdb.images.set == 2) ;
702 |
703 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ;
704 | showDeblurringResult(net, imdb, train(1:30:151)) ;
705 |
706 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ;
707 | showDeblurringResult(net, imdb, val(1:30:151)) ;
708 |
709 |
710 |
Since the CNN is convolutional, it can be applied to arbitrarily-sized images. imdb.examples contains a few larger examples too. The following code shows one:
Is there any obvious difference between training and validation performance?
728 |
729 |
730 |
Part 3.5: Learning a larger model using the GPU
731 |
So far, we have trained a single small network to solve this problem. Here, we will experiment with several variants to try to improve the performance as much as possible.
732 |
Before we experiment further, however, it is beneficial to switch to using a GPU. If you have a GPU and MATLAB Parallel Toolbox installed, you can try running the code above on the GPU by changing a single switch. To prepare MatConvNet to use the GPU, change the first line of the script from setup to:
733 |
setup('useGpu', true) ;
734 |
735 |
736 |
Assuming that the GPU has index 1 (which is always the case if there is a single CUDA-compatible GPU in your machine), modify the training options to tell MatConvNet to use that GPU:
Try increasing the receptive field size by increasing the filter support (do not forget to adjust the padding).
762 |
Try sequences of rank-1 filters, such as $7 \times 1$ followed by $1 \times 7$ to increase the receptive field size while maintaining efficiency.
763 |
764 |
And, of course, make sure to beat the other students.
765 |
766 |
Remark: You can see the relative change of the network weights by setting trainOpts.plotDiagnostics = true ;
767 |
768 |
Links and further work
769 |
770 |
771 |
The code for this practical is written using the software package MatConvNet. This is a software library written in MATLAB, C++, and CUDA and its source code is freely available.
772 |
773 |
774 |
MatConvNet can train complex computer vision models, such as VGG VD and Inception. Several of these models, including a few cool demos, are available for download.
775 |
776 |
777 |
Many more computer vision practicals are available here.
778 |
779 |
780 |
Acknowledgements
781 |
782 |
NVIDIA and MathWorks for valuable help setting up the development environment for this tutorial.
783 |
784 |
785 |
iV&L Summer School instructions
786 |
Connect here to Qwick labs as you have been instructed. Press the Select button for the MatConvNet Lab:
787 |
788 |
Press the Start Lab button:
789 |
790 |
Wait for the progress bar to finish (this may take one or two minutes):
791 |
792 |
Click the lab instructions link and follow the rest of the instructions:
By default, MATLAB on Linux systems has EMACS-style shortcuts. To change it to more familiar Windows style shortcuts:
797 |
798 |
Type preferences in MATLAB Command Window.
799 |
Pick Keyboard -> Shortcuts
800 |
Change Active settings to Windows Default Set
801 |
802 |
Click OK to apply and close the settings window
803 |
804 |
If the screen is too small or too large
805 |
If you are running the practical through a VNC-based remote desktop connection, you can try adjusting the resolution by changing the setting in the OS (click on the big Ubuntu button on the top left and search for Displays).
806 |
History
807 |
808 |
Used in the IV & L Net summer school, Malta, 2016.
809 |
810 |
811 |
812 |
813 |
814 |
If you are familiar with convolution as defined in mathematics and signal processing, you might expect to find the index $i-u$ instead of $i+u$ in this expression. The convention $i+u$, which is often used in CNNs, is often referred to as correlation. ↩
815 |
816 |
817 |
The derivative is computed with respect to a certain assignment $x_0$ and $(w_1,\dots,w_L)$ to the network input and parameters; furthermore, the intermediate derivatives are computed at points $x_1,\dots,x_L$ obtained by evaluating the network at $x_0$. ↩
818 |
819 |
820 |
The stacking operator $\vv$ simply unfolds a tensor in a vector by stacking its elements in some pre-defined order. For example:
821 | $$
822 | \vv\begin{bmatrix}
823 | 1 & 3 & 5\\
824 | 2 & 4 & 6
825 | \end{bmatrix}=\begin{bmatrix}
826 | 1\\2\\3\\4\\5\\6
827 | \end{bmatrix}
828 | $$↩
852 |
853 |
854 |
855 |
856 |
--------------------------------------------------------------------------------
/doc/instructions.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # VGG CNN Practical: Image Regression
6 |
7 | *By Andrea Vedaldi, Karel Lenc, and Joao Henriques*
8 |
9 | This is an [Oxford Visual Geometry Group](http://www.robots.ox.ac.uk/~vgg) computer vision practical (Release 2016a).
10 |
11 |
12 |
13 | *Convolutional neural networks* are an important class of learnable representations applicable, among others, to numerous computer vision problems. Deep CNNs, in particular, are composed of several layers of processing, each involving linear as well as non-linear operators, that are learned jointly, in an end-to-end manner, to solve a particular tasks. These methods are now the dominant approach for feature extraction from audiovisual and textual data.
14 |
15 | This practical explores the basics of learning (deep) CNNs. The first part introduces typical CNN building blocks, such as ReLU units and linear filters. The second part explores backpropagation, including designing custom layers and verifying them numerically. The last part demonstrates learning a CNN for text deblurring; this differs from the usual problem of image classification and demonstrates the flexibility of these techniques.
16 |
17 | This practical is based on MATLAB and the [MatConvNet](http://www.vlfeat.org/matconvnet) library. The practical demonstrates how easy it is to use this environment to prototype new network components and architectures. By only using familar MATLAB syntax, you will be able to implement new layers and take advantage of the GPU for faster computation.
18 |
19 | [TOC]
20 |
21 | $$
22 | \newcommand{\bx}{\mathbf{x}}
23 | \newcommand{\by}{\mathbf{y}}
24 | \newcommand{\bz}{\mathbf{z}}
25 | \newcommand{\bw}{\mathbf{w}}
26 | \newcommand{\bp}{\mathbf{p}}
27 | \newcommand{\cP}{\mathcal{P}}
28 | \newcommand{\cN}{\mathcal{N}}
29 | \newcommand{\vc}{\operatorname{vec}}
30 | \newcommand{\vv}{\operatorname{vec}}
31 | $$
32 |
33 | ## Installation
34 |
35 | > If you are running this in the iV&L Summer School, please refer to the [instructions](#ivl) at the end of the document.
36 |
37 | Read and understand the [requirements and installation instructions](../overview/index.html#installation). The download links for this practical are:
38 |
39 | * Code and data: [practical-cnn-reg-2016a.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz)
40 | * Code only: [practical-cnn-reg-2016a-code-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a-code-only.tar.gz)
41 | * Data only: [practical-cnn-reg-2016a-data-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a-data-only.tar.gz)
42 | * [Git repository](https://github.com/vedaldi/practical-cnn-reg) (for lab setters and developers)
43 |
44 | You can either unpack the archive manually, or use the following MATLAB one-liner:
45 |
46 | ```.language-matlab
47 | untar('http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz')
48 | cd practical-cnn-reg-2016a
49 | ```
50 |
51 |
52 |
53 | ## Getting started
54 |
55 | After the installation is complete, open and edit the script `exercise1.m` in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, for [Part I](#part1) of this document. You can cut and paste this code into the MATLAB window to run it, or use the shortcut `Ctrl+Enter` to run a code section. You will need to modify it as you go through the session. Other files `exercise2.m`, and `exercise3.m`, are given for [Part II](#part2) and [III](#part3).
56 |
57 | Each part contains several **Questions** (that may require pen and paper) and **Tasks** (that require experimentation or coding) to be answered/completed before proceeding further in the practical.
58 |
59 | ## Part 1: CNN building blocks {#part1}
60 |
61 | In this part we will explore two fundamental building blocks of CNNs, linear convolution and non-linear activation functions. Open `exercise1.m` and run up to the `setup()` command, which initializes the MATLAB environment to use MatConvNet.
62 |
63 | ### Part 1.1: Convolution {#part1.1}
64 |
65 | A *convolutional neural network* (CNN) is a sequence of linear and non-linear convolution-like operators. The most important example of such operators is *linear convolution*. In this part, we will explore linear convolution and see how to use it in MatConvNet.
66 |
67 | Recall that linear convolution applies one (or more) filters $\bw$ to an image $\bx$ as follows:
68 |
69 |
70 |
71 | #### Part 1.1.1: Convolution by a single filter {#part1.1.1}
72 |
73 | Start by identifying and then running the following code fragment in `exercise1.m`:
74 |
75 | ```.language-matlab
76 | % Load an image and convert it to gray scale and single precision
77 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ;
78 |
79 | % Define a filter
80 | w = single([
81 | 0 -1 -0
82 | -1 4 -1
83 | 0 -1 0]) ;
84 |
85 | % Apply the filter to the image
86 | y = vl_nnconv(x, w, []) ;
87 | ```
88 |
89 | The code loads the image `data/ray.jpg` and applies to it a linear filter using the linear convolution operator. The latter is implemented by the MatConvNet function `vl_nnconv()`. Note that all variables `x`, `w`, and `y` are in single precision; while MatConvNet supports double precision arithmetic too, single precision is usually preferred in applications as memory is often a bottleneck. The result can be visualized as follows:
90 |
91 | ```.language-matlab
92 | % Visualize the results
93 | figure(11) ; clf ; colormap gray ;
94 | set(gcf, 'name', 'Part 1.1: convolution') ;
95 |
96 | subplot(2,2,1) ;
97 | imagesc(x) ;
98 | axis off image ;
99 | title('Input image x') ;
100 |
101 | subplot(2,2,2) ;
102 | imagesc(w) ;
103 | axis off image ;
104 | title('Filter w') ;
105 |
106 | subplot(2,2,3) ;
107 | imagesc(y) ;
108 | axis off image ;
109 | title('Output image y') ;
110 | ```
111 |
112 | > **Task:** Run the code above and examine the result, which should look like the following image:
113 |
114 |
115 |
116 | The input $\bx$ is an $M \times N$ matrix, which can be interpreted as a gray scale image. The filter $\bw$ is the $3 \times 3$ matrix
117 | $$
118 | \bw =
119 | \begin{bmatrix}
120 | 0 & -1 & 0 \\
121 | -1 & 4 & -1 \\
122 | 0 & -1 & 0 \\
123 | \end{bmatrix}
124 | $$
125 | The output of the convolution is a new matrix $\by$ given by[^convolution]
126 | $$
127 | y_{ij} = \sum_{uv} w_{uv}\ x_{i+u,\ j+v}
128 | $$
129 |
130 |
131 | > **Questions:**
132 | >
133 | > 1. If $H \times W$ is the size of the input image, $H' \times W'$ the size of the filter, what is the size $H'' \times W''$ of the output image?
134 | > 2. The filter $\bw$ given above is a discretized Laplacian operator. Which type of visual structures (corners, bars, ...) do you think may excite this filter the most?
135 |
136 | #### Part 1.1.2: Convolution by a filter bank {#part1.1.2}
137 |
138 | In neural networks, one usually operates with *filter banks* instead of individual filters. Each filter can be though of as computing a different *feature channel*, characterizing a particular statistical property of the input image.
139 |
140 | To see how to define and use a filter bank, create a bank of three filters as follows:
141 |
142 | ```.language-matlab
143 | % Concatenate three filters in a bank
144 | w1 = single([
145 | 0 -1 0
146 | -1 4 -1
147 | 0 -1 0]) ;
148 |
149 | w2 = single([
150 | -1 0 +1
151 | -1 0 +1
152 | -1 0 +1]) ;
153 |
154 | w3 = single([
155 | -1 -1 -1
156 | 0 0 0
157 | +1 +1 +1]) ;
158 |
159 | wbank = cat(4, w1, w2, w3) ;
160 | ```
161 |
162 | The first filter $\bw_1$ is the Laplacian operator seen above; two additional filters $\bw_2$ and $\bw_3$ are horizontal and vertical image derivatives, respectively. The command `vl_nnconv(x, wbank, [])` then applies all the filters in the bank to the input image `x`. Note that the output `y` is not just a matrix, but a 3D array (often called a *tensor* in the CNN jargon). This tensor has dimensions $H \times W \times K$, where $K$ is the number of *feature channels*.
163 |
164 | > **Question:** What is the number of feature channels $K$ in this example? Why?
165 |
166 | > **Task:** Run the code above and visualize the individual feature channels in the tensor `y` by using the provided function `showFeatureChannels()`. Do the channel responses make sense given the filter used to generate them?
167 |
168 | In a CNN, not only the output tensor, but also the input tensor `x` and the filters `wbank` can have multiple feature channels. In this case, the convolution formula becomes:
169 | $$
170 | y_{ijk} = \sum_{uvp} w_{uvpk}\ x_{i+u,\ j+v,\ p}
171 | $$
172 |
173 | > **Questions:**
174 | >
175 | > * If the input tensor $\bx$ has $C$ feature channels, what should be the third dimension of $\bw$?
176 | > * In the code above, the command `wbank = cat(4, w1, w2, w3)` concatenates the tensors `w1`, `w2`, and `w3` along the *fourth dimension*. Why is that given that filters should have three dimensions?
177 |
178 | #### Part 1.1.3: Convolving a batch of images {#part1.1.3}
179 |
180 | Finally, in training CNNs it is often important to be able to work efficiently with *batches* of data. MatConvNet allows packing more than one instance of the tensor $\bx$ in a single MATLAB array `x` by stacking the different instances along the *fourth dimension* of the array:
181 |
182 | ```.language-matlab
183 | x1 = im2single(rgb2gray(imread('data/ray.jpg'))) ;
184 | x2 = im2single(rgb2gray(imread('data/crab.jpg'))) ;
185 | x = cat(4, x1, x2) ;
186 |
187 | y = vl_nnconv(x, wbank, []) ;
188 | ```
189 |
190 | > **Task:** Run the code above and visualize the result. Convince yourself that each filter is applied to each image.
191 |
192 | ### Part 1.2: Non-linear activation (ReLU) {#part1.2}
193 |
194 | CNNs are obtained by composing several operators, individually called *layers*. In addition to convolution and other linear layers, CNNs should contain non-linear layers as well.
195 |
196 | > **Question:** What happens if all layers are linear?
197 |
198 | The simplest non-linearity is given by scalar activation functions, which are applied independently to each element of a tensor. Perhaps the simplest and one of the most useful examples is the *Rectified Linear Unit* (ReLU) operator:
199 | $$
200 | y_{ijk} = \max \{0, x_{ijk}\}
201 | $$
202 | which simply cuts off any negative value in the data.
203 |
204 | In MatConvNet, ReLU is implemented by the `vl_nnrelu` function. To demonstrate its use, we convolve the test image with the negated Laplacian, and then apply ReLU to the result:
205 |
206 | ```.language-matlab
207 | % Convolve with the negated Laplacian
208 | y = vl_nnconv(x, - w, []) ;
209 |
210 | % Apply the ReLU operator
211 | z = vl_nnrelu(y) ;
212 | ```
213 |
214 | > **Task:** Run this code and visualize images `x`, `y`, and `z`.
215 |
216 | > **Questions:**
217 | >
218 | > * Which kind of image structures are preferred by this filter?
219 | > * Why did we negate the Laplacian?
220 |
221 | ReLU has a very important effect as it implicitly sets to zero the majority of the filter responses. In a certain sense, ReLU works as a detector, with the implicit convention that a certain pattern is detected when a corresponding filter response is large enough (greater than zero).
222 |
223 | In practice, while signals are usually centered and therefore a threshold of zero is reasonable, there is no particular reason why this should always be appropriate. For this reason, the convolution operator allows to specify *a bias term* for each filter response. Let us use this term to make the response of ReLU more selective:
224 |
225 | ```.language-matlab
226 | bias = single(- 0.2) ;
227 | y = vl_nnconv(x, - w, bias) ;
228 | z = vl_nnrelu(y) ;
229 | ```
230 |
231 | There is only one `bias` term because there is only one filter in the bank (note that, as for the rest of the data, `bias` is a single precision quantity). The bias is applied after convolution, effectively subtracting 0.2 from the filter responses. Hence, now a response is not suppressed by the subsequent ReLU operator only if it is at least 0.2 after convolution.
232 |
233 | > **Task:** Run this code and visualize images `x`, `y`, and `z`.
234 |
235 | > **Question:** Is the response now more selective?
236 |
237 | > **Remark:** There are many other building blocks used in CNNs, the most important of which is perhaps max pooling. However, convolution and ReLU can solve already many problems, as we will see in the remainder of the practical.
238 |
239 | ## Part 2: Backpropagation {#part2}
240 |
241 | Training CNNs is normally done using a gradient-based optimization method. The CNN $f$ is the composition of $L$ layers $f_l$ each with parameters $\bw_l$, which in the simplest case of a chain looks like:
242 | $$
243 | \bx_0
244 | \longrightarrow
245 | \underset{\displaystyle\underset{\displaystyle\bw_1}{\uparrow}}{\boxed{f_1}}
246 | \longrightarrow
247 | \bx_1
248 | \longrightarrow
249 | \underset{\displaystyle\underset{\displaystyle\bw_2}{\uparrow}}{\boxed{f_2}}
250 | \longrightarrow
251 | \bx_2
252 | \longrightarrow
253 | \dots
254 | \longrightarrow
255 | \bx_{L-1}
256 | \longrightarrow
257 | \underset{\displaystyle\underset{\displaystyle\bw_L}{\uparrow}}{\boxed{f_L}}
258 | \longrightarrow
259 | \bx_L
260 | $$
261 | During learning, the last layer of the network is the *loss function* that should be minimized. Hence, the output $\bx_L = x_L$ of the network is a **scalar** quantity (a single number).
262 |
263 | The gradient is easily computed using using the **chain rule**. If *all* network variables and parameters are scalar, this is given by[^derivative]:
264 | $$
265 | \frac{\partial f}{\partial w_l}(x_0;w_1,\dots,w_L)
266 | =
267 | \frac{\partial f_L}{\partial x_{L-1}}(x_{L-1};w_L) \times
268 | \cdots
269 | \times
270 | \frac{\partial f_{l+1}}{\partial x_l}(x_l;w_{l+1}) \times
271 | \frac{\partial f_{l}}{\partial w_l}(x_{l-1};w_l)
272 | $$
273 | With tensors, however, there are some complications. Consider for instance the derivative of a function $\by=f(\bx)$ where both $\by$ and $\bx$ are tensors; this is formed by taking the derivative of each scalar element in the output $\by$ with respect to each scalar element in the input $\bx$. If $\bx$ has dimensions $H \times W \times C$ and $\by$ has dimensions $H' \times W' \times C'$, then the derivative contains $HWCH'W'C'$ elements, which is often unmanageable (in the order of several GBs of memory for a single derivative).
274 |
275 | Note that all intermediate derivatives in the chain rule may be affected by this size explosion except for the derivative of the network output that, being the loss, is a scalar.
276 |
277 | > **Question:** The output derivatives have the same size as the parameters in the network. Why?
278 |
279 | **Back-propagation** allows computing the output derivatives in a memory-efficient manner. To see how, the first step is to generalize the equation above to tensors using a matrix notation. This is done by converting tensors into vectors by using the $\vv$ (stacking)[^stacking] operator:
280 | $$
281 | \frac{\partial \vv f}{\partial \vv^\top \bw_l}
282 | =
283 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times
284 | \cdots
285 | \times
286 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times
287 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l}
288 | $$
289 | In order to make this computation memory efficient, we *project* the derivative with respect to a tensor $\bp_L = 1$ as follows:
290 | $$
291 | (\vv \bp_L)^\top \times \frac{\partial \vv f}{\partial \vv^\top \bw_l}
292 | =
293 | (\vv \bp_L)^\top
294 | \times
295 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times
296 | \cdots
297 | \times
298 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times
299 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l}
300 | $$
301 | Note that $\bp_L=1$ has the same dimension as $\bx_L$ (the scalar loss) and, being equal to 1, multiplying it to the left of the expression does not change anything. Things are more interesting when products are evaluated from the left to the right, i.e. *backward from the output to the input* of the CNN. The first such factors is given by:
302 | \begin{equation}
303 | \label{e:factor}
304 | (\vv \bp_{L-1})^\top = (\vv \bp_L)^\top
305 | \times
306 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L}
307 | \end{equation}
308 | This results in a new projection vector $\bp_{L-1}$, which can then be multiplied from the left to obtain $\bp_{L-2}$ and so on. The last projection $\bp_l$ is the desired derivative. Crucially, each projection $\bp_q$ takes as much memory as the corresponding variable $\bx_q$.
309 |
310 | Some might have noticed that, while projections remain small, each factor \eqref{e:factor} does contain one of the large derivatives that we cannot compute explicitly. The trick is that CNN toolboxes contain code that can compute the projected derivatives without explicitly computing this large factor. In particular, for any building block function $\by=f(\bx;\bw)$, a toolbox such as MatConvNet will implement:
311 |
312 | * A **forward mode** computing the function $\by=f(\bx;\bw)$.
313 | * A **backward mode** computing the derivatives of the projected function $\langle \bp, f(\bx;\bw) \rangle$ with respect to the input $\bx$ and parameter $\bw$:
314 |
315 | $$
316 | \frac{\partial}{\partial \bx} \left\langle \bp, f(\bx;\bw) \right\rangle,
317 | \qquad
318 | \frac{\partial}{\partial \bw} \left\langle \bp, f(\bx;\bw) \right\rangle.
319 | $$
320 |
321 | For example, this is how this looks for the convolution operator:
322 |
323 | ```.language-matlab
324 | y = vl_nnconv(x,w,b) ; % forward mode (get output)
325 | p = randn(size(y), 'single') ; % projection tensor (arbitrary)
326 | [dx,dw,db] = vl_nnconv(x,w,b,p) ; % backward mode (get projected derivatives)
327 | ```
328 |
329 | and this is how it looks for ReLU operator:
330 |
331 | ```.language-matlab
332 | y = vl_nnrelu(x) ;
333 | p = randn(size(y), 'single') ;
334 | dx = vl_nnrelu(x,p) ;
335 | ```
336 |
337 | ### Part 2.1: Backward mode verification {#part2.1}
338 |
339 | Implementing new layers in a network is conceptually simple, but error prone. A simple way of testing a layer is to check whether the derivatives computed using the backward mode approximately match the derivatives computed numerically using the forward mode. The next example, contained in the file `exercise2.m`, shows how to do this:
340 |
341 | ```.language-matlab
342 | % Forward mode: evaluate the convolution
343 | y = vl_nnconv(x, w, []) ;
344 |
345 | % Pick a random projection tensor
346 | p = randn(size(y), 'single') ;
347 |
348 | % Backward mode: projected derivatives
349 | [dx,dw] = vl_nnconv(x, w, [], p) ;
350 |
351 | % Check the derivative numerically
352 | figure(21) ; clf('reset') ;
353 | set(gcf,'name','Part 2.1: single layer backrpop') ;
354 | checkDerivativeNumerically(@(x) proj(p, vl_nnconv(x, w, [])), x, dx) ;
355 | ```
356 |
357 | > **Questions:**
358 | >
359 | > 1. Recall that the derivative of a function $y=f(x)$ is given by
360 | > $$
361 | > \frac{\partial f}{\partial x}(x) = \lim_{\delta\rightarrow 0} \frac{f(x+\delta) - f(x)}{\delta}
362 | > $$
363 | > Open the file `checkDerivativeNumerically.m`. Can you identify the lines in the code above that use this expression?
364 | > 2. Note that `checkDerivativeNumerically()` is applied to the function `@(x) proj(p, vl_nnconv(x, w, []))`. This syntax defines a function on the fly (an anonymous closure to be more precise). In this case, the purpose of the closure is to evaluate the expression for a variable `x` and a fixed value of `w`. Furthermore, the closure projects the output of `vl_nnconv()` onto `p` by calling the `proj()` function. Why?
365 |
366 | > **Tasks:**
367 | >
368 | > 1. Run the code, visualizing the results. Convince yourself that the numerical and analytical derivatives are nearly identical.
369 | > 2. Modify the code to compute the derivative of the *first element* of the output tensor $\by$ with respect to *all the elements* of the input tensor $\bx$. **Hint:** it suffices to change the value of $\bp$.
370 | > 2. Modify the code to compute the derivative with respect to the convolution parameters $\bw$ instead of the convolution input $\bx$.
371 |
372 | ### Part 2.2: Backpropagation {#part2.3}
373 |
374 | Next, we use the backward mode of convolution and ReLU to implement backpropagation in a network that consists of two layers:
375 |
376 | ```.language-matlab
377 | % Forward mode: evaluate conv followed by ReLU
378 | y = vl_nnconv(x, w, []) ;
379 | z = vl_nnrelu(y) ;
380 |
381 | % Pick a random projection tensor
382 | p = randn(size(z), 'single') ;
383 |
384 | % Backward mode: projected derivatives
385 | dy = vl_nnrelu(z, p) ;
386 | [dx,dw] = vl_nnconv(x, w, [], dy) ;
387 | ```
388 |
389 | > **Question (important)** In the code above, in backward mode the projection `p` is fed to the `vl_nnrelu` operator. However, the `vl_nnconv` operator now receives `dy` as projection. Why?
390 |
391 | > **Tasks:**
392 | >
393 | > 1. Run the code and use `checkDerivativeNumerically()` to compare the analytical and numerical derivatives. Do they differ?
394 | > 2. (Optional) Modify the code above to a chain of three layers: conv + ReLU + conv.
395 |
396 | ### Part 2.3: Design and verify your own layer
397 |
398 | Creating new layers is a common task when experimenting with novel CNN architectures. MatConvNet makes this particularly easy, since you can use all standard MATLAB operators and functions. The same code also works on the GPU.
399 |
400 | In this part we will show how to implement a layer computing the Euclidean distance between a tensor `x` and a reference tensor `r` and your goal will be then to implement absolute difference (L1) loss. This layer will be used later to learn a CNN from data.
401 |
402 | The first step is to write the forward mode. This is contained in the `l2LossForward.m` function. Open the file and check its content:
403 |
404 | ```.language-matlab
405 | function y = l2LossForward(x,r)
406 | delta = x - r ;
407 | y = sum(delta(:).^2) ;
408 | ```
409 |
410 | The function computes the difference `x - r`, squares the individual elements (`.^2`), and then sums the results. The vectorization `delta(:)` just turns the tensor into a vector by stacking, so that the sum is carried across all elements (by default `sum` operates only along the first dimension). The overall result is a scalar `y`, which is the sum of the squared Euclidean distances between `x` and `r`, for all data instances.
411 |
412 | Next, we need to implement the backward mode:
413 |
414 | ```.language-matlab
415 | function dx = l2LossBackward(x,r,p)
416 | dx = 2 * p * (x - r) ;
417 | ```
418 |
419 | Note that the backward mode takes the projection tensor `p` as an additional argument. Let us show that this code is correct. Recall that the goal of the backward mode is to compute the derivative of the projected function:
420 |
421 | $$
422 | \langle \bp, f(\bx) \rangle
423 | = p \sum_{lmnt} (x_{lmnt} - r_{lmnt})^2.
424 | $$
425 |
426 | Here the subscript $t$ index the data instance in the batch; note that, since this function computes the sum of Euclidean distances for all tensor instances, the output $f(\bx)$ is a scalar, and so is the projection $\bp = p$.
427 |
428 | In order to see how to implement the backward mode, compute the derivative with respect to each input element $x_{ijkt}$ (note that $p$ is constant):
429 |
430 | $$
431 | \frac{\partial}{\partial x_{ijkt}}
432 | \langle \bp, f(\bx) \rangle
433 | = 2 p (x_{ijkt} - r_{ijkt}).
434 | $$
435 |
436 | > **Tasks:**
437 | >
438 | > 1. Verify that the forward and backward functions are correct by computing the derivatives numerically using `checkDerivativeNumerically()`.
439 | > 2. Implement the `l1LossForward.m` and `l1LossBackward.m` to compute the L1 distance (sum of absolute differences):
440 | $$
441 | f(\bx) = \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert.
442 | $$
443 | In order to implement the backward pass, you need to find
444 | $$
445 | \frac{\partial}{\partial x_{ijkt}}
446 | \langle \bp, f(\bx) \rangle
447 | =
448 | \frac{\partial}{\partial x_{ijkt} }
449 | \left[
450 | p \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert
451 | \right].
452 | $$
453 | Recall that for ${v} \neq 0$:
454 | $$
455 | \frac{\partial |v|}{\partial v} = \begin{cases} -1 & v<0 \\ 1 & v>0 \end{cases}.
456 | $$
457 | > 3. Make sure that both the forward and backward modes are correctly modified by verifying the result numerically once more. What happens for the components of $\bx$ that are zero or very close to zero?
458 |
459 | ## Part 3: Learning a CNN for text deblurring {#part3}
460 |
461 | By now you should be familiar with two basic CNN layers, convolution and ReLU, as well as with the idea of backpropagation. In this part, we will build on such concepts to learn a CNN model.
462 |
463 | CNN are often used for classification; however, they are much more general than that. In order to demonstrate their flexibility, here we will design a CNN that takes an image as input and produces an image as output (instead of a class label).
464 |
465 | We will consider in particular the problem of *deblurring images of text*, as in the following example:
466 |
467 | 
468 |
469 | ### Part 3.1: Preparing the data {#part3.1}
470 |
471 | The first task is to load the training and validation data and to understand its format. Start by opening in your MATLAB editor `exercise3.m`. The code responsible for loading the data is
472 |
473 | ```.language-matlab
474 | imdb = load('data/text_imdb.mat') ;
475 | ```
476 |
477 | The variable `imdb` is a structure containing $n$ images, which will be used for training and validation. The structure has the following fields:
478 |
479 | * `imdb.images.data`: a $64 \times 64 \times 1 \times n$ array of grayscale blurred images.
480 | * `imdb.images.label`: a $64 \times 64 \times 1 \times n$ of grayscale sharp images.
481 | * `imdb.images.set`: a $1 \times n$ vector containing a 1 for training images and an 2 for validation images. 75% of the images are used for training and 25% for test.
482 |
483 | Run the following code, which displays the first image in the dataset and its label:
484 |
485 | ```.language-matlab
486 | figure(31) ; set(gcf, 'name', 'Part 3.1: Data') ; clf ;
487 |
488 | subplot(1,2,1) ; imagesc(imdb.images.data(:,:,:,1)) ;
489 | axis off image ; title('Input (blurred)') ;
490 |
491 | subplot(1,2,2) ; imagesc(imdb.images.label(:,:,:,1)) ;
492 | axis off image ; title('Desired output (sharp)') ;
493 |
494 | colormap gray ;
495 | ```
496 |
497 | > **Task:** make sure you understand the format of `imdb`. Use MATLAB to find out the number of training and validation images as well as the resolution (size) of each image.
498 |
499 | It is often important to center the data to better condition the learning problem. This is usually obtained by subtracting the mean pixel intensity (computed from the training set) from each pixel. Here, however, pixels are rescaled and shifted to have values in the interval $[-1, 0]$.
500 |
501 | > **Question:** why was the interval $[-1, 0]$ chosen? **Hint:** what intensity corresponds to 'white'? What does the convolution operator do near the image boundaries?
502 |
503 | ### Part 3.2: Defining a CNN architecture
504 |
505 | Next we define a CNN `net` and initialize its weights randomly. A CNN is simply a collection of interlinked layers. While these can be assembled 'manually' as you did in Part 2, it is usually more convenient to use a **wrapper**.
506 |
507 | MatConvNet contains two wrappers, SimpleNN and DagNN. SimpleNN is suitable for simple networks that are a chain of layers (as opposed to a more general graph). We will use SimpleNN here.
508 |
509 | This wrapper defines the CNN as a structure `net` containing a cell-array `layers` listed in order of execution. Open `initializeSmallCNN.m` and find this code:
510 |
511 | ```.language-matlab
512 | net.layers = { } ;
513 | ```
514 |
515 | The first layer of the network is a convolution block:
516 |
517 | ```.language-matlab
518 | net.layers{end+1} = struct(...
519 | 'name', 'conv1', ...
520 | 'type', 'conv', ...
521 | 'weights', {xavier(3,3,1,32)}, ...
522 | 'pad', 1, ...
523 | 'learningRate', [1 1], ...
524 | 'weightDecay', [1 0]) ;
525 | ```
526 |
527 | The fields are as follows:
528 |
529 | * `name` specifies a name for the layer, useful for debugging but otherwise arbitrary.
530 |
531 | * `type` specifies the layer type, in this case convolution.
532 |
533 | * `weights` is a cell array containing the layer parameters, in this case two tensors for the filters and the biases. The filters are initialized using the `xavier()` function to have dimensions $3 \times 3 \times 1 \times 32$ ($3\times 3$ spatial support, 1 input feature channels, and 32 filters). `xavier()` also initializes the biases to be zero.
534 |
535 | * `pad` specifies the amount of zero padding to apply to the layer input. By using a padding of one pixel and a $3\times 3$ filter support, the output of the convolution will have exactly the same height and width as the input.
536 |
537 | * `learningRate` contains two layer-specific multipliers to adjust the learning rate for the filters and the biases.
538 |
539 | * `weightDecay` contains two layer-specific multipliers to adjust the weight decay (regularization strength) for the layer filters and biases. Note that weight decay is not applied to the biases.
540 |
541 | > **Question:** what would happen if `pad` was set to zero?
542 |
543 | The convolution layer is followed by ReLU, which is given simply by:
544 |
545 | ```.language-matlab
546 | net.layers{end+1} = struct(...
547 | 'name', 'relu1', ...
548 | 'type', 'relu') ;
549 | ```
550 |
551 | This pattern is repeated (possibly varying the number and dimensions of filters) for a total of three convolutional layers separated by ReLUs.
552 |
553 | > **Question:** The last layer, generating the output image, is convolutional and is *not* followed by ReLU. Why?
554 |
555 | The command `vl_simplenn_display()` can be used to print information about the network. Here is a subset of this information:
556 |
557 | | layer| 0| 1| 2| 3| 4| 5| 6|
558 | |:---------:|:---:|:---:|:---:|:---:|:---:|:--------:|:----:|
559 | | type|input| conv| relu| conv| relu| conv|custom|
560 | | name| n/a|conv1|relu1|conv2|relu2|prediction| loss|
561 | | support| n/a| 3| 1| 3| 1| 3| 1|
562 | | filt dim| n/a| 1| n/a| 32| n/a| 32| n/a|
563 | | num filts| n/a| 32| n/a| 32| n/a| 1| n/a|
564 | | stride| n/a| 1| 1| 1| 1| 1| 1|
565 | | pad| n/a| 1| 0| 1| 0| 1| 0|
566 | | rf size| n/a| 3| 3| 5| 5| 7| 7|
567 |
568 | > **Questions:** Look carefully at the generated table and answer the following questions:
569 | >
570 | > 1. How many layers are in this network?
571 | > 2. What is the support (height and width) and depth (number of feature channels) of each intermediate tensor?
572 | > 3. How is the number of feature channels related to the
573 | > dimensions of the filters?
574 |
575 | The last row reports the *receptive field size* for the layer. This is the size (in pixels) of the local image region that affects a particular element in a feature map.
576 |
577 | > **Question:** what is the receptive field size of the pixel in the output image (generated by the prediction layer)? Discuss whether a larger receptive field size might be preferable for this problem and how this might be obtained.
578 |
579 | ### Part 3.3: Learning the network {#part3.3}
580 |
581 | In this part we will use SGD to learn the CNN from the available training data. As noted above, the CNN must however terminate in a loss layer. We add one such layer as follows:
582 |
583 | ```.language-matlab
584 | % Add a loss (using our custom layer)
585 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ;
586 | ```
587 | The function `addCustomLossLayer()` creates a `layer` structure compatible with SimpleNN and adds it as the last of the network. This structure contains handles to the functions defined in Part 2, namely `l2LossForward()` and `l2LossBackward()`.
588 |
589 | Next, setup the learning parameters:
590 |
591 | ```.language-matlab
592 | trainOpts.expDir = 'data/text-small' ;
593 | trainOpts.gpus = [] ;
594 | trainOpts.batchSize = 16 ;
595 | trainOpts.learningRate = 0.02 ;
596 | trainOpts.plotDiagnostics = false ;
597 | trainOpts.numEpochs = 20 ;
598 | trainOpts.errorFunction = 'none' ;
599 | ```
600 |
601 | The fields are as follows:
602 |
603 | * `expDir` specifies a directory to store intermediate data (snapshot and figures) as well as the final model. Note that the code resumes execution from the last snapshot; therefore change this directory or clear it if you want to start learning from scratch.
604 |
605 | * `gpus` contains a list of GPU IDs to use. For now, do not use any.
606 |
607 | * `batchSize` specifies how many images to include in a batch. Here we use 16.
608 |
609 | * `learningRate` is the learning rate in SGD.
610 |
611 | * `plotDiagnostic` can be used to plot statistics during training. This is slow, but can help setting a reasonable learning rate. Leave it off for now.
612 |
613 | * `numEpochs` is the number of epochs (passes through the training data) to perform before SGD stops.
614 |
615 | * `errorFunction` disables plotting the default error functions that are suitable for classification, but not for our problem.
616 |
617 | Finally, we can invoke the learning code:
618 |
619 | ```
620 | net = cnn_train(net, imdb, @getBatch, trainOpts) ;
621 | ```
622 |
623 | The `getBatch()` function, passed as a *handle*, is particularly important. The training script `cnn_train` uses `getBatch()` to extract the images and corresponding labels for a certain batch, as follows:
624 |
625 | ```.language-matlab
626 | function [im, label] = getBatch(imdb, batch)
627 | im = imdb.images.data(:,:,:,batch) ;
628 | label = imdb.images.label(:,:,:,batch) ;
629 | ```
630 |
631 | The function takes as input the `imdb` structure defined above and a list `batch` of image indexes that should be returned for training. In this case, this amounts to simply extract and copy some data; however, in general `getBatch` can be used to e.g. read images from disk or apply transformations to them on the fly.
632 |
633 | > **Task:** run the training code and wait for learning to be complete. Note that the model is saved in `data/text-small/net-epoch-16.mat`, where 16 is the number of the last epoch.
634 |
635 | ### Part 3.4: Evaluate the model
636 |
637 | The network is evaluated on the validation set during training. The validation error (which in our case is the average squared differences between the predicted output pixels and the desired ones), is a good indicator of how well the network is doing (in practice, one should ultimately evaluate the network on a held-out test set).
638 |
639 | In our example it is also informative to evaluate the *qualitative* result of the model. This can be done as follows:
640 |
641 | ```.language-matlab
642 | train = find(imdb.images.set == 1) ;
643 | val = find(imdb.images.set == 2) ;
644 |
645 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ;
646 | showDeblurringResult(net, imdb, train(1:30:151)) ;
647 |
648 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ;
649 | showDeblurringResult(net, imdb, val(1:30:151)) ;
650 | ```
651 |
652 | Since the CNN is convolutional, it can be applied to arbitrarily-sized images. `imdb.examples` contains a few larger examples too. The following code shows one:
653 |
654 | ```.language-matlab
655 | figure(35) ;
656 | set(gcf, 'name', 'Part 3.4: Larger example on the validation set') ;
657 | colormap gray ;
658 | subplot(1,2,1) ; imagesc(imdb.examples.blurred{1}, [-1 0]) ;
659 | axis image off ;
660 | title('CNN input') ;
661 | res = vl_simplenn(net, imdb.examples.blurred{1}) ;
662 | subplot(1,2,2) ; imagesc(res(end).x, [-1 0]) ;
663 | axis image off ;
664 | title('CNN output') ;
665 | ```
666 |
667 | > **Questions:**
668 | >
669 | > * Do you think the network is doing a good job?
670 | > * Is there any obvious difference between training and validation performance?
671 |
672 | ### Part 3.5: Learning a larger model using the GPU
673 |
674 | So far, we have trained a single small network to solve this problem. Here, we will experiment with several variants to try to improve the performance as much as possible.
675 |
676 | Before we experiment further, however, it is beneficial to switch to using a GPU. If you have a GPU and MATLAB Parallel Toolbox installed, you can try running the code above on the GPU by changing a single switch. To prepare MatConvNet to use the GPU, change the first line of the script from `setup` to:
677 | ```.language-matlab
678 | setup('useGpu', true) ;
679 | ```
680 | Assuming that the GPU has index 1 (which is always the case if there is a single CUDA-compatible GPU in your machine), modify the training options to tell MatConvNet to use that GPU:
681 |
682 | ```.language-matlab
683 | trainOpts.expDir = 'data/text-small-gpu'
684 | trainOpts.gpus = [1] ;
685 | ```
686 |
687 | The code above also changes `expDir` in order to start a new experiment from scratch.
688 |
689 | > **Task:** Test GPU-based training (if possible). How much faster does it run compared to CPU-based training?
690 |
691 | Now we are ready to experiment with different CNNs.
692 |
693 | > **Task:** Run a new experiment, this time using the `initializeLargeCNN()` function to construct a larger network.
694 |
695 | > **Questions:**
696 | >
697 | > 1. How much slower is this network compared to the small model?
698 | > 2. What about the quantitative performance on the validation set?
699 | > 3. What about the qualitative performance?
700 |
701 | ### Part 3.6: Challenge!
702 |
703 | You are now in control. Play around with the model definition and try to improve the performance as much as possible. For example:
704 |
705 | * Try adding more layers[^goingdeeper].
706 | * Try adding more filters.
707 | * Try a different loss function, such as $L^1$.
708 | * Try increasing the receptive field size by increasing the filter support (do not forget to adjust the padding).
709 | * Try sequences of rank-1 filters, such as $7 \times 1$ followed by $1 \times 7$ to increase the receptive field size while maintaining efficiency.
710 |
711 | And, of course, make sure to beat the other students.
712 |
713 | > **Remark:** You can see the relative change of the network weights by setting `trainOpts.plotDiagnostics = true ;`
714 |
715 | ## Links and further work
716 |
717 | * The code for this practical is written using the software package [MatConvNet](http://www.vlfeat.org/matconvnet). This is a software library written in MATLAB, C++, and CUDA and its source code is freely available.
718 |
719 | * MatConvNet can train complex computer vision models, such as VGG VD and Inception. Several of these models, including a few cool demos, are available for download.
720 |
721 | * Many more computer vision practicals are available [here](https://www.robots.ox.ac.uk/~vgg/practicals/overview/index.html).
722 |
723 | ## Acknowledgements
724 |
725 | * NVIDIA and MathWorks for valuable help setting up the development environment for this tutorial.
726 |
727 |
728 | ## iV&L Summer School instructions
729 |
730 | Connect here to Qwick labs as you have been instructed. Press the `Select` button for the MatConvNet Lab:
731 |
732 |
733 |
734 | Press the `Start Lab` button:
735 |
736 |
737 |
738 | Wait for the progress bar to finish (this may take one or two minutes):
739 |
740 |
741 |
742 | Click the `lab instructions` link and follow the rest of the instructions:
743 |
744 |
745 |
746 | Once MATLAB is started, continue from [the top](#getting).
747 |
748 | ### If shortcuts in MATLAB do not work properly
749 | By default, MATLAB on Linux systems has EMACS-style shortcuts. To change it to more familiar Windows style shortcuts:
750 |
751 | * Type `preferences` in MATLAB Command Window.
752 | * Pick `Keyboard -> Shortcuts`
753 | * Change `Active settings` to `Windows Default Set`
754 |
755 | * Click `OK` to apply and close the settings window
756 |
757 | ### If the screen is too small or too large
758 |
759 | If you are running the practical through a VNC-based remote desktop connection, you can try adjusting the resolution by changing the setting in the OS (click on the big Ubuntu button on the top left and search for `Displays`).
760 |
761 | ## History
762 |
763 | * Used in the [IV & L Net](http://ivl-net.eu/ivl-net-training-school-2016/) summer school, Malta, 2016.
764 |
765 | [^convolution]: If you are familiar with convolution as defined in mathematics and signal processing, you might expect to find the index $i-u$ instead of $i+u$ in this expression. The convention $i+u$, which is often used in CNNs, is often referred to as correlation.
766 |
767 | [^derivative]: The derivative is computed with respect to a certain assignment $x_0$ and $(w_1,\dots,w_L)$ to the network input and parameters; furthermore, the intermediate derivatives are computed at points $x_1,\dots,x_L$ obtained by evaluating the network at $x_0$.
768 |
769 | [^stacking]: The stacking operator $\vv$ simply unfolds a tensor in a vector by stacking its elements in some pre-defined order. For example:
770 | $$
771 | \vv\begin{bmatrix}
772 | 1 & 3 & 5\\
773 | 2 & 4 & 6
774 | \end{bmatrix}=\begin{bmatrix}
775 | 1\\2\\3\\4\\5\\6
776 | \end{bmatrix}
777 | $$
778 |
779 | [^goingdeeper]: Like for example.
780 |
--------------------------------------------------------------------------------
/exercise1.m:
--------------------------------------------------------------------------------
1 | setup() ;
2 |
3 | %% Part 1.1: convolution
4 |
5 | %% Part 1.1.1: convolution by a single filter
6 |
7 | % Load an image and convert it to gray scale and single precision
8 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ;
9 |
10 | % Define a filter
11 | w = single([
12 | 0 -1 0
13 | -1 4 -1
14 | 0 -1 0]) ;
15 |
16 | % Apply the filter to the image
17 | y = vl_nnconv(x, w, []) ;
18 |
19 | % Visualize the results
20 | figure(11) ; clf ; colormap gray ;
21 | set(gcf, 'name', 'Part 1.1: convolution') ;
22 |
23 | subplot(2,2,1) ;
24 | imagesc(x) ;
25 | axis off image ;
26 | title('Input image x') ;
27 |
28 | subplot(2,2,2) ;
29 | imagesc(w) ;
30 | axis off image ;
31 | title('Filter w') ;
32 |
33 | subplot(2,2,3) ;
34 | imagesc(y) ;
35 | axis off image ;
36 | title('Output image y') ;
37 |
38 | %% Part 1.1.2: convolution by a bank of filters
39 |
40 | % Concatenate three filters in a bank
41 | w1 = single([
42 | 0 -1 0
43 | -1 4 -1
44 | 0 -1 0]) ;
45 |
46 | w2 = single([
47 | -1 0 +1
48 | -1 0 +1
49 | -1 0 +1]) ;
50 |
51 | w3 = single([
52 | -1 -1 -1
53 | 0 0 0
54 | +1 +1 +1]) ;
55 |
56 | wbank = cat(4, w1, w2, w3) ;
57 |
58 | % Apply convolution
59 | y = vl_nnconv(x, wbank, []) ;
60 |
61 | % Show feature channels
62 | figure(12) ; clf('reset') ;
63 | set(gcf, 'name', 'Part 1.1.2: channels') ;
64 | colormap gray ;
65 | showFeatureChannels(y) ;
66 |
67 | %% Part 1.1.3: convolving a batch of images
68 |
69 | x1 = im2single(rgb2gray(imread('data/ray.jpg'))) ;
70 | x2 = im2single(rgb2gray(imread('data/crab.jpg'))) ;
71 | x = cat(4, x1, x2) ;
72 |
73 | y = vl_nnconv(x, wbank, []) ;
74 |
75 | figure(13) ; clf('reset') ; colormap gray ;
76 | set(gcf, 'name', 'Part 1.1.3: filtering a batch') ;
77 |
78 | subplot(4,2,1) ; imagesc(x1) ; axis off image ;
79 | subplot(4,2,3) ; imagesc(y(:,:,1,1)) ; axis off image ;
80 | subplot(4,2,5) ; imagesc(y(:,:,2,1)) ; axis off image ;
81 | subplot(4,2,7) ; imagesc(y(:,:,3,1)) ; axis off image ;
82 |
83 | subplot(4,2,2) ; imagesc(x2) ; axis off image ;
84 | subplot(4,2,4) ; imagesc(y(:,:,1,2)) ; axis off image ;
85 | subplot(4,2,6) ; imagesc(y(:,:,2,2)) ; axis off image ;
86 | subplot(4,2,8) ; imagesc(y(:,:,3,2)) ; axis off image ;
87 |
88 | %% Part 1.2: non-linear activation functions (ReLU)
89 |
90 | %% Part 1.2.1: Laplacian and ReLU
91 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ;
92 |
93 | % Convolve with the negated Laplacian
94 | y = vl_nnconv(x, - w, []) ;
95 |
96 | % Apply the ReLU operator
97 | z = vl_nnrelu(y) ;
98 |
99 | figure(14) ; clf ; set(gcf, 'name', 'Part 1.2.1: Laplacian and ReLU') ;
100 | colormap gray ;
101 | subplot(2,2,1); imagesc(x) ; axis off image ; title('Image x') ;
102 | subplot(2,2,2); imagesc(y) ; axis off image ; title('Laplacian y')
103 | subplot(2,2,3); imagesc(z) ; axis off image ; title('ReLU z') ;
104 |
105 | %% Part 1.2.2: effect of adding a bias
106 |
107 | bias = single(- 0.2) ;
108 | y = vl_nnconv(x, - w, bias) ;
109 | z = vl_nnrelu(y) ;
110 |
111 | figure(15) ; clf ; set(gcf, 'name', 'Part 1.2.2: adding a bias') ;
112 | colormap gray ;
113 | subplot(2,2,1); imagesc(x) ; axis off image ; title('Image x') ;
114 | subplot(2,2,2); imagesc(y) ; axis off image ; title('Laplacian y with bias')
115 | subplot(2,2,3); imagesc(z) ; axis off image ; title('ReLU z') ;
116 |
--------------------------------------------------------------------------------
/exercise2.m:
--------------------------------------------------------------------------------
1 | setup() ;
2 |
3 | %% Part 2.1: Backward mode verification
4 |
5 | % Create a random input image batch
6 | x = randn(10, 10, 1, 2, 'single') ;
7 |
8 | % Define a filter
9 | w = single([
10 | 0 -1 -0
11 | -1 4 -1
12 | 0 -1 0]) ;
13 |
14 | % Forward mode: evaluate the convolution
15 | y = vl_nnconv(x, w, []) ;
16 |
17 | % Pick a random projection tensor
18 | p = randn(size(y), 'single') ;
19 |
20 | % Backward mode: projected derivatives
21 | [dx,dw] = vl_nnconv(x, w, [], p) ;
22 |
23 | % Check the derivative numerically
24 | figure(21) ; clf('reset') ;
25 | set(gcf, 'name', 'Part 2.1: single layer backrpop') ;
26 | checkDerivativeNumerically(@(x) proj(p, vl_nnconv(x, w, [])), x, dx) ;
27 |
28 | %% Part 2.2: Backpropagation
29 |
30 | % Create a random input image batch
31 | x = randn(10, 10, 1, 2, 'single') ;
32 |
33 | % Forward mode: evaluate the conv follwed by ReLU
34 | y = vl_nnconv(x, w, []) ;
35 | z = vl_nnrelu(y) ;
36 |
37 | % Pick a random projection tensor
38 | p = randn(size(z), 'single') ;
39 |
40 | % Backward mode: projected derivatives
41 | dy = vl_nnrelu(z, p) ;
42 | [dx,dw] = vl_nnconv(x, w, [], dy) ;
43 |
44 | % Check the derivative numerically
45 | figure(22) ; clf('reset') ;
46 | set(gcf, 'name', 'Part 2.2: two layers backrpop') ;
47 | func = @(x) proj(p, vl_nnrelu(vl_nnconv(x, w, []))) ;
48 | checkDerivativeNumerically(func, x, dx) ;
49 |
50 | %% Part 2.3: Design and verify your own layer
51 |
52 | x0 = randn(size(x), 'single') ;
53 |
54 | forward = @l2LossForward; backward = @l2LossBackward ;
55 |
56 | % Uncomment the followung line to test your L1 loss implementation
57 | % forward = @l1LossForward; backward = @l1LossBackward ;
58 |
59 | y = forward(x, x0) ;
60 |
61 | p = randn(size(y), 'single') ;
62 | dx = backward(x, x0, p) ;
63 |
64 | % Check the derivative numerically
65 | figure(23) ; clf('reset') ;
66 | set(gcf, 'name', 'Part 2.3: custom loss layer') ;
67 | func = @(x) proj(p, forward(x, x0)) ;
68 | checkDerivativeNumerically(func, x, dx) ;
69 |
--------------------------------------------------------------------------------
/exercise3.m:
--------------------------------------------------------------------------------
1 | setup() ;
2 | % setup('useGpu', true); % Uncomment to initialise with a GPU support
3 |
4 | %% Part 3.1: Prepare the data
5 |
6 | % Load a database of blurred images to train from
7 | imdb = load('data/text_imdb.mat') ;
8 |
9 | % Visualize the first image in the database
10 | figure(31) ; set(gcf, 'name', 'Part 3.1: Data') ; clf ;
11 |
12 | subplot(1,2,1) ; imagesc(imdb.images.data(:,:,:,1)) ;
13 | axis off image ; title('Input (blurred)') ;
14 |
15 | subplot(1,2,2) ; imagesc(imdb.images.label(:,:,:,1)) ;
16 | axis off image ; title('Desired output (sharp)') ;
17 |
18 | colormap gray ;
19 |
20 | %% Part 3.2: Create a network architecture
21 | %
22 | % The expected input size (a single 64 x 64 x 1 image patch). This is
23 | % used for visualization purposes.
24 |
25 | net = initializeSmallCNN() ;
26 | %net = initializeLargeCNN() ;
27 |
28 | % Display network
29 | vl_simplenn_display(net) ;
30 |
31 | % Evaluate network on an image
32 | res = vl_simplenn(net, imdb.images.data(:,:,:,1)) ;
33 |
34 | figure(32) ; clf ; colormap gray ;
35 | set(gcf,'name', 'Part 3.2: network input') ;
36 | subplot(1,2,1) ;
37 | imagesc(res(1).x) ; axis image off ;
38 | title('CNN input') ;
39 |
40 | set(gcf,'name', 'Part 3.2: network output') ;
41 | subplot(1,2,2) ;
42 | imagesc(res(end).x) ; axis image off ;
43 | title('CNN output (not trained yet)') ;
44 |
45 | %% Part 3.3: learn the model
46 |
47 | % Add a loss (using a custom layer)
48 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ;
49 |
50 | % Extra: uncomment the following line to use your implementation
51 | % of the L1 loss
52 | %net = addCustomLossLayer(net, @l1LossForward, @l1LossBackward) ;
53 |
54 | % Train
55 | trainOpts.expDir = 'data/text-small' ;
56 | trainOpts.gpus = [] ;
57 | % Uncomment for GPU training:
58 | %trainOpts.expDir = 'data/text-small-gpu' ;
59 | %trainOpts.gpus = [1] ;
60 | trainOpts.batchSize = 16 ;
61 | trainOpts.learningRate = 0.02 ;
62 | trainOpts.plotDiagnostics = false ;
63 | %trainOpts.plotDiagnostics = true ; % Uncomment to plot diagnostics
64 | trainOpts.numEpochs = 20 ;
65 | trainOpts.errorFunction = 'none' ;
66 |
67 | net = cnn_train(net, imdb, @getBatch, trainOpts) ;
68 |
69 | % Deploy: remove loss
70 | net.layers(end) = [] ;
71 |
72 | %% Part 3.4: evaluate the model
73 |
74 | train = find(imdb.images.set == 1) ;
75 | val = find(imdb.images.set == 2) ;
76 |
77 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ;
78 | showDeblurringResult(net, imdb, train(1:30:151)) ;
79 |
80 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ;
81 | showDeblurringResult(net, imdb, val(1:30:151)) ;
82 |
83 | figure(35) ;
84 | set(gcf, 'name', 'Part 3.4: Larger example on the validation set') ;
85 | colormap gray ;
86 | subplot(1,2,1) ; imagesc(imdb.examples.blurred{1}, [-1, 0]) ;
87 | axis image off ;
88 | title('CNN input') ;
89 | res = vl_simplenn(net, imdb.examples.blurred{1}) ;
90 | subplot(1,2,2) ; imagesc(res(end).x, [-1, 0]) ;
91 | axis image off ;
92 | title('CNN output') ;
93 |
--------------------------------------------------------------------------------
/extra/Makefile:
--------------------------------------------------------------------------------
1 | # Build practical
2 |
3 | name ?= practical-cnn-reg
4 | ver ?= 2016a
5 |
6 | code=\
7 | checkDerivativeNumerically.m \
8 | l1LossBackward.m \
9 | l1LossForward.m \
10 | l2LossBackward.m \
11 | l2LossForward.m \
12 | exercise1.m \
13 | exercise2.m \
14 | exercise3.m \
15 | getBatch.m \
16 | addCustomLossLayer.m \
17 | initializeLargeCNN.m \
18 | initializeSmallCNN.m \
19 | proj.m \
20 | setup.m \
21 | showDeblurringResult.m \
22 | showFeatureChannels.m \
23 | xavier.m \
24 | README.md \
25 | matconvnet
26 |
27 | doc=\
28 | doc/images \
29 | doc/instructions.html \
30 | doc/base.css \
31 | doc/prism.css \
32 | doc/prism.js
33 |
34 | data=\
35 | data/text_imdb.mat \
36 | data/ray.jpg \
37 | data/crab.jpg
38 |
39 | include extra/practical/Makefile
40 |
41 | BINPACK_URL=http://www.robots.ox.ac.uk/~karel/project/matconvnet/bin
42 |
43 | %-bin: $(TMPDIR)/matconvnet-%-bin.tar.gz
44 | tar xzvf $< -C ./matconvnet
45 |
46 | $(TMPDIR)/matconvnet-%-bin.tar.gz:
47 | wget $(BINPACK_URL)/$(notdir $@) -O $@
48 |
49 | # Preprocessing
50 | .PHONY: preproc bins
51 |
52 | papers_url=http://www.robots.ox.ac.uk/~vedaldi/assets/pubs
53 | papers=\
54 | blaschko10simultaneous.pdf \
55 | chatfield11devil.pdf \
56 | chatfield14return.pdf \
57 | cimpoi14describing.pdf \
58 | cimpoi15deep.pdf \
59 | cimpoi16deep.pdf \
60 | fulkerson08localizing.pdf \
61 | fulkerson09class.pdf \
62 | jaderberg14deep.pdf \
63 | jaderberg14speeding.pdf \
64 | jaderberg14synthetic.pdf \
65 | jaderberg15reading.pdf \
66 | jones07inertial.pdf \
67 | juneja13blocks.pdf \
68 | lempitsky11pylon.pdf \
69 | lenc15rcnn.pdf \
70 | lenc15understanding.pdf \
71 | mahendran15understanding.pdf \
72 | parizi15automatic.pdf \
73 | parkhi11truth.pdf \
74 | parkhi12cat.pdf \
75 | parkhi12spotting.pdf \
76 | parkhi14compact.pdf \
77 | parkhi15deep.pdf \
78 | pedersoli11coarse.pdf \
79 | pedersoli14coarse.pdf \
80 | rabinovich07objects.pdf \
81 | simonyan12descriptor.pdf \
82 | simonyan13deep.pdf \
83 | simonyan13fisher.pdf \
84 | simonyan14deep.pdf \
85 | simonyan14learning.pdf \
86 | sreekanth10generalized.pdf \
87 | vedaldi05features.pdf \
88 | vedaldi05kalmansac.pdf \
89 | vedaldi05TRviewpoint.pdf \
90 | vedaldi06local.pdf \
91 | vedaldi06viewpoint.pdf \
92 | vedaldi07boosting.pdf \
93 | vedaldi07complexity.pdf \
94 | vedaldi07moving-orig.pdf \
95 | vedaldi07moving.pdf \
96 | vedaldi07open.pdf \
97 | vedaldi08joint.pdf \
98 | vedaldi08quick.pdf \
99 | vedaldi08relaxed.pdf \
100 | vedaldi09multiple.pdf \
101 | vedaldi09structured.pdf \
102 | vedaldi10efficient.pdf \
103 | vedaldi10knowing.pdf \
104 | vedaldi10vlfeat.pdf \
105 | vedaldi11efficient.pdf \
106 | vedaldi11learning.pdf \
107 | vedaldi12self.pdf \
108 | vedaldi12sparse.pdf \
109 | vedaldi14understanding.pdf \
110 | vedaldi15matconvnet.pdf
111 |
112 | papers:=$(addprefix data/text/,$(papers))
113 | papers_crops=$(papers:%.pdf=%.png)
114 |
115 | preproc: $(papers) $(papers_crops)
116 |
117 | bins: win7-bin maci64-bin glnxa64-bin
118 | rm -f matlab/mex/*.mex*
119 |
120 | ec2bins: ec2-bin
121 | rm -f matlab/mex/*.mex*
122 |
123 | data/text/%.pdf:
124 | mkdir -p data/text ; \
125 | wget "$(papers_url)/$(*).pdf" -O "$(@)"
126 |
127 | %.png : %.pdf extra/Makefile
128 | convert \
129 | -verbose \
130 | -units PixelsPerInch \
131 | -density 300 \
132 | "$(<)"[2] \
133 | -colorspace 'rgb' \
134 | -flatten \
135 | -resize 'x1536' \
136 | -gravity center -crop 512x512+0+0 \
137 | "$(@)"
138 |
139 | info:
140 | @echo Images: "$(papers)"
141 |
--------------------------------------------------------------------------------
/extra/getBlurredImagesData.m:
--------------------------------------------------------------------------------
1 | function imdb = getBlurredImagesData(dataDir)
2 | %GETBLURREDIMAGESDATA Get the data for the text deblurring exercise
3 | % IMDB = GETBLURREDIMAGESDATA(DATADIR) reads a directory of PNG
4 | % images DATADIR and returns a corresponding IMDB structure.
5 |
6 | imdb.images.id = {} ;
7 | imdb.images.data = {} ;
8 | imdb.images.set = {} ;
9 | imdb.images.label = {} ;
10 |
11 | names = dir(fullfile(dataDir, '*.png')) ;
12 | names = {names.name} ;
13 |
14 | numCollected = 0 ;
15 |
16 | for i = 1:numel(names)
17 | im = imread(fullfile(dataDir, names{i})) ;
18 | im = im2single(im) ;
19 | if size(im,3) > 1, im = rgb2gray(im) ; end
20 | im = im - 1 ; % make white = 0
21 | label = im ;
22 |
23 | G = fspecial('gaussian', [5 5], 2);
24 | im = imfilter(label,G,'same') ;
25 | s = 1+ (i > numel(names)*.75) ;
26 |
27 | if s == 2 && numCollected < 10
28 | numCollected = numCollected + 1 ;
29 | imdb.examples.sharp{numCollected} = label ;
30 | imdb.examples.blurred{numCollected} = im ;
31 | end
32 |
33 | % further break each image in 64 x 64 tiles
34 | for i = 0:7
35 | for j = 0:7
36 | si = i*64 + (1:64) ;
37 | sj = j*64 + (1:64) ;
38 | im_ = im(si,sj) ;
39 | label_ = label(si,sj) ;
40 | % drop if nothing in the patch
41 | if std(im_(:)) < 0.05, continue ; end
42 | imdb.images.id{end+1} = numel(imdb.images.id) + 1 ;
43 | imdb.images.set{end+1} = s ;
44 | imdb.images.label{end+1} = label_ ;
45 | imdb.images.data{end+1} = im_ ;
46 | end
47 | end
48 | end
49 |
50 | imdb.images.id = horzcat(imdb.images.id{:}) ;
51 | imdb.images.set = horzcat(imdb.images.set{:}) ;
52 | imdb.images.label = cat(4, imdb.images.label{:}) ;
53 | imdb.images.data = cat(4, imdb.images.data{:}) ;
54 |
--------------------------------------------------------------------------------
/extra/post.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # A version with standard binaries
4 | make -f extra/Makefile distclean
5 | make -f extra/Makefile bins
6 | make -f extra/Makefile pack post post-doc
7 |
8 | # A version with EC2 binaries
9 | name=practical-cnn-reg-ec2 make -f extra/Makefile distclean
10 | name=practical-cnn-reg-ec2 make -f extra/Makefile ec2bins
11 | name=practical-cnn-reg-ec2 make -f extra/Makefile pack post
12 |
--------------------------------------------------------------------------------
/extra/preprocess.m:
--------------------------------------------------------------------------------
1 | function preprocess()
2 | % Run the Makefile first
3 |
4 | opts.dataDir = 'data/text/' ;
5 | opts.imdbPath = 'data/text_imdb.mat' ;
6 |
7 | setup() ;
8 |
9 | if ~exist(opts.imdbPath)
10 | imdb = getBlurredImagesData(opts.dataDir) ;
11 | save(opts.imdbPath, '-struct', 'imdb') ;
12 | end
13 |
14 |
--------------------------------------------------------------------------------
/getBatch.m:
--------------------------------------------------------------------------------
1 | function [im, label] = getBatch(imdb, batch)
2 | %GETBATCH Get a batch of training data
3 | % [IM, LABEL] = The GETBATCH(IMDB, BATCH) extracts the images IM
4 | % and labels LABEL from IMDB according to the list of images
5 | % BATCH.
6 |
7 | im = imdb.images.data(:,:,:,batch) ;
8 | label = imdb.images.label(:,:,:,batch) ;
9 |
--------------------------------------------------------------------------------
/initializeLargeCNN.m:
--------------------------------------------------------------------------------
1 | function net = initializeLargeCNN()
2 | %INITIALIZELARGECNN Initialize a large CNN for text deblurring
3 | % NET = INITIALIZELARGECNN() returns the SimpleNN model NET.
4 |
5 | net.meta.inputSize = [64 64 1 1] ;
6 |
7 | net.layers = { } ;
8 |
9 | net.layers{end+1} = struct(...
10 | 'name', 'conv1', ...
11 | 'type', 'conv', ...
12 | 'weights', {xavier(5,5,1,32)}, ...
13 | 'pad', 2, ...
14 | 'learningRate', [1 1], ...
15 | 'weightDecay', [1 0]) ;
16 |
17 | net.layers{end+1} = struct(...
18 | 'name', 'relu1', ...
19 | 'type', 'relu') ;
20 |
21 | net.layers{end+1} = struct(...
22 | 'name', 'conv2', ...
23 | 'type', 'conv', ...
24 | 'weights', {xavier(3,3,32,32)}, ...
25 | 'pad', 1, ...
26 | 'learningRate', [1 1], ...
27 | 'weightDecay', [1 0]) ;
28 |
29 | net.layers{end+1} = struct(...
30 | 'name', 'relu2', ...
31 | 'type', 'relu') ;
32 |
33 | net.layers{end+1} = struct(...
34 | 'name', 'conv3', ...
35 | 'type', 'conv', ...
36 | 'weights', {xavier(1,7,32,32)}, ...
37 | 'pad', [0 0 3 3], ...
38 | 'learningRate', [1 1], ...
39 | 'weightDecay', [1 0]) ;
40 |
41 | net.layers{end+1} = struct(...
42 | 'name', 'relu3', ...
43 | 'type', 'relu') ;
44 |
45 | net.layers{end+1} = struct(...
46 | 'name', 'conv4', ...
47 | 'type', 'conv', ...
48 | 'weights', {xavier(7,1,32,32)}, ...
49 | 'pad', [3 3 0 0], ...
50 | 'learningRate', [1 1], ...
51 | 'weightDecay', [1 0]) ;
52 |
53 | net.layers{end+1} = struct(...
54 | 'name', 'relu4', ...
55 | 'type', 'relu') ;
56 |
57 | net.layers{end+1} = struct(...
58 | 'name', 'prediction', ...
59 | 'type', 'conv', ...
60 | 'weights', {xavier(3,3,32,1)}, ...
61 | 'pad', 1, ...
62 | 'stride', 1, ...
63 | 'learningRate', [1 .001], ...
64 | 'weightDecay', [1 0]) ;
65 |
66 | % Consolidate the network, fixing any missing option
67 | % in the specification above.
68 |
69 | net = vl_simplenn_tidy(net) ;
70 |
--------------------------------------------------------------------------------
/initializeSmallCNN.m:
--------------------------------------------------------------------------------
1 | function net = initializeSmallCNN()
2 | %INITIALIZESMALLCNN Initialize a small CNN for text deblurring
3 | % NET = INITIALIZESMALLCNN() returns the SimpleNN model NET.
4 |
5 | net.meta.inputSize = [64 64 1 1] ;
6 |
7 | net.layers = { } ;
8 |
9 | net.layers{end+1} = struct(...
10 | 'name', 'conv1', ...
11 | 'type', 'conv', ...
12 | 'weights', {xavier(3,3,1,32)}, ...
13 | 'pad', 1, ...
14 | 'stride', 1, ...
15 | 'learningRate', [1 1], ...
16 | 'weightDecay', [1 0]) ;
17 |
18 | net.layers{end+1} = struct(...
19 | 'name', 'relu1', ...
20 | 'type', 'relu') ;
21 |
22 | net.layers{end+1} = struct(...
23 | 'name', 'conv2', ...
24 | 'type', 'conv', ...
25 | 'weights', {xavier(3,3,32,32)}, ...
26 | 'pad', 1, ...
27 | 'stride', 1, ...
28 | 'learningRate', [1 1], ...
29 | 'weightDecay', [1 0]) ;
30 |
31 | net.layers{end+1} = struct(...
32 | 'name', 'relu2', ...
33 | 'type', 'relu') ;
34 |
35 | net.layers{end+1} = struct(...
36 | 'name', 'prediction', ...
37 | 'type', 'conv', ...
38 | 'weights', {xavier(3,3,32,1)}, ...
39 | 'pad', 1, ...
40 | 'stride', 1, ...
41 | 'learningRate', [1 1], ...
42 | 'weightDecay', [1 0]) ;
43 |
44 | % Consolidate the network, fixing any missing option
45 | % in the specification above
46 |
47 | net = vl_simplenn_tidy(net) ;
--------------------------------------------------------------------------------
/l1LossBackward.m:
--------------------------------------------------------------------------------
1 | function dx = l1LossBackward(x,r,p)
2 | % TODO: Replace the following line with your implementation
3 | dx = rand(size(x), 'like', x) ;
4 |
5 | dx = dx / (size(x,1) * size(x,2)) ; % normalize by image size
6 |
--------------------------------------------------------------------------------
/l1LossForward.m:
--------------------------------------------------------------------------------
1 | function y = l1LossForward(x,r)
2 | % TODO: Replace the following line with your implementation
3 | y = rand(size(x), 'like', x) ;
4 |
5 | y = y / (size(x,1) * size(x,2)) ; % normalize by image size
6 |
--------------------------------------------------------------------------------
/l2LossBackward.m:
--------------------------------------------------------------------------------
1 | function dx = l2LossBackward(x,r,p)
2 | dx = 2 * p * (x - r) ;
3 | dx = dx / (size(x,1) * size(x,2)) ; % normalize by image size
4 |
--------------------------------------------------------------------------------
/l2LossForward.m:
--------------------------------------------------------------------------------
1 | function y = l2LossForward(x,r)
2 | delta = x - r ;
3 | y = sum(delta(:).^2) ;
4 | y = y / (size(x,1) * size(x,2)) ; % normalize by image size
5 |
--------------------------------------------------------------------------------
/proj.m:
--------------------------------------------------------------------------------
1 | function z = proj(x,p)
2 | %PROJ Project a tensor onto anotehr
3 | % Z = PROJ(X,P) computes the projection Z of tensor X onto P.
4 | %
5 | % Remark: if X and P contain multiple tensor instances
6 | % (concatenated along the foruth dimension), then the
7 | % result Z contains a scalar projection for each.
8 |
9 | prods = x .* p ;
10 | z = sum(prods(:)) ;
11 |
--------------------------------------------------------------------------------
/setup.m:
--------------------------------------------------------------------------------
1 | function setup(varargin)
2 | %SETUP() Initialize the practical
3 | % SETUP() initializes the practical. SETUP('useGpu', true) does
4 | % the same, but compiles the GPU supprot as well.
5 |
6 | base = fileparts(mfilename('fullpath')) ;
7 | run(fullfile(base, 'matconvnet', 'matlab', 'vl_setupnn')) ;
8 |
9 | opts.useGpu = false ;
10 | opts.verbose = false ;
11 | opts = vl_argparse(opts, varargin) ;
12 |
13 | addpath(fullfile(base, 'matconvnet', 'examples')) ;
14 |
15 | try
16 | vl_nnconv(single(1),single(1),[]) ;
17 | catch
18 | warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ;
19 | vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose, ...
20 | 'enableImreadJpeg', false) ;
21 | end
22 |
23 | if opts.useGpu
24 | try
25 | vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ;
26 | catch
27 | warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now.') ;
28 | vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose, ...
29 | 'enableImreadJpeg', false) ;
30 | end
31 | end
32 |
33 | if verLessThan('matlab','7.12')
34 | % MATLAB R2010b did not have rng()
35 | randn('state',0) ;
36 | rand('state',0) ;
37 | else
38 | rng(0) ;
39 | end
40 |
41 | % The EC2 has incorrect screen size which
42 | % leads to a tiny font in figures
43 |
44 | [~, hostname] = system('hostname') ;
45 | if strcmp(hostname(1:3), 'ip-')
46 | set(0, 'DefaultAxesFontSize', 30) ;
47 | end
48 |
49 |
50 |
--------------------------------------------------------------------------------
/showDeblurringResult.m:
--------------------------------------------------------------------------------
1 | function showDeblurringResult(net, imdb, subset)
2 | %SHOWDEBLURRINGRESULT Show a few examples of deblurred images
3 | % SHOWDEBLURRINGRESULTS(NET, IMDB, SUBSET) uses the CNN NET to
4 | % deblur a few images in the IMDB database and visualzie the result
5 | % in a figure. SUBSET is a vector of image indexes to display.
6 |
7 | % Evaluate the CNN to obtain deblurring results
8 | res = vl_simplenn(net, imdb.images.data(:,:,:,subset)) ;
9 | preds = res(end).x ;
10 |
11 | % Visualize the results in a figure
12 | clf ;
13 | n = numel(subset) ;
14 | for i = 1 : n
15 | j = subset(i) ;
16 | subplot(n,3,1+3*(i-1)) ;
17 | imagesc(imdb.images.data(:,:,:,j),[-1 0]) ;
18 | axis off image ; title('original') ;
19 | subplot(n,3,2+3*(i-1)) ;
20 | imagesc(imdb.images.label(:,:,:,j),[-1 0]) ;
21 | axis off image ; title('expected') ;
22 | subplot(n,3,3+3*(i-1)) ;
23 | imagesc(preds(:,:,:,i),[-1 0]) ;
24 | axis off image ; title('achieved') ;
25 | end
26 | colormap gray ;
27 |
--------------------------------------------------------------------------------
/showFeatureChannels.m:
--------------------------------------------------------------------------------
1 | function showFeatureChannels(x)
2 | %SHOWFEATURECHANNELS Display the feature channels in the tensor x
3 |
4 | k = size(x,3) ;
5 | n = ceil(sqrt(k)) ;
6 | m = ceil(k/n) ;
7 |
8 | for i = 1:k
9 | subplot(m,n,i) ; imagesc(x(:,:,i)) ;
10 | title(sprintf('feature channel %d',i)) ; axis image ;
11 | end
--------------------------------------------------------------------------------
/xavier.m:
--------------------------------------------------------------------------------
1 | function weights = xavier(varargin)
2 | %XAVIER Xavier filter initialization.
3 | % WEIGHTS = XAVIER(H, W, C, N) initializes N filters of support H x
4 | % W and C channels using Xavier method. WEIGHTS = {FILTERS,BIASES}is
5 | % a cell array containing both filters and biases.
6 | %
7 | % See also:
8 | % Glorot, Xavier, and Yoshua Bengio.
9 | % "Understanding the difficulty of training deep feedforward neural networks."
10 | % International conference on artificial intelligence and statistics. 2010.
11 |
12 | filterSize = [varargin{:}] ;
13 | scale = sqrt(2/prod(filterSize(1:3))) ;
14 | filters = randn(filterSize, 'single') * scale ;
15 | biases = zeros(filterSize(4),1,'single') ;
16 | weights = {filters, biases} ;
17 |
--------------------------------------------------------------------------------