├── .gitignore ├── .gitmodules ├── README.md ├── addCustomLossLayer.m ├── checkDerivativeNumerically.m ├── data ├── crab.jpg └── ray.jpg ├── doc ├── images │ ├── conv.png │ ├── conv.svg │ ├── cover.png │ ├── mathworks_logo.png │ ├── matlab_set_shortcuts.svg │ ├── nvidia_logo.svg │ ├── oxford.png │ ├── step1.png │ ├── step2.png │ ├── step3.png │ ├── step4.png │ └── text.png ├── instructions.html └── instructions.md ├── exercise1.m ├── exercise2.m ├── exercise3.m ├── extra ├── Makefile ├── getBlurredImagesData.m ├── post.sh └── preprocess.m ├── getBatch.m ├── initializeLargeCNN.m ├── initializeSmallCNN.m ├── l1LossBackward.m ├── l1LossForward.m ├── l2LossBackward.m ├── l2LossForward.m ├── proj.m ├── setup.m ├── showDeblurringResult.m ├── showFeatureChannels.m └── xavier.m /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data/ 3 | local 4 | local/ 5 | base.css 6 | doc/prism.css 7 | doc/prism.js 8 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "extra/practical"] 2 | path = extra/practical 3 | url = git@github.com:vedaldi/practical.git 4 | [submodule "matconvnet"] 5 | path = matconvnet 6 | url = git@github.com:vlfeat/matconvnet.git 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Convolutional neural network practical (2) 2 | ========================================== 3 | 4 | A computer vision practical by the Oxford Visual Geometry group, 5 | authored by Andrea Vedaldi, Karel Lenc, and Joao Henriques. 6 | 7 | Start from `doc/instructions.html`. 8 | 9 | > Note that this practical requires compiling the (included) 10 | > MatConvNet library. This should happen automatically (see the 11 | > `setup.m` script), but make sure that the compilation succeeds on 12 | > the laboratory computers. 13 | 14 | Package contents 15 | ---------------- 16 | 17 | The practical consists of four exercises, organized in the following 18 | files: 19 | 20 | * `exercise1.m` -- Part 1: Building blocks: convolution and ReLU 21 | * `exercise2.m` -- Part 2: Derivatives and backpropagation 22 | * `exercise3.m` -- Part 3: Learning a CNN for text deblurring 23 | 24 | The practical runs in MATLAB and uses 25 | [MatConvNet](http://www.vlfeat.org/matconvnet). This package contains 26 | the following MATLAB functions: 27 | 28 | * `checkDerivativeNumerically.m`: check a layer derivatives numerically. 29 | * `customLayerForward.m` and `customLayerBackward.m`: code (partially) implementing a custom layer. 30 | * `getBatch.m:`: get a batch of images for training. 31 | * `getCustomLayer.m`: get the custom layer in SimpleNN format. 32 | * `initializeSmallCNN.m` and `initializeLargeCNN.m`: initialize CNN models for text deblurring. 33 | * `setup.m`: setup MATLAB environment. 34 | * `showDeblurringResult.m`: show results for the deblurring network. 35 | * `showFeatureChannels.m`: show the feature channels in a tensor. 36 | * `xavier.m`: Xaiver's initialization of the network weights. 37 | 38 | Appendix: Installing from scratch 39 | --------------------------------- 40 | 41 | The practical requires both VLFeat and MatConvNet. VLFeat comes with 42 | pre-built binaries, but MatConvNet does not. 43 | 44 | 0. Set the current directory to the practical base directory. 45 | 1. From Bash: 46 | 1. Run `git submodule update -i` to download the submodules. 47 | 2. Run `make -f ./extras/Makefile preproc`. This will create a copy 48 | of the data for the practical 49 | 2. From MATLAB run `addpath extra ; preprocess ;`. This will create 50 | `data/text_imdb.mat`. 51 | 3. Test the practical: from MATLAB run all the exercises in order. 52 | 53 | Changes 54 | ------- 55 | 56 | * *2016a* - Initial edition 57 | 58 | License 59 | ------- 60 | 61 | Copyright (c) 16 Andrea Vedaldi, Karel Lenc, and Joao Henriques 62 | 63 | Permission is hereby granted, free of charge, to any person 64 | obtaining a copy of this software and associated documentation 65 | files (the "Software"), to deal in the Software without 66 | restriction, including without limitation the rights to use, copy, 67 | modify, merge, publish, distribute, sublicense, and/or sell copies 68 | of the Software, and to permit persons to whom the Software is 69 | furnished to do so, subject to the following conditions: 70 | 71 | The above copyright notice and this permission notice shall be 72 | included in all copies or substantial portions of the Software. 73 | 74 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 75 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 76 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 77 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 78 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 79 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 80 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 81 | DEALINGS IN THE SOFTWARE. 82 | -------------------------------------------------------------------------------- /addCustomLossLayer.m: -------------------------------------------------------------------------------- 1 | function net = addCustomLossLayer(net, fwfun, bwfun) 2 | %ADDCUSTOMLOSSLAYER Add a custom loss layer to a network 3 | % NET = ADDCUSTOMLOSSLAYER(NET, FWDFUN, BWDFUN) adds a custom loss 4 | % layer to the network NET using FWDFUN for forward pass and BWDFUN for 5 | % a backward pass. 6 | 7 | layer.name = 'loss' ; 8 | layer.type = 'custom' ; 9 | layer.forward = @forward ; 10 | layer.backward = @backward ; 11 | layer.class = [] ; 12 | 13 | % Make sure that the loss layer is not added multiple times 14 | if strcmp(net.layers{end}.name, layer.name) 15 | net.layers{end} = layer ; 16 | else 17 | net.layers{end+1} = layer ; 18 | end 19 | 20 | function res_ = forward(layer, res, res_) 21 | res_.x = fwfun(res.x, layer.class) ; 22 | end 23 | 24 | function res = backward(layer, res, res_) 25 | res.dzdx = bwfun(res.x, layer.class, res_.dzdx) ; 26 | end 27 | end 28 | 29 | 30 | -------------------------------------------------------------------------------- /checkDerivativeNumerically.m: -------------------------------------------------------------------------------- 1 | function err = checkDerivativeNumerically(f, x, dx) 2 | %CHECKDERIVATIVENUMERICALLY Check a layer's deriviative numerically 3 | % ERR = CHECKDERIVATIVENUMERICALLY(F, X, DX) takes the scalar function F, 4 | % its tensor input X and its derivative DX at X and compares DX to 5 | % a numerical approximation of the derivative returing their difference 6 | % ERR. 7 | 8 | y = f(x) ; 9 | dx_numerical = zeros(size(dx), 'single') ; 10 | delta = 0.01 ; 11 | 12 | for n = 1:size(x,4) 13 | for k = 1:size(x,3) 14 | for j = 1:size(x,2) 15 | for i = 1:size(x,1) 16 | xp = x ; 17 | xp(i,j,k,n) = xp(i,j,k,n) + delta ; 18 | yp = f(xp) ; 19 | dx_numerical(i,j,k,n) = (yp - y) / delta ; 20 | end 21 | end 22 | end 23 | end 24 | err = dx_numerical - dx ; 25 | 26 | range = max(abs(dx(:))) * [-1 1] ; 27 | T = size(x,4) ; 28 | for t = 1:size(x,4) 29 | subplot(T,3,1+(t-1)*3) ; bar3(dx(:,:,1,t)) ; zlim(range) ; 30 | title(sprintf('dx(:,:,1,%d) (given)',t)) ; 31 | subplot(T,3,2+(t-1)*3) ; bar3(dx_numerical(:,:,1,t)) ; zlim(range) ; 32 | title(sprintf('dx(:,:,1,%d) (numerical)',t)) ; 33 | subplot(T,3,3+(t-1)*3) ; bar3(abs(err(:,:,1,t))) ; zlim(range) ; 34 | title('absolute difference') ; 35 | end 36 | -------------------------------------------------------------------------------- /data/crab.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/data/crab.jpg -------------------------------------------------------------------------------- /data/ray.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/data/ray.jpg -------------------------------------------------------------------------------- /doc/images/conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/conv.png -------------------------------------------------------------------------------- /doc/images/conv.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | Σ 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | x 73 | 74 | 75 | y 76 | 77 | 78 | 79 | 80 | w 81 | 82 | 83 | 84 | W 85 | 86 | 87 | H 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | W 97 | 98 | 99 | 100 | 101 | 102 | 103 | H 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | W 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | H 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | -------------------------------------------------------------------------------- /doc/images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/cover.png -------------------------------------------------------------------------------- /doc/images/mathworks_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/mathworks_logo.png -------------------------------------------------------------------------------- /doc/images/matlab_set_shortcuts.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 19 | 21 | 43 | 45 | 46 | 48 | image/svg+xml 49 | 51 | 52 | 53 | 54 | 55 | 60 | 702 | 710 | 1. 722 | 730 | 2. 742 | 743 | 744 | -------------------------------------------------------------------------------- /doc/images/nvidia_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | generated by pstoedit version:3.44 from NVBadge_2D.eps 9 | 14 | 22 | 32 | 33 | -------------------------------------------------------------------------------- /doc/images/oxford.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/oxford.png -------------------------------------------------------------------------------- /doc/images/step1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step1.png -------------------------------------------------------------------------------- /doc/images/step2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step2.png -------------------------------------------------------------------------------- /doc/images/step3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step3.png -------------------------------------------------------------------------------- /doc/images/step4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/step4.png -------------------------------------------------------------------------------- /doc/images/text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vedaldi/practical-cnn-reg/6772125bed475ba8564e1660a0bcdc477ab1b11c/doc/images/text.png -------------------------------------------------------------------------------- /doc/instructions.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | VGG Practical 7 | 8 | 9 | 10 | 11 |

Oxford logo 12 | MathWorks logo 13 | NVIDIA logo

14 |

VGG CNN Practical: Image Regression

15 |

By Andrea Vedaldi, Karel Lenc, and Joao Henriques

16 |

This is an Oxford Visual Geometry Group computer vision practical (Release 2016a).

17 |

cover

18 |

Convolutional neural networks are an important class of learnable representations applicable, among others, to numerous computer vision problems. Deep CNNs, in particular, are composed of several layers of processing, each involving linear as well as non-linear operators, that are learned jointly, in an end-to-end manner, to solve a particular tasks. These methods are now the dominant approach for feature extraction from audiovisual and textual data.

19 |

This practical explores the basics of learning (deep) CNNs. The first part introduces typical CNN building blocks, such as ReLU units and linear filters. The second part explores backpropagation, including designing custom layers and verifying them numerically. The last part demonstrates learning a CNN for text deblurring; this differs from the usual problem of image classification and demonstrates the flexibility of these techniques.

20 |

This practical is based on MATLAB and the MatConvNet library. The practical demonstrates how easy it is to use this environment to prototype new network components and architectures. By only using familar MATLAB syntax, you will be able to implement new layers and take advantage of the GPU for faster computation.

21 |
22 | 62 |
63 |

$$ 64 | \newcommand{\bx}{\mathbf{x}} 65 | \newcommand{\by}{\mathbf{y}} 66 | \newcommand{\bz}{\mathbf{z}} 67 | \newcommand{\bw}{\mathbf{w}} 68 | \newcommand{\bp}{\mathbf{p}} 69 | \newcommand{\cP}{\mathcal{P}} 70 | \newcommand{\cN}{\mathcal{N}} 71 | \newcommand{\vc}{\operatorname{vec}} 72 | \newcommand{\vv}{\operatorname{vec}} 73 | $$

74 |

Installation

75 |
76 |

If you are running this in the iV&L Summer School, please refer to the instructions at the end of the document.

77 |
78 |

Read and understand the requirements and installation instructions. The download links for this practical are:

79 | 85 |

You can either unpack the archive manually, or use the following MATLAB one-liner:

86 |
untar('http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz')
 87 | cd practical-cnn-reg-2016a
 88 | 
89 | 90 |

91 |

Getting started

92 |

After the installation is complete, open and edit the script exercise1.m in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, for Part I of this document. You can cut and paste this code into the MATLAB window to run it, or use the shortcut Ctrl+Enter to run a code section. You will need to modify it as you go through the session. Other files exercise2.m, and exercise3.m, are given for Part II and III.

93 |

Each part contains several Questions (that may require pen and paper) and Tasks (that require experimentation or coding) to be answered/completed before proceeding further in the practical.

94 |

Part 1: CNN building blocks

95 |

In this part we will explore two fundamental building blocks of CNNs, linear convolution and non-linear activation functions. Open exercise1.m and run up to the setup() command, which initializes the MATLAB environment to use MatConvNet.

96 |

Part 1.1: Convolution

97 |

A convolutional neural network (CNN) is a sequence of linear and non-linear convolution-like operators. The most important example of such operators is linear convolution. In this part, we will explore linear convolution and see how to use it in MatConvNet.

98 |

Recall that linear convolution applies one (or more) filters $\bw$ to an image $\bx$ as follows:

99 |

conv

100 |

Part 1.1.1: Convolution by a single filter

101 |

Start by identifying and then running the following code fragment in exercise1.m:

102 |
% Load an image and convert it to gray scale and single precision
103 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ;
104 | 
105 | % Define a filter
106 | w = single([
107 |    0 -1 -0
108 |   -1  4 -1
109 |    0 -1  0]) ;
110 | 
111 | % Apply the filter to the image
112 | y = vl_nnconv(x, w, []) ;
113 | 
114 | 115 |

The code loads the image data/ray.jpg and applies to it a linear filter using the linear convolution operator. The latter is implemented by the MatConvNet function vl_nnconv(). Note that all variables x, w, and y are in single precision; while MatConvNet supports double precision arithmetic too, single precision is usually preferred in applications as memory is often a bottleneck. The result can be visualized as follows:

116 |
% Visualize the results
117 | figure(11) ; clf ; colormap gray ;
118 | set(gcf, 'name', 'Part 1.1: convolution') ;
119 | 
120 | subplot(2,2,1) ;
121 | imagesc(x) ;
122 | axis off image ;
123 | title('Input image x') ;
124 | 
125 | subplot(2,2,2) ;
126 | imagesc(w) ;
127 | axis off image ;
128 | title('Filter w') ;
129 | 
130 | subplot(2,2,3) ;
131 | imagesc(y) ;
132 | axis off image ;
133 | title('Output image y') ;
134 | 
135 | 136 |
137 |

Task: Run the code above and examine the result, which should look like the following image:

138 |
139 |

cover

140 |

The input $\bx$ is an $M \times N$ matrix, which can be interpreted as a gray scale image. The filter $\bw$ is the $3 \times 3$ matrix 141 | $$ 142 | \bw = 143 | \begin{bmatrix} 144 | 0 & -1 & 0 \\ 145 | -1 & 4 & -1 \\ 146 | 0 & -1 & 0 \\ 147 | \end{bmatrix} 148 | $$ 149 | The output of the convolution is a new matrix $\by$ given by1 150 | $$ 151 | y_{ij} = \sum_{uv} w_{uv}\ x_{i+u,\ j+v} 152 | $$

153 |
154 |

Questions:

155 |
    156 |
  1. If $H \times W$ is the size of the input image, $H' \times W'$ the size of the filter, what is the size $H'' \times W''$ of the output image?
  2. 157 |
  3. The filter $\bw$ given above is a discretized Laplacian operator. Which type of visual structures (corners, bars, ...) do you think may excite this filter the most?
  4. 158 |
159 |
160 |

Part 1.1.2: Convolution by a filter bank

161 |

In neural networks, one usually operates with filter banks instead of individual filters. Each filter can be though of as computing a different feature channel, characterizing a particular statistical property of the input image.

162 |

To see how to define and use a filter bank, create a bank of three filters as follows:

163 |
% Concatenate three filters in a bank
164 | w1 = single([
165 |    0 -1  0
166 |   -1  4 -1
167 |    0 -1  0]) ;
168 | 
169 | w2 = single([
170 |   -1 0 +1
171 |   -1 0 +1
172 |   -1 0 +1]) ;
173 | 
174 | w3 = single([
175 |   -1 -1 -1
176 |    0  0  0
177 |   +1 +1 +1]) ;
178 | 
179 | wbank = cat(4, w1, w2, w3) ;
180 | 
181 | 182 |

The first filter $\bw_1$ is the Laplacian operator seen above; two additional filters $\bw_2$ and $\bw_3$ are horizontal and vertical image derivatives, respectively. The command vl_nnconv(x, wbank, []) then applies all the filters in the bank to the input image x. Note that the output y is not just a matrix, but a 3D array (often called a tensor in the CNN jargon). This tensor has dimensions $H \times W \times K$, where $K$ is the number of feature channels.

183 |
184 |

Question: What is the number of feature channels $C$ in this example? Why?

185 |

Task: Run the code above and visualize the individual feature channels in the tensor y by using the provided function showFeatureChannels(). Do the channel responses make sense given the filter used to generate them?

186 |
187 |

In a CNN, not only the output tensor, but also the input tensor x and the filters wbank can have multiple feature channels. In this case, the convolution formula becomes: 188 | $$ 189 | y_{ijk} = \sum_{uvp} w_{uvpk}\ x_{i+u,\ j+v,\ p} 190 | $$

191 |
192 |

Questions:

193 | 197 |
198 |

Part 1.1.3: Convolving a batch of images

199 |

Finally, in training CNNs it is often important to be able to work efficiently with batches of data. MatConvNet allows packing more than one instance of the tensor $\bx$ in a single MATLAB array x by stacking the different instances along the fourth dimension of the array:

200 |
x1 = im2single(rgb2gray(imread('data/ray.jpg'))) ;
201 | x2 = im2single(rgb2gray(imread('data/crab.jpg'))) ;
202 | x = cat(4, x1, x2) ;
203 | 
204 | y = vl_nnconv(x, wbank, []) ;
205 | 
206 | 207 |
208 |

Task: Run the code above and visualize the result. Convince yourself that each filter is applied to each image.

209 |
210 |

Part 1.2: Non-linear activation (ReLU)

211 |

CNNs are obtained by composing several operators, individually called layers. In addition to convolution and other linear layers, CNNs should contain non-linear layers as well.

212 |
213 |

Question: What happens if all layers are linear?

214 |
215 |

The simplest non-linearity is given by scalar activation functions, which are applied independently to each element of a tensor. Perhaps the simplest and one of the most useful examples is the Rectified Linear Unit (ReLU) operator: 216 | $$ 217 | y_{ijk} = \max \{0, x_{ijk}\} 218 | $$ 219 | which simply cuts off any negative value in the data.

220 |

In MatConvNet, ReLU is implemented by the vl_nnrelu function. To demonstrate its use, we convolve the test image with the negated Laplacian, and then apply ReLU to the result:

221 |
% Convolve with the negated Laplacian
222 | y = vl_nnconv(x, - w, []) ;
223 | 
224 | % Apply the ReLU operator
225 | z = vl_nnrelu(y) ;
226 | 
227 | 228 |
229 |

Task: Run this code and visualize images x, y, and z.

230 |

Questions:

231 | 235 |
236 |

ReLU has a very important effect as it implicitly sets to zero the majority of the filter responses. In a certain sense, ReLU works as a detector, with the implicit convention that a certain pattern is detected when a corresponding filter response is large enough (greater than zero).

237 |

In practice, while signals are usually centered and therefore a threshold of zero is reasonable, there is no particular reason why this should always be appropriate. For this reason, the convolution operator allows to specify a bias term for each filter response. Let us use this term to make the response of ReLU more selective:

238 |
bias = single(- 0.2) ;
239 | y = vl_nnconv(x, - w, bias) ;
240 | z = vl_nnrelu(y) ;
241 | 
242 | 243 |

There is only one bias term because there is only one filter in the bank (note that, as for the rest of the data, bias is a single precision quantity). The bias is applied after convolution, effectively subtracting 0.2 from the filter responses. Hence, now a response is not suppressed by the subsequent ReLU operator only if it is at least 0.2 after convolution.

244 |
245 |

Task: Run this code and visualize images x, y, and z.

246 |

Question: Is the response now more selective?

247 |

Remark: There are many other building blocks used in CNNs, the most important of which is perhaps max pooling. However, convolution and ReLU can solve already many problems, as we will see in the remainder of the practical.

248 |
249 |

Part 2: Backpropagation

250 |

Training CNNs is normally done using a gradient-based optimization method. The CNN $f$ is the composition of $L$ layers $f_l$ each with parameters $\bw_l$, which in the simplest case of a chain looks like: 251 | $$ 252 | \bx_0 253 | \longrightarrow 254 | \underset{\displaystyle\underset{\displaystyle\bw_1}{\uparrow}}{\boxed{f_1}} 255 | \longrightarrow 256 | \bx_1 257 | \longrightarrow 258 | \underset{\displaystyle\underset{\displaystyle\bw_2}{\uparrow}}{\boxed{f_2}} 259 | \longrightarrow 260 | \bx_2 261 | \longrightarrow 262 | \dots 263 | \longrightarrow 264 | \bx_{L-1} 265 | \longrightarrow 266 | \underset{\displaystyle\underset{\displaystyle\bw_L}{\uparrow}}{\boxed{f_L}} 267 | \longrightarrow 268 | \bx_L 269 | $$ 270 | During learning, the last layer of the network is the loss function that should be minimized. Hence, the output $\bx_L = x_L$ of the network is a scalar quantity (a single number).

271 |

The gradient is easily computed using using the chain rule. If all network variables and parameters are scalar, this is given by2: 272 | $$ 273 | \frac{\partial f}{\partial w_l}(x_0;w_1,\dots,w_L) 274 | = 275 | \frac{\partial f_L}{\partial x_L}(x_L;w_L) \times 276 | \cdots 277 | \times 278 | \frac{\partial f_{l+1}}{\partial x_l}(x_l;w_{l+1}) \times 279 | \frac{\partial f_{l}}{\partial w_l}(x_{l-1};w_l) 280 | $$ 281 | With tensors, however, there are some complications. Consider for instance the derivative of a function $\by=f(\bx)$ where both $\by$ and $\bx$ are tensors; this is formed by taking the derivative of each scalar element in the output $\by$ with respect to each scalar element in the input $\bx$. If $\bx$ has dimensions $H \times W \times C$ and $\by$ has dimensions $H' \times W' \times C'$, then the derivative contains $HWCH'W'C'$ elements, which is often unmanageable (in the order of several GBs of memory for a single derivative).

282 |

Note that all intermediate derivatives in the chain rule may be affected by this size explosion except for the derivative of the network output that, being the loss, is a scalar.

283 |
284 |

Question: The output derivatives have the same size as the parameters in the network. Why?

285 |
286 |

Back-propagation allows computing the output derivatives in a memory-efficient manner. To see how, the first step is to generalize the equation above to tensors using a matrix notation. This is done by converting tensors into vectors by using the $\vv$ (stacking)3 operator: 287 | $$ 288 | \frac{\partial \vv f}{\partial \vv^\top \bw_l} 289 | = 290 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times 291 | \cdots 292 | \times 293 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times 294 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l} 295 | $$ 296 | In order to make this computation memory efficient, we project the derivative with respect to a tensor $\bp_L = 1$ as follows: 297 | $$ 298 | (\vv \bp_L)^\top \times \frac{\partial \vv f}{\partial \vv^\top \bw_l} 299 | = 300 | (\vv \bp_L)^\top 301 | \times 302 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times 303 | \cdots 304 | \times 305 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times 306 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l} 307 | $$ 308 | Note that $\bp_L=1$ has the same dimension as $\bx_L$ (the scalar loss) and, being equal to 1, multiplying it to the left of the expression does not change anything. Things are more interesting when products are evaluated from the left to the right, i.e. backward from the output to the input of the CNN. The first such factors is given by: 309 | \begin{equation} 310 | \label{e:factor} 311 | (\vv \bp_{L-1})^\top = (\vv \bp_L)^\top 312 | \times 313 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} 314 | \end{equation} 315 | This results in a new projection vector $\bp_{L-1}$, which can then be multiplied from the left to obtain $\bp_{L-2}$ and so on. The last projection $\bp_l$ is the desired derivative. Crucially, each projection $\bp_q$ takes as much memory as the corresponding variable $\bx_q$.

316 |

Some might have noticed that, while projections remain small, each factor \eqref{e:factor} does contain one of the large derivatives that we cannot compute explicitly. The trick is that CNN toolboxes contain code that can compute the projected derivatives without explicitly computing this large factor. In particular, for any building block function $\by=f(\bx;\bw)$, a toolbox such as MatConvNet will implement:

317 | 321 |

$$ 322 | \frac{\partial}{\partial \bx} \left\langle \bp, f(\bx;\bw) \right\rangle, 323 | \qquad 324 | \frac{\partial}{\partial \bw} \left\langle \bp, f(\bx;\bw) \right\rangle. 325 | $$

326 |

For example, this is how this looks for the convolution operator:

327 |
y = vl_nnconv(x,w,b) ; % forward mode (get output)
328 | p = randn(size(y), 'single') ; % projection tensor (arbitrary)
329 | [dx,dw,db] = vl_nnconv(x,w,b,p) ; % backward mode (get projected derivatives)
330 | 
331 | 332 |

and this is how it looks for ReLU operator:

333 |
y = vl_nnrelu(x) ;
334 | p = randn(size(y), 'single') ;
335 | dx = vl_nnrelu(x,p) ;
336 | 
337 | 338 |

Part 2.1: Backward mode verification

339 |

Implementing new layers in a network is conceptually simple, but error prone. A simple way of testing a layer is to check whether the derivatives computed using the backward mode approximately match the derivatives computed numerically using the forward mode. The next example, contained in the file exercise2.m, shows how to do this:

340 |
% Forward mode: evaluate the convolution
341 | y = vl_nnconv(x, w, []) ;
342 | 
343 | % Pick a random projection tensor
344 | p = randn(size(y), 'single') ;
345 | 
346 | % Backward mode: projected derivatives
347 | [dx,dw] = vl_nnconv(x, w, [], p) ;
348 | 
349 | % Check the derivative numerically
350 | figure(21) ; clf('reset') ;
351 | set(gcf,'name','Part 2.1: single layer backrpop') ;
352 | checkDerivativeNumerically(@(x) proj(p, vl_nnconv(x, w, [])), x, dx) ;
353 | 
354 | 355 |
356 |

Questions:

357 |
    358 |
  1. Recall that the derivative of a function $y=f(x)$ is given by 359 | $$ 360 | \frac{\partial f}{\partial x}(x) = \lim_{\delta\rightarrow 0} \frac{f(x+\delta) - f(x)}{\delta} 361 | $$ 362 | Open the file checkDerivativeNumerically.m. Can you identify the lines in the code above that use this expression?
  2. 363 |
  3. Note that checkDerivativeNumerically() is applied to the function @(x) proj(p, vl_nnconv(x, w, [])). This syntax defines a function on the fly (an anonymous closure to be more precise). In this case, the purpose of the closure is to evaluate the expression for a variable x and a fixed value of w. Furthermore, the closure projects the output of vl_nnconv() onto p by calling the proj() function. Why?
  4. 364 |
365 |

Tasks:

366 |
    367 |
  1. Run the code, visualizing the results. Convince yourself that the numerical and analytical derivatives are nearly identical.
  2. 368 |
  3. Modify the code to compute the derivative of the first element of the output tensor $\by$ with respect to all the elements of the input tensor $\bx$. Hint: it suffices to change the value of $\bp$.
  4. 369 |
  5. Modify the code to compute the derivative with respect to the convolution parameters $\bw$ instead of the convolution input $\bx$.
  6. 370 |
371 |
372 |

Part 2.2: Backpropagation

373 |

Next, we use the backward mode of convolution and ReLU to implement backpropagation in a network that consists of two layers:

374 |
% Forward mode: evaluate conv followed by ReLU
375 | y = vl_nnconv(x, w, []) ;
376 | z = vl_nnrelu(y) ;
377 | 
378 | % Pick a random projection tensor
379 | p = randn(size(z), 'single') ;
380 | 
381 | % Backward mode: projected derivatives
382 | dy = vl_nnrelu(z, p) ;
383 | [dx,dw] = vl_nnconv(x, w, [], dy) ;
384 | 
385 | 386 |
387 |

Question (important) In the code above, in backward mode the projection p is fed to the vl_nnrelu operator. However, the vl_nnconv operator now receives dy as projection. Why?

388 |

Tasks:

389 |
    390 |
  1. Run the code and use checkDerivativeNumerically() to compare the analytical and numerical derivatives. Do they differ?
  2. 391 |
  3. (Optional) Modify the code above to a chain of three layers: conv + ReLU + conv.
  4. 392 |
393 |
394 |

Part 2.3: Design and verify your own layer

395 |

Creating new layers is a common task when experimenting with novel CNN architectures. MatConvNet makes this particularly easy, since you can use all standard MATLAB operators and functions. The same code also works on the GPU.

396 |

In this part we will show how to implement a layer computing the Euclidean distance between a tensor x and a reference tensor r and your goal will be then to implement absolute difference (L1) loss. This layer will be used later to learn a CNN from data.

397 |

The first step is to write the forward mode. This is contained in the l2LossForward.m function. Open the file and check its content:

398 |
function y = l2LossForward(x,r)
399 | delta = x - r ;
400 | y = sum(delta(:).^2) ;
401 | 
402 | 403 |

The function computes the difference x - r, squares the individual elements (.^2), and then sums the results. The vectorization delta(:) just turns the tensor into a vector by stacking, so that the sum is carried across all elements (by default sum operates only along the first dimension). The overall result is a scalar y, which is the sum of the squared Euclidean distances between x and r, for all data instances.

404 |

Next, we need to implement the backward mode:

405 |
function dx = l2LossBackward(x,r,p)
406 | dx = 2 * p * (x - r) ;
407 | 
408 | 409 |

Note that the backward mode takes the projection tensor p as an additional argument. Let us show that this code is correct. Recall that the goal of the backward mode is to compute the derivative of the projected function:

410 |

$$ 411 | \langle \bp, f(\bx) \rangle 412 | = p \sum_{lmnt} (x_{lmnt} - r_{lmnt})^2. 413 | $$

414 |

Here the subscript $t$ index the data instance in the batch; note that, since this function computes the sum of Euclidean distances for all tensor instances, the output $f(\bx)$ is a scalar, and so is the projection $\bp = p$.

415 |

In order to see how to implement the backward mode, compute the derivative with respect to each input element $x_{ijkt}$ (note that $p$ is constant):

416 |

$$ 417 | \frac{\partial}{\partial x_{ijkt}} 418 | \langle \bp, f(\bx) \rangle 419 | = 2 p (x_{ijkt} - r_{ijkt}). 420 | $$

421 |
422 |

Tasks:

423 |
    424 |
  1. Verify that the forward and backward functions are correct by computing the derivatives numerically using checkDerivativeNumerically().
  2. 425 |
  3. Implement the l1LossForward.m and l1LossBackward.m to compute the L1 distance (sum of absolute differences): 426 | $$ 427 | f(\bx) = \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert. 428 | $$ 429 | In order to implement the backward pass, you need to find 430 | $$ 431 | \frac{\partial}{\partial x_{ijkt}} 432 | \langle \bp, f(\bx) \rangle 433 | = 434 | \frac{\partial}{\partial x_{ijkt} } 435 | \left[ 436 | p \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert 437 | \right]. 438 | $$ 439 | Recall that for ${v} \neq 0$: 440 | $$ 441 | \frac{\partial |v|}{\partial v} = \begin{cases} -1 & v<0 \\ 1 & v>0 \end{cases}. 442 | $$
  4. 443 |
  5. Make sure that both the forward and backward modes are correctly modified by verifying the result numerically once more. What happens for the components of $\bx$ that are zero or very close to zero?
  6. 444 |
445 |
446 |

Part 3: Learning a CNN for text deblurring

447 |

By now you should be familiar with two basic CNN layers, convolution and ReLU, as well as with the idea of backpropagation. In this part, we will build on such concepts to learn a CNN model.

448 |

CNN are often used for classification; however, they are much more general than that. In order to demonstrate their flexibility, here we will design a CNN that takes an image as input and produces an image as output (instead of a class label).

449 |

We will consider in particular the problem of deblurring images of text, as in the following example:

450 |

Data example

451 |

Part 3.1: Preparing the data

452 |

The first task is to load the training and validation data and to understand its format. Start by opening in your MATLAB editor exercise3.m. The code responsible for loading the data is

453 |
imdb = load('data/text_imdb.mat') ;
454 | 
455 | 456 |

The variable imdb is a structure containing $n$ images, which will be used for training and validation. The structure has the following fields:

457 | 462 |

Run the following code, which displays the first image in the dataset and its label:

463 |
figure(31) ; set(gcf, 'name', 'Part 3.1: Data') ; clf ;
464 | 
465 | subplot(1,2,1) ; imagesc(imdb.images.data(:,:,:,1)) ;
466 | axis off image ; title('Input (blurred)') ;
467 | 
468 | subplot(1,2,2) ; imagesc(imdb.images.label(:,:,:,1)) ;
469 | axis off image ; title('Desired output (sharp)') ;
470 | 
471 | colormap gray ;
472 | 
473 | 474 |
475 |

Task: make sure you understand the format of imdb. Use MATLAB to find out the number of training and validation images as well as the resolution (size) of each image.

476 |
477 |

It is often important to center the data to better condition the learning problem. This is usually obtained by subtracting the mean pixel intensity (computed from the training set) from each pixel. Here, however, pixels are rescaled and shifted to have values in the interval $[-1, 0]$.

478 |
479 |

Question: why was the interval $[-1, 0]$ chosen? Hint: what intensity corresponds to 'white'? What does the convolution operator do near the image boundaries?

480 |
481 |

Part 3.2: Defining a CNN architecture

482 |

Next we define a CNN net and initialize its weights randomly. A CNN is simply a collection of interlinked layers. While these can be assembled 'manually' as you did in Part 2, it is usually more convenient to use a wrapper.

483 |

MatConvNet contains two wrappers, SimpleNN and DagNN. SimpleNN is suitable for simple networks that are a chain of layers (as opposed to a more general graph). We will use SimpleNN here.

484 |

This wrapper defines the CNN as a structure net containing a cell-array layers listed in order of execution. Open initializeSmallCNN.m and find this code:

485 |
net.layers = { } ;
486 | 
487 | 488 |

The first layer of the network is a convolution block:

489 |
net.layers{end+1} = struct(...
490 |   'name', 'conv1', ...
491 |   'type', 'conv', ...
492 |   'weights', {xavier(3,3,1,32)}, ...
493 |   'pad', 1, ...
494 |   'learningRate', [1 1], ...
495 |   'weightDecay', [1 0]) ;
496 | 
497 | 498 |

The fields are as follows:

499 | 519 |
520 |

Question: what would happen if pad was set to zero?

521 |
522 |

The convolution layer is followed by ReLU, which is given simply by:

523 |
net.layers{end+1} = struct(...
524 |   'name', 'relu1', ...
525 |   'type', 'relu') ;
526 | 
527 | 528 |

This pattern is repeated (possibly varying the number and dimensions of filters) for a total of three convolutional layers separated by ReLUs.

529 |
530 |

Question: The last layer, generating the output image, is convolutional and is not followed by ReLU. Why?

531 |
532 |

The command vl_simplenn_display() can be used to print information about the network. Here is a subset of this information:

533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 |
layer0123456
typeinputconvreluconvreluconvcustom
namen/aconv1relu1conv2relu2predictionloss
supportn/a313131
filt dimn/a1n/a32n/a32n/a
num filtsn/a32n/a32n/a1n/a
striden/a111111
padn/a101010
rf sizen/a335577
629 |
630 |

Questions: Look carefully at the generated table and answer the following questions:

631 |
    632 |
  1. How many layers are in this network?
  2. 633 |
  3. What is the support (height and width) and depth (number of feature channels) of each intermediate tensor?
  4. 634 |
  5. How is the number of feature channels related to the 635 | dimensions of the filters?
  6. 636 |
637 |
638 |

The last row reports the receptive field size for the layer. This is the size (in pixels) of the local image region that affects a particular element in a feature map.

639 |
640 |

Question: what is the receptive field size of the pixel in the output image (generated by the prediction layer)? Discuss whether a larger receptive field size might be preferable for this problem and how this might be obtained.

641 |
642 |

Part 3.3: Learning the network

643 |

In this part we will use SGD to learn the CNN from the available training data. As noted above, the CNN must however terminate in a loss layer. We add one such layer as follows:

644 |
% Add a loss (using our custom layer)
645 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ;
646 | 
647 | 648 |

The function addCustomLossLayer() creates a layer structure compatible with SimpleNN and adds it as the last of the network. This structure contains handles to the functions defined in Part 2, namely l2LossForward() and l2LossBackward().

649 |

Next, setup the learning parameters:

650 |
trainOpts.expDir = 'data/text-small' ;
651 | trainOpts.gpus = [] ;
652 | trainOpts.batchSize = 16 ;
653 | trainOpts.learningRate = 0.02 ;
654 | trainOpts.plotDiagnostics = false ;
655 | trainOpts.numEpochs = 20 ;
656 | trainOpts.errorFunction = 'none' ;
657 | 
658 | 659 |

The fields are as follows:

660 | 683 |

Finally, we can invoke the learning code:

684 |
net = cnn_train(net, imdb, @getBatch, trainOpts) ;
685 | 
686 | 687 |

The getBatch() function, passed as a handle, is particularly important. The training script cnn_train uses getBatch() to extract the images and corresponding labels for a certain batch, as follows:

688 |
function [im, label] = getBatch(imdb, batch)
689 | im = imdb.images.data(:,:,:,batch) ;
690 | label = imdb.images.label(:,:,:,batch) ;
691 | 
692 | 693 |

The function takes as input the imdb structure defined above and a list batch of image indexes that should be returned for training. In this case, this amounts to simply extract and copy some data; however, in general getBatch can be used to e.g. read images from disk or apply transformations to them on the fly.

694 |
695 |

Task: run the training code and wait for learning to be complete. Note that the model is saved in data/text-small/net-epoch-16.mat, where 16 is the number of the last epoch.

696 |
697 |

Part 3.4: Evaluate the model

698 |

The network is evaluated on the validation set during training. The validation error (which in our case is the average squared differences between the predicted output pixels and the desired ones), is a good indicator of how well the network is doing (in practice, one should ultimately evaluate the network on a held-out test set).

699 |

In our example it is also informative to evaluate the qualitative result of the model. This can be done as follows:

700 |
train = find(imdb.images.set == 1) ;
701 | val = find(imdb.images.set == 2) ;
702 | 
703 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ;
704 | showDeblurringResult(net, imdb, train(1:30:151)) ;
705 | 
706 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ;
707 | showDeblurringResult(net, imdb, val(1:30:151)) ;
708 | 
709 | 710 |

Since the CNN is convolutional, it can be applied to arbitrarily-sized images. imdb.examples contains a few larger examples too. The following code shows one:

711 |
figure(35) ;
712 | set(gcf, 'name', 'Part 3.4: Larger example on the validation set') ;
713 | colormap gray ;
714 | subplot(1,2,1) ; imagesc(imdb.examples.blurred{1}, [-1 0]) ;
715 | axis image off ;
716 | title('CNN input') ;
717 | res = vl_simplenn(net, imdb.examples.blurred{1}) ;
718 | subplot(1,2,2) ; imagesc(res(end).x, [-1 0]) ;
719 | axis image off ;
720 | title('CNN output') ;
721 | 
722 | 723 |
724 |

Questions:

725 | 729 |
730 |

Part 3.5: Learning a larger model using the GPU

731 |

So far, we have trained a single small network to solve this problem. Here, we will experiment with several variants to try to improve the performance as much as possible.

732 |

Before we experiment further, however, it is beneficial to switch to using a GPU. If you have a GPU and MATLAB Parallel Toolbox installed, you can try running the code above on the GPU by changing a single switch. To prepare MatConvNet to use the GPU, change the first line of the script from setup to:

733 |
setup('useGpu', true) ;
734 | 
735 | 736 |

Assuming that the GPU has index 1 (which is always the case if there is a single CUDA-compatible GPU in your machine), modify the training options to tell MatConvNet to use that GPU:

737 |
trainOpts.expDir = 'data/text-small-gpu'
738 | trainOpts.gpus = [1] ;
739 | 
740 | 741 |

The code above also changes expDir in order to start a new experiment from scratch.

742 |
743 |

Task: Test GPU-based training (if possible). How much faster does it run compared to CPU-based training?

744 |
745 |

Now we are ready to experiment with different CNNs.

746 |
747 |

Task: Run a new experiment, this time using the initializeLargeCNN() function to construct a larger network.

748 |

Questions:

749 |
    750 |
  1. How much slower is this network compared to the small model?
  2. 751 |
  3. What about the quantitative performance on the validation set?
  4. 752 |
  5. What about the qualitative performance?
  6. 753 |
754 |
755 |

Part 3.6: Challenge!

756 |

You are now in control. Play around with the model definition and try to improve the performance as much as possible. For example:

757 | 764 |

And, of course, make sure to beat the other students.

765 |
766 |

Remark: You can see the relative change of the network weights by setting trainOpts.plotDiagnostics = true ;

767 |
768 | 769 | 780 |

Acknowledgements

781 | 784 |

785 |

iV&L Summer School instructions

786 |

Connect here to Qwick labs as you have been instructed. Press the Select button for the MatConvNet Lab:

787 |

step1

788 |

Press the Start Lab button:

789 |

step2

790 |

Wait for the progress bar to finish (this may take one or two minutes):

791 |

step3

792 |

Click the lab instructions link and follow the rest of the instructions:

793 |

step4

794 |

Once MATLAB is started, continue from the top.

795 |

If shortcuts in MATLAB do not work properly

796 |

By default, MATLAB on Linux systems has EMACS-style shortcuts. To change it to more familiar Windows style shortcuts:

797 | 804 |

If the screen is too small or too large

805 |

If you are running the practical through a VNC-based remote desktop connection, you can try adjusting the resolution by changing the setting in the OS (click on the big Ubuntu button on the top left and search for Displays).

806 |

History

807 | 810 |
811 |
812 |
    813 |
  1. 814 |

    If you are familiar with convolution as defined in mathematics and signal processing, you might expect to find the index $i-u$ instead of $i+u$ in this expression. The convention $i+u$, which is often used in CNNs, is often referred to as correlation. 

    815 |
  2. 816 |
  3. 817 |

    The derivative is computed with respect to a certain assignment $x_0$ and $(w_1,\dots,w_L)$ to the network input and parameters; furthermore, the intermediate derivatives are computed at points $x_1,\dots,x_L$ obtained by evaluating the network at $x_0$

    818 |
  4. 819 |
  5. 820 |

    The stacking operator $\vv$ simply unfolds a tensor in a vector by stacking its elements in some pre-defined order. For example: 821 | $$ 822 | \vv\begin{bmatrix} 823 | 1 & 3 & 5\\ 824 | 2 & 4 & 6 825 | \end{bmatrix}=\begin{bmatrix} 826 | 1\\2\\3\\4\\5\\6 827 | \end{bmatrix} 828 | $$ 

    829 |
  6. 830 |
  7. 831 |

    Like for example

    832 |
  8. 833 |
834 |
852 | 853 | 854 | 855 | 856 | -------------------------------------------------------------------------------- /doc/instructions.md: -------------------------------------------------------------------------------- 1 | Oxford logo 2 | MathWorks logo 3 | NVIDIA logo 4 | 5 | # VGG CNN Practical: Image Regression 6 | 7 | *By Andrea Vedaldi, Karel Lenc, and Joao Henriques* 8 | 9 | This is an [Oxford Visual Geometry Group](http://www.robots.ox.ac.uk/~vgg) computer vision practical (Release 2016a). 10 | 11 | cover 12 | 13 | *Convolutional neural networks* are an important class of learnable representations applicable, among others, to numerous computer vision problems. Deep CNNs, in particular, are composed of several layers of processing, each involving linear as well as non-linear operators, that are learned jointly, in an end-to-end manner, to solve a particular tasks. These methods are now the dominant approach for feature extraction from audiovisual and textual data. 14 | 15 | This practical explores the basics of learning (deep) CNNs. The first part introduces typical CNN building blocks, such as ReLU units and linear filters. The second part explores backpropagation, including designing custom layers and verifying them numerically. The last part demonstrates learning a CNN for text deblurring; this differs from the usual problem of image classification and demonstrates the flexibility of these techniques. 16 | 17 | This practical is based on MATLAB and the [MatConvNet](http://www.vlfeat.org/matconvnet) library. The practical demonstrates how easy it is to use this environment to prototype new network components and architectures. By only using familar MATLAB syntax, you will be able to implement new layers and take advantage of the GPU for faster computation. 18 | 19 | [TOC] 20 | 21 | $$ 22 | \newcommand{\bx}{\mathbf{x}} 23 | \newcommand{\by}{\mathbf{y}} 24 | \newcommand{\bz}{\mathbf{z}} 25 | \newcommand{\bw}{\mathbf{w}} 26 | \newcommand{\bp}{\mathbf{p}} 27 | \newcommand{\cP}{\mathcal{P}} 28 | \newcommand{\cN}{\mathcal{N}} 29 | \newcommand{\vc}{\operatorname{vec}} 30 | \newcommand{\vv}{\operatorname{vec}} 31 | $$ 32 | 33 | ## Installation 34 | 35 | > If you are running this in the iV&L Summer School, please refer to the [instructions](#ivl) at the end of the document. 36 | 37 | Read and understand the [requirements and installation instructions](../overview/index.html#installation). The download links for this practical are: 38 | 39 | * Code and data: [practical-cnn-reg-2016a.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz) 40 | * Code only: [practical-cnn-reg-2016a-code-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a-code-only.tar.gz) 41 | * Data only: [practical-cnn-reg-2016a-data-only.tar.gz](http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a-data-only.tar.gz) 42 | * [Git repository](https://github.com/vedaldi/practical-cnn-reg) (for lab setters and developers) 43 | 44 | You can either unpack the archive manually, or use the following MATLAB one-liner: 45 | 46 | ```.language-matlab 47 | untar('http://www.robots.ox.ac.uk/~vgg/share/practical-cnn-reg-2016a.tar.gz') 48 | cd practical-cnn-reg-2016a 49 | ``` 50 | 51 | 52 | 53 | ## Getting started 54 | 55 | After the installation is complete, open and edit the script `exercise1.m` in the MATLAB editor. The script contains commented code and a description for all steps of this exercise, for [Part I](#part1) of this document. You can cut and paste this code into the MATLAB window to run it, or use the shortcut `Ctrl+Enter` to run a code section. You will need to modify it as you go through the session. Other files `exercise2.m`, and `exercise3.m`, are given for [Part II](#part2) and [III](#part3). 56 | 57 | Each part contains several **Questions** (that may require pen and paper) and **Tasks** (that require experimentation or coding) to be answered/completed before proceeding further in the practical. 58 | 59 | ## Part 1: CNN building blocks {#part1} 60 | 61 | In this part we will explore two fundamental building blocks of CNNs, linear convolution and non-linear activation functions. Open `exercise1.m` and run up to the `setup()` command, which initializes the MATLAB environment to use MatConvNet. 62 | 63 | ### Part 1.1: Convolution {#part1.1} 64 | 65 | A *convolutional neural network* (CNN) is a sequence of linear and non-linear convolution-like operators. The most important example of such operators is *linear convolution*. In this part, we will explore linear convolution and see how to use it in MatConvNet. 66 | 67 | Recall that linear convolution applies one (or more) filters $\bw$ to an image $\bx$ as follows: 68 | 69 | conv 70 | 71 | #### Part 1.1.1: Convolution by a single filter {#part1.1.1} 72 | 73 | Start by identifying and then running the following code fragment in `exercise1.m`: 74 | 75 | ```.language-matlab 76 | % Load an image and convert it to gray scale and single precision 77 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ; 78 | 79 | % Define a filter 80 | w = single([ 81 | 0 -1 -0 82 | -1 4 -1 83 | 0 -1 0]) ; 84 | 85 | % Apply the filter to the image 86 | y = vl_nnconv(x, w, []) ; 87 | ``` 88 | 89 | The code loads the image `data/ray.jpg` and applies to it a linear filter using the linear convolution operator. The latter is implemented by the MatConvNet function `vl_nnconv()`. Note that all variables `x`, `w`, and `y` are in single precision; while MatConvNet supports double precision arithmetic too, single precision is usually preferred in applications as memory is often a bottleneck. The result can be visualized as follows: 90 | 91 | ```.language-matlab 92 | % Visualize the results 93 | figure(11) ; clf ; colormap gray ; 94 | set(gcf, 'name', 'Part 1.1: convolution') ; 95 | 96 | subplot(2,2,1) ; 97 | imagesc(x) ; 98 | axis off image ; 99 | title('Input image x') ; 100 | 101 | subplot(2,2,2) ; 102 | imagesc(w) ; 103 | axis off image ; 104 | title('Filter w') ; 105 | 106 | subplot(2,2,3) ; 107 | imagesc(y) ; 108 | axis off image ; 109 | title('Output image y') ; 110 | ``` 111 | 112 | > **Task:** Run the code above and examine the result, which should look like the following image: 113 | 114 | cover 115 | 116 | The input $\bx$ is an $M \times N$ matrix, which can be interpreted as a gray scale image. The filter $\bw$ is the $3 \times 3$ matrix 117 | $$ 118 | \bw = 119 | \begin{bmatrix} 120 | 0 & -1 & 0 \\ 121 | -1 & 4 & -1 \\ 122 | 0 & -1 & 0 \\ 123 | \end{bmatrix} 124 | $$ 125 | The output of the convolution is a new matrix $\by$ given by[^convolution] 126 | $$ 127 | y_{ij} = \sum_{uv} w_{uv}\ x_{i+u,\ j+v} 128 | $$ 129 | 130 | 131 | > **Questions:** 132 | > 133 | > 1. If $H \times W$ is the size of the input image, $H' \times W'$ the size of the filter, what is the size $H'' \times W''$ of the output image? 134 | > 2. The filter $\bw$ given above is a discretized Laplacian operator. Which type of visual structures (corners, bars, ...) do you think may excite this filter the most? 135 | 136 | #### Part 1.1.2: Convolution by a filter bank {#part1.1.2} 137 | 138 | In neural networks, one usually operates with *filter banks* instead of individual filters. Each filter can be though of as computing a different *feature channel*, characterizing a particular statistical property of the input image. 139 | 140 | To see how to define and use a filter bank, create a bank of three filters as follows: 141 | 142 | ```.language-matlab 143 | % Concatenate three filters in a bank 144 | w1 = single([ 145 | 0 -1 0 146 | -1 4 -1 147 | 0 -1 0]) ; 148 | 149 | w2 = single([ 150 | -1 0 +1 151 | -1 0 +1 152 | -1 0 +1]) ; 153 | 154 | w3 = single([ 155 | -1 -1 -1 156 | 0 0 0 157 | +1 +1 +1]) ; 158 | 159 | wbank = cat(4, w1, w2, w3) ; 160 | ``` 161 | 162 | The first filter $\bw_1$ is the Laplacian operator seen above; two additional filters $\bw_2$ and $\bw_3$ are horizontal and vertical image derivatives, respectively. The command `vl_nnconv(x, wbank, [])` then applies all the filters in the bank to the input image `x`. Note that the output `y` is not just a matrix, but a 3D array (often called a *tensor* in the CNN jargon). This tensor has dimensions $H \times W \times K$, where $K$ is the number of *feature channels*. 163 | 164 | > **Question:** What is the number of feature channels $K$ in this example? Why? 165 | 166 | > **Task:** Run the code above and visualize the individual feature channels in the tensor `y` by using the provided function `showFeatureChannels()`. Do the channel responses make sense given the filter used to generate them? 167 | 168 | In a CNN, not only the output tensor, but also the input tensor `x` and the filters `wbank` can have multiple feature channels. In this case, the convolution formula becomes: 169 | $$ 170 | y_{ijk} = \sum_{uvp} w_{uvpk}\ x_{i+u,\ j+v,\ p} 171 | $$ 172 | 173 | > **Questions:** 174 | > 175 | > * If the input tensor $\bx$ has $C$ feature channels, what should be the third dimension of $\bw$? 176 | > * In the code above, the command `wbank = cat(4, w1, w2, w3)` concatenates the tensors `w1`, `w2`, and `w3` along the *fourth dimension*. Why is that given that filters should have three dimensions? 177 | 178 | #### Part 1.1.3: Convolving a batch of images {#part1.1.3} 179 | 180 | Finally, in training CNNs it is often important to be able to work efficiently with *batches* of data. MatConvNet allows packing more than one instance of the tensor $\bx$ in a single MATLAB array `x` by stacking the different instances along the *fourth dimension* of the array: 181 | 182 | ```.language-matlab 183 | x1 = im2single(rgb2gray(imread('data/ray.jpg'))) ; 184 | x2 = im2single(rgb2gray(imread('data/crab.jpg'))) ; 185 | x = cat(4, x1, x2) ; 186 | 187 | y = vl_nnconv(x, wbank, []) ; 188 | ``` 189 | 190 | > **Task:** Run the code above and visualize the result. Convince yourself that each filter is applied to each image. 191 | 192 | ### Part 1.2: Non-linear activation (ReLU) {#part1.2} 193 | 194 | CNNs are obtained by composing several operators, individually called *layers*. In addition to convolution and other linear layers, CNNs should contain non-linear layers as well. 195 | 196 | > **Question:** What happens if all layers are linear? 197 | 198 | The simplest non-linearity is given by scalar activation functions, which are applied independently to each element of a tensor. Perhaps the simplest and one of the most useful examples is the *Rectified Linear Unit* (ReLU) operator: 199 | $$ 200 | y_{ijk} = \max \{0, x_{ijk}\} 201 | $$ 202 | which simply cuts off any negative value in the data. 203 | 204 | In MatConvNet, ReLU is implemented by the `vl_nnrelu` function. To demonstrate its use, we convolve the test image with the negated Laplacian, and then apply ReLU to the result: 205 | 206 | ```.language-matlab 207 | % Convolve with the negated Laplacian 208 | y = vl_nnconv(x, - w, []) ; 209 | 210 | % Apply the ReLU operator 211 | z = vl_nnrelu(y) ; 212 | ``` 213 | 214 | > **Task:** Run this code and visualize images `x`, `y`, and `z`. 215 | 216 | > **Questions:** 217 | > 218 | > * Which kind of image structures are preferred by this filter? 219 | > * Why did we negate the Laplacian? 220 | 221 | ReLU has a very important effect as it implicitly sets to zero the majority of the filter responses. In a certain sense, ReLU works as a detector, with the implicit convention that a certain pattern is detected when a corresponding filter response is large enough (greater than zero). 222 | 223 | In practice, while signals are usually centered and therefore a threshold of zero is reasonable, there is no particular reason why this should always be appropriate. For this reason, the convolution operator allows to specify *a bias term* for each filter response. Let us use this term to make the response of ReLU more selective: 224 | 225 | ```.language-matlab 226 | bias = single(- 0.2) ; 227 | y = vl_nnconv(x, - w, bias) ; 228 | z = vl_nnrelu(y) ; 229 | ``` 230 | 231 | There is only one `bias` term because there is only one filter in the bank (note that, as for the rest of the data, `bias` is a single precision quantity). The bias is applied after convolution, effectively subtracting 0.2 from the filter responses. Hence, now a response is not suppressed by the subsequent ReLU operator only if it is at least 0.2 after convolution. 232 | 233 | > **Task:** Run this code and visualize images `x`, `y`, and `z`. 234 | 235 | > **Question:** Is the response now more selective? 236 | 237 | > **Remark:** There are many other building blocks used in CNNs, the most important of which is perhaps max pooling. However, convolution and ReLU can solve already many problems, as we will see in the remainder of the practical. 238 | 239 | ## Part 2: Backpropagation {#part2} 240 | 241 | Training CNNs is normally done using a gradient-based optimization method. The CNN $f$ is the composition of $L$ layers $f_l$ each with parameters $\bw_l$, which in the simplest case of a chain looks like: 242 | $$ 243 | \bx_0 244 | \longrightarrow 245 | \underset{\displaystyle\underset{\displaystyle\bw_1}{\uparrow}}{\boxed{f_1}} 246 | \longrightarrow 247 | \bx_1 248 | \longrightarrow 249 | \underset{\displaystyle\underset{\displaystyle\bw_2}{\uparrow}}{\boxed{f_2}} 250 | \longrightarrow 251 | \bx_2 252 | \longrightarrow 253 | \dots 254 | \longrightarrow 255 | \bx_{L-1} 256 | \longrightarrow 257 | \underset{\displaystyle\underset{\displaystyle\bw_L}{\uparrow}}{\boxed{f_L}} 258 | \longrightarrow 259 | \bx_L 260 | $$ 261 | During learning, the last layer of the network is the *loss function* that should be minimized. Hence, the output $\bx_L = x_L$ of the network is a **scalar** quantity (a single number). 262 | 263 | The gradient is easily computed using using the **chain rule**. If *all* network variables and parameters are scalar, this is given by[^derivative]: 264 | $$ 265 | \frac{\partial f}{\partial w_l}(x_0;w_1,\dots,w_L) 266 | = 267 | \frac{\partial f_L}{\partial x_{L-1}}(x_{L-1};w_L) \times 268 | \cdots 269 | \times 270 | \frac{\partial f_{l+1}}{\partial x_l}(x_l;w_{l+1}) \times 271 | \frac{\partial f_{l}}{\partial w_l}(x_{l-1};w_l) 272 | $$ 273 | With tensors, however, there are some complications. Consider for instance the derivative of a function $\by=f(\bx)$ where both $\by$ and $\bx$ are tensors; this is formed by taking the derivative of each scalar element in the output $\by$ with respect to each scalar element in the input $\bx$. If $\bx$ has dimensions $H \times W \times C$ and $\by$ has dimensions $H' \times W' \times C'$, then the derivative contains $HWCH'W'C'$ elements, which is often unmanageable (in the order of several GBs of memory for a single derivative). 274 | 275 | Note that all intermediate derivatives in the chain rule may be affected by this size explosion except for the derivative of the network output that, being the loss, is a scalar. 276 | 277 | > **Question:** The output derivatives have the same size as the parameters in the network. Why? 278 | 279 | **Back-propagation** allows computing the output derivatives in a memory-efficient manner. To see how, the first step is to generalize the equation above to tensors using a matrix notation. This is done by converting tensors into vectors by using the $\vv$ (stacking)[^stacking] operator: 280 | $$ 281 | \frac{\partial \vv f}{\partial \vv^\top \bw_l} 282 | = 283 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times 284 | \cdots 285 | \times 286 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times 287 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l} 288 | $$ 289 | In order to make this computation memory efficient, we *project* the derivative with respect to a tensor $\bp_L = 1$ as follows: 290 | $$ 291 | (\vv \bp_L)^\top \times \frac{\partial \vv f}{\partial \vv^\top \bw_l} 292 | = 293 | (\vv \bp_L)^\top 294 | \times 295 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} \times 296 | \cdots 297 | \times 298 | \frac{\partial \vv f_{l+1}}{\partial \vv^\top \bx_l} \times 299 | \frac{\partial \vv f_{l}}{\partial \vv^\top \bw_l} 300 | $$ 301 | Note that $\bp_L=1$ has the same dimension as $\bx_L$ (the scalar loss) and, being equal to 1, multiplying it to the left of the expression does not change anything. Things are more interesting when products are evaluated from the left to the right, i.e. *backward from the output to the input* of the CNN. The first such factors is given by: 302 | \begin{equation} 303 | \label{e:factor} 304 | (\vv \bp_{L-1})^\top = (\vv \bp_L)^\top 305 | \times 306 | \frac{\partial \vv f_L}{\partial \vv^\top \bx_L} 307 | \end{equation} 308 | This results in a new projection vector $\bp_{L-1}$, which can then be multiplied from the left to obtain $\bp_{L-2}$ and so on. The last projection $\bp_l$ is the desired derivative. Crucially, each projection $\bp_q$ takes as much memory as the corresponding variable $\bx_q$. 309 | 310 | Some might have noticed that, while projections remain small, each factor \eqref{e:factor} does contain one of the large derivatives that we cannot compute explicitly. The trick is that CNN toolboxes contain code that can compute the projected derivatives without explicitly computing this large factor. In particular, for any building block function $\by=f(\bx;\bw)$, a toolbox such as MatConvNet will implement: 311 | 312 | * A **forward mode** computing the function $\by=f(\bx;\bw)$. 313 | * A **backward mode** computing the derivatives of the projected function $\langle \bp, f(\bx;\bw) \rangle$ with respect to the input $\bx$ and parameter $\bw$: 314 | 315 | $$ 316 | \frac{\partial}{\partial \bx} \left\langle \bp, f(\bx;\bw) \right\rangle, 317 | \qquad 318 | \frac{\partial}{\partial \bw} \left\langle \bp, f(\bx;\bw) \right\rangle. 319 | $$ 320 | 321 | For example, this is how this looks for the convolution operator: 322 | 323 | ```.language-matlab 324 | y = vl_nnconv(x,w,b) ; % forward mode (get output) 325 | p = randn(size(y), 'single') ; % projection tensor (arbitrary) 326 | [dx,dw,db] = vl_nnconv(x,w,b,p) ; % backward mode (get projected derivatives) 327 | ``` 328 | 329 | and this is how it looks for ReLU operator: 330 | 331 | ```.language-matlab 332 | y = vl_nnrelu(x) ; 333 | p = randn(size(y), 'single') ; 334 | dx = vl_nnrelu(x,p) ; 335 | ``` 336 | 337 | ### Part 2.1: Backward mode verification {#part2.1} 338 | 339 | Implementing new layers in a network is conceptually simple, but error prone. A simple way of testing a layer is to check whether the derivatives computed using the backward mode approximately match the derivatives computed numerically using the forward mode. The next example, contained in the file `exercise2.m`, shows how to do this: 340 | 341 | ```.language-matlab 342 | % Forward mode: evaluate the convolution 343 | y = vl_nnconv(x, w, []) ; 344 | 345 | % Pick a random projection tensor 346 | p = randn(size(y), 'single') ; 347 | 348 | % Backward mode: projected derivatives 349 | [dx,dw] = vl_nnconv(x, w, [], p) ; 350 | 351 | % Check the derivative numerically 352 | figure(21) ; clf('reset') ; 353 | set(gcf,'name','Part 2.1: single layer backrpop') ; 354 | checkDerivativeNumerically(@(x) proj(p, vl_nnconv(x, w, [])), x, dx) ; 355 | ``` 356 | 357 | > **Questions:** 358 | > 359 | > 1. Recall that the derivative of a function $y=f(x)$ is given by 360 | > $$ 361 | > \frac{\partial f}{\partial x}(x) = \lim_{\delta\rightarrow 0} \frac{f(x+\delta) - f(x)}{\delta} 362 | > $$ 363 | > Open the file `checkDerivativeNumerically.m`. Can you identify the lines in the code above that use this expression? 364 | > 2. Note that `checkDerivativeNumerically()` is applied to the function `@(x) proj(p, vl_nnconv(x, w, []))`. This syntax defines a function on the fly (an anonymous closure to be more precise). In this case, the purpose of the closure is to evaluate the expression for a variable `x` and a fixed value of `w`. Furthermore, the closure projects the output of `vl_nnconv()` onto `p` by calling the `proj()` function. Why? 365 | 366 | > **Tasks:** 367 | > 368 | > 1. Run the code, visualizing the results. Convince yourself that the numerical and analytical derivatives are nearly identical. 369 | > 2. Modify the code to compute the derivative of the *first element* of the output tensor $\by$ with respect to *all the elements* of the input tensor $\bx$. **Hint:** it suffices to change the value of $\bp$. 370 | > 2. Modify the code to compute the derivative with respect to the convolution parameters $\bw$ instead of the convolution input $\bx$. 371 | 372 | ### Part 2.2: Backpropagation {#part2.3} 373 | 374 | Next, we use the backward mode of convolution and ReLU to implement backpropagation in a network that consists of two layers: 375 | 376 | ```.language-matlab 377 | % Forward mode: evaluate conv followed by ReLU 378 | y = vl_nnconv(x, w, []) ; 379 | z = vl_nnrelu(y) ; 380 | 381 | % Pick a random projection tensor 382 | p = randn(size(z), 'single') ; 383 | 384 | % Backward mode: projected derivatives 385 | dy = vl_nnrelu(z, p) ; 386 | [dx,dw] = vl_nnconv(x, w, [], dy) ; 387 | ``` 388 | 389 | > **Question (important)** In the code above, in backward mode the projection `p` is fed to the `vl_nnrelu` operator. However, the `vl_nnconv` operator now receives `dy` as projection. Why? 390 | 391 | > **Tasks:** 392 | > 393 | > 1. Run the code and use `checkDerivativeNumerically()` to compare the analytical and numerical derivatives. Do they differ? 394 | > 2. (Optional) Modify the code above to a chain of three layers: conv + ReLU + conv. 395 | 396 | ### Part 2.3: Design and verify your own layer 397 | 398 | Creating new layers is a common task when experimenting with novel CNN architectures. MatConvNet makes this particularly easy, since you can use all standard MATLAB operators and functions. The same code also works on the GPU. 399 | 400 | In this part we will show how to implement a layer computing the Euclidean distance between a tensor `x` and a reference tensor `r` and your goal will be then to implement absolute difference (L1) loss. This layer will be used later to learn a CNN from data. 401 | 402 | The first step is to write the forward mode. This is contained in the `l2LossForward.m` function. Open the file and check its content: 403 | 404 | ```.language-matlab 405 | function y = l2LossForward(x,r) 406 | delta = x - r ; 407 | y = sum(delta(:).^2) ; 408 | ``` 409 | 410 | The function computes the difference `x - r`, squares the individual elements (`.^2`), and then sums the results. The vectorization `delta(:)` just turns the tensor into a vector by stacking, so that the sum is carried across all elements (by default `sum` operates only along the first dimension). The overall result is a scalar `y`, which is the sum of the squared Euclidean distances between `x` and `r`, for all data instances. 411 | 412 | Next, we need to implement the backward mode: 413 | 414 | ```.language-matlab 415 | function dx = l2LossBackward(x,r,p) 416 | dx = 2 * p * (x - r) ; 417 | ``` 418 | 419 | Note that the backward mode takes the projection tensor `p` as an additional argument. Let us show that this code is correct. Recall that the goal of the backward mode is to compute the derivative of the projected function: 420 | 421 | $$ 422 | \langle \bp, f(\bx) \rangle 423 | = p \sum_{lmnt} (x_{lmnt} - r_{lmnt})^2. 424 | $$ 425 | 426 | Here the subscript $t$ index the data instance in the batch; note that, since this function computes the sum of Euclidean distances for all tensor instances, the output $f(\bx)$ is a scalar, and so is the projection $\bp = p$. 427 | 428 | In order to see how to implement the backward mode, compute the derivative with respect to each input element $x_{ijkt}$ (note that $p$ is constant): 429 | 430 | $$ 431 | \frac{\partial}{\partial x_{ijkt}} 432 | \langle \bp, f(\bx) \rangle 433 | = 2 p (x_{ijkt} - r_{ijkt}). 434 | $$ 435 | 436 | > **Tasks:** 437 | > 438 | > 1. Verify that the forward and backward functions are correct by computing the derivatives numerically using `checkDerivativeNumerically()`. 439 | > 2. Implement the `l1LossForward.m` and `l1LossBackward.m` to compute the L1 distance (sum of absolute differences): 440 | $$ 441 | f(\bx) = \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert. 442 | $$ 443 | In order to implement the backward pass, you need to find 444 | $$ 445 | \frac{\partial}{\partial x_{ijkt}} 446 | \langle \bp, f(\bx) \rangle 447 | = 448 | \frac{\partial}{\partial x_{ijkt} } 449 | \left[ 450 | p \sum_{lmnt} \lvert x_{lmnt} - r_{lmnt} \rvert 451 | \right]. 452 | $$ 453 | Recall that for ${v} \neq 0$: 454 | $$ 455 | \frac{\partial |v|}{\partial v} = \begin{cases} -1 & v<0 \\ 1 & v>0 \end{cases}. 456 | $$ 457 | > 3. Make sure that both the forward and backward modes are correctly modified by verifying the result numerically once more. What happens for the components of $\bx$ that are zero or very close to zero? 458 | 459 | ## Part 3: Learning a CNN for text deblurring {#part3} 460 | 461 | By now you should be familiar with two basic CNN layers, convolution and ReLU, as well as with the idea of backpropagation. In this part, we will build on such concepts to learn a CNN model. 462 | 463 | CNN are often used for classification; however, they are much more general than that. In order to demonstrate their flexibility, here we will design a CNN that takes an image as input and produces an image as output (instead of a class label). 464 | 465 | We will consider in particular the problem of *deblurring images of text*, as in the following example: 466 | 467 | ![Data example](images/text.png) 468 | 469 | ### Part 3.1: Preparing the data {#part3.1} 470 | 471 | The first task is to load the training and validation data and to understand its format. Start by opening in your MATLAB editor `exercise3.m`. The code responsible for loading the data is 472 | 473 | ```.language-matlab 474 | imdb = load('data/text_imdb.mat') ; 475 | ``` 476 | 477 | The variable `imdb` is a structure containing $n$ images, which will be used for training and validation. The structure has the following fields: 478 | 479 | * `imdb.images.data`: a $64 \times 64 \times 1 \times n$ array of grayscale blurred images. 480 | * `imdb.images.label`: a $64 \times 64 \times 1 \times n$ of grayscale sharp images. 481 | * `imdb.images.set`: a $1 \times n$ vector containing a 1 for training images and an 2 for validation images. 75% of the images are used for training and 25% for test. 482 | 483 | Run the following code, which displays the first image in the dataset and its label: 484 | 485 | ```.language-matlab 486 | figure(31) ; set(gcf, 'name', 'Part 3.1: Data') ; clf ; 487 | 488 | subplot(1,2,1) ; imagesc(imdb.images.data(:,:,:,1)) ; 489 | axis off image ; title('Input (blurred)') ; 490 | 491 | subplot(1,2,2) ; imagesc(imdb.images.label(:,:,:,1)) ; 492 | axis off image ; title('Desired output (sharp)') ; 493 | 494 | colormap gray ; 495 | ``` 496 | 497 | > **Task:** make sure you understand the format of `imdb`. Use MATLAB to find out the number of training and validation images as well as the resolution (size) of each image. 498 | 499 | It is often important to center the data to better condition the learning problem. This is usually obtained by subtracting the mean pixel intensity (computed from the training set) from each pixel. Here, however, pixels are rescaled and shifted to have values in the interval $[-1, 0]$. 500 | 501 | > **Question:** why was the interval $[-1, 0]$ chosen? **Hint:** what intensity corresponds to 'white'? What does the convolution operator do near the image boundaries? 502 | 503 | ### Part 3.2: Defining a CNN architecture 504 | 505 | Next we define a CNN `net` and initialize its weights randomly. A CNN is simply a collection of interlinked layers. While these can be assembled 'manually' as you did in Part 2, it is usually more convenient to use a **wrapper**. 506 | 507 | MatConvNet contains two wrappers, SimpleNN and DagNN. SimpleNN is suitable for simple networks that are a chain of layers (as opposed to a more general graph). We will use SimpleNN here. 508 | 509 | This wrapper defines the CNN as a structure `net` containing a cell-array `layers` listed in order of execution. Open `initializeSmallCNN.m` and find this code: 510 | 511 | ```.language-matlab 512 | net.layers = { } ; 513 | ``` 514 | 515 | The first layer of the network is a convolution block: 516 | 517 | ```.language-matlab 518 | net.layers{end+1} = struct(... 519 | 'name', 'conv1', ... 520 | 'type', 'conv', ... 521 | 'weights', {xavier(3,3,1,32)}, ... 522 | 'pad', 1, ... 523 | 'learningRate', [1 1], ... 524 | 'weightDecay', [1 0]) ; 525 | ``` 526 | 527 | The fields are as follows: 528 | 529 | * `name` specifies a name for the layer, useful for debugging but otherwise arbitrary. 530 | 531 | * `type` specifies the layer type, in this case convolution. 532 | 533 | * `weights` is a cell array containing the layer parameters, in this case two tensors for the filters and the biases. The filters are initialized using the `xavier()` function to have dimensions $3 \times 3 \times 1 \times 32$ ($3\times 3$ spatial support, 1 input feature channels, and 32 filters). `xavier()` also initializes the biases to be zero. 534 | 535 | * `pad` specifies the amount of zero padding to apply to the layer input. By using a padding of one pixel and a $3\times 3$ filter support, the output of the convolution will have exactly the same height and width as the input. 536 | 537 | * `learningRate` contains two layer-specific multipliers to adjust the learning rate for the filters and the biases. 538 | 539 | * `weightDecay` contains two layer-specific multipliers to adjust the weight decay (regularization strength) for the layer filters and biases. Note that weight decay is not applied to the biases. 540 | 541 | > **Question:** what would happen if `pad` was set to zero? 542 | 543 | The convolution layer is followed by ReLU, which is given simply by: 544 | 545 | ```.language-matlab 546 | net.layers{end+1} = struct(... 547 | 'name', 'relu1', ... 548 | 'type', 'relu') ; 549 | ``` 550 | 551 | This pattern is repeated (possibly varying the number and dimensions of filters) for a total of three convolutional layers separated by ReLUs. 552 | 553 | > **Question:** The last layer, generating the output image, is convolutional and is *not* followed by ReLU. Why? 554 | 555 | The command `vl_simplenn_display()` can be used to print information about the network. Here is a subset of this information: 556 | 557 | | layer| 0| 1| 2| 3| 4| 5| 6| 558 | |:---------:|:---:|:---:|:---:|:---:|:---:|:--------:|:----:| 559 | | type|input| conv| relu| conv| relu| conv|custom| 560 | | name| n/a|conv1|relu1|conv2|relu2|prediction| loss| 561 | | support| n/a| 3| 1| 3| 1| 3| 1| 562 | | filt dim| n/a| 1| n/a| 32| n/a| 32| n/a| 563 | | num filts| n/a| 32| n/a| 32| n/a| 1| n/a| 564 | | stride| n/a| 1| 1| 1| 1| 1| 1| 565 | | pad| n/a| 1| 0| 1| 0| 1| 0| 566 | | rf size| n/a| 3| 3| 5| 5| 7| 7| 567 | 568 | > **Questions:** Look carefully at the generated table and answer the following questions: 569 | > 570 | > 1. How many layers are in this network? 571 | > 2. What is the support (height and width) and depth (number of feature channels) of each intermediate tensor? 572 | > 3. How is the number of feature channels related to the 573 | > dimensions of the filters? 574 | 575 | The last row reports the *receptive field size* for the layer. This is the size (in pixels) of the local image region that affects a particular element in a feature map. 576 | 577 | > **Question:** what is the receptive field size of the pixel in the output image (generated by the prediction layer)? Discuss whether a larger receptive field size might be preferable for this problem and how this might be obtained. 578 | 579 | ### Part 3.3: Learning the network {#part3.3} 580 | 581 | In this part we will use SGD to learn the CNN from the available training data. As noted above, the CNN must however terminate in a loss layer. We add one such layer as follows: 582 | 583 | ```.language-matlab 584 | % Add a loss (using our custom layer) 585 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ; 586 | ``` 587 | The function `addCustomLossLayer()` creates a `layer` structure compatible with SimpleNN and adds it as the last of the network. This structure contains handles to the functions defined in Part 2, namely `l2LossForward()` and `l2LossBackward()`. 588 | 589 | Next, setup the learning parameters: 590 | 591 | ```.language-matlab 592 | trainOpts.expDir = 'data/text-small' ; 593 | trainOpts.gpus = [] ; 594 | trainOpts.batchSize = 16 ; 595 | trainOpts.learningRate = 0.02 ; 596 | trainOpts.plotDiagnostics = false ; 597 | trainOpts.numEpochs = 20 ; 598 | trainOpts.errorFunction = 'none' ; 599 | ``` 600 | 601 | The fields are as follows: 602 | 603 | * `expDir` specifies a directory to store intermediate data (snapshot and figures) as well as the final model. Note that the code resumes execution from the last snapshot; therefore change this directory or clear it if you want to start learning from scratch. 604 | 605 | * `gpus` contains a list of GPU IDs to use. For now, do not use any. 606 | 607 | * `batchSize` specifies how many images to include in a batch. Here we use 16. 608 | 609 | * `learningRate` is the learning rate in SGD. 610 | 611 | * `plotDiagnostic` can be used to plot statistics during training. This is slow, but can help setting a reasonable learning rate. Leave it off for now. 612 | 613 | * `numEpochs` is the number of epochs (passes through the training data) to perform before SGD stops. 614 | 615 | * `errorFunction` disables plotting the default error functions that are suitable for classification, but not for our problem. 616 | 617 | Finally, we can invoke the learning code: 618 | 619 | ``` 620 | net = cnn_train(net, imdb, @getBatch, trainOpts) ; 621 | ``` 622 | 623 | The `getBatch()` function, passed as a *handle*, is particularly important. The training script `cnn_train` uses `getBatch()` to extract the images and corresponding labels for a certain batch, as follows: 624 | 625 | ```.language-matlab 626 | function [im, label] = getBatch(imdb, batch) 627 | im = imdb.images.data(:,:,:,batch) ; 628 | label = imdb.images.label(:,:,:,batch) ; 629 | ``` 630 | 631 | The function takes as input the `imdb` structure defined above and a list `batch` of image indexes that should be returned for training. In this case, this amounts to simply extract and copy some data; however, in general `getBatch` can be used to e.g. read images from disk or apply transformations to them on the fly. 632 | 633 | > **Task:** run the training code and wait for learning to be complete. Note that the model is saved in `data/text-small/net-epoch-16.mat`, where 16 is the number of the last epoch. 634 | 635 | ### Part 3.4: Evaluate the model 636 | 637 | The network is evaluated on the validation set during training. The validation error (which in our case is the average squared differences between the predicted output pixels and the desired ones), is a good indicator of how well the network is doing (in practice, one should ultimately evaluate the network on a held-out test set). 638 | 639 | In our example it is also informative to evaluate the *qualitative* result of the model. This can be done as follows: 640 | 641 | ```.language-matlab 642 | train = find(imdb.images.set == 1) ; 643 | val = find(imdb.images.set == 2) ; 644 | 645 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ; 646 | showDeblurringResult(net, imdb, train(1:30:151)) ; 647 | 648 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ; 649 | showDeblurringResult(net, imdb, val(1:30:151)) ; 650 | ``` 651 | 652 | Since the CNN is convolutional, it can be applied to arbitrarily-sized images. `imdb.examples` contains a few larger examples too. The following code shows one: 653 | 654 | ```.language-matlab 655 | figure(35) ; 656 | set(gcf, 'name', 'Part 3.4: Larger example on the validation set') ; 657 | colormap gray ; 658 | subplot(1,2,1) ; imagesc(imdb.examples.blurred{1}, [-1 0]) ; 659 | axis image off ; 660 | title('CNN input') ; 661 | res = vl_simplenn(net, imdb.examples.blurred{1}) ; 662 | subplot(1,2,2) ; imagesc(res(end).x, [-1 0]) ; 663 | axis image off ; 664 | title('CNN output') ; 665 | ``` 666 | 667 | > **Questions:** 668 | > 669 | > * Do you think the network is doing a good job? 670 | > * Is there any obvious difference between training and validation performance? 671 | 672 | ### Part 3.5: Learning a larger model using the GPU 673 | 674 | So far, we have trained a single small network to solve this problem. Here, we will experiment with several variants to try to improve the performance as much as possible. 675 | 676 | Before we experiment further, however, it is beneficial to switch to using a GPU. If you have a GPU and MATLAB Parallel Toolbox installed, you can try running the code above on the GPU by changing a single switch. To prepare MatConvNet to use the GPU, change the first line of the script from `setup` to: 677 | ```.language-matlab 678 | setup('useGpu', true) ; 679 | ``` 680 | Assuming that the GPU has index 1 (which is always the case if there is a single CUDA-compatible GPU in your machine), modify the training options to tell MatConvNet to use that GPU: 681 | 682 | ```.language-matlab 683 | trainOpts.expDir = 'data/text-small-gpu' 684 | trainOpts.gpus = [1] ; 685 | ``` 686 | 687 | The code above also changes `expDir` in order to start a new experiment from scratch. 688 | 689 | > **Task:** Test GPU-based training (if possible). How much faster does it run compared to CPU-based training? 690 | 691 | Now we are ready to experiment with different CNNs. 692 | 693 | > **Task:** Run a new experiment, this time using the `initializeLargeCNN()` function to construct a larger network. 694 | 695 | > **Questions:** 696 | > 697 | > 1. How much slower is this network compared to the small model? 698 | > 2. What about the quantitative performance on the validation set? 699 | > 3. What about the qualitative performance? 700 | 701 | ### Part 3.6: Challenge! 702 | 703 | You are now in control. Play around with the model definition and try to improve the performance as much as possible. For example: 704 | 705 | * Try adding more layers[^goingdeeper]. 706 | * Try adding more filters. 707 | * Try a different loss function, such as $L^1$. 708 | * Try increasing the receptive field size by increasing the filter support (do not forget to adjust the padding). 709 | * Try sequences of rank-1 filters, such as $7 \times 1$ followed by $1 \times 7$ to increase the receptive field size while maintaining efficiency. 710 | 711 | And, of course, make sure to beat the other students. 712 | 713 | > **Remark:** You can see the relative change of the network weights by setting `trainOpts.plotDiagnostics = true ;` 714 | 715 | ## Links and further work 716 | 717 | * The code for this practical is written using the software package [MatConvNet](http://www.vlfeat.org/matconvnet). This is a software library written in MATLAB, C++, and CUDA and its source code is freely available. 718 | 719 | * MatConvNet can train complex computer vision models, such as VGG VD and Inception. Several of these models, including a few cool demos, are available for download. 720 | 721 | * Many more computer vision practicals are available [here](https://www.robots.ox.ac.uk/~vgg/practicals/overview/index.html). 722 | 723 | ## Acknowledgements 724 | 725 | * NVIDIA and MathWorks for valuable help setting up the development environment for this tutorial. 726 | 727 | 728 | ## iV&L Summer School instructions 729 | 730 | Connect here to Qwick labs as you have been instructed. Press the `Select` button for the MatConvNet Lab: 731 | 732 | step1 733 | 734 | Press the `Start Lab` button: 735 | 736 | step2 737 | 738 | Wait for the progress bar to finish (this may take one or two minutes): 739 | 740 | step3 741 | 742 | Click the `lab instructions` link and follow the rest of the instructions: 743 | 744 | step4 745 | 746 | Once MATLAB is started, continue from [the top](#getting). 747 | 748 | ### If shortcuts in MATLAB do not work properly 749 | By default, MATLAB on Linux systems has EMACS-style shortcuts. To change it to more familiar Windows style shortcuts: 750 | 751 | * Type `preferences` in MATLAB Command Window. 752 | * Pick `Keyboard -> Shortcuts` 753 | * Change `Active settings` to `Windows Default Set` 754 | cover 755 | * Click `OK` to apply and close the settings window 756 | 757 | ### If the screen is too small or too large 758 | 759 | If you are running the practical through a VNC-based remote desktop connection, you can try adjusting the resolution by changing the setting in the OS (click on the big Ubuntu button on the top left and search for `Displays`). 760 | 761 | ## History 762 | 763 | * Used in the [IV & L Net](http://ivl-net.eu/ivl-net-training-school-2016/) summer school, Malta, 2016. 764 | 765 | [^convolution]: If you are familiar with convolution as defined in mathematics and signal processing, you might expect to find the index $i-u$ instead of $i+u$ in this expression. The convention $i+u$, which is often used in CNNs, is often referred to as correlation. 766 | 767 | [^derivative]: The derivative is computed with respect to a certain assignment $x_0$ and $(w_1,\dots,w_L)$ to the network input and parameters; furthermore, the intermediate derivatives are computed at points $x_1,\dots,x_L$ obtained by evaluating the network at $x_0$. 768 | 769 | [^stacking]: The stacking operator $\vv$ simply unfolds a tensor in a vector by stacking its elements in some pre-defined order. For example: 770 | $$ 771 | \vv\begin{bmatrix} 772 | 1 & 3 & 5\\ 773 | 2 & 4 & 6 774 | \end{bmatrix}=\begin{bmatrix} 775 | 1\\2\\3\\4\\5\\6 776 | \end{bmatrix} 777 | $$ 778 | 779 | [^goingdeeper]: Like for example. 780 | -------------------------------------------------------------------------------- /exercise1.m: -------------------------------------------------------------------------------- 1 | setup() ; 2 | 3 | %% Part 1.1: convolution 4 | 5 | %% Part 1.1.1: convolution by a single filter 6 | 7 | % Load an image and convert it to gray scale and single precision 8 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ; 9 | 10 | % Define a filter 11 | w = single([ 12 | 0 -1 0 13 | -1 4 -1 14 | 0 -1 0]) ; 15 | 16 | % Apply the filter to the image 17 | y = vl_nnconv(x, w, []) ; 18 | 19 | % Visualize the results 20 | figure(11) ; clf ; colormap gray ; 21 | set(gcf, 'name', 'Part 1.1: convolution') ; 22 | 23 | subplot(2,2,1) ; 24 | imagesc(x) ; 25 | axis off image ; 26 | title('Input image x') ; 27 | 28 | subplot(2,2,2) ; 29 | imagesc(w) ; 30 | axis off image ; 31 | title('Filter w') ; 32 | 33 | subplot(2,2,3) ; 34 | imagesc(y) ; 35 | axis off image ; 36 | title('Output image y') ; 37 | 38 | %% Part 1.1.2: convolution by a bank of filters 39 | 40 | % Concatenate three filters in a bank 41 | w1 = single([ 42 | 0 -1 0 43 | -1 4 -1 44 | 0 -1 0]) ; 45 | 46 | w2 = single([ 47 | -1 0 +1 48 | -1 0 +1 49 | -1 0 +1]) ; 50 | 51 | w3 = single([ 52 | -1 -1 -1 53 | 0 0 0 54 | +1 +1 +1]) ; 55 | 56 | wbank = cat(4, w1, w2, w3) ; 57 | 58 | % Apply convolution 59 | y = vl_nnconv(x, wbank, []) ; 60 | 61 | % Show feature channels 62 | figure(12) ; clf('reset') ; 63 | set(gcf, 'name', 'Part 1.1.2: channels') ; 64 | colormap gray ; 65 | showFeatureChannels(y) ; 66 | 67 | %% Part 1.1.3: convolving a batch of images 68 | 69 | x1 = im2single(rgb2gray(imread('data/ray.jpg'))) ; 70 | x2 = im2single(rgb2gray(imread('data/crab.jpg'))) ; 71 | x = cat(4, x1, x2) ; 72 | 73 | y = vl_nnconv(x, wbank, []) ; 74 | 75 | figure(13) ; clf('reset') ; colormap gray ; 76 | set(gcf, 'name', 'Part 1.1.3: filtering a batch') ; 77 | 78 | subplot(4,2,1) ; imagesc(x1) ; axis off image ; 79 | subplot(4,2,3) ; imagesc(y(:,:,1,1)) ; axis off image ; 80 | subplot(4,2,5) ; imagesc(y(:,:,2,1)) ; axis off image ; 81 | subplot(4,2,7) ; imagesc(y(:,:,3,1)) ; axis off image ; 82 | 83 | subplot(4,2,2) ; imagesc(x2) ; axis off image ; 84 | subplot(4,2,4) ; imagesc(y(:,:,1,2)) ; axis off image ; 85 | subplot(4,2,6) ; imagesc(y(:,:,2,2)) ; axis off image ; 86 | subplot(4,2,8) ; imagesc(y(:,:,3,2)) ; axis off image ; 87 | 88 | %% Part 1.2: non-linear activation functions (ReLU) 89 | 90 | %% Part 1.2.1: Laplacian and ReLU 91 | x = im2single(rgb2gray(imread('data/ray.jpg'))) ; 92 | 93 | % Convolve with the negated Laplacian 94 | y = vl_nnconv(x, - w, []) ; 95 | 96 | % Apply the ReLU operator 97 | z = vl_nnrelu(y) ; 98 | 99 | figure(14) ; clf ; set(gcf, 'name', 'Part 1.2.1: Laplacian and ReLU') ; 100 | colormap gray ; 101 | subplot(2,2,1); imagesc(x) ; axis off image ; title('Image x') ; 102 | subplot(2,2,2); imagesc(y) ; axis off image ; title('Laplacian y') 103 | subplot(2,2,3); imagesc(z) ; axis off image ; title('ReLU z') ; 104 | 105 | %% Part 1.2.2: effect of adding a bias 106 | 107 | bias = single(- 0.2) ; 108 | y = vl_nnconv(x, - w, bias) ; 109 | z = vl_nnrelu(y) ; 110 | 111 | figure(15) ; clf ; set(gcf, 'name', 'Part 1.2.2: adding a bias') ; 112 | colormap gray ; 113 | subplot(2,2,1); imagesc(x) ; axis off image ; title('Image x') ; 114 | subplot(2,2,2); imagesc(y) ; axis off image ; title('Laplacian y with bias') 115 | subplot(2,2,3); imagesc(z) ; axis off image ; title('ReLU z') ; 116 | -------------------------------------------------------------------------------- /exercise2.m: -------------------------------------------------------------------------------- 1 | setup() ; 2 | 3 | %% Part 2.1: Backward mode verification 4 | 5 | % Create a random input image batch 6 | x = randn(10, 10, 1, 2, 'single') ; 7 | 8 | % Define a filter 9 | w = single([ 10 | 0 -1 -0 11 | -1 4 -1 12 | 0 -1 0]) ; 13 | 14 | % Forward mode: evaluate the convolution 15 | y = vl_nnconv(x, w, []) ; 16 | 17 | % Pick a random projection tensor 18 | p = randn(size(y), 'single') ; 19 | 20 | % Backward mode: projected derivatives 21 | [dx,dw] = vl_nnconv(x, w, [], p) ; 22 | 23 | % Check the derivative numerically 24 | figure(21) ; clf('reset') ; 25 | set(gcf, 'name', 'Part 2.1: single layer backrpop') ; 26 | checkDerivativeNumerically(@(x) proj(p, vl_nnconv(x, w, [])), x, dx) ; 27 | 28 | %% Part 2.2: Backpropagation 29 | 30 | % Create a random input image batch 31 | x = randn(10, 10, 1, 2, 'single') ; 32 | 33 | % Forward mode: evaluate the conv follwed by ReLU 34 | y = vl_nnconv(x, w, []) ; 35 | z = vl_nnrelu(y) ; 36 | 37 | % Pick a random projection tensor 38 | p = randn(size(z), 'single') ; 39 | 40 | % Backward mode: projected derivatives 41 | dy = vl_nnrelu(z, p) ; 42 | [dx,dw] = vl_nnconv(x, w, [], dy) ; 43 | 44 | % Check the derivative numerically 45 | figure(22) ; clf('reset') ; 46 | set(gcf, 'name', 'Part 2.2: two layers backrpop') ; 47 | func = @(x) proj(p, vl_nnrelu(vl_nnconv(x, w, []))) ; 48 | checkDerivativeNumerically(func, x, dx) ; 49 | 50 | %% Part 2.3: Design and verify your own layer 51 | 52 | x0 = randn(size(x), 'single') ; 53 | 54 | forward = @l2LossForward; backward = @l2LossBackward ; 55 | 56 | % Uncomment the followung line to test your L1 loss implementation 57 | % forward = @l1LossForward; backward = @l1LossBackward ; 58 | 59 | y = forward(x, x0) ; 60 | 61 | p = randn(size(y), 'single') ; 62 | dx = backward(x, x0, p) ; 63 | 64 | % Check the derivative numerically 65 | figure(23) ; clf('reset') ; 66 | set(gcf, 'name', 'Part 2.3: custom loss layer') ; 67 | func = @(x) proj(p, forward(x, x0)) ; 68 | checkDerivativeNumerically(func, x, dx) ; 69 | -------------------------------------------------------------------------------- /exercise3.m: -------------------------------------------------------------------------------- 1 | setup() ; 2 | % setup('useGpu', true); % Uncomment to initialise with a GPU support 3 | 4 | %% Part 3.1: Prepare the data 5 | 6 | % Load a database of blurred images to train from 7 | imdb = load('data/text_imdb.mat') ; 8 | 9 | % Visualize the first image in the database 10 | figure(31) ; set(gcf, 'name', 'Part 3.1: Data') ; clf ; 11 | 12 | subplot(1,2,1) ; imagesc(imdb.images.data(:,:,:,1)) ; 13 | axis off image ; title('Input (blurred)') ; 14 | 15 | subplot(1,2,2) ; imagesc(imdb.images.label(:,:,:,1)) ; 16 | axis off image ; title('Desired output (sharp)') ; 17 | 18 | colormap gray ; 19 | 20 | %% Part 3.2: Create a network architecture 21 | % 22 | % The expected input size (a single 64 x 64 x 1 image patch). This is 23 | % used for visualization purposes. 24 | 25 | net = initializeSmallCNN() ; 26 | %net = initializeLargeCNN() ; 27 | 28 | % Display network 29 | vl_simplenn_display(net) ; 30 | 31 | % Evaluate network on an image 32 | res = vl_simplenn(net, imdb.images.data(:,:,:,1)) ; 33 | 34 | figure(32) ; clf ; colormap gray ; 35 | set(gcf,'name', 'Part 3.2: network input') ; 36 | subplot(1,2,1) ; 37 | imagesc(res(1).x) ; axis image off ; 38 | title('CNN input') ; 39 | 40 | set(gcf,'name', 'Part 3.2: network output') ; 41 | subplot(1,2,2) ; 42 | imagesc(res(end).x) ; axis image off ; 43 | title('CNN output (not trained yet)') ; 44 | 45 | %% Part 3.3: learn the model 46 | 47 | % Add a loss (using a custom layer) 48 | net = addCustomLossLayer(net, @l2LossForward, @l2LossBackward) ; 49 | 50 | % Extra: uncomment the following line to use your implementation 51 | % of the L1 loss 52 | %net = addCustomLossLayer(net, @l1LossForward, @l1LossBackward) ; 53 | 54 | % Train 55 | trainOpts.expDir = 'data/text-small' ; 56 | trainOpts.gpus = [] ; 57 | % Uncomment for GPU training: 58 | %trainOpts.expDir = 'data/text-small-gpu' ; 59 | %trainOpts.gpus = [1] ; 60 | trainOpts.batchSize = 16 ; 61 | trainOpts.learningRate = 0.02 ; 62 | trainOpts.plotDiagnostics = false ; 63 | %trainOpts.plotDiagnostics = true ; % Uncomment to plot diagnostics 64 | trainOpts.numEpochs = 20 ; 65 | trainOpts.errorFunction = 'none' ; 66 | 67 | net = cnn_train(net, imdb, @getBatch, trainOpts) ; 68 | 69 | % Deploy: remove loss 70 | net.layers(end) = [] ; 71 | 72 | %% Part 3.4: evaluate the model 73 | 74 | train = find(imdb.images.set == 1) ; 75 | val = find(imdb.images.set == 2) ; 76 | 77 | figure(33) ; set(gcf, 'name', 'Part 3.4: Results on the training set') ; 78 | showDeblurringResult(net, imdb, train(1:30:151)) ; 79 | 80 | figure(34) ; set(gcf, 'name', 'Part 3.4: Results on the validation set') ; 81 | showDeblurringResult(net, imdb, val(1:30:151)) ; 82 | 83 | figure(35) ; 84 | set(gcf, 'name', 'Part 3.4: Larger example on the validation set') ; 85 | colormap gray ; 86 | subplot(1,2,1) ; imagesc(imdb.examples.blurred{1}, [-1, 0]) ; 87 | axis image off ; 88 | title('CNN input') ; 89 | res = vl_simplenn(net, imdb.examples.blurred{1}) ; 90 | subplot(1,2,2) ; imagesc(res(end).x, [-1, 0]) ; 91 | axis image off ; 92 | title('CNN output') ; 93 | -------------------------------------------------------------------------------- /extra/Makefile: -------------------------------------------------------------------------------- 1 | # Build practical 2 | 3 | name ?= practical-cnn-reg 4 | ver ?= 2016a 5 | 6 | code=\ 7 | checkDerivativeNumerically.m \ 8 | l1LossBackward.m \ 9 | l1LossForward.m \ 10 | l2LossBackward.m \ 11 | l2LossForward.m \ 12 | exercise1.m \ 13 | exercise2.m \ 14 | exercise3.m \ 15 | getBatch.m \ 16 | addCustomLossLayer.m \ 17 | initializeLargeCNN.m \ 18 | initializeSmallCNN.m \ 19 | proj.m \ 20 | setup.m \ 21 | showDeblurringResult.m \ 22 | showFeatureChannels.m \ 23 | xavier.m \ 24 | README.md \ 25 | matconvnet 26 | 27 | doc=\ 28 | doc/images \ 29 | doc/instructions.html \ 30 | doc/base.css \ 31 | doc/prism.css \ 32 | doc/prism.js 33 | 34 | data=\ 35 | data/text_imdb.mat \ 36 | data/ray.jpg \ 37 | data/crab.jpg 38 | 39 | include extra/practical/Makefile 40 | 41 | BINPACK_URL=http://www.robots.ox.ac.uk/~karel/project/matconvnet/bin 42 | 43 | %-bin: $(TMPDIR)/matconvnet-%-bin.tar.gz 44 | tar xzvf $< -C ./matconvnet 45 | 46 | $(TMPDIR)/matconvnet-%-bin.tar.gz: 47 | wget $(BINPACK_URL)/$(notdir $@) -O $@ 48 | 49 | # Preprocessing 50 | .PHONY: preproc bins 51 | 52 | papers_url=http://www.robots.ox.ac.uk/~vedaldi/assets/pubs 53 | papers=\ 54 | blaschko10simultaneous.pdf \ 55 | chatfield11devil.pdf \ 56 | chatfield14return.pdf \ 57 | cimpoi14describing.pdf \ 58 | cimpoi15deep.pdf \ 59 | cimpoi16deep.pdf \ 60 | fulkerson08localizing.pdf \ 61 | fulkerson09class.pdf \ 62 | jaderberg14deep.pdf \ 63 | jaderberg14speeding.pdf \ 64 | jaderberg14synthetic.pdf \ 65 | jaderberg15reading.pdf \ 66 | jones07inertial.pdf \ 67 | juneja13blocks.pdf \ 68 | lempitsky11pylon.pdf \ 69 | lenc15rcnn.pdf \ 70 | lenc15understanding.pdf \ 71 | mahendran15understanding.pdf \ 72 | parizi15automatic.pdf \ 73 | parkhi11truth.pdf \ 74 | parkhi12cat.pdf \ 75 | parkhi12spotting.pdf \ 76 | parkhi14compact.pdf \ 77 | parkhi15deep.pdf \ 78 | pedersoli11coarse.pdf \ 79 | pedersoli14coarse.pdf \ 80 | rabinovich07objects.pdf \ 81 | simonyan12descriptor.pdf \ 82 | simonyan13deep.pdf \ 83 | simonyan13fisher.pdf \ 84 | simonyan14deep.pdf \ 85 | simonyan14learning.pdf \ 86 | sreekanth10generalized.pdf \ 87 | vedaldi05features.pdf \ 88 | vedaldi05kalmansac.pdf \ 89 | vedaldi05TRviewpoint.pdf \ 90 | vedaldi06local.pdf \ 91 | vedaldi06viewpoint.pdf \ 92 | vedaldi07boosting.pdf \ 93 | vedaldi07complexity.pdf \ 94 | vedaldi07moving-orig.pdf \ 95 | vedaldi07moving.pdf \ 96 | vedaldi07open.pdf \ 97 | vedaldi08joint.pdf \ 98 | vedaldi08quick.pdf \ 99 | vedaldi08relaxed.pdf \ 100 | vedaldi09multiple.pdf \ 101 | vedaldi09structured.pdf \ 102 | vedaldi10efficient.pdf \ 103 | vedaldi10knowing.pdf \ 104 | vedaldi10vlfeat.pdf \ 105 | vedaldi11efficient.pdf \ 106 | vedaldi11learning.pdf \ 107 | vedaldi12self.pdf \ 108 | vedaldi12sparse.pdf \ 109 | vedaldi14understanding.pdf \ 110 | vedaldi15matconvnet.pdf 111 | 112 | papers:=$(addprefix data/text/,$(papers)) 113 | papers_crops=$(papers:%.pdf=%.png) 114 | 115 | preproc: $(papers) $(papers_crops) 116 | 117 | bins: win7-bin maci64-bin glnxa64-bin 118 | rm -f matlab/mex/*.mex* 119 | 120 | ec2bins: ec2-bin 121 | rm -f matlab/mex/*.mex* 122 | 123 | data/text/%.pdf: 124 | mkdir -p data/text ; \ 125 | wget "$(papers_url)/$(*).pdf" -O "$(@)" 126 | 127 | %.png : %.pdf extra/Makefile 128 | convert \ 129 | -verbose \ 130 | -units PixelsPerInch \ 131 | -density 300 \ 132 | "$(<)"[2] \ 133 | -colorspace 'rgb' \ 134 | -flatten \ 135 | -resize 'x1536' \ 136 | -gravity center -crop 512x512+0+0 \ 137 | "$(@)" 138 | 139 | info: 140 | @echo Images: "$(papers)" 141 | -------------------------------------------------------------------------------- /extra/getBlurredImagesData.m: -------------------------------------------------------------------------------- 1 | function imdb = getBlurredImagesData(dataDir) 2 | %GETBLURREDIMAGESDATA Get the data for the text deblurring exercise 3 | % IMDB = GETBLURREDIMAGESDATA(DATADIR) reads a directory of PNG 4 | % images DATADIR and returns a corresponding IMDB structure. 5 | 6 | imdb.images.id = {} ; 7 | imdb.images.data = {} ; 8 | imdb.images.set = {} ; 9 | imdb.images.label = {} ; 10 | 11 | names = dir(fullfile(dataDir, '*.png')) ; 12 | names = {names.name} ; 13 | 14 | numCollected = 0 ; 15 | 16 | for i = 1:numel(names) 17 | im = imread(fullfile(dataDir, names{i})) ; 18 | im = im2single(im) ; 19 | if size(im,3) > 1, im = rgb2gray(im) ; end 20 | im = im - 1 ; % make white = 0 21 | label = im ; 22 | 23 | G = fspecial('gaussian', [5 5], 2); 24 | im = imfilter(label,G,'same') ; 25 | s = 1+ (i > numel(names)*.75) ; 26 | 27 | if s == 2 && numCollected < 10 28 | numCollected = numCollected + 1 ; 29 | imdb.examples.sharp{numCollected} = label ; 30 | imdb.examples.blurred{numCollected} = im ; 31 | end 32 | 33 | % further break each image in 64 x 64 tiles 34 | for i = 0:7 35 | for j = 0:7 36 | si = i*64 + (1:64) ; 37 | sj = j*64 + (1:64) ; 38 | im_ = im(si,sj) ; 39 | label_ = label(si,sj) ; 40 | % drop if nothing in the patch 41 | if std(im_(:)) < 0.05, continue ; end 42 | imdb.images.id{end+1} = numel(imdb.images.id) + 1 ; 43 | imdb.images.set{end+1} = s ; 44 | imdb.images.label{end+1} = label_ ; 45 | imdb.images.data{end+1} = im_ ; 46 | end 47 | end 48 | end 49 | 50 | imdb.images.id = horzcat(imdb.images.id{:}) ; 51 | imdb.images.set = horzcat(imdb.images.set{:}) ; 52 | imdb.images.label = cat(4, imdb.images.label{:}) ; 53 | imdb.images.data = cat(4, imdb.images.data{:}) ; 54 | -------------------------------------------------------------------------------- /extra/post.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # A version with standard binaries 4 | make -f extra/Makefile distclean 5 | make -f extra/Makefile bins 6 | make -f extra/Makefile pack post post-doc 7 | 8 | # A version with EC2 binaries 9 | name=practical-cnn-reg-ec2 make -f extra/Makefile distclean 10 | name=practical-cnn-reg-ec2 make -f extra/Makefile ec2bins 11 | name=practical-cnn-reg-ec2 make -f extra/Makefile pack post 12 | -------------------------------------------------------------------------------- /extra/preprocess.m: -------------------------------------------------------------------------------- 1 | function preprocess() 2 | % Run the Makefile first 3 | 4 | opts.dataDir = 'data/text/' ; 5 | opts.imdbPath = 'data/text_imdb.mat' ; 6 | 7 | setup() ; 8 | 9 | if ~exist(opts.imdbPath) 10 | imdb = getBlurredImagesData(opts.dataDir) ; 11 | save(opts.imdbPath, '-struct', 'imdb') ; 12 | end 13 | 14 | -------------------------------------------------------------------------------- /getBatch.m: -------------------------------------------------------------------------------- 1 | function [im, label] = getBatch(imdb, batch) 2 | %GETBATCH Get a batch of training data 3 | % [IM, LABEL] = The GETBATCH(IMDB, BATCH) extracts the images IM 4 | % and labels LABEL from IMDB according to the list of images 5 | % BATCH. 6 | 7 | im = imdb.images.data(:,:,:,batch) ; 8 | label = imdb.images.label(:,:,:,batch) ; 9 | -------------------------------------------------------------------------------- /initializeLargeCNN.m: -------------------------------------------------------------------------------- 1 | function net = initializeLargeCNN() 2 | %INITIALIZELARGECNN Initialize a large CNN for text deblurring 3 | % NET = INITIALIZELARGECNN() returns the SimpleNN model NET. 4 | 5 | net.meta.inputSize = [64 64 1 1] ; 6 | 7 | net.layers = { } ; 8 | 9 | net.layers{end+1} = struct(... 10 | 'name', 'conv1', ... 11 | 'type', 'conv', ... 12 | 'weights', {xavier(5,5,1,32)}, ... 13 | 'pad', 2, ... 14 | 'learningRate', [1 1], ... 15 | 'weightDecay', [1 0]) ; 16 | 17 | net.layers{end+1} = struct(... 18 | 'name', 'relu1', ... 19 | 'type', 'relu') ; 20 | 21 | net.layers{end+1} = struct(... 22 | 'name', 'conv2', ... 23 | 'type', 'conv', ... 24 | 'weights', {xavier(3,3,32,32)}, ... 25 | 'pad', 1, ... 26 | 'learningRate', [1 1], ... 27 | 'weightDecay', [1 0]) ; 28 | 29 | net.layers{end+1} = struct(... 30 | 'name', 'relu2', ... 31 | 'type', 'relu') ; 32 | 33 | net.layers{end+1} = struct(... 34 | 'name', 'conv3', ... 35 | 'type', 'conv', ... 36 | 'weights', {xavier(1,7,32,32)}, ... 37 | 'pad', [0 0 3 3], ... 38 | 'learningRate', [1 1], ... 39 | 'weightDecay', [1 0]) ; 40 | 41 | net.layers{end+1} = struct(... 42 | 'name', 'relu3', ... 43 | 'type', 'relu') ; 44 | 45 | net.layers{end+1} = struct(... 46 | 'name', 'conv4', ... 47 | 'type', 'conv', ... 48 | 'weights', {xavier(7,1,32,32)}, ... 49 | 'pad', [3 3 0 0], ... 50 | 'learningRate', [1 1], ... 51 | 'weightDecay', [1 0]) ; 52 | 53 | net.layers{end+1} = struct(... 54 | 'name', 'relu4', ... 55 | 'type', 'relu') ; 56 | 57 | net.layers{end+1} = struct(... 58 | 'name', 'prediction', ... 59 | 'type', 'conv', ... 60 | 'weights', {xavier(3,3,32,1)}, ... 61 | 'pad', 1, ... 62 | 'stride', 1, ... 63 | 'learningRate', [1 .001], ... 64 | 'weightDecay', [1 0]) ; 65 | 66 | % Consolidate the network, fixing any missing option 67 | % in the specification above. 68 | 69 | net = vl_simplenn_tidy(net) ; 70 | -------------------------------------------------------------------------------- /initializeSmallCNN.m: -------------------------------------------------------------------------------- 1 | function net = initializeSmallCNN() 2 | %INITIALIZESMALLCNN Initialize a small CNN for text deblurring 3 | % NET = INITIALIZESMALLCNN() returns the SimpleNN model NET. 4 | 5 | net.meta.inputSize = [64 64 1 1] ; 6 | 7 | net.layers = { } ; 8 | 9 | net.layers{end+1} = struct(... 10 | 'name', 'conv1', ... 11 | 'type', 'conv', ... 12 | 'weights', {xavier(3,3,1,32)}, ... 13 | 'pad', 1, ... 14 | 'stride', 1, ... 15 | 'learningRate', [1 1], ... 16 | 'weightDecay', [1 0]) ; 17 | 18 | net.layers{end+1} = struct(... 19 | 'name', 'relu1', ... 20 | 'type', 'relu') ; 21 | 22 | net.layers{end+1} = struct(... 23 | 'name', 'conv2', ... 24 | 'type', 'conv', ... 25 | 'weights', {xavier(3,3,32,32)}, ... 26 | 'pad', 1, ... 27 | 'stride', 1, ... 28 | 'learningRate', [1 1], ... 29 | 'weightDecay', [1 0]) ; 30 | 31 | net.layers{end+1} = struct(... 32 | 'name', 'relu2', ... 33 | 'type', 'relu') ; 34 | 35 | net.layers{end+1} = struct(... 36 | 'name', 'prediction', ... 37 | 'type', 'conv', ... 38 | 'weights', {xavier(3,3,32,1)}, ... 39 | 'pad', 1, ... 40 | 'stride', 1, ... 41 | 'learningRate', [1 1], ... 42 | 'weightDecay', [1 0]) ; 43 | 44 | % Consolidate the network, fixing any missing option 45 | % in the specification above 46 | 47 | net = vl_simplenn_tidy(net) ; -------------------------------------------------------------------------------- /l1LossBackward.m: -------------------------------------------------------------------------------- 1 | function dx = l1LossBackward(x,r,p) 2 | % TODO: Replace the following line with your implementation 3 | dx = rand(size(x), 'like', x) ; 4 | 5 | dx = dx / (size(x,1) * size(x,2)) ; % normalize by image size 6 | -------------------------------------------------------------------------------- /l1LossForward.m: -------------------------------------------------------------------------------- 1 | function y = l1LossForward(x,r) 2 | % TODO: Replace the following line with your implementation 3 | y = rand(size(x), 'like', x) ; 4 | 5 | y = y / (size(x,1) * size(x,2)) ; % normalize by image size 6 | -------------------------------------------------------------------------------- /l2LossBackward.m: -------------------------------------------------------------------------------- 1 | function dx = l2LossBackward(x,r,p) 2 | dx = 2 * p * (x - r) ; 3 | dx = dx / (size(x,1) * size(x,2)) ; % normalize by image size 4 | -------------------------------------------------------------------------------- /l2LossForward.m: -------------------------------------------------------------------------------- 1 | function y = l2LossForward(x,r) 2 | delta = x - r ; 3 | y = sum(delta(:).^2) ; 4 | y = y / (size(x,1) * size(x,2)) ; % normalize by image size 5 | -------------------------------------------------------------------------------- /proj.m: -------------------------------------------------------------------------------- 1 | function z = proj(x,p) 2 | %PROJ Project a tensor onto anotehr 3 | % Z = PROJ(X,P) computes the projection Z of tensor X onto P. 4 | % 5 | % Remark: if X and P contain multiple tensor instances 6 | % (concatenated along the foruth dimension), then the 7 | % result Z contains a scalar projection for each. 8 | 9 | prods = x .* p ; 10 | z = sum(prods(:)) ; 11 | -------------------------------------------------------------------------------- /setup.m: -------------------------------------------------------------------------------- 1 | function setup(varargin) 2 | %SETUP() Initialize the practical 3 | % SETUP() initializes the practical. SETUP('useGpu', true) does 4 | % the same, but compiles the GPU supprot as well. 5 | 6 | base = fileparts(mfilename('fullpath')) ; 7 | run(fullfile(base, 'matconvnet', 'matlab', 'vl_setupnn')) ; 8 | 9 | opts.useGpu = false ; 10 | opts.verbose = false ; 11 | opts = vl_argparse(opts, varargin) ; 12 | 13 | addpath(fullfile(base, 'matconvnet', 'examples')) ; 14 | 15 | try 16 | vl_nnconv(single(1),single(1),[]) ; 17 | catch 18 | warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ; 19 | vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose, ... 20 | 'enableImreadJpeg', false) ; 21 | end 22 | 23 | if opts.useGpu 24 | try 25 | vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ; 26 | catch 27 | warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now.') ; 28 | vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose, ... 29 | 'enableImreadJpeg', false) ; 30 | end 31 | end 32 | 33 | if verLessThan('matlab','7.12') 34 | % MATLAB R2010b did not have rng() 35 | randn('state',0) ; 36 | rand('state',0) ; 37 | else 38 | rng(0) ; 39 | end 40 | 41 | % The EC2 has incorrect screen size which 42 | % leads to a tiny font in figures 43 | 44 | [~, hostname] = system('hostname') ; 45 | if strcmp(hostname(1:3), 'ip-') 46 | set(0, 'DefaultAxesFontSize', 30) ; 47 | end 48 | 49 | 50 | -------------------------------------------------------------------------------- /showDeblurringResult.m: -------------------------------------------------------------------------------- 1 | function showDeblurringResult(net, imdb, subset) 2 | %SHOWDEBLURRINGRESULT Show a few examples of deblurred images 3 | % SHOWDEBLURRINGRESULTS(NET, IMDB, SUBSET) uses the CNN NET to 4 | % deblur a few images in the IMDB database and visualzie the result 5 | % in a figure. SUBSET is a vector of image indexes to display. 6 | 7 | % Evaluate the CNN to obtain deblurring results 8 | res = vl_simplenn(net, imdb.images.data(:,:,:,subset)) ; 9 | preds = res(end).x ; 10 | 11 | % Visualize the results in a figure 12 | clf ; 13 | n = numel(subset) ; 14 | for i = 1 : n 15 | j = subset(i) ; 16 | subplot(n,3,1+3*(i-1)) ; 17 | imagesc(imdb.images.data(:,:,:,j),[-1 0]) ; 18 | axis off image ; title('original') ; 19 | subplot(n,3,2+3*(i-1)) ; 20 | imagesc(imdb.images.label(:,:,:,j),[-1 0]) ; 21 | axis off image ; title('expected') ; 22 | subplot(n,3,3+3*(i-1)) ; 23 | imagesc(preds(:,:,:,i),[-1 0]) ; 24 | axis off image ; title('achieved') ; 25 | end 26 | colormap gray ; 27 | -------------------------------------------------------------------------------- /showFeatureChannels.m: -------------------------------------------------------------------------------- 1 | function showFeatureChannels(x) 2 | %SHOWFEATURECHANNELS Display the feature channels in the tensor x 3 | 4 | k = size(x,3) ; 5 | n = ceil(sqrt(k)) ; 6 | m = ceil(k/n) ; 7 | 8 | for i = 1:k 9 | subplot(m,n,i) ; imagesc(x(:,:,i)) ; 10 | title(sprintf('feature channel %d',i)) ; axis image ; 11 | end -------------------------------------------------------------------------------- /xavier.m: -------------------------------------------------------------------------------- 1 | function weights = xavier(varargin) 2 | %XAVIER Xavier filter initialization. 3 | % WEIGHTS = XAVIER(H, W, C, N) initializes N filters of support H x 4 | % W and C channels using Xavier method. WEIGHTS = {FILTERS,BIASES}is 5 | % a cell array containing both filters and biases. 6 | % 7 | % See also: 8 | % Glorot, Xavier, and Yoshua Bengio. 9 | % "Understanding the difficulty of training deep feedforward neural networks." 10 | % International conference on artificial intelligence and statistics. 2010. 11 | 12 | filterSize = [varargin{:}] ; 13 | scale = sqrt(2/prod(filterSize(1:3))) ; 14 | filters = randn(filterSize, 'single') * scale ; 15 | biases = zeros(filterSize(4),1,'single') ; 16 | weights = {filters, biases} ; 17 | --------------------------------------------------------------------------------