├── .DS_Store ├── .gitattributes ├── .gitignore ├── README.md ├── go ├── images ├── elm_1E_3.png ├── elmtest.png ├── leastsq5.png ├── leastsq8.png ├── plotdat.png ├── polyreg.png ├── ridge4.png ├── ridge5.png └── ridge8.png ├── jelm.ijs ├── jelm.nw ├── jelm.pdf └── jelm.tex /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.swp 3 | *.aux 4 | jelm.log 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jelm 2 | Extreme Learning Machine in J 3 | See the corresponding web page: 4 | http://peportier.me/blog/201905_JELM 5 | -------------------------------------------------------------------------------- /go: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | noweave -x -delay jelm.nw > jelm.tex 3 | pdflatex jelm 4 | pdflatex jelm 5 | notangle -Rjelm.ijs jelm.nw > jelm.ijs 6 | -------------------------------------------------------------------------------- /images/elm_1E_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/elm_1E_3.png -------------------------------------------------------------------------------- /images/elmtest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/elmtest.png -------------------------------------------------------------------------------- /images/leastsq5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/leastsq5.png -------------------------------------------------------------------------------- /images/leastsq8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/leastsq8.png -------------------------------------------------------------------------------- /images/plotdat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/plotdat.png -------------------------------------------------------------------------------- /images/polyreg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/polyreg.png -------------------------------------------------------------------------------- /images/ridge4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/ridge4.png -------------------------------------------------------------------------------- /images/ridge5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/ridge5.png -------------------------------------------------------------------------------- /images/ridge8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/images/ridge8.png -------------------------------------------------------------------------------- /jelm.ijs: -------------------------------------------------------------------------------- 1 | require'trig' 2 | require'plot' 3 | require'numeric' 4 | 5 | pushup=: ] + 0.1 * | 6 | pushdown=: ] - 0.1 * | 7 | 8 | NB. locate the elements with values between {.x and {:x 9 | sel=: (] >: {.@[) *. (] <: {:@[) 10 | 11 | mean=: +/ % # 12 | rmse=: [: %: [: mean ([: *: -) 13 | 14 | mp=: +/ . * NB. matrix product 15 | 16 | diag=: (<0 1)&|: : (([:(>:*i.)[:#])}) 17 | addDiag=: ([+diag@]) diag ] NB. add x to the diagonal of y 18 | 19 | f=: 3 : '(^y) * cos 2*pi * sin pi * y' 20 | noise=: 4 : 'y + -&x *&(+:x) ? (#y) # 0' 21 | 22 | gendat=: 4 : 0 23 | X=: ? y $ 0 24 | Y=: x noise f X 25 | minmaxX=: (<./ , >./) X 26 | minmaxf=: (([: pushdown <./) , ([: pushup >./)) f steps 0 1 100 27 | XT=: ? (>. 0.1 * y) $ 0 28 | YT=: f XT 29 | 0 30 | ) 31 | 32 | 33 | plotdatnoshow=: 3 : 0 34 | pd 'reset' 35 | pd 'color green' 36 | pd 'type marker' 37 | pd 'markersize 1' 38 | pd 'markers circle' 39 | 40 | pd X;Y 41 | pd 'color red' 42 | pd 'type line' 43 | pd 'pensize 1' 44 | pd (;f) steps 0 1 100 45 | 46 | ) 47 | plotdat=: 3 : 0 48 | plotdatnoshow 0 49 | pd 'show' 50 | ) 51 | 52 | plotpoly=: 3 : 0 53 | plotdatnoshow 0 54 | pd 'color blue' 55 | xs=: (] #~ minmaxX"_ sel ]) /:~ X,steps 0 1 100 56 | pval=: c&p. xs 57 | crop=: minmaxf sel pval 58 | pd (crop # xs);(crop # pval) 59 | pd 'show' 60 | ) 61 | 62 | polyreg=: 3 : 0 63 | c=: Y %. X ^/ i.#X 64 | YThat=: c&p. XT 65 | plotpoly 0 66 | ) 67 | 68 | gram=: 3 : 0 69 | A=: X ^/ i.y 70 | S=: (mp~ |:) A 71 | ) 72 | 73 | leastsq=: 3 : 0 74 | gram y 75 | c=: ((|:A) mp Y) %. S 76 | YThat=: c&p. XT 77 | plotpoly 0 78 | ) 79 | 80 | ridge=: 4 : 0 81 | gram y 82 | c=: ((|:A) mp Y) %. x addDiag S 83 | YThat=: c&p. XT 84 | plotpoly 0 85 | ) 86 | 87 | plotelm=: 3 : 0 88 | plotdatnoshow 0 89 | pd 'type line' 90 | pd 'color blue' 91 | xs=: (] #~ minmaxX"_ sel ]) steps (<.<./X),(>.>./X),100 92 | pd xs;(mkH ,. xs) mp c 93 | pd 'show' 94 | ) 95 | 96 | initelm=: 3 : 0 97 | W=: _1 + 2 * ? (y,1) $ 0 NB. input weights 98 | B=: ? y $ 0 NB. bias 99 | H=: mkH ,. X 100 | 0 [ S=: (mp~ |:) H 101 | ) 102 | mkH=: 3 : '0&>. B +"1 y mp"1/ W' 103 | 104 | elm=: 3 : 0 105 | c=: ((|:H) mp Y) %. y addDiag S 106 | YThat=: (mkH ,. XT) mp c 107 | plotelm 0 108 | ) 109 | 110 | plottest=: 3 : 0 111 | pd 'reset' 112 | pd 'color green' 113 | pd 'type marker' 114 | pd 'markersize 1' 115 | pd 'markers circle' 116 | 117 | pd XT;YT 118 | pd 'color magenta' 119 | pd XT;YThat 120 | pd 'color red' 121 | pd 'type line' 122 | pd 'pensize 1' 123 | pd (;f) steps 0 1 100 124 | 125 | pd 'show' 126 | ) 127 | 128 | test=: 3 : 0 129 | plottest 0 130 | YT rmse YThat 131 | ) 132 | 133 | 134 | -------------------------------------------------------------------------------- /jelm.nw: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article} 2 | \usepackage[a4paper, total={6in, 8in}]{geometry} 3 | \usepackage{graphicx} 4 | \graphicspath{ {./images/} } 5 | \usepackage{noweb} 6 | \usepackage{amsmath} 7 | \usepackage{amssymb} 8 | \usepackage{url} 9 | \title{Extreme Learning Machine in J} 10 | \author{Pierre-Edouard Portier} 11 | \date{2019} 12 | \renewcommand{\vec}[1]{\boldsymbol{#1}} 13 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 14 | 15 | \begin{document} 16 | \maketitle 17 | \section{Regression} 18 | $\vec{x^{(1)}}\dots\vec{x^{(P)}}$ are vectors of $\mathbb{R}^{n-1}$ with 19 | associated values $y^{(1)}\dots y^{(P)}$ of $\mathbb{R}$. 20 | We search a function $f(\vec{x}):\mathbb{R}^{n-1} \rightarrow \mathbb{R}$ to 21 | model the observed relationship between $\vec{x}$ and $y$. 22 | $f$ can have a fixed parameterized form. For example: 23 | \[ 24 | f(\vec{x}) = a_0 + a_1 x_1 + a_2 x_2 + \dots + a_{n-1} x_{n-1} 25 | \] 26 | 27 | If $P=n$, parameters $a_0 \dots a_{n-1}$ are found by solving a linear system. 28 | \[ 29 | \begin{cases} 30 | y^{(1)} &= a_0 + a_1 x_1^{(1)} + a_2 x_2^{(1)} + \dots + a_{n-1} x_{n-1}^{(1)} \\ 31 | \dots &= \dots \\ 32 | y^{(P)} &= a_0 + a_1 x_1^{(P)} + a_2 x_2^{(P)} + \dots + a_{n-1} x_{n-1}^{(P)} \\ 33 | \end{cases} 34 | \] 35 | This system can be written in matrix form. 36 | \[ 37 | \left( \begin{array}{cccc} 38 | 1 & x^{(1)}_1 & \dots & x^{(1)}_{n-1} \\ 39 | 1 & x^{(2)}_1 & \dots & x^{(2)}_{n-1} \\ 40 | \dots & \dots & \dots & \dots \\ 41 | 1 & x^{(P)}_1 & \dots & x^{(P)}_{n-1} 42 | \end{array} \right) 43 | \left( \begin{array}{c} 44 | a_0 \\ a_1 \\ \dots \\ a_{n-1} 45 | \end{array} \right) 46 | = 47 | \left( \begin{array}{c} 48 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(P)} 49 | \end{array} \right) 50 | \] 51 | 52 | Each line of the first term matrix is a vector $\vec{x^{(i)T}}$ with the 53 | addition of a constant coordinate that accounts for parameter $a_0$. 54 | Thus, naming this matrix $\vec{X}^T$, the linear system can also be written: 55 | \[ 56 | \vec{X}^T \vec{a} = \vec{y} 57 | \] 58 | 59 | Consider the special case when $x$ is a number and $f$ is a polynomial of degree $n-1$: 60 | \[ 61 | f(x) = a_0 + a_1 x + a_2 x^2 + \dots + a_{n-1}x^{n-1} 62 | \] 63 | 64 | With $P=n$ examples $\left(x^{(k)},y^{(k)}\right)$, the parameters are found by 65 | solving the following linear system: 66 | \begin{equation} 67 | \left( \begin{array}{ccccc} 68 | 1 & x^{(1)} & (x^{(1)})^2 & \dots & (x^{(1)})^{n-1} \\ 69 | 1 & x^{(2)} & (x^{(2)})^2 & \dots & (x^{(2)})^{n-1} \\ 70 | \dots & \dots & \dots & \dots \\ 71 | 1 & x^{(P)} & (x^{(P)})^2 & \dots & (x^{(P)})^{n-1} 72 | \end{array} \right) 73 | \left( \begin{array}{c} 74 | a_0 \\ a_1 \\ \dots \\ a_{n-1} 75 | \end{array} \right) 76 | = 77 | \left( \begin{array}{c} 78 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(P)} 79 | \end{array} \right) 80 | \label{eqn:vandermonde} 81 | \end{equation} 82 | Incidentally, the first term is called the Vandermonde Matrix. 83 | 84 | \subsection{Experiment with a 1-dimensional synthetic dataset} 85 | We define a non linear function [[f]] from which we generate a dataset. 86 | 87 | <>= 88 | f=: 3 : '(^y) * cos 2*pi * sin pi * y' 89 | <> 90 | <> 91 | 92 | @ 93 | In traditional mathematical form, this function is: 94 | \[f(x)=e^x \times cos\left(2\pi sin\left(\pi x\right)\right)\] 95 | 96 | Function [[noise]] adds some random noise to the values of a vector. For 97 | example [[0.5 noise v]], will add random values uniformly drawn from interval 98 | $[-0.5,0.5]$ to the terms of vector [[v]]. 99 | 100 | <>= 101 | noise=: 4 : 'y + -&x *&(+:x) ? (#y) # 0' 102 | 103 | @ 104 | [[0.5 gendat 10]] generates from [[f]] a dataset [[(X,Y)]] of 10 points with 105 | random noise in $[-0.5,0.5]$ added to [[Y]]. It also stores in [[minmaxX]] the 106 | minimum and maximum values of [[X]]. It computes the pair [[minmaxf]], 107 | where the first term is ten percent smaller than the minimum of [[f]] on interval 108 | $[0,1]$, and the second term is ten percent bigger than the maximum of [[f]] on 109 | interval $[0,1]$. [[minmaxf]] is later used to crop the plots so that 110 | extreme values are not visible. 111 | 112 | A test set [[(XT,YT)]] is used to assert the capacity of the model to generalize 113 | on unseen data. Its size is fixed to $10\%$ of the size of the training set. 114 | 115 | <>= 116 | pushup=: ] + 0.1 * | 117 | pushdown=: ] - 0.1 * | 118 | 119 | <>= 120 | gendat=: 4 : 0 121 | X=: ? y $ 0 122 | Y=: x noise f X 123 | minmaxX=: (<./ , >./) X 124 | minmaxf=: (([: pushdown <./) , ([: pushup >./)) f steps 0 1 100 125 | XT=: ? (>. 0.1 * y) $ 0 126 | YT=: f XT 127 | 0 128 | ) 129 | 130 | @ 131 | [[plotdat 0]] plots the dataset. 132 | 133 | <>= 134 | plotdatnoshow=: 3 : 0 135 | <> 136 | pd X;Y 137 | <> 138 | ) 139 | plotdat=: 3 : 0 140 | plotdatnoshow 0 141 | pd 'show' 142 | ) 143 | 144 | <>= 145 | pd 'reset' 146 | pd 'color green' 147 | pd 'type marker' 148 | pd 'markersize 1' 149 | pd 'markers circle' 150 | 151 | <>= 152 | pd 'color red' 153 | pd 'type line' 154 | pd 'pensize 1' 155 | pd (;f) steps 0 1 100 156 | 157 | @ 158 | \noindent\begin{minipage}{0.5\textwidth} 159 | \includegraphics[width=\linewidth]{plotdat} 160 | \end{minipage}% 161 | \hfill% 162 | \begin{minipage}{0.4\textwidth} 163 | \begin{verbatim} 164 | 0.5 gendat 10 165 | plotdat 0 166 | \end{verbatim} 167 | \end{minipage} 168 | \vskip.5\baselineskip 169 | 170 | [[polyreg 0]] solves the linear system \eqref{eqn:vandermonde}, stores the 171 | coefficients of the polynomial in variable~[[c]] and computes [[YThat]], the 172 | predictions on the test dataset. 173 | 174 | <>= 175 | polyreg=: 3 : 0 176 | c=: Y %. X ^/ i.#X 177 | YThat=: c&p. XT 178 | plotpoly 0 179 | ) 180 | 181 | <>= 182 | NB. locate the elements with values between {.x and {:x 183 | sel=: (] >: {.@[) *. (] <: {:@[) 184 | 185 | <>= 186 | plotpoly=: 3 : 0 187 | plotdatnoshow 0 188 | pd 'color blue' 189 | xs=: (] #~ minmaxX"_ sel ]) /:~ X,steps 0 1 100 190 | pval=: c&p. xs 191 | crop=: minmaxf sel pval 192 | pd (crop # xs);(crop # pval) 193 | pd 'show' 194 | ) 195 | 196 | @ 197 | \noindent\begin{minipage}{0.5\textwidth} 198 | \includegraphics[width=\linewidth]{polyreg} 199 | \end{minipage}% 200 | \hfill% 201 | \begin{minipage}{0.4\textwidth} 202 | \begin{verbatim} 203 | polyreg 0 204 | \end{verbatim} 205 | \end{minipage} 206 | \vskip.5\baselineskip 207 | 208 | [[test 0]] returns the root mean square error (RMSE) on the test set, and a plot 209 | of the predictions. 210 | 211 | <>= 212 | mean=: +/ % # 213 | rmse=: [: %: [: mean ([: *: -) 214 | 215 | <>= 216 | test=: 3 : 0 217 | plottest 0 218 | YT rmse YThat 219 | ) 220 | 221 | <>= 222 | plottest=: 3 : 0 223 | <> 224 | pd XT;YT 225 | pd 'color magenta' 226 | pd XT;YThat 227 | <> 228 | pd 'show' 229 | ) 230 | 231 | @ 232 | \subsection{Generalization to a function space} 233 | Given a basis for a function space, we can try to express [[f]] as a combination 234 | of basis functions. 235 | \[ 236 | f(\vec{x}) = a_1 f_1(\vec{x}) + a_2 f_2(\vec{x}) + \dots + a_n f_n(\vec{x}) 237 | \] 238 | 239 | Given a dataset of $n$ pairs $\biggl(\vec{x}^{(k)},\vec{y}^{(k)}\biggr)$, the 240 | coefficients $a_i$ are found by solving a linear system. 241 | 242 | \[ 243 | \left( \begin{array}{ccccc} 244 | f_1(\vec{x}^{(1)}) & f_2(\vec{x}^{(1)}) & \dots & f_n(\vec{x}^{(1)}) \\ 245 | f_1(\vec{x}^{(2)}) & f_2(\vec{x}^{(2)}) & \dots & f_n(\vec{x}^{(2)}) \\ 246 | \dots & \dots & \dots & \dots \\ 247 | f_1(\vec{x}^{(n)}) & f_2(\vec{x}^{(n)}) & \dots & f_n(\vec{x}^{(n)}) 248 | \end{array} \right) 249 | \left( \begin{array}{c} 250 | a_1 \\ a_2 \\ \dots \\ a_{n} 251 | \end{array} \right) 252 | = 253 | \left( \begin{array}{c} 254 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(n)} 255 | \end{array} \right) 256 | \] 257 | 258 | Let us denote this linear system by $\vec{A}\vec{x}=\vec{b}$. 259 | 260 | \subsection{Least squares} 261 | The linear system $\vec{A}\vec{x}=\vec{b}$ 262 | (with $\vec{A} \in \mathbb{R}^{m \times n}$) doesn't necessarily have a solution 263 | when there are more examples than the number of basis functions (i.e. $m>n$). 264 | Thus, we want to find an approximate solution $\vec{A}\vec{x}\approx\vec{b}$ 265 | that minimizes the squares of the errors: $\norm{\vec{A}\vec{x}-\vec{b}}^2_2$. 266 | 267 | \begin{align*} 268 | & \norm{\vec{A}\vec{x}-\vec{b}}^2_2 \\ 269 | = \{ & \norm{\vec{x}}_2 = \sqrt{\vec{x}\cdot\vec{x}} \} \\ 270 | & \left(\vec{A}\vec{x}-\vec{b}\right) \cdot \left(\vec{A}\vec{x}-\vec{b}\right) \\ 271 | = \{ & \text{euclidean scalar product} \} \\ 272 | & \left(\vec{A}\vec{x}-\vec{b}\right)^T \left(\vec{A}\vec{x}-\vec{b}\right) \\ 273 | = \{ & \text{property of transposition} \} \\ 274 | & \left(\vec{x}^T\vec{A}^T - \vec{b}^T \right) \left(\vec{A}\vec{x}-\vec{b}\right) \\ 275 | = \{ & \text{multiplication} \} \\ 276 | & \vec{x}^T\vec{A}^T\vec{A}\vec{x} - \vec{x}^T\vec{A}^T\vec{b} - \vec{b}^T\vec{A}\vec{x} + \vec{b}^T\vec{b} \\ 277 | = \{ & \text{Since each element of the sum is a scalar, } \vec{b}^T\vec{A}\vec{x} = \left(\vec{b}^T\vec{A}\vec{x}\right)^T = \vec{x}^T\vec{A}^T\vec{b} \} \\ 278 | & \vec{x}^T\vec{A}^T\vec{A}\vec{x} - 2\vec{x}^T\vec{A}^T\vec{b} + \vec{b}^T\vec{b} 279 | \end{align*} 280 | 281 | To this quadratic expression corresponds a convex surface. 282 | Its minimum is found by setting its derivative to zero. 283 | 284 | \begin{align*} 285 | & \vec{0} = 2\vec{A}^T\vec{A}\vec{x} - 2\vec{A}^T\vec{b} \\ 286 | =& \\ 287 | & \vec{A}^T\vec{A}\vec{x} = \vec{A}^T\vec{b} 288 | \end{align*} 289 | 290 | Thus, when $m>n$, we solve $\vec{A}\vec{x}\approx\vec{b}$ by solving 291 | $\vec{A}^T\vec{A}\vec{x} = \vec{A}^T\vec{b}$. 292 | $\vec{A}^T\vec{A}$ is called the Gram matrix. 293 | 294 | [[gram y]] computes the Gram matrix [[S]] for a polynomial basis of degree [[y-1]]. 295 | 296 | <>= 297 | gram=: 3 : 0 298 | A=: X ^/ i.y 299 | S=: (mp~ |:) A 300 | ) 301 | 302 | <>= 303 | mp=: +/ . * NB. matrix product 304 | 305 | @ 306 | 307 | [[leastsq y]] solves the overdetermined linear system by computing the Gram 308 | matrix for a polynomial basis of degree [[y-1]]. 309 | 310 | <>= 311 | leastsq=: 3 : 0 312 | gram y 313 | c=: ((|:A) mp Y) %. S 314 | YThat=: c&p. XT 315 | plotpoly 0 316 | ) 317 | 318 | @ 319 | \noindent\begin{minipage}{0.5\textwidth} 320 | \includegraphics[width=\linewidth]{leastsq5} 321 | \end{minipage}% 322 | \hfill% 323 | \begin{minipage}{0.4\textwidth} 324 | \begin{verbatim} 325 | 0.5 gendat 100 326 | leastsq 5 327 | \end{verbatim} 328 | \end{minipage} 329 | 330 | \noindent\begin{minipage}{0.5\textwidth} 331 | \includegraphics[width=\linewidth]{leastsq8} 332 | \end{minipage}% 333 | \hfill% 334 | \begin{minipage}{0.4\textwidth} 335 | \begin{verbatim} 336 | leastsq 8 337 | \end{verbatim} 338 | \end{minipage} 339 | \vskip.5\baselineskip 340 | 341 | \subsection{Tikhonov regularization} 342 | With less examples than the number of basis functions (i.e. $m>= 391 | ridge=: 4 : 0 392 | gram y 393 | c=: ((|:A) mp Y) %. x addDiag S 394 | YThat=: c&p. XT 395 | plotpoly 0 396 | ) 397 | 398 | <>= 399 | diag=: (<0 1)&|: : (([:(>:*i.)[:#])}) 400 | addDiag=: ([+diag@]) diag ] NB. add x to the diagonal of y 401 | 402 | @ 403 | \noindent\begin{minipage}{0.5\textwidth} 404 | \includegraphics[width=\linewidth]{ridge4} 405 | \end{minipage}% 406 | \hfill% 407 | \begin{minipage}{0.4\textwidth} 408 | \begin{verbatim} 409 | 1E_4 ridge 4 410 | \end{verbatim} 411 | \end{minipage} 412 | \vskip.5\baselineskip 413 | 414 | \noindent\begin{minipage}{0.5\textwidth} 415 | \includegraphics[width=\linewidth]{ridge5} 416 | \end{minipage}% 417 | \hfill% 418 | \begin{minipage}{0.4\textwidth} 419 | \begin{verbatim} 420 | 1E_4 ridge 5 421 | \end{verbatim} 422 | \end{minipage} 423 | \vskip.5\baselineskip 424 | 425 | \noindent\begin{minipage}{0.5\textwidth} 426 | \includegraphics[width=\linewidth]{ridge8} 427 | \end{minipage}% 428 | \hfill% 429 | \begin{minipage}{0.4\textwidth} 430 | \begin{verbatim} 431 | 1E_4 ridge 8 432 | \end{verbatim} 433 | \end{minipage} 434 | \vskip.5\baselineskip 435 | 436 | \subsection{Extreme Learning Machine} 437 | The following parameterized form of $f$ corresponds to a single hidden layer 438 | neural network. 439 | \[ 440 | f(\vec{x}) = c_1 g(\vec{w_1}\cdot\vec{x}+b_1) + c_2 g(\vec{w_2}\cdot\vec{x}+b_2) 441 | + \dots + c_M g(\vec{w_M}\cdot\vec{x}+b_M) 442 | \] 443 | $g$ is a non-linear activation function. We use the rectified linear unit (ReLU): 444 | $g(y)=max(0,y)$. 445 | 446 | If vectors $\vec{w_1}\dots\vec{w_M}$ and scalars $b_1\dots b_M$ are initialized 447 | randomly and never modified (i.e., if they are not parameters), we can solve a 448 | linear system $\vec{H}\vec{c}=\vec{y}$ of unknwon $\vec{c}$. 449 | \[ 450 | \vec{H}: 451 | \left( \begin{array}{ccc} 452 | g(\vec{w_1}\cdot\vec{x_1}+b_1) & \dots & g(\vec{w_M}\cdot\vec{x_1}+b_M) \\ 453 | \dots & \dots & \dots \\ 454 | g(\vec{w_1}\cdot\vec{x_N}+b_1) & \dots & g(\vec{w_M}\cdot\vec{x_N}+b_M) 455 | \end{array} \right) 456 | \] 457 | \begin{align*} 458 | \vec{c}^T &: \left(c_1 \dots c_M\right) \\ 459 | \vec{y}^T &: \left(y_1 \dots y_N\right) 460 | \end{align*} 461 | 462 | This approach is named \emph{Extreme Learning Machine} 463 | \footnote{\url{https://scholar.google.fr/scholar?q=extreme+learning+machine}}. 464 | 465 | [[initelm 100]] initializes randomly matrix H with $100$ neurons on the hidden 466 | layer (i.e., $M=100$) and computes its Gram form [[S]]. 467 | 468 | <>= 469 | initelm=: 3 : 0 470 | W=: _1 + 2 * ? (y,1) $ 0 NB. input weights 471 | B=: ? y $ 0 NB. bias 472 | H=: mkH ,. X 473 | 0 [ S=: (mp~ |:) H 474 | ) 475 | mkH=: 3 : '0&>. B +"1 y mp"1/ W' 476 | 477 | @ 478 | [[elm 1E_4]] solves the extreme learning machine linear system with a Tikhonov 479 | regularization coefficient of $10^{-4}$. 480 | 481 | <>= 482 | elm=: 3 : 0 483 | c=: ((|:H) mp Y) %. y addDiag S 484 | YThat=: (mkH ,. XT) mp c 485 | plotelm 0 486 | ) 487 | 488 | <>= 489 | plotelm=: 3 : 0 490 | plotdatnoshow 0 491 | pd 'type line' 492 | pd 'color blue' 493 | xs=: (] #~ minmaxX"_ sel ]) steps (<.<./X),(>.>./X),100 494 | pd xs;(mkH ,. xs) mp c 495 | pd 'show' 496 | ) 497 | 498 | @ 499 | \noindent\begin{minipage}{0.5\textwidth} 500 | \includegraphics[width=\linewidth]{elm_1E_3} 501 | \end{minipage}% 502 | \hfill% 503 | \begin{minipage}{0.4\textwidth} 504 | \begin{verbatim} 505 | initelm 100 506 | 0 507 | elm 1E_3 508 | \end{verbatim} 509 | \end{minipage} 510 | \vskip.5\baselineskip 511 | 512 | \noindent\begin{minipage}{0.5\textwidth} 513 | \includegraphics[width=\linewidth]{elmtest} 514 | \end{minipage}% 515 | \hfill% 516 | \begin{minipage}{0.4\textwidth} 517 | \begin{verbatim} 518 | test 0 519 | \end{verbatim} 520 | \end{minipage} 521 | \vskip.5\baselineskip 522 | 523 | <>= 524 | require'trig' 525 | require'plot' 526 | require'numeric' 527 | 528 | <>= 529 | <> 530 | <> 531 | <> 532 | <> 533 | <> 534 | <> 535 | <> 536 | <> 537 | <> 538 | <> 539 | <> 540 | <> 541 | 542 | @ 543 | \end{document} 544 | -------------------------------------------------------------------------------- /jelm.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/peportier/jelm/5f0447e2a9b5abb53cf423f830f43976f18147c6/jelm.pdf -------------------------------------------------------------------------------- /jelm.tex: -------------------------------------------------------------------------------- 1 | \documentclass[10pt]{article}% ===> this file was generated automatically by noweave --- better not edit it 2 | \usepackage[a4paper, total={6in, 8in}]{geometry} 3 | \usepackage{graphicx} 4 | \graphicspath{ {./images/} } 5 | \usepackage{noweb} 6 | \usepackage{amsmath} 7 | \usepackage{amssymb} 8 | \usepackage{url} 9 | \title{Extreme Learning Machine in J} 10 | \author{Pierre-Edouard Portier} 11 | \date{2019} 12 | \renewcommand{\vec}[1]{\boldsymbol{#1}} 13 | \newcommand{\norm}[1]{\left\lVert#1\right\rVert} 14 | 15 | \begin{document} 16 | \maketitle 17 | \section{Regression} 18 | $\vec{x^{(1)}}\dots\vec{x^{(P)}}$ are vectors of $\mathbb{R}^{n-1}$ with 19 | associated values $y^{(1)}\dots y^{(P)}$ of $\mathbb{R}$. 20 | We search a function $f(\vec{x}):\mathbb{R}^{n-1} \rightarrow \mathbb{R}$ to 21 | model the observed relationship between $\vec{x}$ and $y$. 22 | $f$ can have a fixed parameterized form. For example: 23 | \[ 24 | f(\vec{x}) = a_0 + a_1 x_1 + a_2 x_2 + \dots + a_{n-1} x_{n-1} 25 | \] 26 | 27 | If $P=n$, parameters $a_0 \dots a_{n-1}$ are found by solving a linear system. 28 | \[ 29 | \begin{cases} 30 | y^{(1)} &= a_0 + a_1 x_1^{(1)} + a_2 x_2^{(1)} + \dots + a_{n-1} x_{n-1}^{(1)} \\ 31 | \dots &= \dots \\ 32 | y^{(P)} &= a_0 + a_1 x_1^{(P)} + a_2 x_2^{(P)} + \dots + a_{n-1} x_{n-1}^{(P)} \\ 33 | \end{cases} 34 | \] 35 | This system can be written in matrix form. 36 | \[ 37 | \left( \begin{array}{cccc} 38 | 1 & x^{(1)}_1 & \dots & x^{(1)}_{n-1} \\ 39 | 1 & x^{(2)}_1 & \dots & x^{(2)}_{n-1} \\ 40 | \dots & \dots & \dots & \dots \\ 41 | 1 & x^{(P)}_1 & \dots & x^{(P)}_{n-1} 42 | \end{array} \right) 43 | \left( \begin{array}{c} 44 | a_0 \\ a_1 \\ \dots \\ a_{n-1} 45 | \end{array} \right) 46 | = 47 | \left( \begin{array}{c} 48 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(P)} 49 | \end{array} \right) 50 | \] 51 | 52 | Each line of the first term matrix is a vector $\vec{x^{(i)T}}$ with the 53 | addition of a constant coordinate that accounts for parameter $a_0$. 54 | Thus, naming this matrix $\vec{X}^T$, the linear system can also be written: 55 | \[ 56 | \vec{X}^T \vec{a} = \vec{y} 57 | \] 58 | 59 | Consider the special case when $x$ is a number and $f$ is a polynomial of degree $n-1$: 60 | \[ 61 | f(x) = a_0 + a_1 x + a_2 x^2 + \dots + a_{n-1}x^{n-1} 62 | \] 63 | 64 | With $P=n$ examples $\left(x^{(k)},y^{(k)}\right)$, the parameters are found by 65 | solving the following linear system: 66 | \begin{equation} 67 | \left( \begin{array}{ccccc} 68 | 1 & x^{(1)} & (x^{(1)})^2 & \dots & (x^{(1)})^{n-1} \\ 69 | 1 & x^{(2)} & (x^{(2)})^2 & \dots & (x^{(2)})^{n-1} \\ 70 | \dots & \dots & \dots & \dots \\ 71 | 1 & x^{(P)} & (x^{(P)})^2 & \dots & (x^{(P)})^{n-1} 72 | \end{array} \right) 73 | \left( \begin{array}{c} 74 | a_0 \\ a_1 \\ \dots \\ a_{n-1} 75 | \end{array} \right) 76 | = 77 | \left( \begin{array}{c} 78 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(P)} 79 | \end{array} \right) 80 | \label{eqn:vandermonde} 81 | \end{equation} 82 | Incidentally, the first term is called the Vandermonde Matrix. 83 | 84 | \subsection{Experiment with a 1-dimensional synthetic dataset} 85 | We define a non linear function {\Tt{}f\nwendquote} from which we generate a dataset. 86 | 87 | \nwfilename{jelm.nw}\nwbegincode{1}\sublabel{NWS2xmL-1BDS9S-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-1BDS9S-1}}}\moddef{dataset~{\nwtagstyle{}\subpageref{NWS2xmL-1BDS9S-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 88 | f=: 3 : '(^y) * cos 2*pi * sin pi * y' 89 | \LA{}noise~{\nwtagstyle{}\subpageref{NWS2xmL-148nWG-1}}\RA{} 90 | \LA{}gendat~{\nwtagstyle{}\subpageref{NWS2xmL-15el3j-1}}\RA{} 91 | 92 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{2}\nwdocspar 93 | In traditional mathematical form, this function is: 94 | \[f(x)=e^x \times cos\left(2\pi sin\left(\pi x\right)\right)\] 95 | 96 | Function {\Tt{}noise\nwendquote} adds some random noise to the values of a vector. For 97 | example {\Tt{}0.5\ noise\ v\nwendquote}, will add random values uniformly drawn from interval 98 | $[-0.5,0.5]$ to the terms of vector {\Tt{}v\nwendquote}. 99 | 100 | \nwenddocs{}\nwbegincode{3}\sublabel{NWS2xmL-148nWG-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-148nWG-1}}}\moddef{noise~{\nwtagstyle{}\subpageref{NWS2xmL-148nWG-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-1BDS9S-1}}\nwenddeflinemarkup 101 | noise=: 4 : 'y + -&x *&(+:x) ? (#y) # 0' 102 | 103 | \nwused{\\{NWS2xmL-1BDS9S-1}}\nwendcode{}\nwbegindocs{4}\nwdocspar 104 | {\Tt{}0.5\ gendat\ 10\nwendquote} generates from {\Tt{}f\nwendquote} a dataset {\Tt{}(X,Y)\nwendquote} of 10 points with 105 | random noise in $[-0.5,0.5]$ added to {\Tt{}Y\nwendquote}. It also stores in {\Tt{}minmaxX\nwendquote} the 106 | minimum and maximum values of {\Tt{}X\nwendquote}. It computes the pair {\Tt{}minmaxf\nwendquote}, 107 | where the first term is ten percent smaller than the minimum of {\Tt{}f\nwendquote} on interval 108 | $[0,1]$, and the second term is ten percent bigger than the maximum of {\Tt{}f\nwendquote} on 109 | interval $[0,1]$. {\Tt{}minmaxf\nwendquote} is later used to crop the plots so that 110 | extreme values are not visible. 111 | 112 | A test set {\Tt{}(XT,YT)\nwendquote} is used to assert the capacity of the model to generalize 113 | on unseen data. Its size is fixed to $10\%$ of the size of the training set. 114 | 115 | \nwenddocs{}\nwbegincode{5}\sublabel{NWS2xmL-qrUr4-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}}\moddef{utils~{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{\relax}{NWS2xmL-qrUr4-2}\nwenddeflinemarkup 116 | pushup=: ] + 0.1 * | 117 | pushdown=: ] - 0.1 * | 118 | 119 | \nwalsodefined{\\{NWS2xmL-qrUr4-2}\\{NWS2xmL-qrUr4-3}\\{NWS2xmL-qrUr4-4}\\{NWS2xmL-qrUr4-5}}\nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{6}\sublabel{NWS2xmL-15el3j-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-15el3j-1}}}\moddef{gendat~{\nwtagstyle{}\subpageref{NWS2xmL-15el3j-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-1BDS9S-1}}\nwenddeflinemarkup 120 | gendat=: 4 : 0 121 | X=: ? y $ 0 122 | Y=: x noise f X 123 | minmaxX=: (<./ , >./) X 124 | minmaxf=: (([: pushdown <./) , ([: pushup >./)) f steps 0 1 100 125 | XT=: ? (>. 0.1 * y) $ 0 126 | YT=: f XT 127 | 0 128 | ) 129 | 130 | \nwused{\\{NWS2xmL-1BDS9S-1}}\nwendcode{}\nwbegindocs{7}\nwdocspar 131 | {\Tt{}plotdat\ 0\nwendquote} plots the dataset. 132 | 133 | \nwenddocs{}\nwbegincode{8}\sublabel{NWS2xmL-MlnJE-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-MlnJE-1}}}\moddef{plotdat~{\nwtagstyle{}\subpageref{NWS2xmL-MlnJE-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 134 | plotdatnoshow=: 3 : 0 135 | \LA{}initplot~{\nwtagstyle{}\subpageref{NWS2xmL-1JSNQW-1}}\RA{} 136 | pd X;Y 137 | \LA{}plotf~{\nwtagstyle{}\subpageref{NWS2xmL-4gTINf-1}}\RA{} 138 | ) 139 | plotdat=: 3 : 0 140 | plotdatnoshow 0 141 | pd 'show' 142 | ) 143 | 144 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{9}\sublabel{NWS2xmL-1JSNQW-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-1JSNQW-1}}}\moddef{initplot~{\nwtagstyle{}\subpageref{NWS2xmL-1JSNQW-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-MlnJE-1}\\{NWS2xmL-D6YgQ-1}}\nwenddeflinemarkup 145 | pd 'reset' 146 | pd 'color green' 147 | pd 'type marker' 148 | pd 'markersize 1' 149 | pd 'markers circle' 150 | 151 | \nwused{\\{NWS2xmL-MlnJE-1}\\{NWS2xmL-D6YgQ-1}}\nwendcode{}\nwbegincode{10}\sublabel{NWS2xmL-4gTINf-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-4gTINf-1}}}\moddef{plotf~{\nwtagstyle{}\subpageref{NWS2xmL-4gTINf-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-MlnJE-1}\\{NWS2xmL-D6YgQ-1}}\nwenddeflinemarkup 152 | pd 'color red' 153 | pd 'type line' 154 | pd 'pensize 1' 155 | pd (;f) steps 0 1 100 156 | 157 | \nwused{\\{NWS2xmL-MlnJE-1}\\{NWS2xmL-D6YgQ-1}}\nwendcode{}\nwbegindocs{11}\nwdocspar 158 | \noindent\begin{minipage}{0.5\textwidth} 159 | \includegraphics[width=\linewidth]{plotdat} 160 | \end{minipage}% 161 | \hfill% 162 | \begin{minipage}{0.4\textwidth} 163 | \begin{verbatim} 164 | 0.5 gendat 10 165 | plotdat 0 166 | \end{verbatim} 167 | \end{minipage} 168 | \vskip.5\baselineskip 169 | 170 | {\Tt{}polyreg\ 0\nwendquote} solves the linear system \eqref{eqn:vandermonde}, stores the 171 | coefficients of the polynomial in variable~{\Tt{}c\nwendquote} and computes {\Tt{}YThat\nwendquote}, the 172 | predictions on the test dataset. 173 | 174 | \nwenddocs{}\nwbegincode{12}\sublabel{NWS2xmL-16Ud8S-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-16Ud8S-1}}}\moddef{polyreg~{\nwtagstyle{}\subpageref{NWS2xmL-16Ud8S-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 175 | polyreg=: 3 : 0 176 | c=: Y %. X ^/ i.#X 177 | YThat=: c&p. XT 178 | plotpoly 0 179 | ) 180 | 181 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{13}\sublabel{NWS2xmL-qrUr4-2}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-2}}}\moddef{utils~{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}}\plusendmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{NWS2xmL-qrUr4-1}{NWS2xmL-qrUr4-3}\nwenddeflinemarkup 182 | NB. locate the elements with values between \{.x and \{:x 183 | sel=: (] >: \{.@[) *. (] <: \{:@[) 184 | 185 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{14}\sublabel{NWS2xmL-44Bx4d-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-44Bx4d-1}}}\moddef{plotpoly~{\nwtagstyle{}\subpageref{NWS2xmL-44Bx4d-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 186 | plotpoly=: 3 : 0 187 | plotdatnoshow 0 188 | pd 'color blue' 189 | xs=: (] #~ minmaxX"_ sel ]) /:~ X,steps 0 1 100 190 | pval=: c&p. xs 191 | crop=: minmaxf sel pval 192 | pd (crop # xs);(crop # pval) 193 | pd 'show' 194 | ) 195 | 196 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{15}\nwdocspar 197 | \noindent\begin{minipage}{0.5\textwidth} 198 | \includegraphics[width=\linewidth]{polyreg} 199 | \end{minipage}% 200 | \hfill% 201 | \begin{minipage}{0.4\textwidth} 202 | \begin{verbatim} 203 | polyreg 0 204 | \end{verbatim} 205 | \end{minipage} 206 | \vskip.5\baselineskip 207 | 208 | {\Tt{}test\ 0\nwendquote} returns the root mean square error (RMSE) on the test set, and a plot 209 | of the predictions. 210 | 211 | \nwenddocs{}\nwbegincode{16}\sublabel{NWS2xmL-qrUr4-3}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-3}}}\moddef{utils~{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}}\plusendmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{NWS2xmL-qrUr4-2}{NWS2xmL-qrUr4-4}\nwenddeflinemarkup 212 | mean=: +/ % # 213 | rmse=: [: %: [: mean ([: *: -) 214 | 215 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{17}\sublabel{NWS2xmL-Joiug-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-Joiug-1}}}\moddef{test~{\nwtagstyle{}\subpageref{NWS2xmL-Joiug-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 216 | test=: 3 : 0 217 | plottest 0 218 | YT rmse YThat 219 | ) 220 | 221 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{18}\sublabel{NWS2xmL-D6YgQ-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-D6YgQ-1}}}\moddef{plottest~{\nwtagstyle{}\subpageref{NWS2xmL-D6YgQ-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 222 | plottest=: 3 : 0 223 | \LA{}initplot~{\nwtagstyle{}\subpageref{NWS2xmL-1JSNQW-1}}\RA{} 224 | pd XT;YT 225 | pd 'color magenta' 226 | pd XT;YThat 227 | \LA{}plotf~{\nwtagstyle{}\subpageref{NWS2xmL-4gTINf-1}}\RA{} 228 | pd 'show' 229 | ) 230 | 231 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{19}\nwdocspar 232 | \subsection{Generalization to a function space} 233 | Given a basis for a function space, we can try to express {\Tt{}f\nwendquote} as a combination 234 | of basis functions. 235 | \[ 236 | f(\vec{x}) = a_1 f_1(\vec{x}) + a_2 f_2(\vec{x}) + \dots + a_n f_n(\vec{x}) 237 | \] 238 | 239 | Given a dataset of $n$ pairs $\biggl(\vec{x}^{(k)},\vec{y}^{(k)}\biggr)$, the 240 | coefficients $a_i$ are found by solving a linear system. 241 | 242 | \[ 243 | \left( \begin{array}{ccccc} 244 | f_1(\vec{x}^{(1)}) & f_2(\vec{x}^{(1)}) & \dots & f_n(\vec{x}^{(1)}) \\ 245 | f_1(\vec{x}^{(2)}) & f_2(\vec{x}^{(2)}) & \dots & f_n(\vec{x}^{(2)}) \\ 246 | \dots & \dots & \dots & \dots \\ 247 | f_1(\vec{x}^{(n)}) & f_2(\vec{x}^{(n)}) & \dots & f_n(\vec{x}^{(n)}) 248 | \end{array} \right) 249 | \left( \begin{array}{c} 250 | a_1 \\ a_2 \\ \dots \\ a_{n} 251 | \end{array} \right) 252 | = 253 | \left( \begin{array}{c} 254 | y^{(1)} \\ y^{(2)} \\ \dots \\ y^{(n)} 255 | \end{array} \right) 256 | \] 257 | 258 | Let us denote this linear system by $\vec{A}\vec{x}=\vec{b}$. 259 | 260 | \subsection{Least squares} 261 | The linear system $\vec{A}\vec{x}=\vec{b}$ 262 | (with $\vec{A} \in \mathbb{R}^{m \times n}$) doesn't necessarily have a solution 263 | when there are more examples than the number of basis functions (i.e. $m>n$). 264 | Thus, we want to find an approximate solution $\vec{A}\vec{x}\approx\vec{b}$ 265 | that minimizes the squares of the errors: $\norm{\vec{A}\vec{x}-\vec{b}}^2_2$. 266 | 267 | \begin{align*} 268 | & \norm{\vec{A}\vec{x}-\vec{b}}^2_2 \\ 269 | = \{ & \norm{\vec{x}}_2 = \sqrt{\vec{x}\cdot\vec{x}} \} \\ 270 | & \left(\vec{A}\vec{x}-\vec{b}\right) \cdot \left(\vec{A}\vec{x}-\vec{b}\right) \\ 271 | = \{ & \text{euclidean scalar product} \} \\ 272 | & \left(\vec{A}\vec{x}-\vec{b}\right)^T \left(\vec{A}\vec{x}-\vec{b}\right) \\ 273 | = \{ & \text{property of transposition} \} \\ 274 | & \left(\vec{x}^T\vec{A}^T - \vec{b}^T \right) \left(\vec{A}\vec{x}-\vec{b}\right) \\ 275 | = \{ & \text{multiplication} \} \\ 276 | & \vec{x}^T\vec{A}^T\vec{A}\vec{x} - \vec{x}^T\vec{A}^T\vec{b} - \vec{b}^T\vec{A}\vec{x} + \vec{b}^T\vec{b} \\ 277 | = \{ & \text{Since each element of the sum is a scalar, } \vec{b}^T\vec{A}\vec{x} = \left(\vec{b}^T\vec{A}\vec{x}\right)^T = \vec{x}^T\vec{A}^T\vec{b} \} \\ 278 | & \vec{x}^T\vec{A}^T\vec{A}\vec{x} - 2\vec{x}^T\vec{A}^T\vec{b} + \vec{b}^T\vec{b} 279 | \end{align*} 280 | 281 | To this quadratic expression corresponds a convex surface. 282 | Its minimum is found by setting its derivative to zero. 283 | 284 | \begin{align*} 285 | & \vec{0} = 2\vec{A}^T\vec{A}\vec{x} - 2\vec{A}^T\vec{b} \\ 286 | =& \\ 287 | & \vec{A}^T\vec{A}\vec{x} = \vec{A}^T\vec{b} 288 | \end{align*} 289 | 290 | Thus, when $m>n$, we solve $\vec{A}\vec{x}\approx\vec{b}$ by solving 291 | $\vec{A}^T\vec{A}\vec{x} = \vec{A}^T\vec{b}$. 292 | $\vec{A}^T\vec{A}$ is called the Gram matrix. 293 | 294 | {\Tt{}gram\ y\nwendquote} computes the Gram matrix {\Tt{}S\nwendquote} for a polynomial basis of degree {\Tt{}y-1\nwendquote}. 295 | 296 | \nwenddocs{}\nwbegincode{20}\sublabel{NWS2xmL-4YHogE-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-4YHogE-1}}}\moddef{gram~{\nwtagstyle{}\subpageref{NWS2xmL-4YHogE-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{\relax}{NWS2xmL-4YHogE-2}\nwenddeflinemarkup 297 | gram=: 3 : 0 298 | A=: X ^/ i.y 299 | S=: (mp~ |:) A 300 | ) 301 | 302 | \nwalsodefined{\\{NWS2xmL-4YHogE-2}}\nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{21}\sublabel{NWS2xmL-qrUr4-4}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-4}}}\moddef{utils~{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}}\plusendmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{NWS2xmL-qrUr4-3}{NWS2xmL-qrUr4-5}\nwenddeflinemarkup 303 | mp=: +/ . * NB. matrix product 304 | 305 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{22}\nwdocspar 306 | 307 | {\Tt{}leastsq\ y\nwendquote} solves the overdetermined linear system by computing the Gram 308 | matrix for a polynomial basis of degree {\Tt{}y-1\nwendquote}. 309 | 310 | \nwenddocs{}\nwbegincode{23}\sublabel{NWS2xmL-4YHogE-2}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-4YHogE-2}}}\moddef{gram~{\nwtagstyle{}\subpageref{NWS2xmL-4YHogE-1}}}\plusendmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{NWS2xmL-4YHogE-1}{\relax}\nwenddeflinemarkup 311 | leastsq=: 3 : 0 312 | gram y 313 | c=: ((|:A) mp Y) %. S 314 | YThat=: c&p. XT 315 | plotpoly 0 316 | ) 317 | 318 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{24}\nwdocspar 319 | \noindent\begin{minipage}{0.5\textwidth} 320 | \includegraphics[width=\linewidth]{leastsq5} 321 | \end{minipage}% 322 | \hfill% 323 | \begin{minipage}{0.4\textwidth} 324 | \begin{verbatim} 325 | 0.5 gendat 100 326 | leastsq 5 327 | \end{verbatim} 328 | \end{minipage} 329 | 330 | \noindent\begin{minipage}{0.5\textwidth} 331 | \includegraphics[width=\linewidth]{leastsq8} 332 | \end{minipage}% 333 | \hfill% 334 | \begin{minipage}{0.4\textwidth} 335 | \begin{verbatim} 336 | leastsq 8 337 | \end{verbatim} 338 | \end{minipage} 339 | \vskip.5\baselineskip 340 | 341 | \subsection{Tikhonov regularization} 342 | With less examples than the number of basis functions (i.e. $m:*i.)[:#])\}) 400 | addDiag=: ([+diag@]) diag ] NB. add x to the diagonal of y 401 | 402 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{27}\nwdocspar 403 | \noindent\begin{minipage}{0.5\textwidth} 404 | \includegraphics[width=\linewidth]{ridge4} 405 | \end{minipage}% 406 | \hfill% 407 | \begin{minipage}{0.4\textwidth} 408 | \begin{verbatim} 409 | 1E_4 ridge 4 410 | \end{verbatim} 411 | \end{minipage} 412 | \vskip.5\baselineskip 413 | 414 | \noindent\begin{minipage}{0.5\textwidth} 415 | \includegraphics[width=\linewidth]{ridge5} 416 | \end{minipage}% 417 | \hfill% 418 | \begin{minipage}{0.4\textwidth} 419 | \begin{verbatim} 420 | 1E_4 ridge 5 421 | \end{verbatim} 422 | \end{minipage} 423 | \vskip.5\baselineskip 424 | 425 | \noindent\begin{minipage}{0.5\textwidth} 426 | \includegraphics[width=\linewidth]{ridge8} 427 | \end{minipage}% 428 | \hfill% 429 | \begin{minipage}{0.4\textwidth} 430 | \begin{verbatim} 431 | 1E_4 ridge 8 432 | \end{verbatim} 433 | \end{minipage} 434 | \vskip.5\baselineskip 435 | 436 | \subsection{Extreme Learning Machine} 437 | The following parameterized form of $f$ corresponds to a single hidden layer 438 | neural network. 439 | \[ 440 | f(\vec{x}) = c_1 g(\vec{w_1}\cdot\vec{x}+b_1) + c_2 g(\vec{w_2}\cdot\vec{x}+b_2) 441 | + \dots + c_M g(\vec{w_M}\cdot\vec{x}+b_M) 442 | \] 443 | $g$ is a non-linear activation function. We use the rectified linear unit (ReLU): 444 | $g(y)=max(0,y)$. 445 | 446 | If vectors $\vec{w_1}\dots\vec{w_M}$ and scalars $b_1\dots b_M$ are initialized 447 | randomly and never modified (i.e., if they are not parameters), we can solve a 448 | linear system $\vec{H}\vec{c}=\vec{y}$ of unknwon $\vec{c}$. 449 | \[ 450 | \vec{H}: 451 | \left( \begin{array}{ccc} 452 | g(\vec{w_1}\cdot\vec{x_1}+b_1) & \dots & g(\vec{w_M}\cdot\vec{x_1}+b_M) \\ 453 | \dots & \dots & \dots \\ 454 | g(\vec{w_1}\cdot\vec{x_N}+b_1) & \dots & g(\vec{w_M}\cdot\vec{x_N}+b_M) 455 | \end{array} \right) 456 | \] 457 | \begin{align*} 458 | \vec{c}^T &: \left(c_1 \dots c_M\right) \\ 459 | \vec{y}^T &: \left(y_1 \dots y_N\right) 460 | \end{align*} 461 | 462 | This approach is named \emph{Extreme Learning Machine} 463 | \footnote{\url{https://scholar.google.fr/scholar?q=extreme+learning+machine}}. 464 | 465 | {\Tt{}initelm\ 100\nwendquote} initializes randomly matrix H with $100$ neurons on the hidden 466 | layer (i.e., $M=100$) and computes its Gram form {\Tt{}S\nwendquote}. 467 | 468 | \nwenddocs{}\nwbegincode{28}\sublabel{NWS2xmL-3rGYkW-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-3rGYkW-1}}}\moddef{elm~{\nwtagstyle{}\subpageref{NWS2xmL-3rGYkW-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{\relax}{NWS2xmL-3rGYkW-2}\nwenddeflinemarkup 469 | initelm=: 3 : 0 470 | W=: _1 + 2 * ? (y,1) $ 0 NB. input weights 471 | B=: ? y $ 0 NB. bias 472 | H=: mkH ,. X 473 | 0 [ S=: (mp~ |:) H 474 | ) 475 | mkH=: 3 : '0&>. B +"1 y mp"1/ W' 476 | 477 | \nwalsodefined{\\{NWS2xmL-3rGYkW-2}}\nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{29}\nwdocspar 478 | {\Tt{}elm\ 1E{\_}4\nwendquote} solves the extreme learning machine linear system with a Tikhonov 479 | regularization coefficient of $10^{-4}$. 480 | 481 | \nwenddocs{}\nwbegincode{30}\sublabel{NWS2xmL-3rGYkW-2}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-3rGYkW-2}}}\moddef{elm~{\nwtagstyle{}\subpageref{NWS2xmL-3rGYkW-1}}}\plusendmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwprevnextdefs{NWS2xmL-3rGYkW-1}{\relax}\nwenddeflinemarkup 482 | elm=: 3 : 0 483 | c=: ((|:H) mp Y) %. y addDiag S 484 | YThat=: (mkH ,. XT) mp c 485 | plotelm 0 486 | ) 487 | 488 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{31}\sublabel{NWS2xmL-4OfyJI-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-4OfyJI-1}}}\moddef{plotelm~{\nwtagstyle{}\subpageref{NWS2xmL-4OfyJI-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 489 | plotelm=: 3 : 0 490 | plotdatnoshow 0 491 | pd 'type line' 492 | pd 'color blue' 493 | xs=: (] #~ minmaxX"_ sel ]) steps (<.<./X),(>.>./X),100 494 | pd xs;(mkH ,. xs) mp c 495 | pd 'show' 496 | ) 497 | 498 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegindocs{32}\nwdocspar 499 | \noindent\begin{minipage}{0.5\textwidth} 500 | \includegraphics[width=\linewidth]{elm_1E_3} 501 | \end{minipage}% 502 | \hfill% 503 | \begin{minipage}{0.4\textwidth} 504 | \begin{verbatim} 505 | initelm 100 506 | 0 507 | elm 1E_3 508 | \end{verbatim} 509 | \end{minipage} 510 | \vskip.5\baselineskip 511 | 512 | \noindent\begin{minipage}{0.5\textwidth} 513 | \includegraphics[width=\linewidth]{elmtest} 514 | \end{minipage}% 515 | \hfill% 516 | \begin{minipage}{0.4\textwidth} 517 | \begin{verbatim} 518 | test 0 519 | \end{verbatim} 520 | \end{minipage} 521 | \vskip.5\baselineskip 522 | 523 | \nwenddocs{}\nwbegincode{33}\sublabel{NWS2xmL-3bJuaU-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-3bJuaU-1}}}\moddef{require~{\nwtagstyle{}\subpageref{NWS2xmL-3bJuaU-1}}}\endmoddef\nwstartdeflinemarkup\nwusesondefline{\\{NWS2xmL-2IxhEH-1}}\nwenddeflinemarkup 524 | require'trig' 525 | require'plot' 526 | require'numeric' 527 | 528 | \nwused{\\{NWS2xmL-2IxhEH-1}}\nwendcode{}\nwbegincode{34}\sublabel{NWS2xmL-2IxhEH-1}\nwmargintag{{\nwtagstyle{}\subpageref{NWS2xmL-2IxhEH-1}}}\moddef{jelm.ijs~{\nwtagstyle{}\subpageref{NWS2xmL-2IxhEH-1}}}\endmoddef\nwstartdeflinemarkup\nwenddeflinemarkup 529 | \LA{}require~{\nwtagstyle{}\subpageref{NWS2xmL-3bJuaU-1}}\RA{} 530 | \LA{}utils~{\nwtagstyle{}\subpageref{NWS2xmL-qrUr4-1}}\RA{} 531 | \LA{}dataset~{\nwtagstyle{}\subpageref{NWS2xmL-1BDS9S-1}}\RA{} 532 | \LA{}plotdat~{\nwtagstyle{}\subpageref{NWS2xmL-MlnJE-1}}\RA{} 533 | \LA{}plotpoly~{\nwtagstyle{}\subpageref{NWS2xmL-44Bx4d-1}}\RA{} 534 | \LA{}polyreg~{\nwtagstyle{}\subpageref{NWS2xmL-16Ud8S-1}}\RA{} 535 | \LA{}gram~{\nwtagstyle{}\subpageref{NWS2xmL-4YHogE-1}}\RA{} 536 | \LA{}ridge~{\nwtagstyle{}\subpageref{NWS2xmL-Vqgr5-1}}\RA{} 537 | \LA{}plotelm~{\nwtagstyle{}\subpageref{NWS2xmL-4OfyJI-1}}\RA{} 538 | \LA{}elm~{\nwtagstyle{}\subpageref{NWS2xmL-3rGYkW-1}}\RA{} 539 | \LA{}plottest~{\nwtagstyle{}\subpageref{NWS2xmL-D6YgQ-1}}\RA{} 540 | \LA{}test~{\nwtagstyle{}\subpageref{NWS2xmL-Joiug-1}}\RA{} 541 | 542 | \nwnotused{jelm.ijs}\nwendcode{} 543 | 544 | \nwixlogsorted{c}{{dataset}{NWS2xmL-1BDS9S-1}{\nwixd{NWS2xmL-1BDS9S-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 545 | \nwixlogsorted{c}{{elm}{NWS2xmL-3rGYkW-1}{\nwixd{NWS2xmL-3rGYkW-1}\nwixd{NWS2xmL-3rGYkW-2}\nwixu{NWS2xmL-2IxhEH-1}}}% 546 | \nwixlogsorted{c}{{gendat}{NWS2xmL-15el3j-1}{\nwixu{NWS2xmL-1BDS9S-1}\nwixd{NWS2xmL-15el3j-1}}}% 547 | \nwixlogsorted{c}{{gram}{NWS2xmL-4YHogE-1}{\nwixd{NWS2xmL-4YHogE-1}\nwixd{NWS2xmL-4YHogE-2}\nwixu{NWS2xmL-2IxhEH-1}}}% 548 | \nwixlogsorted{c}{{initplot}{NWS2xmL-1JSNQW-1}{\nwixu{NWS2xmL-MlnJE-1}\nwixd{NWS2xmL-1JSNQW-1}\nwixu{NWS2xmL-D6YgQ-1}}}% 549 | \nwixlogsorted{c}{{jelm.ijs}{NWS2xmL-2IxhEH-1}{\nwixd{NWS2xmL-2IxhEH-1}}}% 550 | \nwixlogsorted{c}{{noise}{NWS2xmL-148nWG-1}{\nwixu{NWS2xmL-1BDS9S-1}\nwixd{NWS2xmL-148nWG-1}}}% 551 | \nwixlogsorted{c}{{plotdat}{NWS2xmL-MlnJE-1}{\nwixd{NWS2xmL-MlnJE-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 552 | \nwixlogsorted{c}{{plotelm}{NWS2xmL-4OfyJI-1}{\nwixd{NWS2xmL-4OfyJI-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 553 | \nwixlogsorted{c}{{plotf}{NWS2xmL-4gTINf-1}{\nwixu{NWS2xmL-MlnJE-1}\nwixd{NWS2xmL-4gTINf-1}\nwixu{NWS2xmL-D6YgQ-1}}}% 554 | \nwixlogsorted{c}{{plotpoly}{NWS2xmL-44Bx4d-1}{\nwixd{NWS2xmL-44Bx4d-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 555 | \nwixlogsorted{c}{{plottest}{NWS2xmL-D6YgQ-1}{\nwixd{NWS2xmL-D6YgQ-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 556 | \nwixlogsorted{c}{{polyreg}{NWS2xmL-16Ud8S-1}{\nwixd{NWS2xmL-16Ud8S-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 557 | \nwixlogsorted{c}{{require}{NWS2xmL-3bJuaU-1}{\nwixd{NWS2xmL-3bJuaU-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 558 | \nwixlogsorted{c}{{ridge}{NWS2xmL-Vqgr5-1}{\nwixd{NWS2xmL-Vqgr5-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 559 | \nwixlogsorted{c}{{test}{NWS2xmL-Joiug-1}{\nwixd{NWS2xmL-Joiug-1}\nwixu{NWS2xmL-2IxhEH-1}}}% 560 | \nwixlogsorted{c}{{utils}{NWS2xmL-qrUr4-1}{\nwixd{NWS2xmL-qrUr4-1}\nwixd{NWS2xmL-qrUr4-2}\nwixd{NWS2xmL-qrUr4-3}\nwixd{NWS2xmL-qrUr4-4}\nwixd{NWS2xmL-qrUr4-5}\nwixu{NWS2xmL-2IxhEH-1}}}% 561 | \nwbegindocs{35}\nwdocspar 562 | \end{document} 563 | \nwenddocs{} 564 | --------------------------------------------------------------------------------