├── .clang-format ├── .gitignore ├── CMakeLists.txt ├── README.md ├── data ├── test │ ├── t10k-images-idx3-ubyte │ └── t10k-labels-idx1-ubyte └── train │ ├── train-images-idx3-ubyte │ └── train-labels-idx1-ubyte ├── doc ├── 219994768ec294ab99637a7747627aeebd998d41.svg ├── DOC.epub ├── DOC.html ├── DOC.md ├── DOC.pdf ├── DOC.tex ├── Makefile ├── cb34710dce878a77b3fd5f3e7e4746403aaaefcc.svg ├── plots │ ├── -1637788021081228918.png │ ├── -1637788021081228918.txt │ ├── -6767785830879840565.png │ ├── -6767785830879840565.txt │ ├── 6094492350593652429.png │ └── 6094492350593652429.txt └── tikz.lua └── src ├── CCELossNode.cpp ├── CCELossNode.hpp ├── CMakeLists.txt ├── Dual.hpp ├── FFNode.cpp ├── FFNode.hpp ├── GDOptimizer.cpp ├── GDOptimizer.hpp ├── MNIST.cpp ├── MNIST.hpp ├── Model.cpp ├── Model.hpp └── main.cpp /.clang-format: -------------------------------------------------------------------------------- 1 | AccessModifierOffset: -4 2 | AlignAfterOpenBracket: true 3 | AlignConsecutiveAssignments: true 4 | AlignConsecutiveDeclarations: false 5 | AlignEscapedNewlinesLeft: true 6 | AlignTrailingComments: true 7 | AllowAllParametersOfDeclarationOnNextLine: false 8 | AllowShortBlocksOnASingleLine: false 9 | AllowShortCaseLabelsOnASingleLine: false 10 | AllowShortFunctionsOnASingleLine: false 11 | AllowShortIfStatementsOnASingleLine: false 12 | AllowShortLoopsOnASingleLine: false 13 | AlwaysBreakAfterReturnType: None 14 | AlwaysBreakBeforeMultilineStrings: true 15 | AlwaysBreakTemplateDeclarations: true 16 | BinPackArguments: false 17 | BinPackParameters: false 18 | BreakBeforeBraces: Custom 19 | BraceWrapping: 20 | AfterClass: true 21 | AfterControlStatement: true 22 | AfterEnum: true 23 | AfterFunction: true 24 | AfterNamespace: true 25 | AfterObjCDeclaration: true 26 | AfterStruct: true 27 | AfterUnion: true 28 | AfterExternBlock: true 29 | BeforeCatch: true 30 | BeforeElse: true 31 | IndentBraces: false 32 | SplitEmptyFunction: false 33 | SplitEmptyRecord: false 34 | SplitEmptyNamespace: false 35 | BreakBeforeBinaryOperators: All 36 | BreakBeforeTernaryOperators: true 37 | BreakConstructorInitializers: BeforeComma 38 | BreakStringLiterals: true 39 | ColumnLimit: 80 40 | CommentPragmas: '' 41 | CompactNamespaces: false 42 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 43 | ConstructorInitializerIndentWidth: 4 44 | ContinuationIndentWidth: 4 45 | Cpp11BracedListStyle: true 46 | DerivePointerBinding: false 47 | FixNamespaceComments: true 48 | IndentCaseLabels: false 49 | IndentPPDirectives: AfterHash 50 | IndentWidth: 4 51 | IndentWrappedFunctionNames: false 52 | KeepEmptyLinesAtTheStartOfBlocks: false 53 | Language: Cpp 54 | MaxEmptyLinesToKeep: 1 55 | NamespaceIndentation: Inner 56 | PenaltyBreakBeforeFirstCallParameter: 0 57 | PenaltyBreakComment: 0 58 | PenaltyBreakFirstLessLess: 0 59 | PenaltyBreakString: 1 60 | PenaltyExcessCharacter: 10 61 | PenaltyReturnTypeOnItsOwnLine: 20 62 | PointerAlignment: Left 63 | SortIncludes: true 64 | SortUsingDeclarations: true 65 | SpaceAfterTemplateKeyword: true 66 | SpaceBeforeAssignmentOperators: true 67 | SpaceBeforeParens: ControlStatements 68 | SpaceInEmptyParentheses: false 69 | SpacesBeforeTrailingComments: 1 70 | SpacesInAngles: false 71 | SpacesInCStyleCastParentheses: false 72 | SpacesInContainerLiterals: false 73 | SpacesInParentheses: false 74 | SpacesInSquareBrackets: false 75 | Standard: C++11 76 | TabWidth: 4 77 | UseTab: Never 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # System 2 | .DS_Store 3 | 4 | # IDE/Editor 5 | .ccls-cache 6 | .vscode 7 | 8 | # Build 9 | build 10 | build-clang 11 | build-gcc 12 | build-release 13 | .cache 14 | 15 | # Statically generated documentation 16 | site/ 17 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.16) 2 | 3 | project(nn_in_a_weekend LANGUAGES CXX) 4 | 5 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 6 | 7 | add_subdirectory(src) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # C++ Neural Network in a Weekend 2 | 3 | This repository is the companion code to the article "Neural Network in a Weekend." Readers are welcome to clone the repository and use the code herein as a reference if following along the article. Pull requests and issues filed for errors and bugs in both code and/or documentation are welcome and appreciated. However, pull requests that introduce new features are unlikely to be considered, as the ultimate goal of this code is to be tractable for a newer practitioner getting started with deep learning architectures. 4 | 5 | [Article pdf link](https://github.com/jeremyong/cpp_nn_in_a_weekend/raw/master/doc/DOC.pdf) 6 | 7 | ## Compilation and Usage 8 | 9 | mkdir build 10 | cd build 11 | # substitute Ninja for your preferred generator 12 | cmake .. -G Ninja 13 | ninja 14 | # trains the network and writes the learned parameters to disk 15 | ./src/nn train ../data/train 16 | # evaluate the model loss and accuracy based on the trained parameters 17 | ./src/nn evaluate ../data/test ./ff.params 18 | 19 | Note that the actual location of the `nn` executable may depend on your build system and build type. For performance reasons, it recommended to run the training itself with an optimized build, reverting to a development/debug build only when debugging is needed. 20 | 21 | ## Conventions 22 | 23 | 1. Member variables have a single underscore suffix (e.g. `member_variable_`) 24 | 2. The `F.T.R.` acroynym stands for "For the reader" and precedes suggestions for experimentation, improvements, or alternative implementations 25 | 3. Throughout, you may see the type aliases `num_t` and `rne_t`. These aliases refer to `float` and `std::mt199837` respectively and are defined in `Model.hpp` to easily experiment with alternative precisions and random number engines. The reader may wish to make these parameters changeable by other means. 26 | 27 | ## General Code Structure 28 | 29 | The neural network is modeled as a computational graph. The graph itself is the `Model` defined in `Model.hpp`. Nodes in the computational graph override the `Node` base class and must implement various methods to explain how data flows through the node (forwards and backwards). 30 | 31 | The fully-connected feedforward node in this example is implemented as `FFNode` in `FFNode.hpp`. The cross-entropy loss node is implemented in `CELossNode.hpp`. Together, these two nodes are all that is needed to train our example on the MNIST dataset. 32 | 33 | ## Data 34 | 35 | For your convenience, the MNIST data used to train and test the network is provided uncompressed in the `data/` subdirectory. The data is structured like so: 36 | 37 | ### Images 38 | 39 | Image data can be parsed using code provided in the `MNIST.hpp` header, but the data is described here as well. Multi-byte integers are stored with the MSB first, meaning that on a little-endian architecture, the bytes must be flipped. Image pixel data is stored in row-major order and packed contiguously one after another. 40 | 41 | Bytes 42 | [00-03] 0x00000803 (Magic Number: 2051) 43 | [04-07] image count 44 | [08-11] rows 45 | [12-15] columns 46 | [16] pixel[0, 0] 47 | [17] pixel[0, 1] 48 | ... 49 | 50 | ### Labels 51 | 52 | Label data is parsed according to the following byte layout: 53 | 54 | Bytes 55 | [00-03] 0x00000801 (Magic Number: 2049) 56 | [04-07] label count 57 | [8] label 1 58 | [9] label 2 59 | ... 60 | 61 | The parser provided by the `MNIST` input node validates the magic numbers to ensure the machine endianness is as expected, and also validates that the image data and label data sizes match. 62 | -------------------------------------------------------------------------------- /data/test/t10k-images-idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/test/t10k-images-idx3-ubyte -------------------------------------------------------------------------------- /data/test/t10k-labels-idx1-ubyte: -------------------------------------------------------------------------------- 1 | '                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             -------------------------------------------------------------------------------- /data/train/train-images-idx3-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/train/train-images-idx3-ubyte -------------------------------------------------------------------------------- /data/train/train-labels-idx1-ubyte: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/train/train-labels-idx1-ubyte -------------------------------------------------------------------------------- /doc/219994768ec294ab99637a7747627aeebd998d41.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | -------------------------------------------------------------------------------- /doc/DOC.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/DOC.epub -------------------------------------------------------------------------------- /doc/DOC.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/DOC.pdf -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | all: pdf html 2 | 3 | clean: 4 | rm -rf *.svg plots DOC.pdf DOC.html 5 | 6 | tex: 7 | pandoc -F pandoc-plot -s DOC.md -o DOC.tex 8 | 9 | pdf: 10 | pandoc -F pandoc-plot -s --katex DOC.md -o DOC.pdf 11 | 12 | html: 13 | pandoc -L tikz.lua -F pandoc-plot -s --katex DOC.md -o DOC.html 14 | 15 | epub: 16 | pandoc -t epub3 --webtex -L tikz.lua -F pandoc-plot -s DOC.md -o DOC.epub 17 | -------------------------------------------------------------------------------- /doc/cb34710dce878a77b3fd5f3e7e4746403aaaefcc.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | -------------------------------------------------------------------------------- /doc/plots/-1637788021081228918.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/-1637788021081228918.png -------------------------------------------------------------------------------- /doc/plots/-1637788021081228918.txt: -------------------------------------------------------------------------------- 1 | # Generated by pandoc-plot 0.8.0.0 2 | 3 | import matplotlib.pyplot as plt 4 | import array as arr 5 | import math as math 6 | 7 | f = arr.array('f') 8 | f.append(0) 9 | f.append(0) 10 | f.append(1) 11 | x = arr.array('f') 12 | x.append(-1) 13 | x.append(-0) 14 | x.append(1) 15 | 16 | plt.figure() 17 | plt.plot(x, f) 18 | plt.xlabel('$x$') 19 | plt.ylabel('$\max(0, x)$') 20 | plt.title('Rectifier function') -------------------------------------------------------------------------------- /doc/plots/-6767785830879840565.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/-6767785830879840565.png -------------------------------------------------------------------------------- /doc/plots/-6767785830879840565.txt: -------------------------------------------------------------------------------- 1 | # Generated by pandoc-plot 0.8.0.0 2 | 3 | import matplotlib.pyplot as plt 4 | import array as arr 5 | import math as math 6 | 7 | s = arr.array('f') 8 | h = arr.array('f') 9 | 10 | last = 0 11 | n = 30 12 | for i in range(0, n): 13 | last += 1 / (n + 1) 14 | s.append(last) 15 | h.append(-(1 - last) * math.log(last) - last * math.log(1 - last)) 16 | 17 | plt.figure() 18 | plt.plot(s, h) 19 | plt.xlabel('$S$') 20 | plt.ylabel('$-(1-S)\log S - S\log (1 - S)$') 21 | plt.title('Cross entropy with mismatched distribution') -------------------------------------------------------------------------------- /doc/plots/6094492350593652429.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/6094492350593652429.png -------------------------------------------------------------------------------- /doc/plots/6094492350593652429.txt: -------------------------------------------------------------------------------- 1 | # Generated by pandoc-plot 0.8.0.0 2 | 3 | import matplotlib.pyplot as plt 4 | import array as arr 5 | import math as math 6 | 7 | s = arr.array('f') 8 | s.append(0) 9 | h = arr.array('f') 10 | h.append(0) 11 | 12 | last = 0 13 | n = 30 14 | for i in range(0, n): 15 | last += 1 / (n + 1) 16 | s.append(last) 17 | h.append(-last * math.log(last) - (1 - last) * math.log(1 - last)) 18 | 19 | s.append(1.0) 20 | h.append(0) 21 | 22 | plt.figure() 23 | plt.plot(s, h) 24 | plt.xlabel('$S$') 25 | plt.ylabel('$H(S) = -S\log S - (1 - S)\log (1 - S)$') 26 | plt.title('Binary Entropy') -------------------------------------------------------------------------------- /doc/tikz.lua: -------------------------------------------------------------------------------- 1 | local system = require 'pandoc.system' 2 | 3 | local tikz_doc_template = [[ 4 | \documentclass{standalone} 5 | \usepackage{xcolor} 6 | \usepackage{tikz} 7 | \usetikzlibrary{positioning,calc,arrows} 8 | \renewenvironment{center} {} {} 9 | \begin{document} 10 | \nopagecolor 11 | %s 12 | \end{document} 13 | ]] 14 | 15 | local function tikz2image(src, filetype, outfile) 16 | system.with_temporary_directory('tikz2image', function (tmpdir) 17 | system.with_working_directory(tmpdir, function() 18 | local f = io.open('tikz.tex', 'w') 19 | f:write(tikz_doc_template:format(src)) 20 | f:close() 21 | os.execute('pdflatex tikz.tex') 22 | if filetype == 'pdf' then 23 | os.rename('tikz.pdf', outfile) 24 | else 25 | os.execute('pdf2svg tikz.pdf ' .. outfile) 26 | end 27 | end) 28 | end) 29 | end 30 | 31 | extension_for = { 32 | html = 'svg', 33 | html4 = 'svg', 34 | html5 = 'svg', 35 | latex = 'pdf', 36 | beamer = 'pdf' } 37 | 38 | local function file_exists(name) 39 | local f = io.open(name, 'r') 40 | if f ~= nil then 41 | io.close(f) 42 | return true 43 | else 44 | return false 45 | end 46 | end 47 | 48 | local function starts_with(start, str) 49 | return str:sub(1, #start) == start 50 | end 51 | 52 | 53 | function RawBlock(el) 54 | if starts_with('\\begin{center}', el.text) then 55 | local filetype = extension_for[FORMAT] or 'svg' 56 | local fname = system.get_working_directory() .. '/' .. 57 | pandoc.sha1(el.text) .. '.' .. filetype 58 | if not file_exists(fname) then 59 | tikz2image(el.text, filetype, fname) 60 | end 61 | return pandoc.Para({pandoc.Image({}, fname)}) 62 | else 63 | return el 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /src/CCELossNode.cpp: -------------------------------------------------------------------------------- 1 | #include "CCELossNode.hpp" 2 | #include 3 | 4 | CCELossNode::CCELossNode(Model& model, 5 | std::string name, 6 | uint16_t input_size, 7 | size_t batch_size) 8 | : Node{model, std::move(name)} 9 | , input_size_{input_size} 10 | , inv_batch_size_{num_t{1.0} / static_cast(batch_size)} 11 | { 12 | // When we deliver a gradient back, we deliver just the loss gradient with 13 | // respect to any input and the index that was "hot" in the second argument. 14 | gradients_.resize(input_size_); 15 | } 16 | 17 | void CCELossNode::forward(num_t* data) 18 | { 19 | // The cross-entropy categorical loss is defined as -\sum_i(q_i * log(p_i)) 20 | // where p_i is the predicted probabilty and q_i is the expected probablity 21 | // 22 | // In information theory, by convention, lim_{x approaches 0}(x log(x)) = 0 23 | 24 | num_t max{0.0}; 25 | size_t max_index; 26 | 27 | loss_ = num_t{0.0}; 28 | for (size_t i = 0; i != input_size_; ++i) 29 | { 30 | if (data[i] > max) 31 | { 32 | max_index = i; 33 | max = data[i]; 34 | } 35 | 36 | // Because the target vector is one-hot encoded, most of these terms 37 | // will be zero, but we leave the full calculation here to be explicit 38 | // and in the event we want to compute losses against probability 39 | // distributions that arent one-hot. In practice, a faster code path 40 | // should be employed if the targets are known to be one-hot 41 | // distributions. 42 | loss_ -= target_[i] 43 | * std::log( 44 | // Prevent undefined results when taking the log of 0 45 | std::max(data[i], std::numeric_limits::epsilon())); 46 | 47 | if (target_[i] != num_t{0.0}) 48 | { 49 | active_ = i; 50 | } 51 | 52 | // NOTE: The astute reader may notice that the gradients associated with 53 | // many of the loss node's input signals will be zero because the 54 | // cross-entropy is performed with respect to a one-hot vector. 55 | // Fortunately, because the layer preceding the output layer is a 56 | // softmax layer, the gradient from the single term contributing in the 57 | // above expression has a dependency on *every* softmax output unit (all 58 | // outputs show up in the summation in the softmax denominator). 59 | } 60 | 61 | if (max_index == active_) 62 | { 63 | ++correct_; 64 | } 65 | else 66 | { 67 | ++incorrect_; 68 | } 69 | 70 | cumulative_loss_ += loss_; 71 | 72 | // Store the data pointer to compute gradients later 73 | last_input_ = data; 74 | } 75 | 76 | void CCELossNode::reverse(num_t* data) 77 | { 78 | // dJ/dq_i = d(-\sum_i(p_i log(q_i)))/dq_i = -1 / q_j where j is the index 79 | // of the correct classification (loss gradient for a single sample). 80 | // 81 | // Note the normalization factor where we multiply by the inverse batch 82 | // size. This ensures that losses computed by the network are similar in 83 | // scale irrespective of batch size. 84 | 85 | for (size_t i = 0; i != input_size_; ++i) 86 | { 87 | gradients_[i] = -inv_batch_size_ * target_[i] / last_input_[i]; 88 | } 89 | 90 | for (Node* node : antecedents_) 91 | { 92 | node->reverse(gradients_.data()); 93 | } 94 | } 95 | 96 | void CCELossNode::print() const 97 | { 98 | std::printf("Avg Loss: %f\t%f%% correct\n", avg_loss(), accuracy() * 100.0); 99 | } 100 | 101 | num_t CCELossNode::accuracy() const 102 | { 103 | return static_cast(correct_) 104 | / static_cast(correct_ + incorrect_); 105 | } 106 | num_t CCELossNode::avg_loss() const 107 | { 108 | return cumulative_loss_ / static_cast(correct_ + incorrect_); 109 | } 110 | 111 | void CCELossNode::reset_score() 112 | { 113 | cumulative_loss_ = num_t{0.0}; 114 | correct_ = 0; 115 | incorrect_ = 0; 116 | } 117 | -------------------------------------------------------------------------------- /src/CCELossNode.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Model.hpp" 4 | 5 | // Categorical Cross-Entropy Loss Node 6 | // Assumes input data is "one-hot encoded," with size equal to the number of 7 | // possible classifications, where the "answer" has a single "1" (aka hot value) 8 | // in one of the classification positions and zero everywhere else. 9 | 10 | class CCELossNode : public Node 11 | { 12 | public: 13 | CCELossNode(Model& model, 14 | std::string name, 15 | uint16_t input_size, 16 | size_t batch_size); 17 | 18 | // No initialization is needed for this node 19 | void init(rne_t&) override 20 | {} 21 | 22 | void forward(num_t* inputs) override; 23 | // As a loss node, the argument to this method is ignored (the gradient of 24 | // the loss with respect to itself is unity) 25 | void reverse(num_t* gradients = nullptr) override; 26 | 27 | void print() const override; 28 | 29 | void set_target(num_t const* target) 30 | { 31 | target_ = target; 32 | } 33 | 34 | num_t accuracy() const; 35 | num_t avg_loss() const; 36 | void reset_score(); 37 | 38 | private: 39 | uint16_t input_size_; 40 | 41 | // We minimize the average loss, not the net loss so that the losses 42 | // produced do not scale with batch size (which allows us to keep training 43 | // parameters constant) 44 | num_t inv_batch_size_; 45 | num_t loss_; 46 | num_t const* target_; 47 | num_t* last_input_; 48 | // Stores the last active classification in the target one-hot encoding 49 | size_t active_; 50 | num_t cumulative_loss_{0.0}; 51 | // Store running counts of correct and incorrect predictions 52 | size_t correct_ = 0; 53 | size_t incorrect_ = 0; 54 | std::vector gradients_; 55 | }; 56 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable( 2 | nn 3 | main.cpp 4 | CCELossNode.cpp 5 | FFNode.cpp 6 | GDOptimizer.cpp 7 | MNIST.cpp 8 | Model.cpp 9 | ) 10 | 11 | target_compile_features(nn PUBLIC cxx_std_17) 12 | -------------------------------------------------------------------------------- /src/Dual.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | template 4 | struct Dual 5 | { 6 | T real_ = T{0.0}; 7 | T dual_ = T{1.0}; 8 | }; 9 | 10 | template 11 | [[nodiscard]] Dual operator+(Dual&& a, Dual&& b) noexcept 12 | { 13 | return {a.real_ + b.real_, a.dual_ + b.dual_}; 14 | } 15 | 16 | template 17 | [[nodiscard]] Dual operator-(Dual&& a, Dual&& b) noexcept 18 | { 19 | return {a.real_ - b.real_, a.dual_ - b.dual_}; 20 | } 21 | 22 | // (a + eb) * (c + ed) = ac + ebc + ead + e^2bd = ac + e(bc + ad) 23 | template 24 | [[nodiscard]] constexpr Dual operator*(Dual&& a, Dual&& b) noexcept 25 | { 26 | return { 27 | a.real_ * b.real_, 28 | a.real_ * b.dual_ + b.real_ * a.dual_, 29 | }; 30 | } 31 | -------------------------------------------------------------------------------- /src/FFNode.cpp: -------------------------------------------------------------------------------- 1 | #include "FFNode.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | FFNode::FFNode(Model& model, 9 | std::string name, 10 | Activation activation, 11 | uint16_t output_size, 12 | uint16_t input_size) 13 | : Node{model, std::move(name)} 14 | , activation_{activation} 15 | , output_size_{output_size} 16 | , input_size_{input_size} 17 | { 18 | std::printf("%s: %d -> %d\n", name_.c_str(), input_size_, output_size_); 19 | 20 | // The weight parameters of a FF-layer are an NxM matrix 21 | weights_.resize(output_size_ * input_size_); 22 | 23 | // Each node in this layer is assigned a bias (so that zero is not 24 | // necessarily mapped to zero) 25 | biases_.resize(output_size_); 26 | 27 | // The outputs of each neuron within the layer is an "activation" in 28 | // neuroscience parlance 29 | activations_.resize(output_size_); 30 | 31 | activation_gradients_.resize(output_size_); 32 | weight_gradients_.resize(output_size_ * input_size_); 33 | bias_gradients_.resize(output_size_); 34 | input_gradients_.resize(input_size_); 35 | } 36 | 37 | void FFNode::init(rne_t& rne) 38 | { 39 | num_t sigma; 40 | switch (activation_) 41 | { 42 | case Activation::ReLU: 43 | // Kaiming He, et. al. weight initialization for ReLU networks 44 | // https://arxiv.org/pdf/1502.01852.pdf 45 | // 46 | // Suggests using a normal distribution with variance := 2 / n_in 47 | sigma = std::sqrt(2.0 / static_cast(input_size_)); 48 | break; 49 | case Activation::Softmax: 50 | default: 51 | sigma = std::sqrt(1.0 / static_cast(input_size_)); 52 | break; 53 | } 54 | 55 | // NOTE: Unfortunately, the C++ standard does not guarantee that the results 56 | // obtained from a distribution function will be identical given the same 57 | // inputs across different compilers and platforms. A production ML 58 | // framework will likely implement its own distributions to provide 59 | // deterministic results. 60 | auto dist = std::normal_distribution{0.0, sigma}; 61 | 62 | for (num_t& w : weights_) 63 | { 64 | w = dist(rne); 65 | } 66 | 67 | // NOTE: Setting biases to zero is a common practice, as is initializing the 68 | // bias to a small value (e.g. on the order of 0.01). It is unclear if the 69 | // latter produces a consistent result over the former, but the thinking is 70 | // that a non-zero bias will ensure that the neuron always "fires" at the 71 | // beginning to produce a signal. 72 | // 73 | // Here, we initialize all biases to a small number, but the reader should 74 | // consider experimenting with other approaches. 75 | for (num_t& b : biases_) 76 | { 77 | b = 0.01; 78 | } 79 | } 80 | 81 | void FFNode::forward(num_t* inputs) 82 | { 83 | // Remember the last input data for backpropagation later 84 | last_input_ = inputs; 85 | 86 | for (size_t i = 0; i != output_size_; ++i) 87 | { 88 | // For each output vector, compute the dot product of the input data 89 | // with the weight vector add the bias 90 | 91 | num_t z{0.0}; 92 | 93 | size_t offset = i * input_size_; 94 | 95 | for (size_t j = 0; j != input_size_; ++j) 96 | { 97 | z += weights_[offset + j] * inputs[j]; 98 | } 99 | // Add neuron bias 100 | z += biases_[i]; 101 | 102 | switch (activation_) 103 | { 104 | case Activation::ReLU: 105 | activations_[i] = std::max(z, num_t{0.0}); 106 | break; 107 | case Activation::Softmax: 108 | default: 109 | activations_[i] = std::exp(z); 110 | break; 111 | } 112 | } 113 | 114 | if (activation_ == Activation::Softmax) 115 | { 116 | // softmax(z)_i = exp(z_i) / \sum_j(exp(z_j)) 117 | num_t sum_exp_z{0.0}; 118 | for (size_t i = 0; i != output_size_; ++i) 119 | { 120 | // NOTE: with exploding gradients, it is quite easy for this 121 | // exponential function to overflow, which will result in NaNs 122 | // infecting the network. 123 | sum_exp_z += activations_[i]; 124 | } 125 | num_t inv_sum_exp_z = num_t{1.0} / sum_exp_z; 126 | for (size_t i = 0; i != output_size_; ++i) 127 | { 128 | activations_[i] *= inv_sum_exp_z; 129 | } 130 | } 131 | 132 | // Forward activation data to all subsequent nodes in the computational 133 | // graph 134 | for (Node* subsequent : subsequents_) 135 | { 136 | subsequent->forward(activations_.data()); 137 | } 138 | } 139 | 140 | void FFNode::reverse(num_t* gradients) 141 | { 142 | // We receive a vector of output_size_ gradients of the loss function with 143 | // respect to the activations of this node. 144 | 145 | // We need to compute the gradients of the loss function with respect to 146 | // each parameter in the node (all weights and biases). In addition, we need 147 | // to compute the gradients with respect to the inputs in order to propagate 148 | // the gradients further. 149 | 150 | // Notation: 151 | // 152 | // Subscripts on any of the following vector and matrix quantities are used 153 | // to specify a specific element of the vector or matrix. 154 | // 155 | // - I is the input vector 156 | // - W is the weight matrix 157 | // - B is the bias vector 158 | // - Z = W*I + B 159 | // - A is our activation function (ReLU or Softmax in this case) 160 | // - L is the total loss (cost) 161 | // 162 | // The gradient we receive from the subsequent is dJ/dg(Z) which we can use 163 | // to compute dJ/dW_{i, j}, dJ/dB_i, and dJ/dI_i 164 | 165 | // First, we compute dJ/dz as dJ/dg(z) * dg(z)/dz and store it in our 166 | // activations array 167 | for (size_t i = 0; i != output_size_; ++i) 168 | { 169 | // dg(z)/dz 170 | num_t activation_grad{0.0}; 171 | switch (activation_) 172 | { 173 | case Activation::ReLU: 174 | // For a ReLU function, the gradient is unity when the activation 175 | // exceeds 0.0, and 0.0 otherwise. Technically, the gradient is 176 | // undefined at 0, but in practice, defining the gradient at this 177 | // point to be 0 isn't an issue 178 | if (activations_[i] > num_t{0.0}) 179 | { 180 | activation_grad = num_t{1.0}; 181 | } 182 | else 183 | { 184 | activation_grad = num_t{0.0}; 185 | } 186 | // dJ/dz = dJ/dg(z) * dg(z)/dz 187 | activation_gradients_[i] = gradients[i] * activation_grad; 188 | break; 189 | case Activation::Softmax: 190 | default: 191 | // F.T.R. The implementation here correctly computes gradients for 192 | // the general softmax function accounting for all received 193 | // gradients. However, this step can be optimized significantly if 194 | // it is known that the softmax output is being compared to a 195 | // one-hot distribution. The softmax output of a given unit is 196 | // exp(z_i) / \sum_j exp(z_j). When the loss gradient with respect 197 | // to the softmax outputs is returned, a single i is selected from 198 | // among the softmax outputs in a 1-hot encoding, corresponding to 199 | // the correct classification for this training sample. Complete the 200 | // derivation for the gradient of the softmax assuming a one-hot 201 | // distribution and implement the optimized routine. 202 | 203 | for (size_t j = 0; j != output_size_; ++j) 204 | { 205 | if (i == j) 206 | { 207 | activation_grad += activations_[i] 208 | * (num_t{1.0} - activations_[i]) 209 | * gradients[j]; 210 | } 211 | else 212 | { 213 | activation_grad 214 | += -activations_[i] * activations_[j] * gradients[j]; 215 | } 216 | } 217 | 218 | activation_gradients_[i] = activation_grad; 219 | break; 220 | } 221 | } 222 | 223 | for (size_t i = 0; i != output_size_; ++i) 224 | { 225 | // Next, let's compute the partial dJ/db_i. If we hold all the weights 226 | // and inputs constant, it's clear that dz/db_i is just 1 (consider 227 | // differentiating the line mx + b with respect to b). Thus, dJ/db_i = 228 | // dJ/dg(z_i) * dg(z_i)/dz_i. 229 | bias_gradients_[i] += activation_gradients_[i]; 230 | } 231 | 232 | // CAREFUL! Unlike the other gradients, we reset input gradients to 0. These 233 | // values are used primarily as a subexpression in computing upstream 234 | // gradients and do not participate in the network optimization step (aka 235 | // Stochastic Gradient Descent) later. 236 | std::fill(input_gradients_.begin(), input_gradients_.end(), num_t{0.0}); 237 | 238 | // To compute dz/dI_i, recall that z_i = \sum_i W_i*I_i + B_i. That is, the 239 | // precursor to each activation is a dot-product between a weight vector an 240 | // the input plus a bias. Thus, dz/dI_i must be the sum of all weights that 241 | // were scaled by I_i during the forward pass. 242 | for (size_t i = 0; i != output_size_; ++i) 243 | { 244 | size_t offset = i * input_size_; 245 | for (size_t j = 0; j != input_size_; ++j) 246 | { 247 | input_gradients_[j] 248 | += weights_[offset + j] * activation_gradients_[i]; 249 | } 250 | } 251 | 252 | for (size_t i = 0; i != input_size_; ++i) 253 | { 254 | for (size_t j = 0; j != output_size_; ++j) 255 | { 256 | // Each individual weight shows up in the equation for z once and is 257 | // scaled by the corresponding input. Thus, dJ/dw_i = dJ/dg(z_i) * 258 | // dg(z_i)/dz_i * dz_i/d_w_ij where the last factor is equal to the 259 | // input scaled by w_ij. 260 | 261 | weight_gradients_[j * input_size_ + i] 262 | += last_input_[i] * activation_gradients_[j]; 263 | } 264 | } 265 | 266 | for (Node* node : antecedents_) 267 | { 268 | // Forward loss gradients with respect to the inputs to the previous 269 | // node. 270 | // 271 | // F.T.R. Technically, if the antecedent node has no learnable 272 | // parameters, there is no point forwarding gradients to that node. 273 | // Furthermore, if no antecedent nodes required any gradients, we could 274 | // have skipped computing the gradients for this node altogether. A 275 | // simple way to implement this is to add a `parameter_count` virtual 276 | // method on the Node interface leverage it to save some work whenever 277 | // possible here. 278 | node->reverse(input_gradients_.data()); 279 | } 280 | } 281 | 282 | // F.T.R. It is more efficient to store parameters contiguously so they can be 283 | // accessed without branching or arithmetic. 284 | num_t* FFNode::param(size_t index) 285 | { 286 | if (index < weights_.size()) 287 | { 288 | return &weights_[index]; 289 | } 290 | return &biases_[index - weights_.size()]; 291 | } 292 | 293 | num_t* FFNode::gradient(size_t index) 294 | { 295 | if (index < weights_.size()) 296 | { 297 | return &weight_gradients_[index]; 298 | } 299 | return &bias_gradients_[index - weights_.size()]; 300 | } 301 | 302 | void FFNode::print() const 303 | { 304 | std::printf("%s\n", name_.c_str()); 305 | 306 | // Consider the input samples as column vectors, and visualize the weights 307 | // as a matrix transforming vectors with input_size_ dimension to size_ 308 | // dimension 309 | std::printf("Weights (%d x %d)\n", output_size_, input_size_); 310 | for (size_t i = 0; i != output_size_; ++i) 311 | { 312 | size_t offset = i * input_size_; 313 | for (size_t j = 0; j != input_size_; ++j) 314 | { 315 | std::printf("\t[%zu]%f", offset + j, weights_[offset + j]); 316 | } 317 | std::printf("\n"); 318 | } 319 | std::printf("Biases (%d x 1)\n", output_size_); 320 | for (size_t i = 0; i != output_size_; ++i) 321 | { 322 | std::printf("\t%f\n", biases_[i]); 323 | } 324 | std::printf("\n"); 325 | } 326 | -------------------------------------------------------------------------------- /src/FFNode.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Model.hpp" 4 | 5 | #include 6 | #include 7 | 8 | // Fully-connected, feedforward Layer 9 | 10 | // A feedforward layer is parameterized by the number of neurons it posesses and 11 | // the number of neurons in the layer preceding it 12 | class FFNode : public Node 13 | { 14 | public: 15 | FFNode(Model& model, 16 | std::string name, 17 | Activation activation, 18 | uint16_t output_size, 19 | uint16_t input_size); 20 | 21 | // Initialize the parameters of the layer 22 | // F.T.R. 23 | // Experiment with alternative weight and bias initialization schemes: 24 | // 1. Try different distributions for the weight 25 | // 2. Try initializing all weights to zero (why is this suboptimal) 26 | // 3. Try initializing all the biases to zero 27 | void init(rne_t& rne) override; 28 | 29 | // The input vector should have size input_size_ 30 | void forward(num_t* inputs) override; 31 | // The output vector should have size output_size_ 32 | void reverse(num_t* gradients) override; 33 | 34 | size_t param_count() const noexcept override 35 | { 36 | // Weight matrix entries + bias entries 37 | return (input_size_ + 1) * output_size_; 38 | } 39 | 40 | num_t* param(size_t index); 41 | num_t* gradient(size_t index); 42 | 43 | void print() const override; 44 | 45 | private: 46 | Activation activation_; 47 | uint16_t output_size_; 48 | uint16_t input_size_; 49 | 50 | ///////////////////// 51 | // Node Parameters // 52 | ///////////////////// 53 | 54 | // weights_.size() := output_size_ * input_size_ 55 | std::vector weights_; 56 | // biases_.size() := output_size_ 57 | std::vector biases_; 58 | // activations_.size() := output_size_ 59 | std::vector activations_; 60 | 61 | //////////////////// 62 | // Loss Gradients // 63 | //////////////////// 64 | 65 | std::vector activation_gradients_; 66 | 67 | // During the training cycle, parameter loss gradients are accumulated in 68 | // the following buffers. 69 | std::vector weight_gradients_; 70 | std::vector bias_gradients_; 71 | 72 | // This buffer is used to store temporary gradients used in a SINGLE 73 | // backpropagation pass. Note that this does not accumulate like the weight 74 | // and bias gradients do. 75 | std::vector input_gradients_; 76 | 77 | // The last input is needed to compute loss gradients with respect to the 78 | // weights during backpropagation 79 | num_t* last_input_; 80 | }; 81 | -------------------------------------------------------------------------------- /src/GDOptimizer.cpp: -------------------------------------------------------------------------------- 1 | #include "GDOptimizer.hpp" 2 | #include "Model.hpp" 3 | #include 4 | 5 | GDOptimizer::GDOptimizer(num_t eta) 6 | : eta_{eta} 7 | {} 8 | 9 | void GDOptimizer::train(Node& node) 10 | { 11 | size_t param_count = node.param_count(); 12 | for (size_t i = 0; i != param_count; ++i) 13 | { 14 | num_t& param = *node.param(i); 15 | num_t& gradient = *node.gradient(i); 16 | 17 | param = param - eta_ * gradient; 18 | 19 | // Reset the gradient which will be accumulated again in the next 20 | // training epoch 21 | gradient = num_t{0.0}; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/GDOptimizer.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Model.hpp" 4 | 5 | // Note that this class defines the general gradient descent algorithm. It can 6 | // be used as part of the *Stochastic* gradient descent algorithm (aka SGD) by 7 | // invoking it after smaller batches of training data are evaluated. 8 | class GDOptimizer : public Optimizer 9 | { 10 | public: 11 | // "Eta" is the commonly accepted character used to denote the learning 12 | // rate. Given a loss gradient dL/dp for some parameter p, during gradient 13 | // descent, p will be adjusted such that p' = p - eta * dL/dp. 14 | GDOptimizer(num_t eta); 15 | 16 | // This should be invoked at the end of each batch's evaluation. The 17 | // interface technically permits the use of different optimizers for 18 | // different segments of the computational graph. 19 | void train(Node& node) override; 20 | 21 | private: 22 | num_t eta_; 23 | }; 24 | -------------------------------------------------------------------------------- /src/MNIST.cpp: -------------------------------------------------------------------------------- 1 | #include "MNIST.hpp" 2 | 3 | #include 4 | #include 5 | 6 | // Read 4 bytes and reverse them to return an unsigned integer on LE 7 | // architectures 8 | void read_be(std::ifstream& in, uint32_t* out) 9 | { 10 | char* buf = reinterpret_cast(out); 11 | in.read(buf, 4); 12 | 13 | std::swap(buf[0], buf[3]); 14 | std::swap(buf[1], buf[2]); 15 | } 16 | 17 | MNIST::MNIST(Model& model, std::ifstream& images, std::ifstream& labels) 18 | : Node{model, "MNIST input"} 19 | , images_{images} 20 | , labels_{labels} 21 | { 22 | // Confirm that passed input file streams are well-formed MNIST data sets 23 | uint32_t image_magic; 24 | read_be(images, &image_magic); 25 | if (image_magic != 2051) 26 | { 27 | throw std::runtime_error{"Images file appears to be malformed"}; 28 | } 29 | read_be(images, &image_count_); 30 | 31 | uint32_t labels_magic; 32 | read_be(labels, &labels_magic); 33 | if (labels_magic != 2049) 34 | { 35 | throw std::runtime_error{"Labels file appears to be malformed"}; 36 | } 37 | 38 | uint32_t label_count; 39 | read_be(labels, &label_count); 40 | if (label_count != image_count_) 41 | { 42 | throw std::runtime_error( 43 | "Label count did not match the number of images supplied"); 44 | } 45 | 46 | uint32_t rows; 47 | uint32_t columns; 48 | read_be(images, &rows); 49 | read_be(images, &columns); 50 | if (rows != 28 || columns != 28) 51 | { 52 | throw std::runtime_error{ 53 | "Expected 28x28 images, non-MNIST data supplied"}; 54 | } 55 | 56 | printf("Loaded images file with %d entries\n", image_count_); 57 | } 58 | 59 | void MNIST::forward(num_t* data) 60 | { 61 | read_next(); 62 | for (Node* node : subsequents_) 63 | { 64 | node->forward(data_); 65 | } 66 | } 67 | 68 | void MNIST::print() const 69 | { 70 | // No learned parameters to display for an MNIST input node 71 | } 72 | 73 | void MNIST::read_next() 74 | { 75 | images_.read(buf_, DIM); 76 | num_t inv = num_t{1.0} / num_t{255.0}; 77 | for (size_t i = 0; i != DIM; ++i) 78 | { 79 | data_[i] = static_cast(buf_[i]) * inv; 80 | } 81 | 82 | char label; 83 | labels_.read(&label, 1); 84 | 85 | for (size_t i = 0; i != 10; ++i) 86 | { 87 | label_[i] = num_t{0.0}; 88 | } 89 | label_[static_cast(label)] = num_t{1.0}; 90 | } 91 | 92 | void MNIST::print_last() 93 | { 94 | for (size_t i = 0; i != 10; ++i) 95 | { 96 | if (label_[i] == num_t{1.0}) 97 | { 98 | printf("This is a %zu:\n", i); 99 | break; 100 | } 101 | } 102 | 103 | for (size_t i = 0; i != 28; ++i) 104 | { 105 | size_t offset = i * 28; 106 | for (size_t j = 0; j != 28; ++j) 107 | { 108 | if (data_[offset + j] > num_t{0.5}) 109 | { 110 | if (data_[offset + j] > num_t{0.9}) 111 | { 112 | printf("#"); 113 | } 114 | else if (data_[offset + j] > num_t{0.7}) 115 | { 116 | printf("*"); 117 | } 118 | else 119 | { 120 | printf("."); 121 | } 122 | } 123 | else 124 | { 125 | printf(" "); 126 | } 127 | } 128 | printf("\n"); 129 | } 130 | printf("\n"); 131 | } 132 | -------------------------------------------------------------------------------- /src/MNIST.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "Model.hpp" 4 | #include 5 | 6 | class MNIST : public Node 7 | { 8 | public: 9 | constexpr static size_t DIM = 28 * 28; 10 | 11 | MNIST(Model& model, std::ifstream& images, std::ifstream& labels); 12 | 13 | void init(rne_t&) override 14 | {} 15 | 16 | // As this is an input node, the argument to this function is ignored 17 | void forward(num_t* data = nullptr) override; 18 | // Backpropagation is a no-op for input nodes as there are no parameters to 19 | // update 20 | void reverse(num_t* data = nullptr) override 21 | {} 22 | 23 | // Parse the next image and label into memory 24 | void read_next(); 25 | 26 | void print() const override; 27 | 28 | [[nodiscard]] size_t size() const noexcept 29 | { 30 | return image_count_; 31 | } 32 | 33 | [[nodiscard]] num_t const* data() const noexcept 34 | { 35 | return data_; 36 | } 37 | 38 | [[nodiscard]] num_t* data() noexcept 39 | { 40 | return data_; 41 | } 42 | 43 | [[nodiscard]] num_t* label() noexcept 44 | { 45 | return label_; 46 | } 47 | 48 | [[nodiscard]] num_t const* label() const noexcept 49 | { 50 | return label_; 51 | } 52 | 53 | // Quick ASCII visualization of the last read image. For best results, 54 | // ensure that your terminal font is a monospace font. 55 | void print_last(); 56 | 57 | private: 58 | std::ifstream& images_; 59 | std::ifstream& labels_; 60 | uint32_t image_count_; 61 | // Data from the images file is read as one-byte unsigned values which are 62 | // converted to num_t after 63 | char buf_[DIM]; 64 | // All images are resized (with antialiasing) to a 28 x 28 row-major raster 65 | num_t data_[DIM]; 66 | // One-hot encoded label 67 | num_t label_[10]; 68 | }; 69 | -------------------------------------------------------------------------------- /src/Model.cpp: -------------------------------------------------------------------------------- 1 | #include "Model.hpp" 2 | 3 | Node::Node(Model& model, std::string name) 4 | : model_(model) 5 | , name_{std::move(name)} 6 | {} 7 | 8 | Model::Model(std::string name) 9 | : name_{std::move(name)} 10 | {} 11 | 12 | void Model::create_edge(Node& dst, Node& src) 13 | { 14 | // NOTE: No validation is done to ensure the edge doesn't already exist 15 | dst.antecedents_.push_back(&src); 16 | src.subsequents_.push_back(&dst); 17 | } 18 | 19 | rne_t::result_type Model::init(rne_t::result_type seed) 20 | { 21 | if (seed == 0) 22 | { 23 | // Generate a new random seed from the host random device 24 | std::random_device rd{}; 25 | seed = rd(); 26 | } 27 | std::printf("Initializing model parameters with seed: %u\n", seed); 28 | 29 | rne_t rne{seed}; 30 | 31 | for (auto& node : nodes_) 32 | { 33 | node->init(rne); 34 | } 35 | 36 | return seed; 37 | } 38 | 39 | void Model::train(Optimizer& optimizer) 40 | { 41 | for (auto&& node : nodes_) 42 | { 43 | optimizer.train(*node); 44 | } 45 | } 46 | 47 | void Model::print() const 48 | { 49 | // Invoke "print" on each node in the order added 50 | for (auto&& node : nodes_) 51 | { 52 | node->print(); 53 | } 54 | } 55 | 56 | void Model::save(std::ofstream& out) 57 | { 58 | // To save the model to disk, we employ a very simple scheme. All nodes are 59 | // looped through in the order they were added to the model. Then, all 60 | // advertised learnable parameters are serialized in host byte-order to the 61 | // supplied output stream. 62 | // 63 | // F.T.R. This simplistic method of saving the model to disk isn't very 64 | // robust or practical in the real world. For one thing, it contains no 65 | // reflection data about the topology of the model. Loading the data relies 66 | // on the model being constructed in the same manner it was trained on. 67 | // Furthermore, the data will be parsed incorrectly if the program is 68 | // recompiled to operate with a different precision. Adopting a more 69 | // sensible serialization scheme is left as an exercise. 70 | for (auto& node : nodes_) 71 | { 72 | size_t param_count = node->param_count(); 73 | for (size_t i = 0; i != param_count; ++i) 74 | { 75 | out.write( 76 | reinterpret_cast(node->param(i)), sizeof(num_t)); 77 | } 78 | } 79 | } 80 | 81 | void Model::load(std::ifstream& in) 82 | { 83 | for (auto& node : nodes_) 84 | { 85 | size_t param_count = node->param_count(); 86 | for (size_t i = 0; i != param_count; ++i) 87 | { 88 | in.read(reinterpret_cast(node->param(i)), sizeof(num_t)); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/Model.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | // Default precision: single 11 | using num_t = float; 12 | // Default random number engine: 32-bit Mersenne Twister by Matsumoto and 13 | // Nishimura, 1998. For generating random numbers with double precision, the 14 | // 64-bit Mersenne Twister should be used. 15 | using rne_t = std::mt19937; 16 | 17 | enum class Activation 18 | { 19 | ReLU, 20 | Softmax 21 | }; 22 | 23 | class Model; 24 | 25 | // Base class of computational nodes in a model 26 | class Node 27 | { 28 | public: 29 | Node(Model& model, std::string name); 30 | virtual ~Node(){}; 31 | 32 | // Initialize the parameters of the node with a provided random number 33 | // engine. 34 | virtual void init(rne_t& rne) = 0; 35 | 36 | // Data is fed forward through the network using a simple generic interface. 37 | // We do this to avoid requiring an involved N-dimensional matrix 38 | // abstraction. Here, the "shape" of the data is dependent on the Node's 39 | // implementation and the way a given Node is initialized. 40 | // 41 | // In practice, this should be replaced with an actual type with a shape 42 | // defined by data to permit additional validation. It is also common for 43 | // the data object passed here to not contain the data directly (the data 44 | // may be located on a GPU for example) 45 | virtual void forward(num_t* inputs) = 0; 46 | 47 | // Expected inputs during the reverse accumulation phase are the loss 48 | // gradients with respect to each output 49 | // 50 | // The node is expected to compute the loss gradient with respect to each 51 | // parameter and update the parameter according to the model's optimizer, 52 | // after which, the gradients with respect to the node inputs are propagated 53 | // backwards again. 54 | virtual void reverse(num_t* gradients) = 0; 55 | 56 | // Returns the number of learnable parameters in this node. Nodes that are 57 | // input or loss nodes have no learnable parameters. 58 | virtual size_t param_count() const noexcept 59 | { 60 | return 0; 61 | } 62 | 63 | // Indexing operator for learnable parameters that are mutated during 64 | // training. Nodes without learnable parameters should keep this 65 | // unimplemented. 66 | virtual num_t* param(size_t index) 67 | { 68 | return nullptr; 69 | } 70 | 71 | // Indexing operator for the loss gradient with respect to a learnable 72 | // parameter. Used by an optimizer to adjust the corresponding parameter and 73 | // potentially for tracking gradient histories (done in more sophisticated 74 | // optimizers, e.g. AdaGrad) 75 | virtual num_t* gradient(size_t index) 76 | { 77 | return nullptr; 78 | } 79 | 80 | [[nodiscard]] std::string const& name() const noexcept 81 | { 82 | return name_; 83 | } 84 | 85 | // Generic function that displays the contents of the node in some fashion 86 | virtual void print() const = 0; 87 | 88 | protected: 89 | friend class Model; 90 | 91 | Model& model_; 92 | std::string name_; 93 | std::vector antecedents_; 94 | std::vector subsequents_; 95 | }; 96 | 97 | // Base class of optimizer used to train a model 98 | class Optimizer 99 | { 100 | public: 101 | virtual void train(Node& node) = 0; 102 | }; 103 | 104 | class Model 105 | { 106 | public: 107 | Model(std::string name); 108 | 109 | template 110 | Node_t& add_node(T&&... args) 111 | { 112 | nodes_.emplace_back( 113 | std::make_unique(*this, std::forward(args)...)); 114 | return reinterpret_cast(*nodes_.back()); 115 | } 116 | 117 | void create_edge(Node& dst, Node& src); 118 | 119 | // Initialize the parameters of all nodes with the provided seed. If the 120 | // seed is 0, a new random seed is chosen instead. Returns the seed used. 121 | rne_t::result_type init(rne_t::result_type seed = 0); 122 | 123 | void train(Optimizer& optimizer); 124 | 125 | [[nodiscard]] std::string const& name() const noexcept 126 | { 127 | return name_; 128 | } 129 | 130 | void print() const; 131 | 132 | void save(std::ofstream& out); 133 | void load(std::ifstream& in); 134 | 135 | private: 136 | friend class Node; 137 | 138 | std::string name_; 139 | std::vector> nodes_; 140 | }; 141 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "CCELossNode.hpp" 2 | #include "FFNode.hpp" 3 | #include "GDOptimizer.hpp" 4 | #include "MNIST.hpp" 5 | #include "Model.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | static constexpr size_t batch_size = 80; 12 | 13 | Model create_model(std::ifstream& images, 14 | std::ifstream& labels, 15 | MNIST** mnist, 16 | CCELossNode** loss) 17 | { 18 | // Here we create a simple fully-connected feedforward neural network 19 | Model model{"ff"}; 20 | 21 | *mnist = &model.add_node(images, labels); 22 | 23 | FFNode& hidden = model.add_node("hidden", Activation::ReLU, 32, 784); 24 | 25 | FFNode& output 26 | = model.add_node("output", Activation::Softmax, 10, 32); 27 | 28 | *loss = &model.add_node("loss", 10, batch_size); 29 | (*loss)->set_target((*mnist)->label()); 30 | 31 | // F.T.R. The structure of our computational graph is completely sequential. 32 | // In fact, the fully connected node and loss node we've implemented here do 33 | // not support multiple inputs. Consider adding nodes that support "skip" 34 | // connections that forward outputs from earlier nodes to downstream nodes 35 | // that aren't directly adjacent (such skip nodes are used in the ResNet 36 | // architecture) 37 | model.create_edge(hidden, **mnist); 38 | model.create_edge(output, hidden); 39 | model.create_edge(**loss, output); 40 | return model; 41 | } 42 | 43 | void train(char* argv[]) 44 | { 45 | // Uncomment to debug floating point instability in the network 46 | // feenableexcept(FE_INVALID | FE_OVERFLOW); 47 | 48 | std::printf("Executing training routine\n"); 49 | 50 | std::ifstream images{ 51 | std::filesystem::path{argv[0]} / "train-images-idx3-ubyte", 52 | std::ios::binary}; 53 | 54 | std::ifstream labels{ 55 | std::filesystem::path{argv[0]} / "train-labels-idx1-ubyte", 56 | std::ios::binary}; 57 | 58 | MNIST* mnist; 59 | CCELossNode* loss; 60 | Model model = create_model(images, labels, &mnist, &loss); 61 | 62 | model.init(); 63 | 64 | // The gradient descent optimizer is stateless, but other optimizers may not 65 | // be. Some optimizers need to track "momentum" or gradient histories. 66 | // Others may slow the learning rate for each parameter at different rates 67 | // depending on various factors. 68 | // 69 | // F.T.R. Implement an alternative SGDOptimizer that decays the learning 70 | // rate over time and compare the results against this optimizer that learns 71 | // at a fixed rate. 72 | GDOptimizer optimizer{num_t{0.3}}; 73 | 74 | // F.T.R. Here, we've hardcoded the number of batches to train on. In 75 | // practice, training should halt when the average loss begins to 76 | // vascillate, indicating that the model is starting to overfit the data. 77 | // Implement some form of loss-improvement measure to determine when this 78 | // inflection point occurs and stop accordingly. 79 | size_t i = 0; 80 | for (; i != 256; ++i) 81 | { 82 | loss->reset_score(); 83 | 84 | for (size_t j = 0; j != batch_size; ++j) 85 | { 86 | mnist->forward(); 87 | loss->reverse(); 88 | } 89 | 90 | model.train(optimizer); 91 | } 92 | 93 | std::printf("Ran %zu batches (%zu samples each)\n", i, batch_size); 94 | 95 | // Print the average loss computed in the final batch 96 | loss->print(); 97 | 98 | std::ofstream out{ 99 | std::filesystem::current_path() / (model.name() + ".params"), 100 | std::ios::binary}; 101 | model.save(out); 102 | } 103 | 104 | void evaluate(char* argv[]) 105 | { 106 | std::printf("Executing evaluation routine\n"); 107 | 108 | std::ifstream images{ 109 | std::filesystem::path{argv[0]} / "t10k-images-idx3-ubyte", 110 | std::ios::binary}; 111 | 112 | std::ifstream labels{ 113 | std::filesystem::path{argv[0]} / "t10k-labels-idx1-ubyte", 114 | std::ios::binary}; 115 | 116 | MNIST* mnist; 117 | CCELossNode* loss; 118 | // For the data to be loaded properly, the model must be constructed in the 119 | // same manner as it was constructed during training. 120 | Model model = create_model(images, labels, &mnist, &loss); 121 | 122 | // Instead of initializing the parameters randomly, here we load it from 123 | // disk (saved from a previous training run). 124 | std::ifstream params_file{std::filesystem::path{argv[1]}, std::ios::binary}; 125 | model.load(params_file); 126 | 127 | // Evaluate all 10000 images in the test set and compute the loss average 128 | for (size_t i = 0; i != mnist->size(); ++i) 129 | { 130 | mnist->forward(); 131 | } 132 | loss->print(); 133 | } 134 | 135 | int main(int argc, char* argv[]) 136 | { 137 | if (argc < 2) 138 | { 139 | std::printf("Supported commands include:\ntrain\nevaluate\n"); 140 | return 1; 141 | } 142 | 143 | if (strcmp(argv[1], "train") == 0) 144 | { 145 | train(argv + 2); 146 | } 147 | else if (strcmp(argv[1], "evaluate") == 0) 148 | { 149 | evaluate(argv + 2); 150 | } 151 | else 152 | { 153 | std::printf("Argument %s is an unrecognized directive.\n", argv[1]); 154 | } 155 | 156 | return 0; 157 | } 158 | --------------------------------------------------------------------------------