├── .clang-format
├── .gitignore
├── CMakeLists.txt
├── README.md
├── data
├── test
│ ├── t10k-images-idx3-ubyte
│ └── t10k-labels-idx1-ubyte
└── train
│ ├── train-images-idx3-ubyte
│ └── train-labels-idx1-ubyte
├── doc
├── 219994768ec294ab99637a7747627aeebd998d41.svg
├── DOC.epub
├── DOC.html
├── DOC.md
├── DOC.pdf
├── DOC.tex
├── Makefile
├── cb34710dce878a77b3fd5f3e7e4746403aaaefcc.svg
├── plots
│ ├── -1637788021081228918.png
│ ├── -1637788021081228918.txt
│ ├── -6767785830879840565.png
│ ├── -6767785830879840565.txt
│ ├── 6094492350593652429.png
│ └── 6094492350593652429.txt
└── tikz.lua
└── src
├── CCELossNode.cpp
├── CCELossNode.hpp
├── CMakeLists.txt
├── Dual.hpp
├── FFNode.cpp
├── FFNode.hpp
├── GDOptimizer.cpp
├── GDOptimizer.hpp
├── MNIST.cpp
├── MNIST.hpp
├── Model.cpp
├── Model.hpp
└── main.cpp
/.clang-format:
--------------------------------------------------------------------------------
1 | AccessModifierOffset: -4
2 | AlignAfterOpenBracket: true
3 | AlignConsecutiveAssignments: true
4 | AlignConsecutiveDeclarations: false
5 | AlignEscapedNewlinesLeft: true
6 | AlignTrailingComments: true
7 | AllowAllParametersOfDeclarationOnNextLine: false
8 | AllowShortBlocksOnASingleLine: false
9 | AllowShortCaseLabelsOnASingleLine: false
10 | AllowShortFunctionsOnASingleLine: false
11 | AllowShortIfStatementsOnASingleLine: false
12 | AllowShortLoopsOnASingleLine: false
13 | AlwaysBreakAfterReturnType: None
14 | AlwaysBreakBeforeMultilineStrings: true
15 | AlwaysBreakTemplateDeclarations: true
16 | BinPackArguments: false
17 | BinPackParameters: false
18 | BreakBeforeBraces: Custom
19 | BraceWrapping:
20 | AfterClass: true
21 | AfterControlStatement: true
22 | AfterEnum: true
23 | AfterFunction: true
24 | AfterNamespace: true
25 | AfterObjCDeclaration: true
26 | AfterStruct: true
27 | AfterUnion: true
28 | AfterExternBlock: true
29 | BeforeCatch: true
30 | BeforeElse: true
31 | IndentBraces: false
32 | SplitEmptyFunction: false
33 | SplitEmptyRecord: false
34 | SplitEmptyNamespace: false
35 | BreakBeforeBinaryOperators: All
36 | BreakBeforeTernaryOperators: true
37 | BreakConstructorInitializers: BeforeComma
38 | BreakStringLiterals: true
39 | ColumnLimit: 80
40 | CommentPragmas: ''
41 | CompactNamespaces: false
42 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
43 | ConstructorInitializerIndentWidth: 4
44 | ContinuationIndentWidth: 4
45 | Cpp11BracedListStyle: true
46 | DerivePointerBinding: false
47 | FixNamespaceComments: true
48 | IndentCaseLabels: false
49 | IndentPPDirectives: AfterHash
50 | IndentWidth: 4
51 | IndentWrappedFunctionNames: false
52 | KeepEmptyLinesAtTheStartOfBlocks: false
53 | Language: Cpp
54 | MaxEmptyLinesToKeep: 1
55 | NamespaceIndentation: Inner
56 | PenaltyBreakBeforeFirstCallParameter: 0
57 | PenaltyBreakComment: 0
58 | PenaltyBreakFirstLessLess: 0
59 | PenaltyBreakString: 1
60 | PenaltyExcessCharacter: 10
61 | PenaltyReturnTypeOnItsOwnLine: 20
62 | PointerAlignment: Left
63 | SortIncludes: true
64 | SortUsingDeclarations: true
65 | SpaceAfterTemplateKeyword: true
66 | SpaceBeforeAssignmentOperators: true
67 | SpaceBeforeParens: ControlStatements
68 | SpaceInEmptyParentheses: false
69 | SpacesBeforeTrailingComments: 1
70 | SpacesInAngles: false
71 | SpacesInCStyleCastParentheses: false
72 | SpacesInContainerLiterals: false
73 | SpacesInParentheses: false
74 | SpacesInSquareBrackets: false
75 | Standard: C++11
76 | TabWidth: 4
77 | UseTab: Never
78 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # System
2 | .DS_Store
3 |
4 | # IDE/Editor
5 | .ccls-cache
6 | .vscode
7 |
8 | # Build
9 | build
10 | build-clang
11 | build-gcc
12 | build-release
13 | .cache
14 |
15 | # Statically generated documentation
16 | site/
17 |
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.16)
2 |
3 | project(nn_in_a_weekend LANGUAGES CXX)
4 |
5 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
6 |
7 | add_subdirectory(src)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # C++ Neural Network in a Weekend
2 |
3 | This repository is the companion code to the article "Neural Network in a Weekend." Readers are welcome to clone the repository and use the code herein as a reference if following along the article. Pull requests and issues filed for errors and bugs in both code and/or documentation are welcome and appreciated. However, pull requests that introduce new features are unlikely to be considered, as the ultimate goal of this code is to be tractable for a newer practitioner getting started with deep learning architectures.
4 |
5 | [Article pdf link](https://github.com/jeremyong/cpp_nn_in_a_weekend/raw/master/doc/DOC.pdf)
6 |
7 | ## Compilation and Usage
8 |
9 | mkdir build
10 | cd build
11 | # substitute Ninja for your preferred generator
12 | cmake .. -G Ninja
13 | ninja
14 | # trains the network and writes the learned parameters to disk
15 | ./src/nn train ../data/train
16 | # evaluate the model loss and accuracy based on the trained parameters
17 | ./src/nn evaluate ../data/test ./ff.params
18 |
19 | Note that the actual location of the `nn` executable may depend on your build system and build type. For performance reasons, it recommended to run the training itself with an optimized build, reverting to a development/debug build only when debugging is needed.
20 |
21 | ## Conventions
22 |
23 | 1. Member variables have a single underscore suffix (e.g. `member_variable_`)
24 | 2. The `F.T.R.` acroynym stands for "For the reader" and precedes suggestions for experimentation, improvements, or alternative implementations
25 | 3. Throughout, you may see the type aliases `num_t` and `rne_t`. These aliases refer to `float` and `std::mt199837` respectively and are defined in `Model.hpp` to easily experiment with alternative precisions and random number engines. The reader may wish to make these parameters changeable by other means.
26 |
27 | ## General Code Structure
28 |
29 | The neural network is modeled as a computational graph. The graph itself is the `Model` defined in `Model.hpp`. Nodes in the computational graph override the `Node` base class and must implement various methods to explain how data flows through the node (forwards and backwards).
30 |
31 | The fully-connected feedforward node in this example is implemented as `FFNode` in `FFNode.hpp`. The cross-entropy loss node is implemented in `CELossNode.hpp`. Together, these two nodes are all that is needed to train our example on the MNIST dataset.
32 |
33 | ## Data
34 |
35 | For your convenience, the MNIST data used to train and test the network is provided uncompressed in the `data/` subdirectory. The data is structured like so:
36 |
37 | ### Images
38 |
39 | Image data can be parsed using code provided in the `MNIST.hpp` header, but the data is described here as well. Multi-byte integers are stored with the MSB first, meaning that on a little-endian architecture, the bytes must be flipped. Image pixel data is stored in row-major order and packed contiguously one after another.
40 |
41 | Bytes
42 | [00-03] 0x00000803 (Magic Number: 2051)
43 | [04-07] image count
44 | [08-11] rows
45 | [12-15] columns
46 | [16] pixel[0, 0]
47 | [17] pixel[0, 1]
48 | ...
49 |
50 | ### Labels
51 |
52 | Label data is parsed according to the following byte layout:
53 |
54 | Bytes
55 | [00-03] 0x00000801 (Magic Number: 2049)
56 | [04-07] label count
57 | [8] label 1
58 | [9] label 2
59 | ...
60 |
61 | The parser provided by the `MNIST` input node validates the magic numbers to ensure the machine endianness is as expected, and also validates that the image data and label data sizes match.
62 |
--------------------------------------------------------------------------------
/data/test/t10k-images-idx3-ubyte:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/test/t10k-images-idx3-ubyte
--------------------------------------------------------------------------------
/data/test/t10k-labels-idx1-ubyte:
--------------------------------------------------------------------------------
1 | '
--------------------------------------------------------------------------------
/data/train/train-images-idx3-ubyte:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/train/train-images-idx3-ubyte
--------------------------------------------------------------------------------
/data/train/train-labels-idx1-ubyte:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/data/train/train-labels-idx1-ubyte
--------------------------------------------------------------------------------
/doc/219994768ec294ab99637a7747627aeebd998d41.svg:
--------------------------------------------------------------------------------
1 |
2 |
447 |
--------------------------------------------------------------------------------
/doc/DOC.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/DOC.epub
--------------------------------------------------------------------------------
/doc/DOC.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/DOC.pdf
--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
1 | all: pdf html
2 |
3 | clean:
4 | rm -rf *.svg plots DOC.pdf DOC.html
5 |
6 | tex:
7 | pandoc -F pandoc-plot -s DOC.md -o DOC.tex
8 |
9 | pdf:
10 | pandoc -F pandoc-plot -s --katex DOC.md -o DOC.pdf
11 |
12 | html:
13 | pandoc -L tikz.lua -F pandoc-plot -s --katex DOC.md -o DOC.html
14 |
15 | epub:
16 | pandoc -t epub3 --webtex -L tikz.lua -F pandoc-plot -s DOC.md -o DOC.epub
17 |
--------------------------------------------------------------------------------
/doc/cb34710dce878a77b3fd5f3e7e4746403aaaefcc.svg:
--------------------------------------------------------------------------------
1 |
2 |
217 |
--------------------------------------------------------------------------------
/doc/plots/-1637788021081228918.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/-1637788021081228918.png
--------------------------------------------------------------------------------
/doc/plots/-1637788021081228918.txt:
--------------------------------------------------------------------------------
1 | # Generated by pandoc-plot 0.8.0.0
2 |
3 | import matplotlib.pyplot as plt
4 | import array as arr
5 | import math as math
6 |
7 | f = arr.array('f')
8 | f.append(0)
9 | f.append(0)
10 | f.append(1)
11 | x = arr.array('f')
12 | x.append(-1)
13 | x.append(-0)
14 | x.append(1)
15 |
16 | plt.figure()
17 | plt.plot(x, f)
18 | plt.xlabel('$x$')
19 | plt.ylabel('$\max(0, x)$')
20 | plt.title('Rectifier function')
--------------------------------------------------------------------------------
/doc/plots/-6767785830879840565.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/-6767785830879840565.png
--------------------------------------------------------------------------------
/doc/plots/-6767785830879840565.txt:
--------------------------------------------------------------------------------
1 | # Generated by pandoc-plot 0.8.0.0
2 |
3 | import matplotlib.pyplot as plt
4 | import array as arr
5 | import math as math
6 |
7 | s = arr.array('f')
8 | h = arr.array('f')
9 |
10 | last = 0
11 | n = 30
12 | for i in range(0, n):
13 | last += 1 / (n + 1)
14 | s.append(last)
15 | h.append(-(1 - last) * math.log(last) - last * math.log(1 - last))
16 |
17 | plt.figure()
18 | plt.plot(s, h)
19 | plt.xlabel('$S$')
20 | plt.ylabel('$-(1-S)\log S - S\log (1 - S)$')
21 | plt.title('Cross entropy with mismatched distribution')
--------------------------------------------------------------------------------
/doc/plots/6094492350593652429.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jeremyong/cpp_nn_in_a_weekend/152e8cbd361161d8b526c021ef9818ea9dbfe034/doc/plots/6094492350593652429.png
--------------------------------------------------------------------------------
/doc/plots/6094492350593652429.txt:
--------------------------------------------------------------------------------
1 | # Generated by pandoc-plot 0.8.0.0
2 |
3 | import matplotlib.pyplot as plt
4 | import array as arr
5 | import math as math
6 |
7 | s = arr.array('f')
8 | s.append(0)
9 | h = arr.array('f')
10 | h.append(0)
11 |
12 | last = 0
13 | n = 30
14 | for i in range(0, n):
15 | last += 1 / (n + 1)
16 | s.append(last)
17 | h.append(-last * math.log(last) - (1 - last) * math.log(1 - last))
18 |
19 | s.append(1.0)
20 | h.append(0)
21 |
22 | plt.figure()
23 | plt.plot(s, h)
24 | plt.xlabel('$S$')
25 | plt.ylabel('$H(S) = -S\log S - (1 - S)\log (1 - S)$')
26 | plt.title('Binary Entropy')
--------------------------------------------------------------------------------
/doc/tikz.lua:
--------------------------------------------------------------------------------
1 | local system = require 'pandoc.system'
2 |
3 | local tikz_doc_template = [[
4 | \documentclass{standalone}
5 | \usepackage{xcolor}
6 | \usepackage{tikz}
7 | \usetikzlibrary{positioning,calc,arrows}
8 | \renewenvironment{center} {} {}
9 | \begin{document}
10 | \nopagecolor
11 | %s
12 | \end{document}
13 | ]]
14 |
15 | local function tikz2image(src, filetype, outfile)
16 | system.with_temporary_directory('tikz2image', function (tmpdir)
17 | system.with_working_directory(tmpdir, function()
18 | local f = io.open('tikz.tex', 'w')
19 | f:write(tikz_doc_template:format(src))
20 | f:close()
21 | os.execute('pdflatex tikz.tex')
22 | if filetype == 'pdf' then
23 | os.rename('tikz.pdf', outfile)
24 | else
25 | os.execute('pdf2svg tikz.pdf ' .. outfile)
26 | end
27 | end)
28 | end)
29 | end
30 |
31 | extension_for = {
32 | html = 'svg',
33 | html4 = 'svg',
34 | html5 = 'svg',
35 | latex = 'pdf',
36 | beamer = 'pdf' }
37 |
38 | local function file_exists(name)
39 | local f = io.open(name, 'r')
40 | if f ~= nil then
41 | io.close(f)
42 | return true
43 | else
44 | return false
45 | end
46 | end
47 |
48 | local function starts_with(start, str)
49 | return str:sub(1, #start) == start
50 | end
51 |
52 |
53 | function RawBlock(el)
54 | if starts_with('\\begin{center}', el.text) then
55 | local filetype = extension_for[FORMAT] or 'svg'
56 | local fname = system.get_working_directory() .. '/' ..
57 | pandoc.sha1(el.text) .. '.' .. filetype
58 | if not file_exists(fname) then
59 | tikz2image(el.text, filetype, fname)
60 | end
61 | return pandoc.Para({pandoc.Image({}, fname)})
62 | else
63 | return el
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/src/CCELossNode.cpp:
--------------------------------------------------------------------------------
1 | #include "CCELossNode.hpp"
2 | #include
3 |
4 | CCELossNode::CCELossNode(Model& model,
5 | std::string name,
6 | uint16_t input_size,
7 | size_t batch_size)
8 | : Node{model, std::move(name)}
9 | , input_size_{input_size}
10 | , inv_batch_size_{num_t{1.0} / static_cast(batch_size)}
11 | {
12 | // When we deliver a gradient back, we deliver just the loss gradient with
13 | // respect to any input and the index that was "hot" in the second argument.
14 | gradients_.resize(input_size_);
15 | }
16 |
17 | void CCELossNode::forward(num_t* data)
18 | {
19 | // The cross-entropy categorical loss is defined as -\sum_i(q_i * log(p_i))
20 | // where p_i is the predicted probabilty and q_i is the expected probablity
21 | //
22 | // In information theory, by convention, lim_{x approaches 0}(x log(x)) = 0
23 |
24 | num_t max{0.0};
25 | size_t max_index;
26 |
27 | loss_ = num_t{0.0};
28 | for (size_t i = 0; i != input_size_; ++i)
29 | {
30 | if (data[i] > max)
31 | {
32 | max_index = i;
33 | max = data[i];
34 | }
35 |
36 | // Because the target vector is one-hot encoded, most of these terms
37 | // will be zero, but we leave the full calculation here to be explicit
38 | // and in the event we want to compute losses against probability
39 | // distributions that arent one-hot. In practice, a faster code path
40 | // should be employed if the targets are known to be one-hot
41 | // distributions.
42 | loss_ -= target_[i]
43 | * std::log(
44 | // Prevent undefined results when taking the log of 0
45 | std::max(data[i], std::numeric_limits::epsilon()));
46 |
47 | if (target_[i] != num_t{0.0})
48 | {
49 | active_ = i;
50 | }
51 |
52 | // NOTE: The astute reader may notice that the gradients associated with
53 | // many of the loss node's input signals will be zero because the
54 | // cross-entropy is performed with respect to a one-hot vector.
55 | // Fortunately, because the layer preceding the output layer is a
56 | // softmax layer, the gradient from the single term contributing in the
57 | // above expression has a dependency on *every* softmax output unit (all
58 | // outputs show up in the summation in the softmax denominator).
59 | }
60 |
61 | if (max_index == active_)
62 | {
63 | ++correct_;
64 | }
65 | else
66 | {
67 | ++incorrect_;
68 | }
69 |
70 | cumulative_loss_ += loss_;
71 |
72 | // Store the data pointer to compute gradients later
73 | last_input_ = data;
74 | }
75 |
76 | void CCELossNode::reverse(num_t* data)
77 | {
78 | // dJ/dq_i = d(-\sum_i(p_i log(q_i)))/dq_i = -1 / q_j where j is the index
79 | // of the correct classification (loss gradient for a single sample).
80 | //
81 | // Note the normalization factor where we multiply by the inverse batch
82 | // size. This ensures that losses computed by the network are similar in
83 | // scale irrespective of batch size.
84 |
85 | for (size_t i = 0; i != input_size_; ++i)
86 | {
87 | gradients_[i] = -inv_batch_size_ * target_[i] / last_input_[i];
88 | }
89 |
90 | for (Node* node : antecedents_)
91 | {
92 | node->reverse(gradients_.data());
93 | }
94 | }
95 |
96 | void CCELossNode::print() const
97 | {
98 | std::printf("Avg Loss: %f\t%f%% correct\n", avg_loss(), accuracy() * 100.0);
99 | }
100 |
101 | num_t CCELossNode::accuracy() const
102 | {
103 | return static_cast(correct_)
104 | / static_cast(correct_ + incorrect_);
105 | }
106 | num_t CCELossNode::avg_loss() const
107 | {
108 | return cumulative_loss_ / static_cast(correct_ + incorrect_);
109 | }
110 |
111 | void CCELossNode::reset_score()
112 | {
113 | cumulative_loss_ = num_t{0.0};
114 | correct_ = 0;
115 | incorrect_ = 0;
116 | }
117 |
--------------------------------------------------------------------------------
/src/CCELossNode.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "Model.hpp"
4 |
5 | // Categorical Cross-Entropy Loss Node
6 | // Assumes input data is "one-hot encoded," with size equal to the number of
7 | // possible classifications, where the "answer" has a single "1" (aka hot value)
8 | // in one of the classification positions and zero everywhere else.
9 |
10 | class CCELossNode : public Node
11 | {
12 | public:
13 | CCELossNode(Model& model,
14 | std::string name,
15 | uint16_t input_size,
16 | size_t batch_size);
17 |
18 | // No initialization is needed for this node
19 | void init(rne_t&) override
20 | {}
21 |
22 | void forward(num_t* inputs) override;
23 | // As a loss node, the argument to this method is ignored (the gradient of
24 | // the loss with respect to itself is unity)
25 | void reverse(num_t* gradients = nullptr) override;
26 |
27 | void print() const override;
28 |
29 | void set_target(num_t const* target)
30 | {
31 | target_ = target;
32 | }
33 |
34 | num_t accuracy() const;
35 | num_t avg_loss() const;
36 | void reset_score();
37 |
38 | private:
39 | uint16_t input_size_;
40 |
41 | // We minimize the average loss, not the net loss so that the losses
42 | // produced do not scale with batch size (which allows us to keep training
43 | // parameters constant)
44 | num_t inv_batch_size_;
45 | num_t loss_;
46 | num_t const* target_;
47 | num_t* last_input_;
48 | // Stores the last active classification in the target one-hot encoding
49 | size_t active_;
50 | num_t cumulative_loss_{0.0};
51 | // Store running counts of correct and incorrect predictions
52 | size_t correct_ = 0;
53 | size_t incorrect_ = 0;
54 | std::vector gradients_;
55 | };
56 |
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(
2 | nn
3 | main.cpp
4 | CCELossNode.cpp
5 | FFNode.cpp
6 | GDOptimizer.cpp
7 | MNIST.cpp
8 | Model.cpp
9 | )
10 |
11 | target_compile_features(nn PUBLIC cxx_std_17)
12 |
--------------------------------------------------------------------------------
/src/Dual.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | template
4 | struct Dual
5 | {
6 | T real_ = T{0.0};
7 | T dual_ = T{1.0};
8 | };
9 |
10 | template
11 | [[nodiscard]] Dual operator+(Dual&& a, Dual&& b) noexcept
12 | {
13 | return {a.real_ + b.real_, a.dual_ + b.dual_};
14 | }
15 |
16 | template
17 | [[nodiscard]] Dual operator-(Dual&& a, Dual&& b) noexcept
18 | {
19 | return {a.real_ - b.real_, a.dual_ - b.dual_};
20 | }
21 |
22 | // (a + eb) * (c + ed) = ac + ebc + ead + e^2bd = ac + e(bc + ad)
23 | template
24 | [[nodiscard]] constexpr Dual operator*(Dual&& a, Dual&& b) noexcept
25 | {
26 | return {
27 | a.real_ * b.real_,
28 | a.real_ * b.dual_ + b.real_ * a.dual_,
29 | };
30 | }
31 |
--------------------------------------------------------------------------------
/src/FFNode.cpp:
--------------------------------------------------------------------------------
1 | #include "FFNode.hpp"
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | FFNode::FFNode(Model& model,
9 | std::string name,
10 | Activation activation,
11 | uint16_t output_size,
12 | uint16_t input_size)
13 | : Node{model, std::move(name)}
14 | , activation_{activation}
15 | , output_size_{output_size}
16 | , input_size_{input_size}
17 | {
18 | std::printf("%s: %d -> %d\n", name_.c_str(), input_size_, output_size_);
19 |
20 | // The weight parameters of a FF-layer are an NxM matrix
21 | weights_.resize(output_size_ * input_size_);
22 |
23 | // Each node in this layer is assigned a bias (so that zero is not
24 | // necessarily mapped to zero)
25 | biases_.resize(output_size_);
26 |
27 | // The outputs of each neuron within the layer is an "activation" in
28 | // neuroscience parlance
29 | activations_.resize(output_size_);
30 |
31 | activation_gradients_.resize(output_size_);
32 | weight_gradients_.resize(output_size_ * input_size_);
33 | bias_gradients_.resize(output_size_);
34 | input_gradients_.resize(input_size_);
35 | }
36 |
37 | void FFNode::init(rne_t& rne)
38 | {
39 | num_t sigma;
40 | switch (activation_)
41 | {
42 | case Activation::ReLU:
43 | // Kaiming He, et. al. weight initialization for ReLU networks
44 | // https://arxiv.org/pdf/1502.01852.pdf
45 | //
46 | // Suggests using a normal distribution with variance := 2 / n_in
47 | sigma = std::sqrt(2.0 / static_cast(input_size_));
48 | break;
49 | case Activation::Softmax:
50 | default:
51 | sigma = std::sqrt(1.0 / static_cast(input_size_));
52 | break;
53 | }
54 |
55 | // NOTE: Unfortunately, the C++ standard does not guarantee that the results
56 | // obtained from a distribution function will be identical given the same
57 | // inputs across different compilers and platforms. A production ML
58 | // framework will likely implement its own distributions to provide
59 | // deterministic results.
60 | auto dist = std::normal_distribution{0.0, sigma};
61 |
62 | for (num_t& w : weights_)
63 | {
64 | w = dist(rne);
65 | }
66 |
67 | // NOTE: Setting biases to zero is a common practice, as is initializing the
68 | // bias to a small value (e.g. on the order of 0.01). It is unclear if the
69 | // latter produces a consistent result over the former, but the thinking is
70 | // that a non-zero bias will ensure that the neuron always "fires" at the
71 | // beginning to produce a signal.
72 | //
73 | // Here, we initialize all biases to a small number, but the reader should
74 | // consider experimenting with other approaches.
75 | for (num_t& b : biases_)
76 | {
77 | b = 0.01;
78 | }
79 | }
80 |
81 | void FFNode::forward(num_t* inputs)
82 | {
83 | // Remember the last input data for backpropagation later
84 | last_input_ = inputs;
85 |
86 | for (size_t i = 0; i != output_size_; ++i)
87 | {
88 | // For each output vector, compute the dot product of the input data
89 | // with the weight vector add the bias
90 |
91 | num_t z{0.0};
92 |
93 | size_t offset = i * input_size_;
94 |
95 | for (size_t j = 0; j != input_size_; ++j)
96 | {
97 | z += weights_[offset + j] * inputs[j];
98 | }
99 | // Add neuron bias
100 | z += biases_[i];
101 |
102 | switch (activation_)
103 | {
104 | case Activation::ReLU:
105 | activations_[i] = std::max(z, num_t{0.0});
106 | break;
107 | case Activation::Softmax:
108 | default:
109 | activations_[i] = std::exp(z);
110 | break;
111 | }
112 | }
113 |
114 | if (activation_ == Activation::Softmax)
115 | {
116 | // softmax(z)_i = exp(z_i) / \sum_j(exp(z_j))
117 | num_t sum_exp_z{0.0};
118 | for (size_t i = 0; i != output_size_; ++i)
119 | {
120 | // NOTE: with exploding gradients, it is quite easy for this
121 | // exponential function to overflow, which will result in NaNs
122 | // infecting the network.
123 | sum_exp_z += activations_[i];
124 | }
125 | num_t inv_sum_exp_z = num_t{1.0} / sum_exp_z;
126 | for (size_t i = 0; i != output_size_; ++i)
127 | {
128 | activations_[i] *= inv_sum_exp_z;
129 | }
130 | }
131 |
132 | // Forward activation data to all subsequent nodes in the computational
133 | // graph
134 | for (Node* subsequent : subsequents_)
135 | {
136 | subsequent->forward(activations_.data());
137 | }
138 | }
139 |
140 | void FFNode::reverse(num_t* gradients)
141 | {
142 | // We receive a vector of output_size_ gradients of the loss function with
143 | // respect to the activations of this node.
144 |
145 | // We need to compute the gradients of the loss function with respect to
146 | // each parameter in the node (all weights and biases). In addition, we need
147 | // to compute the gradients with respect to the inputs in order to propagate
148 | // the gradients further.
149 |
150 | // Notation:
151 | //
152 | // Subscripts on any of the following vector and matrix quantities are used
153 | // to specify a specific element of the vector or matrix.
154 | //
155 | // - I is the input vector
156 | // - W is the weight matrix
157 | // - B is the bias vector
158 | // - Z = W*I + B
159 | // - A is our activation function (ReLU or Softmax in this case)
160 | // - L is the total loss (cost)
161 | //
162 | // The gradient we receive from the subsequent is dJ/dg(Z) which we can use
163 | // to compute dJ/dW_{i, j}, dJ/dB_i, and dJ/dI_i
164 |
165 | // First, we compute dJ/dz as dJ/dg(z) * dg(z)/dz and store it in our
166 | // activations array
167 | for (size_t i = 0; i != output_size_; ++i)
168 | {
169 | // dg(z)/dz
170 | num_t activation_grad{0.0};
171 | switch (activation_)
172 | {
173 | case Activation::ReLU:
174 | // For a ReLU function, the gradient is unity when the activation
175 | // exceeds 0.0, and 0.0 otherwise. Technically, the gradient is
176 | // undefined at 0, but in practice, defining the gradient at this
177 | // point to be 0 isn't an issue
178 | if (activations_[i] > num_t{0.0})
179 | {
180 | activation_grad = num_t{1.0};
181 | }
182 | else
183 | {
184 | activation_grad = num_t{0.0};
185 | }
186 | // dJ/dz = dJ/dg(z) * dg(z)/dz
187 | activation_gradients_[i] = gradients[i] * activation_grad;
188 | break;
189 | case Activation::Softmax:
190 | default:
191 | // F.T.R. The implementation here correctly computes gradients for
192 | // the general softmax function accounting for all received
193 | // gradients. However, this step can be optimized significantly if
194 | // it is known that the softmax output is being compared to a
195 | // one-hot distribution. The softmax output of a given unit is
196 | // exp(z_i) / \sum_j exp(z_j). When the loss gradient with respect
197 | // to the softmax outputs is returned, a single i is selected from
198 | // among the softmax outputs in a 1-hot encoding, corresponding to
199 | // the correct classification for this training sample. Complete the
200 | // derivation for the gradient of the softmax assuming a one-hot
201 | // distribution and implement the optimized routine.
202 |
203 | for (size_t j = 0; j != output_size_; ++j)
204 | {
205 | if (i == j)
206 | {
207 | activation_grad += activations_[i]
208 | * (num_t{1.0} - activations_[i])
209 | * gradients[j];
210 | }
211 | else
212 | {
213 | activation_grad
214 | += -activations_[i] * activations_[j] * gradients[j];
215 | }
216 | }
217 |
218 | activation_gradients_[i] = activation_grad;
219 | break;
220 | }
221 | }
222 |
223 | for (size_t i = 0; i != output_size_; ++i)
224 | {
225 | // Next, let's compute the partial dJ/db_i. If we hold all the weights
226 | // and inputs constant, it's clear that dz/db_i is just 1 (consider
227 | // differentiating the line mx + b with respect to b). Thus, dJ/db_i =
228 | // dJ/dg(z_i) * dg(z_i)/dz_i.
229 | bias_gradients_[i] += activation_gradients_[i];
230 | }
231 |
232 | // CAREFUL! Unlike the other gradients, we reset input gradients to 0. These
233 | // values are used primarily as a subexpression in computing upstream
234 | // gradients and do not participate in the network optimization step (aka
235 | // Stochastic Gradient Descent) later.
236 | std::fill(input_gradients_.begin(), input_gradients_.end(), num_t{0.0});
237 |
238 | // To compute dz/dI_i, recall that z_i = \sum_i W_i*I_i + B_i. That is, the
239 | // precursor to each activation is a dot-product between a weight vector an
240 | // the input plus a bias. Thus, dz/dI_i must be the sum of all weights that
241 | // were scaled by I_i during the forward pass.
242 | for (size_t i = 0; i != output_size_; ++i)
243 | {
244 | size_t offset = i * input_size_;
245 | for (size_t j = 0; j != input_size_; ++j)
246 | {
247 | input_gradients_[j]
248 | += weights_[offset + j] * activation_gradients_[i];
249 | }
250 | }
251 |
252 | for (size_t i = 0; i != input_size_; ++i)
253 | {
254 | for (size_t j = 0; j != output_size_; ++j)
255 | {
256 | // Each individual weight shows up in the equation for z once and is
257 | // scaled by the corresponding input. Thus, dJ/dw_i = dJ/dg(z_i) *
258 | // dg(z_i)/dz_i * dz_i/d_w_ij where the last factor is equal to the
259 | // input scaled by w_ij.
260 |
261 | weight_gradients_[j * input_size_ + i]
262 | += last_input_[i] * activation_gradients_[j];
263 | }
264 | }
265 |
266 | for (Node* node : antecedents_)
267 | {
268 | // Forward loss gradients with respect to the inputs to the previous
269 | // node.
270 | //
271 | // F.T.R. Technically, if the antecedent node has no learnable
272 | // parameters, there is no point forwarding gradients to that node.
273 | // Furthermore, if no antecedent nodes required any gradients, we could
274 | // have skipped computing the gradients for this node altogether. A
275 | // simple way to implement this is to add a `parameter_count` virtual
276 | // method on the Node interface leverage it to save some work whenever
277 | // possible here.
278 | node->reverse(input_gradients_.data());
279 | }
280 | }
281 |
282 | // F.T.R. It is more efficient to store parameters contiguously so they can be
283 | // accessed without branching or arithmetic.
284 | num_t* FFNode::param(size_t index)
285 | {
286 | if (index < weights_.size())
287 | {
288 | return &weights_[index];
289 | }
290 | return &biases_[index - weights_.size()];
291 | }
292 |
293 | num_t* FFNode::gradient(size_t index)
294 | {
295 | if (index < weights_.size())
296 | {
297 | return &weight_gradients_[index];
298 | }
299 | return &bias_gradients_[index - weights_.size()];
300 | }
301 |
302 | void FFNode::print() const
303 | {
304 | std::printf("%s\n", name_.c_str());
305 |
306 | // Consider the input samples as column vectors, and visualize the weights
307 | // as a matrix transforming vectors with input_size_ dimension to size_
308 | // dimension
309 | std::printf("Weights (%d x %d)\n", output_size_, input_size_);
310 | for (size_t i = 0; i != output_size_; ++i)
311 | {
312 | size_t offset = i * input_size_;
313 | for (size_t j = 0; j != input_size_; ++j)
314 | {
315 | std::printf("\t[%zu]%f", offset + j, weights_[offset + j]);
316 | }
317 | std::printf("\n");
318 | }
319 | std::printf("Biases (%d x 1)\n", output_size_);
320 | for (size_t i = 0; i != output_size_; ++i)
321 | {
322 | std::printf("\t%f\n", biases_[i]);
323 | }
324 | std::printf("\n");
325 | }
326 |
--------------------------------------------------------------------------------
/src/FFNode.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "Model.hpp"
4 |
5 | #include
6 | #include
7 |
8 | // Fully-connected, feedforward Layer
9 |
10 | // A feedforward layer is parameterized by the number of neurons it posesses and
11 | // the number of neurons in the layer preceding it
12 | class FFNode : public Node
13 | {
14 | public:
15 | FFNode(Model& model,
16 | std::string name,
17 | Activation activation,
18 | uint16_t output_size,
19 | uint16_t input_size);
20 |
21 | // Initialize the parameters of the layer
22 | // F.T.R.
23 | // Experiment with alternative weight and bias initialization schemes:
24 | // 1. Try different distributions for the weight
25 | // 2. Try initializing all weights to zero (why is this suboptimal)
26 | // 3. Try initializing all the biases to zero
27 | void init(rne_t& rne) override;
28 |
29 | // The input vector should have size input_size_
30 | void forward(num_t* inputs) override;
31 | // The output vector should have size output_size_
32 | void reverse(num_t* gradients) override;
33 |
34 | size_t param_count() const noexcept override
35 | {
36 | // Weight matrix entries + bias entries
37 | return (input_size_ + 1) * output_size_;
38 | }
39 |
40 | num_t* param(size_t index);
41 | num_t* gradient(size_t index);
42 |
43 | void print() const override;
44 |
45 | private:
46 | Activation activation_;
47 | uint16_t output_size_;
48 | uint16_t input_size_;
49 |
50 | /////////////////////
51 | // Node Parameters //
52 | /////////////////////
53 |
54 | // weights_.size() := output_size_ * input_size_
55 | std::vector weights_;
56 | // biases_.size() := output_size_
57 | std::vector biases_;
58 | // activations_.size() := output_size_
59 | std::vector activations_;
60 |
61 | ////////////////////
62 | // Loss Gradients //
63 | ////////////////////
64 |
65 | std::vector activation_gradients_;
66 |
67 | // During the training cycle, parameter loss gradients are accumulated in
68 | // the following buffers.
69 | std::vector weight_gradients_;
70 | std::vector bias_gradients_;
71 |
72 | // This buffer is used to store temporary gradients used in a SINGLE
73 | // backpropagation pass. Note that this does not accumulate like the weight
74 | // and bias gradients do.
75 | std::vector input_gradients_;
76 |
77 | // The last input is needed to compute loss gradients with respect to the
78 | // weights during backpropagation
79 | num_t* last_input_;
80 | };
81 |
--------------------------------------------------------------------------------
/src/GDOptimizer.cpp:
--------------------------------------------------------------------------------
1 | #include "GDOptimizer.hpp"
2 | #include "Model.hpp"
3 | #include
4 |
5 | GDOptimizer::GDOptimizer(num_t eta)
6 | : eta_{eta}
7 | {}
8 |
9 | void GDOptimizer::train(Node& node)
10 | {
11 | size_t param_count = node.param_count();
12 | for (size_t i = 0; i != param_count; ++i)
13 | {
14 | num_t& param = *node.param(i);
15 | num_t& gradient = *node.gradient(i);
16 |
17 | param = param - eta_ * gradient;
18 |
19 | // Reset the gradient which will be accumulated again in the next
20 | // training epoch
21 | gradient = num_t{0.0};
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/GDOptimizer.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "Model.hpp"
4 |
5 | // Note that this class defines the general gradient descent algorithm. It can
6 | // be used as part of the *Stochastic* gradient descent algorithm (aka SGD) by
7 | // invoking it after smaller batches of training data are evaluated.
8 | class GDOptimizer : public Optimizer
9 | {
10 | public:
11 | // "Eta" is the commonly accepted character used to denote the learning
12 | // rate. Given a loss gradient dL/dp for some parameter p, during gradient
13 | // descent, p will be adjusted such that p' = p - eta * dL/dp.
14 | GDOptimizer(num_t eta);
15 |
16 | // This should be invoked at the end of each batch's evaluation. The
17 | // interface technically permits the use of different optimizers for
18 | // different segments of the computational graph.
19 | void train(Node& node) override;
20 |
21 | private:
22 | num_t eta_;
23 | };
24 |
--------------------------------------------------------------------------------
/src/MNIST.cpp:
--------------------------------------------------------------------------------
1 | #include "MNIST.hpp"
2 |
3 | #include
4 | #include
5 |
6 | // Read 4 bytes and reverse them to return an unsigned integer on LE
7 | // architectures
8 | void read_be(std::ifstream& in, uint32_t* out)
9 | {
10 | char* buf = reinterpret_cast(out);
11 | in.read(buf, 4);
12 |
13 | std::swap(buf[0], buf[3]);
14 | std::swap(buf[1], buf[2]);
15 | }
16 |
17 | MNIST::MNIST(Model& model, std::ifstream& images, std::ifstream& labels)
18 | : Node{model, "MNIST input"}
19 | , images_{images}
20 | , labels_{labels}
21 | {
22 | // Confirm that passed input file streams are well-formed MNIST data sets
23 | uint32_t image_magic;
24 | read_be(images, &image_magic);
25 | if (image_magic != 2051)
26 | {
27 | throw std::runtime_error{"Images file appears to be malformed"};
28 | }
29 | read_be(images, &image_count_);
30 |
31 | uint32_t labels_magic;
32 | read_be(labels, &labels_magic);
33 | if (labels_magic != 2049)
34 | {
35 | throw std::runtime_error{"Labels file appears to be malformed"};
36 | }
37 |
38 | uint32_t label_count;
39 | read_be(labels, &label_count);
40 | if (label_count != image_count_)
41 | {
42 | throw std::runtime_error(
43 | "Label count did not match the number of images supplied");
44 | }
45 |
46 | uint32_t rows;
47 | uint32_t columns;
48 | read_be(images, &rows);
49 | read_be(images, &columns);
50 | if (rows != 28 || columns != 28)
51 | {
52 | throw std::runtime_error{
53 | "Expected 28x28 images, non-MNIST data supplied"};
54 | }
55 |
56 | printf("Loaded images file with %d entries\n", image_count_);
57 | }
58 |
59 | void MNIST::forward(num_t* data)
60 | {
61 | read_next();
62 | for (Node* node : subsequents_)
63 | {
64 | node->forward(data_);
65 | }
66 | }
67 |
68 | void MNIST::print() const
69 | {
70 | // No learned parameters to display for an MNIST input node
71 | }
72 |
73 | void MNIST::read_next()
74 | {
75 | images_.read(buf_, DIM);
76 | num_t inv = num_t{1.0} / num_t{255.0};
77 | for (size_t i = 0; i != DIM; ++i)
78 | {
79 | data_[i] = static_cast(buf_[i]) * inv;
80 | }
81 |
82 | char label;
83 | labels_.read(&label, 1);
84 |
85 | for (size_t i = 0; i != 10; ++i)
86 | {
87 | label_[i] = num_t{0.0};
88 | }
89 | label_[static_cast(label)] = num_t{1.0};
90 | }
91 |
92 | void MNIST::print_last()
93 | {
94 | for (size_t i = 0; i != 10; ++i)
95 | {
96 | if (label_[i] == num_t{1.0})
97 | {
98 | printf("This is a %zu:\n", i);
99 | break;
100 | }
101 | }
102 |
103 | for (size_t i = 0; i != 28; ++i)
104 | {
105 | size_t offset = i * 28;
106 | for (size_t j = 0; j != 28; ++j)
107 | {
108 | if (data_[offset + j] > num_t{0.5})
109 | {
110 | if (data_[offset + j] > num_t{0.9})
111 | {
112 | printf("#");
113 | }
114 | else if (data_[offset + j] > num_t{0.7})
115 | {
116 | printf("*");
117 | }
118 | else
119 | {
120 | printf(".");
121 | }
122 | }
123 | else
124 | {
125 | printf(" ");
126 | }
127 | }
128 | printf("\n");
129 | }
130 | printf("\n");
131 | }
132 |
--------------------------------------------------------------------------------
/src/MNIST.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "Model.hpp"
4 | #include
5 |
6 | class MNIST : public Node
7 | {
8 | public:
9 | constexpr static size_t DIM = 28 * 28;
10 |
11 | MNIST(Model& model, std::ifstream& images, std::ifstream& labels);
12 |
13 | void init(rne_t&) override
14 | {}
15 |
16 | // As this is an input node, the argument to this function is ignored
17 | void forward(num_t* data = nullptr) override;
18 | // Backpropagation is a no-op for input nodes as there are no parameters to
19 | // update
20 | void reverse(num_t* data = nullptr) override
21 | {}
22 |
23 | // Parse the next image and label into memory
24 | void read_next();
25 |
26 | void print() const override;
27 |
28 | [[nodiscard]] size_t size() const noexcept
29 | {
30 | return image_count_;
31 | }
32 |
33 | [[nodiscard]] num_t const* data() const noexcept
34 | {
35 | return data_;
36 | }
37 |
38 | [[nodiscard]] num_t* data() noexcept
39 | {
40 | return data_;
41 | }
42 |
43 | [[nodiscard]] num_t* label() noexcept
44 | {
45 | return label_;
46 | }
47 |
48 | [[nodiscard]] num_t const* label() const noexcept
49 | {
50 | return label_;
51 | }
52 |
53 | // Quick ASCII visualization of the last read image. For best results,
54 | // ensure that your terminal font is a monospace font.
55 | void print_last();
56 |
57 | private:
58 | std::ifstream& images_;
59 | std::ifstream& labels_;
60 | uint32_t image_count_;
61 | // Data from the images file is read as one-byte unsigned values which are
62 | // converted to num_t after
63 | char buf_[DIM];
64 | // All images are resized (with antialiasing) to a 28 x 28 row-major raster
65 | num_t data_[DIM];
66 | // One-hot encoded label
67 | num_t label_[10];
68 | };
69 |
--------------------------------------------------------------------------------
/src/Model.cpp:
--------------------------------------------------------------------------------
1 | #include "Model.hpp"
2 |
3 | Node::Node(Model& model, std::string name)
4 | : model_(model)
5 | , name_{std::move(name)}
6 | {}
7 |
8 | Model::Model(std::string name)
9 | : name_{std::move(name)}
10 | {}
11 |
12 | void Model::create_edge(Node& dst, Node& src)
13 | {
14 | // NOTE: No validation is done to ensure the edge doesn't already exist
15 | dst.antecedents_.push_back(&src);
16 | src.subsequents_.push_back(&dst);
17 | }
18 |
19 | rne_t::result_type Model::init(rne_t::result_type seed)
20 | {
21 | if (seed == 0)
22 | {
23 | // Generate a new random seed from the host random device
24 | std::random_device rd{};
25 | seed = rd();
26 | }
27 | std::printf("Initializing model parameters with seed: %u\n", seed);
28 |
29 | rne_t rne{seed};
30 |
31 | for (auto& node : nodes_)
32 | {
33 | node->init(rne);
34 | }
35 |
36 | return seed;
37 | }
38 |
39 | void Model::train(Optimizer& optimizer)
40 | {
41 | for (auto&& node : nodes_)
42 | {
43 | optimizer.train(*node);
44 | }
45 | }
46 |
47 | void Model::print() const
48 | {
49 | // Invoke "print" on each node in the order added
50 | for (auto&& node : nodes_)
51 | {
52 | node->print();
53 | }
54 | }
55 |
56 | void Model::save(std::ofstream& out)
57 | {
58 | // To save the model to disk, we employ a very simple scheme. All nodes are
59 | // looped through in the order they were added to the model. Then, all
60 | // advertised learnable parameters are serialized in host byte-order to the
61 | // supplied output stream.
62 | //
63 | // F.T.R. This simplistic method of saving the model to disk isn't very
64 | // robust or practical in the real world. For one thing, it contains no
65 | // reflection data about the topology of the model. Loading the data relies
66 | // on the model being constructed in the same manner it was trained on.
67 | // Furthermore, the data will be parsed incorrectly if the program is
68 | // recompiled to operate with a different precision. Adopting a more
69 | // sensible serialization scheme is left as an exercise.
70 | for (auto& node : nodes_)
71 | {
72 | size_t param_count = node->param_count();
73 | for (size_t i = 0; i != param_count; ++i)
74 | {
75 | out.write(
76 | reinterpret_cast(node->param(i)), sizeof(num_t));
77 | }
78 | }
79 | }
80 |
81 | void Model::load(std::ifstream& in)
82 | {
83 | for (auto& node : nodes_)
84 | {
85 | size_t param_count = node->param_count();
86 | for (size_t i = 0; i != param_count; ++i)
87 | {
88 | in.read(reinterpret_cast(node->param(i)), sizeof(num_t));
89 | }
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/Model.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | // Default precision: single
11 | using num_t = float;
12 | // Default random number engine: 32-bit Mersenne Twister by Matsumoto and
13 | // Nishimura, 1998. For generating random numbers with double precision, the
14 | // 64-bit Mersenne Twister should be used.
15 | using rne_t = std::mt19937;
16 |
17 | enum class Activation
18 | {
19 | ReLU,
20 | Softmax
21 | };
22 |
23 | class Model;
24 |
25 | // Base class of computational nodes in a model
26 | class Node
27 | {
28 | public:
29 | Node(Model& model, std::string name);
30 | virtual ~Node(){};
31 |
32 | // Initialize the parameters of the node with a provided random number
33 | // engine.
34 | virtual void init(rne_t& rne) = 0;
35 |
36 | // Data is fed forward through the network using a simple generic interface.
37 | // We do this to avoid requiring an involved N-dimensional matrix
38 | // abstraction. Here, the "shape" of the data is dependent on the Node's
39 | // implementation and the way a given Node is initialized.
40 | //
41 | // In practice, this should be replaced with an actual type with a shape
42 | // defined by data to permit additional validation. It is also common for
43 | // the data object passed here to not contain the data directly (the data
44 | // may be located on a GPU for example)
45 | virtual void forward(num_t* inputs) = 0;
46 |
47 | // Expected inputs during the reverse accumulation phase are the loss
48 | // gradients with respect to each output
49 | //
50 | // The node is expected to compute the loss gradient with respect to each
51 | // parameter and update the parameter according to the model's optimizer,
52 | // after which, the gradients with respect to the node inputs are propagated
53 | // backwards again.
54 | virtual void reverse(num_t* gradients) = 0;
55 |
56 | // Returns the number of learnable parameters in this node. Nodes that are
57 | // input or loss nodes have no learnable parameters.
58 | virtual size_t param_count() const noexcept
59 | {
60 | return 0;
61 | }
62 |
63 | // Indexing operator for learnable parameters that are mutated during
64 | // training. Nodes without learnable parameters should keep this
65 | // unimplemented.
66 | virtual num_t* param(size_t index)
67 | {
68 | return nullptr;
69 | }
70 |
71 | // Indexing operator for the loss gradient with respect to a learnable
72 | // parameter. Used by an optimizer to adjust the corresponding parameter and
73 | // potentially for tracking gradient histories (done in more sophisticated
74 | // optimizers, e.g. AdaGrad)
75 | virtual num_t* gradient(size_t index)
76 | {
77 | return nullptr;
78 | }
79 |
80 | [[nodiscard]] std::string const& name() const noexcept
81 | {
82 | return name_;
83 | }
84 |
85 | // Generic function that displays the contents of the node in some fashion
86 | virtual void print() const = 0;
87 |
88 | protected:
89 | friend class Model;
90 |
91 | Model& model_;
92 | std::string name_;
93 | std::vector antecedents_;
94 | std::vector subsequents_;
95 | };
96 |
97 | // Base class of optimizer used to train a model
98 | class Optimizer
99 | {
100 | public:
101 | virtual void train(Node& node) = 0;
102 | };
103 |
104 | class Model
105 | {
106 | public:
107 | Model(std::string name);
108 |
109 | template
110 | Node_t& add_node(T&&... args)
111 | {
112 | nodes_.emplace_back(
113 | std::make_unique(*this, std::forward(args)...));
114 | return reinterpret_cast(*nodes_.back());
115 | }
116 |
117 | void create_edge(Node& dst, Node& src);
118 |
119 | // Initialize the parameters of all nodes with the provided seed. If the
120 | // seed is 0, a new random seed is chosen instead. Returns the seed used.
121 | rne_t::result_type init(rne_t::result_type seed = 0);
122 |
123 | void train(Optimizer& optimizer);
124 |
125 | [[nodiscard]] std::string const& name() const noexcept
126 | {
127 | return name_;
128 | }
129 |
130 | void print() const;
131 |
132 | void save(std::ofstream& out);
133 | void load(std::ifstream& in);
134 |
135 | private:
136 | friend class Node;
137 |
138 | std::string name_;
139 | std::vector> nodes_;
140 | };
141 |
--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
1 | #include "CCELossNode.hpp"
2 | #include "FFNode.hpp"
3 | #include "GDOptimizer.hpp"
4 | #include "MNIST.hpp"
5 | #include "Model.hpp"
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | static constexpr size_t batch_size = 80;
12 |
13 | Model create_model(std::ifstream& images,
14 | std::ifstream& labels,
15 | MNIST** mnist,
16 | CCELossNode** loss)
17 | {
18 | // Here we create a simple fully-connected feedforward neural network
19 | Model model{"ff"};
20 |
21 | *mnist = &model.add_node(images, labels);
22 |
23 | FFNode& hidden = model.add_node("hidden", Activation::ReLU, 32, 784);
24 |
25 | FFNode& output
26 | = model.add_node("output", Activation::Softmax, 10, 32);
27 |
28 | *loss = &model.add_node("loss", 10, batch_size);
29 | (*loss)->set_target((*mnist)->label());
30 |
31 | // F.T.R. The structure of our computational graph is completely sequential.
32 | // In fact, the fully connected node and loss node we've implemented here do
33 | // not support multiple inputs. Consider adding nodes that support "skip"
34 | // connections that forward outputs from earlier nodes to downstream nodes
35 | // that aren't directly adjacent (such skip nodes are used in the ResNet
36 | // architecture)
37 | model.create_edge(hidden, **mnist);
38 | model.create_edge(output, hidden);
39 | model.create_edge(**loss, output);
40 | return model;
41 | }
42 |
43 | void train(char* argv[])
44 | {
45 | // Uncomment to debug floating point instability in the network
46 | // feenableexcept(FE_INVALID | FE_OVERFLOW);
47 |
48 | std::printf("Executing training routine\n");
49 |
50 | std::ifstream images{
51 | std::filesystem::path{argv[0]} / "train-images-idx3-ubyte",
52 | std::ios::binary};
53 |
54 | std::ifstream labels{
55 | std::filesystem::path{argv[0]} / "train-labels-idx1-ubyte",
56 | std::ios::binary};
57 |
58 | MNIST* mnist;
59 | CCELossNode* loss;
60 | Model model = create_model(images, labels, &mnist, &loss);
61 |
62 | model.init();
63 |
64 | // The gradient descent optimizer is stateless, but other optimizers may not
65 | // be. Some optimizers need to track "momentum" or gradient histories.
66 | // Others may slow the learning rate for each parameter at different rates
67 | // depending on various factors.
68 | //
69 | // F.T.R. Implement an alternative SGDOptimizer that decays the learning
70 | // rate over time and compare the results against this optimizer that learns
71 | // at a fixed rate.
72 | GDOptimizer optimizer{num_t{0.3}};
73 |
74 | // F.T.R. Here, we've hardcoded the number of batches to train on. In
75 | // practice, training should halt when the average loss begins to
76 | // vascillate, indicating that the model is starting to overfit the data.
77 | // Implement some form of loss-improvement measure to determine when this
78 | // inflection point occurs and stop accordingly.
79 | size_t i = 0;
80 | for (; i != 256; ++i)
81 | {
82 | loss->reset_score();
83 |
84 | for (size_t j = 0; j != batch_size; ++j)
85 | {
86 | mnist->forward();
87 | loss->reverse();
88 | }
89 |
90 | model.train(optimizer);
91 | }
92 |
93 | std::printf("Ran %zu batches (%zu samples each)\n", i, batch_size);
94 |
95 | // Print the average loss computed in the final batch
96 | loss->print();
97 |
98 | std::ofstream out{
99 | std::filesystem::current_path() / (model.name() + ".params"),
100 | std::ios::binary};
101 | model.save(out);
102 | }
103 |
104 | void evaluate(char* argv[])
105 | {
106 | std::printf("Executing evaluation routine\n");
107 |
108 | std::ifstream images{
109 | std::filesystem::path{argv[0]} / "t10k-images-idx3-ubyte",
110 | std::ios::binary};
111 |
112 | std::ifstream labels{
113 | std::filesystem::path{argv[0]} / "t10k-labels-idx1-ubyte",
114 | std::ios::binary};
115 |
116 | MNIST* mnist;
117 | CCELossNode* loss;
118 | // For the data to be loaded properly, the model must be constructed in the
119 | // same manner as it was constructed during training.
120 | Model model = create_model(images, labels, &mnist, &loss);
121 |
122 | // Instead of initializing the parameters randomly, here we load it from
123 | // disk (saved from a previous training run).
124 | std::ifstream params_file{std::filesystem::path{argv[1]}, std::ios::binary};
125 | model.load(params_file);
126 |
127 | // Evaluate all 10000 images in the test set and compute the loss average
128 | for (size_t i = 0; i != mnist->size(); ++i)
129 | {
130 | mnist->forward();
131 | }
132 | loss->print();
133 | }
134 |
135 | int main(int argc, char* argv[])
136 | {
137 | if (argc < 2)
138 | {
139 | std::printf("Supported commands include:\ntrain\nevaluate\n");
140 | return 1;
141 | }
142 |
143 | if (strcmp(argv[1], "train") == 0)
144 | {
145 | train(argv + 2);
146 | }
147 | else if (strcmp(argv[1], "evaluate") == 0)
148 | {
149 | evaluate(argv + 2);
150 | }
151 | else
152 | {
153 | std::printf("Argument %s is an unrecognized directive.\n", argv[1]);
154 | }
155 |
156 | return 0;
157 | }
158 |
--------------------------------------------------------------------------------