├── .github └── workflows │ └── cmake-multi-platform.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── README.md ├── examples ├── CMakeLists.txt ├── binary-classifier.cpp ├── c-plus-equals-cycle.dot ├── c-plus-equals-cycle.svg ├── c-plus-equals-rewrite.svg ├── example-usage-cycle.cpp ├── example-usage.cpp ├── graph.cpp ├── graph.svg ├── loss.svg ├── mlp1.cpp ├── mlp1.svg ├── neuron.cpp ├── neuron.svg └── regression0.cpp ├── include ├── array.h ├── backprop.h ├── graph.h ├── loss.h ├── mac.h ├── nn.h ├── randomdata.h ├── tuple.h └── value.h └── tests ├── CMakeLists.txt ├── mac-test.cpp ├── multivariate-test.cpp └── value-test.cpp /.github/workflows/cmake-multi-platform.yml: -------------------------------------------------------------------------------- 1 | # This starter workflow is for a CMake project running on multiple platforms. There is a different starter workflow if you just want a single platform. 2 | # See: https://github.com/actions/starter-workflows/blob/main/ci/cmake-single-platform.yml 3 | name: CMake on multiple platforms 4 | 5 | on: 6 | push: 7 | branches: [ "main" ] 8 | pull_request: 9 | branches: [ "main" ] 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | 15 | strategy: 16 | # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations. Consider changing this to true when your workflow is stable. 17 | fail-fast: false 18 | 19 | # Set up a matrix to run the following 3 configurations: 20 | # 1. 21 | # 2. 22 | # 3. 23 | # 24 | # To add more build types (Release, Debug, RelWithDebInfo, etc.) customize the build_type list. 25 | matrix: 26 | os: [ubuntu-latest, windows-latest] 27 | build_type: [Release] 28 | c_compiler: [gcc, cl] 29 | include: 30 | - os: windows-latest 31 | c_compiler: cl 32 | cpp_compiler: cl 33 | - os: ubuntu-latest 34 | c_compiler: gcc 35 | cpp_compiler: g++ 36 | exclude: 37 | - os: windows-latest 38 | c_compiler: gcc 39 | - os: ubuntu-latest 40 | c_compiler: cl 41 | 42 | steps: 43 | - uses: actions/checkout@v3 44 | 45 | - name: Set reusable strings 46 | # Turn repeated input strings (such as the build output directory) into step outputs. These step outputs can be used throughout the workflow file. 47 | id: strings 48 | shell: bash 49 | run: | 50 | echo "build-output-dir=${{ github.workspace }}/build" >> "$GITHUB_OUTPUT" 51 | 52 | - name: Configure CMake 53 | # Configure CMake in a 'build' subdirectory. `CMAKE_BUILD_TYPE` is only required if you are using a single-configuration generator such as make. 54 | # See https://cmake.org/cmake/help/latest/variable/CMAKE_BUILD_TYPE.html?highlight=cmake_build_type 55 | run: > 56 | cmake -B ${{ steps.strings.outputs.build-output-dir }} 57 | -DCMAKE_CXX_COMPILER=${{ matrix.cpp_compiler }} 58 | -DCMAKE_C_COMPILER=${{ matrix.c_compiler }} 59 | -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} 60 | -S ${{ github.workspace }} 61 | 62 | - name: Build 63 | # Build your program with the given configuration. Note that --config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). 64 | run: cmake --build ${{ steps.strings.outputs.build-output-dir }} --config ${{ matrix.build_type }} 65 | 66 | - name: Test 67 | working-directory: ${{ steps.strings.outputs.build-output-dir }} 68 | # Execute tests defined by the CMake configuration. Note that --build-config is needed because the default Windows generator is a multi-config generator (Visual Studio generator). 69 | # See https://cmake.org/cmake/help/latest/manual/ctest.1.html for more detail 70 | run: ctest --build-config ${{ matrix.build_type }} 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | *.swp 3 | 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | enable_testing() 4 | 5 | project(ai-play) 6 | 7 | set(CMAKE_CXX_STANDARD 23) 8 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 9 | set(CMAKE_CXX_EXTENSIONS OFF) 10 | 11 | # Global compiler flags 12 | if(CMAKE_CXX_COMPILER_ID MATCHES "GNU") 13 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -Wpedantic -g") 14 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Wextra -O3 -march=native -mtune=native -mavx2 -ffast-math") 15 | endif() 16 | 17 | if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") 18 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -Wextra -Wpedantic -g") 19 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Wextra -O3 -march=native -mtune=native -mavx2 -ffast-math") 20 | endif() 21 | 22 | 23 | add_subdirectory(tests) 24 | 25 | add_subdirectory(examples) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Conrad Parker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![CMake on multiple platforms](https://github.com/kfish/micrograd-cpp-2023/actions/workflows/cmake-multi-platform.yml/badge.svg)](https://github.com/kfish/micrograd-cpp-2023/actions/workflows/cmake-multi-platform.yml) 2 | 3 | # micrograd-cpp-2023 4 | 5 | A C++ implementation of 6 | [karpathy/micrograd](https://github.com/karpathy/micrograd). 7 | Each step of the first episode of *Neural Nets: Zero to Hero*: 8 | [The spelled-out intro to neural networks and backpropagation: building micrograd](https://youtu.be/VMj-3S1tku0) 9 | is included. 10 | 11 | ![](https://i.ytimg.com/vi/VMj-3S1tku0/hqdefault.jpg) 12 | 13 | This roughly follows the flow of Karpathy's YouTube tutorial, with details specific to this C++ implementation: 14 | 15 | * [What is micrograd-cpp and why is it interesting?](#what-is-micrograd-cpp-and-why-is-it-interesting) 16 | - [Example usage](#example-usage) 17 | - [C++ implementation notes](#c-implementation-notes) 18 | * [Building out the Value object](#building-out-the-value-object) 19 | * [Visualizing the expression graph](#visualizing-the-expression-graph) 20 | * [Backpropagation](#backpropagation) 21 | * [Backpropagation through a neuron](#backpropagation-through-a-neuron) 22 | - [Activation function](#activation-function) 23 | - [Math operations](#math-operations) 24 | - [Multiply-Accumulate](#multiply-accumulate) 25 | - [randomValue, randomArray](#randomvalue-randomarray) 26 | * [Multi-Layer Perceptron](#multi-layer-perceptron) 27 | - [Layer](#layer) 28 | - [BuildLayers](#buildlayers) 29 | - [MLP](#mlp) 30 | - [MLP1](#mlp1) 31 | * [Loss function](#loss-function) 32 | - [MSELoss](#mseloss) 33 | * [Gradient descent](#gradient-descent) 34 | - [Adjusting parameters](#adjusting-parameters) 35 | - [CanBackProp](#canbackprop) 36 | - [BackProp](#backprop) 37 | - [Binary Classifier](#binary-classifier) 38 | 39 | with [References](#references) at the end for further reading about automatic differentiation and C++ implementations. 40 | 41 | See [kfish/makemore-cpp-2023](https://github.com/kfish/makemore-cpp-2023) for a continuation to other 42 | videos in the series, expanding the codebase to handle automatic differentiation of vectors and matrices. 43 | 44 | ## What is micrograd-cpp and why is it interesting? 45 | 46 | micrograd-cpp introduces some nuances of backpropagation (reverse-mode autodiff) and its 47 | implementation. At its core is an expression graph which can be evaluated forwards, where 48 | an expression like `a+b` is implemented using `operator+`, and differentiated in reverse 49 | using a `std::function` attached to each graph node to calculate its gradient. 50 | 51 | This implementation allows computations using `Value` objects to be written as 52 | normal-looking C++ code. It also includes generic classes for evaluation and learning for anything that can produce `Value`. 53 | 54 | Like micrograd, the point is still to be educational, with a focus on some implementation details 55 | and flexibility for exploring learning algorithms. 56 | 57 | ### Building 58 | 59 | For simplicity, there are no code dependencies. Data is manipulated using `std::array<>` and graphing and plotting is done 60 | with external tools like `dot` and `gnuplot`. 61 | 62 | Build with CMake, eg 63 | 64 | ```bash 65 | $ mkdir build 66 | $ cd build 67 | $ cmake .. 68 | $ make 69 | ``` 70 | 71 | ### Example usage 72 | 73 | This is [examples/example-usage.cpp](examples/example-usage.cpp): 74 | 75 | ```c++ 76 | #include 77 | 78 | #include "value.h" 79 | 80 | using namespace ai; 81 | 82 | int main(int argc, char *argv[]) 83 | { 84 | auto a = make_value(-4.0); 85 | auto b = make_value(2.0); 86 | 87 | auto c = a + b; 88 | auto d = a * b + pow(b, 3); 89 | 90 | c += c + 1; 91 | c += 1 + c + (-a); 92 | d += d * 2 + relu(b + a); 93 | d += 3 * d + relu(b - a); 94 | auto e = c - d; 95 | auto f = pow(e, 2); 96 | auto g = f / 2.0; 97 | g += 10.0 / f; 98 | printf("%.4f\n", g->data()); // prints 24.7041, the outcome of this forward pass 99 | backward(g); 100 | printf("%.4f\n", a->grad()); // prints 138.8338, i.e. the numerical value of dg/da 101 | printf("%.4f\n", b->grad()); // prints 645.5773, i.e. the numerical value of dg/db 102 | } 103 | ``` 104 | 105 | ### C++ implementation notes 106 | 107 | 1. Data type 108 | 109 | Neural nets generally don't require many bits of precision on individual node values, 110 | so let's not limit ourselves to `float` or `double` 111 | (or [FP8](https://github.com/opencomputeproject/FP8/blob/main/ofp8_references.pdf)). 112 | We template using `Value`. 113 | 114 | 2. Sharing 115 | 116 | Nodes may appear as inputs to multiple other nodes in the expression graph, 117 | especially for neural networks, so we use a `shared_ptr`: 118 | 119 | ```c++ 120 | using Value = std::shared_ptr>; 121 | ``` 122 | 123 | 3. Removal of cycles 124 | 125 | The expression `c += c + 1` refers to itself, so it contains a cycle. This cycle needs to be 126 | removed in order to implement backpropagation. 127 | 128 | ![c += c + 1](examples/c-plus-equals-cycle.svg) 129 | 130 | In Python, `x += y` usually translates to `x.__iadd__(y)` which modifies `x` in-place. 131 | However, the `Value` objects in `micrograd` don't implement `__iadd__`, so Python falls back to using `__add__` 132 | followed by assignment. That means `a += b` is roughly equivalent to `a = a + b`. Each time the + operator 133 | is invoked, a new Value object is created and the graph gets extended, so it is not modifying the existing 134 | objects in-place. 135 | 136 | In C++, `operator+=` requires an explicit implementation which modifies its value in-place. 137 | We create a copy of the old value and re-write all earlier references in the expression graph 138 | to point to the copy. 139 | 140 | ![c += c + 1](examples/c-plus-equals-rewrite.svg) 141 | 142 | Note that this aspect of the implementation is peculiar to the operational semantics of C++ 143 | and in-place assignment operators. It is straightforward to implement a neural network 144 | without calling these operators, so the overhead of node copying and graph rewriting could 145 | easily be removed. We include it here only for the translation of micrograd to C++. 146 | 147 | ## Building out the Value object 148 | 149 | > Neural nets are some pretty scary expressions. We need some data structures to maintain 150 | > these expressions. 151 | 152 | In order to handle basic expressions like: 153 | 154 | ```c++ 155 | auto a = make_value(2.0, "a"); 156 | auto b = make_value(-3.0, "b"); 157 | auto c = make_value(10.0, "c"); 158 | 159 | auto d = (a*b) + c; 160 | std::cout << d << std::endl; 161 | ``` 162 | 163 | we start sketching out the underlying `RawValue` class, implementing operators for `+` 164 | and `*`, and storing the inputs (children) of each for the evaluation graph. 165 | 166 | ```c++ 167 | template 168 | class RawValue { 169 | public: 170 | using ptr = std::shared_ptr>; 171 | 172 | private: 173 | RawValue(const T& data, const std::string& label="") 174 | : data_(data), label_(label) 175 | {} 176 | 177 | RawValue(const T& data, std::set& children, const std::string& op="") 178 | : data_(data), prev_(children), op_(op) 179 | {} 180 | 181 | public: 182 | template 183 | static ptr make(Args&&... args) { 184 | return ptr(new RawValue(std::forward(args)...)); 185 | } 186 | 187 | friend ptr operator+(const ptr& a, const ptr& b) { 188 | std::set children = {a, b}; 189 | return make(a->data() + b->data(), children, "+"); 190 | } 191 | 192 | friend ptr operator*(const ptr& a, const ptr& b) { 193 | std::set children = {a, b}; 194 | return make(a->data() * b->data(), children, "*"); 195 | } 196 | 197 | private: 198 | T data_; 199 | std::set prev_{}; 200 | std::string op_{""}; 201 | }; 202 | 203 | template 204 | static inline std::ostream& operator<<(std::ostream& os, const RawValue& value) { 205 | return os << "Value(" 206 | << "data=" << value.data() << ", " 207 | << "op=" << value.op() 208 | << ")"; 209 | } 210 | ``` 211 | 212 | In code we use `Value`, which is an alias for `shared_ptr>`: 213 | 214 | ```c++ 215 | template 216 | using Value = typename RawValue::ptr; 217 | 218 | template 219 | static Value make_value(const T& data, Args&&... args) { 220 | return RawValue::make(data, std::forward(args)...); 221 | } 222 | 223 | template 224 | static inline std::ostream& operator<<(std::ostream& os, const std::shared_ptr>& value) { 225 | return os << value.get() << "=&" << *value; 226 | } 227 | ``` 228 | 229 | ## Visualizing the expression graph 230 | 231 | We provide a `Graph` class that can wrap any `Value`. It has a custom `operator<<` that writes in `dot` 232 | language. The implementation is in [include/graph.h](include/graph.h). We also introduce a `label` to the `Value` 233 | object for labelling graph nodes, and an `expr` factory function for creating labelled expressions. 234 | 235 | We can pipe the output of a program to `dot -Tsvg` to produce an svg image, or to `xdot` to view it interactively: 236 | 237 | ```bash 238 | $ build/examples/graph | dot -Tsvg -o graph.svg 239 | $ build/examples/graph | xdot - 240 | ``` 241 | 242 | ![Example graph](examples/graph.svg) 243 | 244 | ## Backpropagation 245 | 246 | We add a member variable `grad_` that maintains the gradient with respect to the final output. 247 | 248 | How each operation affects the output is written as a lambda function, `backward_`. 249 | It copies the `Value` `shared_ptr`s of each node's children in order to increment their reference counts. 250 | 251 | ```c++ 252 | friend ptr operator+(const ptr& a, const ptr& b) { 253 | auto out = make(a->data() + b->data(), children, "+"); 254 | 255 | out->backward_ = [=]() { 256 | a->grad_ += out->grad_; 257 | b->grad_ += out->grad_; 258 | }; 259 | 260 | return out; 261 | } 262 | 263 | friend ptr operator*(const ptr& a, const ptr& b) { 264 | std::set children = {a, b}; 265 | auto out = make(a->data() * b->data(), children, "*"); 266 | 267 | out->backward_ = [=]() { 268 | a->grad_ += b->data() * out->grad(); 269 | b->grad_ += a->data() * out->grad(); 270 | }; 271 | 272 | return out; 273 | } 274 | ``` 275 | 276 | We recursively apply the local derivatives using the chain rule backwards through the expression graph: 277 | 278 | ```c++ 279 | friend void backward(const ptr& node) { 280 | std::vector*> topo; 281 | std::set*> visited; 282 | 283 | std::function build_topo = [&](const ptr& v) { 284 | if (!visited.contains(v.get())) { 285 | visited.insert(v.get()); 286 | for (auto && c : v->children()) { 287 | build_topo(c); 288 | } 289 | topo.push_back(v.get()); 290 | } 291 | }; 292 | 293 | build_topo(node); 294 | 295 | for (auto & v : topo) { 296 | v->grad_ = 0.0; 297 | } 298 | 299 | node->grad_ = 1.0; 300 | 301 | for (auto it = topo.rbegin(); it != topo.rend(); ++it) { 302 | const RawValue* v = *it; 303 | auto f = v->backward_; 304 | if (f) f(); 305 | } 306 | } 307 | ``` 308 | 309 | ## Backpropagation through a neuron 310 | 311 | We begin the implementation of a neuron, in [include/nn.h](include/nn.h): 312 | 313 | ```c++ 314 | template 315 | class Neuron { 316 | public: 317 | Neuron() 318 | : weights_(randomArray()), bias_(randomValue()) 319 | { 320 | } 321 | 322 | Value operator()(const std::array, Nin>& x) const { 323 | Value y = mac(weights_, x, bias_); 324 | return expr(tanh(y), "n"); 325 | } 326 | 327 | ... 328 | }; 329 | ``` 330 | 331 | The resulting expression graph for a neuron with four inputs (code in [examples/neuron.cpp](examples/neuron.cpp)): 332 | 333 | ![Neuron graph](examples/neuron.svg) 334 | 335 | ### Activation function 336 | 337 | In general an activation function modifies the output of a neuron, perhaps so that all neurons have similar ranges of output value or to smooth or filter large and negative values. 338 | Whichever activation function we use, we need to implement a `backward_` function. 339 | This implementation includes `relu` (which just replaces any negative values with zero) and `tanh`, which squashes the output into the range ±1.0. `tanh` is used in the video and has an obvious and continuous effect on the gradient: 340 | 341 | ```c++ 342 | friend ptr tanh(const ptr& a) { 343 | std::set children = {a}; 344 | double x = a->data(); 345 | double e2x = exp(2.0*x); 346 | double t = (e2x-1)/(e2x+1); 347 | auto out = make(t, children, "tanh"); 348 | 349 | out->backward_ = [=]() { 350 | a->grad_ += (1.0 - t*t) * out->grad_; 351 | }; 352 | 353 | return out; 354 | } 355 | ``` 356 | 357 | ### Math operations 358 | 359 | We must implement all required math operations on `Value`, including pow, exp, and division, 360 | so that we can accumulate gradients and run backpropagation. 361 | 362 | For convenience we also provide operator specializations where one operand is an arithmetic value, so that instead of 363 | writing `a * make_value(7.0)` you can write `a * 7.0` or `7.0 * a`: 364 | 365 | ```c++ 366 | template::value, int> = 0> 367 | friend ptr operator*(const ptr& a, N n) { return a * make(n); } 368 | 369 | template::value, int> = 0> 370 | friend ptr operator*(N n, const ptr& a) { return make(n) * a; } 371 | ``` 372 | 373 | ### Multiply-Accumulate 374 | 375 | A neuron takes a number of input values, applies a weight to each, and sums the result. We can abstract this out as a common multiply-accumulate function. 376 | It is usual to use a hardware-optimized, eg. GPU, implementation. 377 | In order to use our explicit `Value` object, we provide a generic implementation is in [include/mac.h](include/mac.h). 378 | This uses `std::execution` to allow the compiler to choose an optimized execution method, allowing parallel and vectorized execution: 379 | 380 | ```c++ 381 | template 382 | T mac(const std::array& a, const std::array& b, T init = T{}) { 383 | return std::transform_reduce( 384 | std::execution::par_unseq, // Use parallel and vectorized execution 385 | a.begin(), a.end(), // Range of first vector 386 | b.begin(), // Range of second vector 387 | init, //static_cast(0), // Initial value 388 | std::plus<>(), // Accumulate 389 | std::multiplies<>() // Multiply 390 | ); 391 | } 392 | ``` 393 | 394 | ### randomValue, randomArray 395 | 396 | We provide helper functions to create random values statically, in deterministic order. This helps with reproducibility for debugging. 397 | 398 | The implementation is in [include/random.h](include/random.h). 399 | 400 | ```c++ 401 | // Static inline function to generate a random T 402 | template 403 | static inline Value randomValue() { 404 | static unsigned int seed = 42; 405 | static thread_local std::mt19937 gen(seed++); 406 | std::uniform_real_distribution dist(-1.0, 1.0); 407 | seed = gen(); // update seed for next time 408 | return make_value(dist(gen)); 409 | } 410 | 411 | // Static inline function to generate a random std::array 412 | template 413 | static inline std::array, N> randomArray() { 414 | std::array, N> arr; 415 | for (auto& element : arr) { 416 | element = randomValue(); 417 | } 418 | return arr; 419 | } 420 | ``` 421 | 422 | ## Multi-Layer Perceptron 423 | 424 | We arrange neurons in a series of layers. Each layer is just an array of neurons. 425 | 426 | A layer `Layer` consists of `Nout` neurons, and is callable: 427 | * The same input (array of `Nin` values) is passed to each of the neurons 428 | * Each neuron produces a single output value 429 | * These output values are collected into an output array of `Nout` values. 430 | 431 | ### Layer 432 | 433 | ```c++ 434 | template 435 | class Layer { 436 | public: 437 | 438 | std::array, Nout> operator()(const std::array, Nin>& x) { 439 | std::array, Nout> output{}; 440 | std::transform(std::execution::par_unseq, neurons_.begin(), neurons_.end(), 441 | output.begin(), [&](const auto& n) { return n(x); }); 442 | return output; 443 | } 444 | 445 | private: 446 | std::array, Nout> neurons_{}; 447 | }; 448 | ``` 449 | 450 | ### BuildLayers 451 | 452 | We introduce a helper type that allows us to specify a sequence of layers of different sizes. 453 | 454 | ```c++ 455 | template 456 | struct BuildLayers; 457 | 458 | template 459 | struct BuildLayers { 460 | using type = decltype(std::tuple_cat( 461 | std::tuple>{}, 462 | typename BuildLayers::type{} 463 | )); 464 | static constexpr size_t nout = BuildLayers::nout; 465 | }; 466 | 467 | template 468 | struct BuildLayers { 469 | using type = std::tuple>; 470 | static constexpr size_t nout = Last; 471 | }; 472 | ``` 473 | 474 | We make an alias for the type of such a sequence, like `Layers<3, 4, 4, 1>`: 475 | 476 | ```c++ 477 | template 478 | using Layers = typename BuildLayers::type; 479 | ``` 480 | 481 | and a helper to extract the final number of outputs, eg. `LayersNout<3, 4, 4, 1>` is 1: 482 | 483 | ```c++ 484 | template 485 | static constexpr size_t LayersNout = BuildLayers::nout; 486 | ``` 487 | 488 | ### MLP 489 | 490 | Finaly we use `Layers<>` in a class `MLP<>`, which: 491 | * Forwards its input to the first layer 492 | * Passes the output of each layer to the next layer, in turn 493 | * Returns the output of the last layer 494 | 495 | ```c++ 496 | template 497 | class MLP { 498 | public: 499 | static constexpr size_t Nout = LayersNout; 500 | 501 | std::array, Nout> operator()(const std::array, Nin>& input) { 502 | return forward<0, Nin, Nouts...>(input); 503 | } 504 | 505 | std::array, Nout> operator()(const std::array& input) { 506 | return this->operator()(value_array(input)); 507 | } 508 | 509 | private: 510 | template 511 | auto forward(const std::array, NinCurr>& input) -> decltype(auto) { 512 | auto & p = std::get(layers_); 513 | auto output = std::get(layers_)(input); 514 | if constexpr (sizeof...(NoutsRest) > 0) { 515 | return forward(output); 516 | } else { 517 | return output; 518 | } 519 | } 520 | 521 | private: 522 | Layers layers_; 523 | 524 | }; 525 | ``` 526 | 527 | ### MLP1 528 | 529 | If we want a single-valued output from our neural network, we create a wrapper class `MLP1<>` that returns only the first element 530 | of the output of the wrapped `MLP<>`: 531 | 532 | ```c++ 533 | template 534 | class MLP1 : public MLP 535 | { 536 | public: 537 | MLP1() 538 | : MLP() 539 | {} 540 | 541 | Value operator()(const std::array, Nin>& input) { 542 | return MLP::operator()(input)[0]; 543 | } 544 | 545 | Value operator()(const std::array& input) { 546 | return MLP::operator()(input)[0]; 547 | } 548 | }; 549 | ``` 550 | 551 | With the following code from [examples/mlp1.cpp](examples/mlp1.cpp) we can make a 3 layer neural net with 1 output, 552 | run backpropagation over it and show the resulting expression graph: 553 | 554 | ```c++ 555 | MLP1 n; 556 | 557 | std::array input = {{ 2.0, 3.0, -1.0 }}; 558 | auto output = n(input); 559 | 560 | backward(output); 561 | 562 | std::cout << Graph(output) << std::endl; 563 | ``` 564 | 565 | ![MLP1 graph](examples/mlp1.svg) 566 | 567 | ## Loss function 568 | 569 | Now that we can make a neural net, run it forwards to produce a value and backwards to calculate gradients, we can begin adjusting it to learn. 570 | 571 | We introduce generic evaluation and learning classes for anything that can produce `Value`. 572 | 573 | ### MSELoss 574 | 575 | We evaluate a prediction against a known "ground truth". The difference between these is the *Error*, and we 576 | take the square of the error to approximate distance. 577 | We average these out when considering an array of predictions and their ground truths. This is the Mean Squared Error. 578 | 579 | Implementation in [include/loss.h](include/loss.h). 580 | 581 | ```c++ 582 | template 583 | Value mse_loss(const Value& predicted, const Value& ground_truth) { 584 | static_assert(std::is_arithmetic::value, "Type must be arithmetic"); 585 | return pow(predicted - ground_truth, 2); 586 | } 587 | ``` 588 | 589 | ```c++ 590 | template 591 | Value mse_loss(const std::array, N>& predictions, const std::array& ground_truth) { 592 | Value sum_squared_error = std::inner_product(predictions.begin(), predictions.end(), ground_truth.begin(), make_value(0), 593 | std::plus<>(), 594 | [](Value pred, T truth) { return pow(pred - truth, 2); } 595 | ); 596 | return sum_squared_error / make_value(N); 597 | } 598 | ``` 599 | 600 | We provide a wrapper class to calculate the Mean Squared Error of any `std::function(const Arg&)>`: 601 | 602 | ```c++ 603 | template 604 | class MSELoss { 605 | public: 606 | MSELoss(const std::function(const Arg&)>& func) 607 | : func_(func) 608 | { 609 | } 610 | 611 | Value operator()(std::array& input, const std::array& ground_truth, bool verbose=false) { 612 | if (verbose) std::cerr << "Predictions: "; 613 | for (size_t i = 0; i < N; ++i) { 614 | predictions_[i] = func_(input[i]); 615 | if (verbose) std::cerr << predictions_[i]->data() << " "; 616 | } 617 | if (verbose) std::cerr << '\n'; 618 | return mse_loss(predictions_, ground_truth); 619 | } 620 | 621 | private: 622 | const std::function(const Arg&)> func_; 623 | std::array, N> predictions_; 624 | }; 625 | ``` 626 | 627 | ## Gradient descent 628 | 629 | The gradients calculated by running `backward(loss)` annotate how each parameter contributes to the error loss. 630 | By adjusting each parameter down (against the gradient) we aim to minimize the error. 631 | 632 | ### Adjusting parameters 633 | 634 | We introduce an `adjust()` function in `Value` to modify `data_` according to the calculated gradient `grad_`. 635 | This takes a parameter `learning_rate`, usually in the range `[0.0 .. 1.0]` to scale of the adjustment: 636 | 637 | ```c++ 638 | void adjust(const T& learning_rate) { 639 | data_ += -learning_rate * grad_; 640 | } 641 | ``` 642 | 643 | We then provide `adjust()` functions to adjust all the parameters of an neural net: the weights and bias of a Neuron: 644 | 645 | ```c++ 646 | template 647 | class Neuron { 648 | ... 649 | const std::array, Nin>& weights() const { 650 | return weights_; 651 | } 652 | 653 | Value bias() const { 654 | return bias_; 655 | } 656 | 657 | void adjust_weights(const T& learning_rate) { 658 | for (const auto& w : weights_) { 659 | w->adjust(learning_rate); 660 | } 661 | } 662 | 663 | void adjust_bias(const T& learning_rate) { 664 | bias_->adjust(learning_rate); 665 | } 666 | 667 | void adjust(const T& learning_rate) { 668 | adjust_weights(learning_rate); 669 | adjust_bias(learning_rate); 670 | } 671 | ... 672 | }; 673 | ``` 674 | 675 | then adjust all the Neurons in a Layer: 676 | 677 | ```c++ 678 | template 679 | class Layer { 680 | ... 681 | void adjust(const T& learning_rate) { 682 | for (auto & n : neurons_) { 683 | n.adjust(learning_rate); 684 | } 685 | } 686 | ... 687 | }; 688 | ``` 689 | 690 | and all the Layers in an MLP: 691 | 692 | ```c++ 693 | template 694 | void layers_adjust(std::tuple& layers, const T& learning_rate) { 695 | std::apply([&learning_rate](auto&... layer) { 696 | // Use fold expression to call adjust on each layer 697 | (..., layer.adjust(learning_rate)); 698 | }, layers); 699 | } 700 | ``` 701 | 702 | ```c++ 703 | template 704 | class MLP { 705 | ... 706 | void adjust(const T& learning_rate) { 707 | layers_adjust(layers_, learning_rate); 708 | } 709 | ... 710 | }; 711 | ``` 712 | 713 | ### CanBackProp 714 | 715 | We introduce a concept `CanBackProp` to describe any function that can be evaluated and adjusted: 716 | 717 | ```c++ 718 | template 719 | concept CanBackProp = requires(F f, Arg arg, T learning_rate) { 720 | { f(arg) } -> std::convertible_to>; 721 | { f.adjust(learning_rate) } -> std::convertible_to; 722 | }; 723 | ``` 724 | 725 | For example, `CanBackProp` is true for `MLP1`. 726 | 727 | ### BackProp 728 | 729 | We create a wrapper class for any function that matches the `CanBackProp` concept. This class is callable 730 | with input and ground truth arguments, which are used to iteratively: 731 | * make predictions 732 | * evaluate error loss against ground truth 733 | * adjust parameters to minimize loss 734 | 735 | The loss at each step is recorded in an output file `loss_path`. 736 | 737 | ```c++ 738 | template 739 | class BackPropImpl { 740 | public: 741 | BackPropImpl(const F& func, const std::string& loss_path) 742 | : func_(func), loss_output_(loss_path) 743 | { 744 | } 745 | 746 | MSELoss loss_function() const { 747 | return MSELoss(func_); 748 | } 749 | 750 | T operator()(std::array& input, const std::array& ground_truth, 751 | T learning_rate, int iterations, bool verbose=false) 752 | { 753 | auto loss_f = loss_function(); 754 | T result; 755 | 756 | for (int i=0; i < iterations; ++i) { 757 | Value loss = loss_f(input, ground_truth, verbose); 758 | 759 | result = loss->data(); 760 | loss_output_ << iter_ << '\t' << result << '\n'; 761 | 762 | if (verbose) { 763 | std::cerr << "Loss (" << iter_ << "):\t" << result << std::endl; 764 | } 765 | 766 | backward(loss); 767 | 768 | func_.adjust(learning_rate); 769 | 770 | ++iter_; 771 | } 772 | 773 | return result; 774 | } 775 | 776 | private: 777 | F func_; 778 | std::ofstream loss_output_; 779 | int iter_{0}; 780 | }; 781 | ``` 782 | 783 | The helper `BackProp` template allows us to instantiate without specifying the type of `F`, as the compiler can infer it from the constructor argument: 784 | 785 | ```c++ 786 | template 787 | requires CanBackProp 788 | auto BackProp(const F& func, const std::string& loss_path) 789 | { 790 | return BackPropImpl(func, loss_path); 791 | } 792 | ``` 793 | 794 | ### Binary Classifier 795 | 796 | Full example: [binary-classifier.cpp](examples/binary-classifier.cpp): 797 | 798 | ```c++ 799 | #include 800 | 801 | #include "backprop.h" 802 | #include "nn.h" 803 | 804 | using namespace ai; 805 | 806 | int main(int argc, char *argv[]) 807 | { 808 | // Define a neural net 809 | MLP1 n; 810 | std::cerr << n << std::endl; 811 | 812 | // A set of training inputs 813 | std::array, 4> input = {{ 814 | {2.0, 3.0, -1.0}, 815 | {3.0, -1.0, 0.5}, 816 | {0.5, 1.0, 1.0}, 817 | {1.0, 1.0, -1.0} 818 | }}; 819 | 820 | // Corresponding ground truth values for these inputs 821 | std::array y = {1.0, -1.0, -1.0, 1.0}; 822 | std::cerr << "y (gt):\t" << PrettyArray(y) << std::endl; 823 | 824 | double learning_rate = 0.9; 825 | 826 | auto backprop = BackProp, 4>(n, "loss.tsv"); 827 | 828 | // Run backprop for 20 iterations, verbose=true 829 | double loss = backprop(input, y, learning_rate, 20, true); 830 | } 831 | ``` 832 | 833 | This quickly converges close to the ground truth `y = {1.0, -1.0, -1.0, 1.0}`: 834 | 835 | ``` 836 | Predictions: 0.773488 0.796802 0.870344 0.736159 837 | Loss (0): 1.7119 838 | Predictions: -0.316783 -0.714319 -0.588441 -0.396673 839 | Loss (1): 0.983902 840 | Predictions: 0.997675 0.996129 0.996903 0.99767 841 | Loss (2): 1.99304 842 | Predictions: 0.997454 0.995671 0.996576 0.997448 843 | Loss (3): 1.99226 844 | Predictions: 0.997183 0.995088 0.996169 0.997177 845 | Loss (4): 1.99127 846 | Predictions: 0.996845 0.99432 0.995649 0.996838 847 | Loss (5): 1.98999 848 | Predictions: 0.996409 0.993259 0.99496 0.996403 849 | Loss (6): 1.98824 850 | Predictions: 0.995827 0.991692 0.994 0.995819 851 | Loss (7): 1.98573 852 | Predictions: 0.995001 0.989122 0.992564 0.994993 853 | Loss (8): 1.98174 854 | Predictions: 0.993729 0.984072 0.990152 0.993718 855 | Loss (9): 1.97433 856 | Predictions: 0.991462 0.969564 0.985135 0.991443 857 | Loss (10): 1.95502 858 | Predictions: 0.985916 0.862539 0.967274 0.985854 859 | Loss (11): 1.8349 860 | Predictions: 0.947625 -0.182752 0.497795 0.945489 861 | Loss (12): 0.72925 862 | Predictions: -0.0306315 -0.996451 -0.996155 -0.512822 863 | Loss (13): 0.837715 864 | Predictions: 0.999406 -0.977287 -0.542644 0.999393 865 | Loss (14): 0.0524229 866 | Predictions: 0.999027 -0.997707 -0.997583 0.998998 867 | Loss (15): 3.26197e-06 868 | Predictions: 0.999027 -0.997707 -0.997584 0.998998 869 | Loss (16): 3.26134e-06 870 | Predictions: 0.999027 -0.997708 -0.997584 0.998998 871 | Loss (17): 3.26072e-06 872 | Predictions: 0.999027 -0.997708 -0.997584 0.998998 873 | Loss (18): 3.26009e-06 874 | Predictions: 0.999027 -0.997708 -0.997585 0.998998 875 | Loss (19): 3.25946e-06 876 | ``` 877 | 878 | ## Conclusion 879 | 880 | We ported all the features of micrograd introduced in Karpathy's YouTube tutorial to C++, giving a different perspective on implementation details. 881 | We also considered some more generic aspects of model evaluation and iterative learning to develop re-usable C++ classes. 882 | 883 | ## References 884 | 885 | ### Automatic differentiation in C++ 886 | * [Differentiable Programming in C++ - Vassil Vassilev & William Moses - CppCon 2021](https://www.youtube.com/watch?v=1QQj1mAV-eY) [YouTube] 887 | * [Automatic Differentiation in C++](https://compiler-research.org/assets/presentations/CladInROOT_15_02_2020.pdf)[PDF] 888 | * [FastAD: Expression Template-Based C++ Library for Fast and Memory-Efficient Automatic Differentiation](https://arxiv.org/abs/2102.03681) [PDF] 889 | 890 | ### Automatic differentiation 891 | * [ad](https://hackage.haskell.org/package/ad) 892 | 893 | ## Next up: [kfish/makemore-cpp-2023](https://github.com/kfish/makemore-cpp-2023) 894 | 895 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.10) 2 | 3 | set(CMAKE_CXX_STANDARD 23) 4 | set(CMAKE_CXX_FLAGS_RELEASE "-O3") 5 | 6 | enable_testing() 7 | 8 | project(examples) 9 | 10 | get_filename_component(PARENT_DIR ${PROJECT_SOURCE_DIR} DIRECTORY) 11 | 12 | include_directories( 13 | ${PARENT_DIR}/include 14 | ) 15 | 16 | add_executable(example-usage example-usage.cpp) 17 | add_test(NAME example-usage 18 | COMMAND example-usage) 19 | 20 | add_executable(example-usage-cycle example-usage-cycle.cpp) 21 | add_test(NAME example-usage-cycle 22 | COMMAND example-usage-cycle) 23 | 24 | add_executable(graph graph.cpp) 25 | add_test(NAME graph 26 | COMMAND graph) 27 | 28 | add_executable(neuron neuron.cpp) 29 | add_test(NAME neuron 30 | COMMAND neuron) 31 | 32 | add_executable(mlp1 mlp1.cpp) 33 | add_test(NAME mlp1 34 | COMMAND mlp1) 35 | 36 | add_executable(regression0 regression0.cpp) 37 | add_test(NAME regression0 38 | COMMAND regression0) 39 | 40 | add_executable(binary-classifier binary-classifier.cpp) 41 | add_test(NAME binary-classifier 42 | COMMAND binary-classifier) 43 | -------------------------------------------------------------------------------- /examples/binary-classifier.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "backprop.h" 4 | #include "nn.h" 5 | #include "graph.h" 6 | 7 | using namespace ai; 8 | 9 | int main(int argc, char *argv[]) 10 | { 11 | // Define a neural net 12 | MLP1 n; 13 | std::cerr << n << std::endl; 14 | 15 | // A set of training inputs 16 | std::array, 4> input = {{ 17 | {2.0, 3.0, -1.0}, 18 | {3.0, -1.0, 0.5}, 19 | {0.5, 1.0, 1.0}, 20 | {1.0, 1.0, -1.0} 21 | }}; 22 | 23 | // Corresponding ground truth values for these inputs 24 | std::array y = {1.0, -1.0, -1.0, 1.0}; 25 | std::cerr << "y (gt):\t" << PrettyArray(y) << std::endl; 26 | 27 | double learning_rate = 0.9; 28 | 29 | auto backprop = BackProp, 4>(n, "loss.tsv"); 30 | 31 | // Run backprop for 20 iterations, verbose=true 32 | double loss = backprop(input, y, learning_rate, 20, true); 33 | } 34 | 35 | -------------------------------------------------------------------------------- /examples/c-plus-equals-cycle.dot: -------------------------------------------------------------------------------- 1 | digraph G { 2 | rankdir = "LR"; 3 | "node0x55628c103eb0" [label = "{ | data=-4.0000 | grad=0.0000 }", shape="record"] 4 | "node0x55628c103fb0" [label = "{ | data=2.0000 | grad=0.0000 }", shape="record"] 5 | "node0x55628c104130" [label = "{ | data=-3.0000 | grad=0.0000 }", shape="record"] 6 | "node0x55628c104130+" [label = "+"] 7 | "node0x55628c104130+" -> "node0x55628c104130"; 8 | "node0x55628c104930" [label = "{ | data=1.0000 | grad=0.0000 }", shape="record"] 9 | "node0x55628c104a30" [label = "{ | data=-1.0000 | grad=0.0000 }", shape="record"] 10 | "node0x55628c104a30+" [label = "+"] 11 | "node0x55628c104a30+" -> "node0x55628c104a30"; 12 | "node0x55628c103eb0" -> "node0x55628c104130+"; 13 | "node0x55628c103fb0" -> "node0x55628c104130+"; 14 | "node0x55628c104130" -> "node0x55628c104a30+"; 15 | "node0x55628c104930" -> "node0x55628c104a30+"; 16 | "node0x55628c104a30" -> "node0x55628c104130+"; 17 | } 18 | 19 | -------------------------------------------------------------------------------- /examples/c-plus-equals-cycle.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | 14 | node0x55a7a4822eb0 15 | 16 | a 17 | 18 | data=-4.0000 19 | 20 | grad=0.0000 21 | 22 | 23 | 24 | node0x55a7a4823130+ 25 | 26 | + 27 | 28 | 29 | 30 | node0x55a7a4822eb0->node0x55a7a4823130+ 31 | 32 | 33 | 34 | 35 | 36 | node0x55a7a4822fb0 37 | 38 | b 39 | 40 | data=2.0000 41 | 42 | grad=0.0000 43 | 44 | 45 | 46 | node0x55a7a4822fb0->node0x55a7a4823130+ 47 | 48 | 49 | 50 | 51 | 52 | node0x55a7a4823130 53 | 54 | c 55 | 56 | data=-3.0000 57 | 58 | grad=0.0000 59 | 60 | 61 | 62 | node0x55a7a4823a30+ 63 | 64 | + 65 | 66 | 67 | 68 | node0x55a7a4823130->node0x55a7a4823a30+ 69 | 70 | 71 | 72 | 73 | 74 | node0x55a7a4823130+->node0x55a7a4823130 75 | 76 | 77 | 78 | 79 | 80 | node0x55a7a4823930 81 | 82 | 83 | 84 | data=1.0000 85 | 86 | grad=0.0000 87 | 88 | 89 | 90 | node0x55a7a4823930->node0x55a7a4823a30+ 91 | 92 | 93 | 94 | 95 | 96 | node0x55a7a4823a30 97 | 98 | 99 | 100 | data=-1.0000 101 | 102 | grad=0.0000 103 | 104 | 105 | 106 | node0x55a7a4823a30->node0x55a7a4823130+ 107 | 108 | 109 | 110 | 111 | 112 | node0x55a7a4823a30+->node0x55a7a4823a30 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /examples/c-plus-equals-rewrite.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | 14 | node0x56434b372eb0 15 | 16 | a 17 | 18 | data=-4.0000 19 | 20 | grad=0.0000 21 | 22 | 23 | 24 | node0x56434b373bf0+ 25 | 26 | + 27 | 28 | 29 | 30 | node0x56434b372eb0->node0x56434b373bf0+ 31 | 32 | 33 | 34 | 35 | 36 | node0x56434b372fb0 37 | 38 | b 39 | 40 | data=2.0000 41 | 42 | grad=0.0000 43 | 44 | 45 | 46 | node0x56434b372fb0->node0x56434b373bf0+ 47 | 48 | 49 | 50 | 51 | 52 | node0x56434b373130 53 | 54 | 55 | 56 | data=-3.0000 57 | 58 | grad=0.0000 59 | 60 | 61 | 62 | node0x56434b373130+ 63 | 64 | + 65 | 66 | 67 | 68 | node0x56434b373130+->node0x56434b373130 69 | 70 | 71 | 72 | 73 | 74 | node0x56434b373930 75 | 76 | 77 | 78 | data=1.0000 79 | 80 | grad=0.0000 81 | 82 | 83 | 84 | node0x56434b373a30+ 85 | 86 | + 87 | 88 | 89 | 90 | node0x56434b373930->node0x56434b373a30+ 91 | 92 | 93 | 94 | 95 | 96 | node0x56434b373a30 97 | 98 | 99 | 100 | data=-1.0000 101 | 102 | grad=0.0000 103 | 104 | 105 | 106 | node0x56434b373a30->node0x56434b373130+ 107 | 108 | 109 | 110 | 111 | 112 | node0x56434b373a30+->node0x56434b373a30 113 | 114 | 115 | 116 | 117 | 118 | node0x56434b373bf0 119 | 120 | c 121 | 122 | data=-2.0000 123 | 124 | grad=0.0000 125 | 126 | 127 | 128 | node0x56434b373bf0->node0x56434b373130+ 129 | 130 | 131 | 132 | 133 | 134 | node0x56434b373bf0->node0x56434b373a30+ 135 | 136 | 137 | 138 | 139 | 140 | node0x56434b373bf0+->node0x56434b373bf0 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /examples/example-usage-cycle.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "value.h" 4 | #include "graph.h" 5 | 6 | using namespace ai; 7 | 8 | int main(int argc, char *argv[]) 9 | { 10 | auto a = make_value(-4.0, "a"); 11 | auto b = make_value(2.0, "b"); 12 | 13 | auto c = expr(a + b, "c");; 14 | auto d = a * b + pow(b, 3); 15 | 16 | c += c + 1; 17 | 18 | std::cout << Graph(c) << std::endl; 19 | } 20 | -------------------------------------------------------------------------------- /examples/example-usage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "value.h" 4 | 5 | using namespace ai; 6 | 7 | int main(int argc, char *argv[]) 8 | { 9 | auto a = make_value(-4.0); 10 | auto b = make_value(2.0); 11 | 12 | auto c = a + b; 13 | auto d = a * b + pow(b, 3); 14 | 15 | c += c + 1; 16 | c += 1 + c + (-a); 17 | d += d * 2 + relu(b + a); 18 | d += 3 * d + relu(b - a); 19 | auto e = c - d; 20 | auto f = pow(e, 2); 21 | auto g = f / 2.0; 22 | g += 10.0 / f; 23 | printf("%.4f\n", g->data()); // prints 24.7041, the outcome of this forward pass 24 | backward(g); 25 | printf("%.4f\n", a->grad()); // prints 138.8338, i.e. the numerical value of dg/da 26 | printf("%.4f\n", b->grad()); // prints 645.5773, i.e. the numerical value of dg/db 27 | } 28 | -------------------------------------------------------------------------------- /examples/graph.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "graph.h" 4 | 5 | using namespace ai; 6 | 7 | int main(int argc, char *argv[]) 8 | { 9 | auto a = make_value(2.0, "a"); 10 | auto b = make_value(-3.0, "b"); 11 | auto c = make_value(10.0, "c"); 12 | 13 | auto e = expr(a*b, "e"); 14 | auto d = expr(e+c, "d"); 15 | 16 | auto f = make_value(-2.0, "f"); 17 | auto L = expr(d * f, "L"); 18 | 19 | std::cout << Graph(L) << std::endl; 20 | } 21 | -------------------------------------------------------------------------------- /examples/graph.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | 14 | node0x561d478f3eb0 15 | 16 | a 17 | 18 | data=2.0000 19 | 20 | grad=0.0000 21 | 22 | 23 | 24 | node0x561d478f4230* 25 | 26 | * 27 | 28 | 29 | 30 | node0x561d478f3eb0->node0x561d478f4230* 31 | 32 | 33 | 34 | 35 | 36 | node0x561d478f3fb0 37 | 38 | b 39 | 40 | data=-3.0000 41 | 42 | grad=0.0000 43 | 44 | 45 | 46 | node0x561d478f3fb0->node0x561d478f4230* 47 | 48 | 49 | 50 | 51 | 52 | node0x561d478f40b0 53 | 54 | c 55 | 56 | data=10.0000 57 | 58 | grad=0.0000 59 | 60 | 61 | 62 | node0x561d478f43f0+ 63 | 64 | + 65 | 66 | 67 | 68 | node0x561d478f40b0->node0x561d478f43f0+ 69 | 70 | 71 | 72 | 73 | 74 | node0x561d478f4230 75 | 76 | e 77 | 78 | data=-6.0000 79 | 80 | grad=0.0000 81 | 82 | 83 | 84 | node0x561d478f4230->node0x561d478f43f0+ 85 | 86 | 87 | 88 | 89 | 90 | node0x561d478f4230*->node0x561d478f4230 91 | 92 | 93 | 94 | 95 | 96 | node0x561d478f43f0 97 | 98 | d 99 | 100 | data=4.0000 101 | 102 | grad=0.0000 103 | 104 | 105 | 106 | node0x561d478f46b0* 107 | 108 | * 109 | 110 | 111 | 112 | node0x561d478f43f0->node0x561d478f46b0* 113 | 114 | 115 | 116 | 117 | 118 | node0x561d478f43f0+->node0x561d478f43f0 119 | 120 | 121 | 122 | 123 | 124 | node0x561d478f45b0 125 | 126 | f 127 | 128 | data=-2.0000 129 | 130 | grad=0.0000 131 | 132 | 133 | 134 | node0x561d478f45b0->node0x561d478f46b0* 135 | 136 | 137 | 138 | 139 | 140 | node0x561d478f46b0 141 | 142 | L 143 | 144 | data=-8.0000 145 | 146 | grad=0.0000 147 | 148 | 149 | 150 | node0x561d478f46b0*->node0x561d478f46b0 151 | 152 | 153 | 154 | 155 | 156 | -------------------------------------------------------------------------------- /examples/mlp1.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "nn.h" 4 | #include "graph.h" 5 | 6 | using namespace ai; 7 | 8 | int main(int argc, char *argv[]) 9 | { 10 | // Define a neural net 11 | MLP1 n; 12 | 13 | std::array input = {{ 2.0, 3.0, -1.0 }}; 14 | auto output = n(input); 15 | 16 | backward(output); 17 | 18 | std::cout << Graph(output) << std::endl; 19 | } 20 | 21 | 22 | -------------------------------------------------------------------------------- /examples/neuron.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "graph.h" 4 | 5 | using namespace ai; 6 | 7 | int main(int argc, char *argv[]) 8 | { 9 | // Inputs x1, x2 10 | auto x1 = make_value(2.0, "x1"); 11 | auto x2 = make_value(0.0, "x2"); 12 | 13 | // Weights w1, w2 14 | auto w1 = make_value(-3.0, "w1"); 15 | auto w2 = make_value(1.0, "w2"); 16 | 17 | // Bias of the neuron 18 | auto b = make_value(6.8813735870195432, "b"); 19 | 20 | auto x1w1 = expr(x1*w1, "x1*w1"); 21 | auto x2w2 = expr(x2*w2, "x2*w2"); 22 | 23 | auto x1w1x2w2 = expr(x1w1 + x2w2, "x1w1+x2w2"); 24 | auto n = expr(x1w1x2w2 + b, "n"); 25 | 26 | auto o = expr(tanh(n), "o"); 27 | 28 | backward(o); 29 | 30 | std::cout << Graph(o) << std::endl; 31 | } 32 | -------------------------------------------------------------------------------- /examples/neuron.svg: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 9 | 10 | G 11 | 12 | 13 | 14 | node0x55fe2e941eb0 15 | 16 | x1 17 | 18 | data=2.0000 19 | 20 | grad=-1.5000 21 | 22 | 23 | 24 | node0x55fe2e942430* 25 | 26 | * 27 | 28 | 29 | 30 | node0x55fe2e941eb0->node0x55fe2e942430* 31 | 32 | 33 | 34 | 35 | 36 | node0x55fe2e941fb0 37 | 38 | x2 39 | 40 | data=0.0000 41 | 42 | grad=0.5000 43 | 44 | 45 | 46 | node0x55fe2e9425f0* 47 | 48 | * 49 | 50 | 51 | 52 | node0x55fe2e941fb0->node0x55fe2e9425f0* 53 | 54 | 55 | 56 | 57 | 58 | node0x55fe2e9420b0 59 | 60 | w1 61 | 62 | data=-3.0000 63 | 64 | grad=1.0000 65 | 66 | 67 | 68 | node0x55fe2e9420b0->node0x55fe2e942430* 69 | 70 | 71 | 72 | 73 | 74 | node0x55fe2e9421b0 75 | 76 | w2 77 | 78 | data=1.0000 79 | 80 | grad=0.0000 81 | 82 | 83 | 84 | node0x55fe2e9421b0->node0x55fe2e9425f0* 85 | 86 | 87 | 88 | 89 | 90 | node0x55fe2e9422b0 91 | 92 | b 93 | 94 | data=6.8814 95 | 96 | grad=0.5000 97 | 98 | 99 | 100 | node0x55fe2e942970+ 101 | 102 | + 103 | 104 | 105 | 106 | node0x55fe2e9422b0->node0x55fe2e942970+ 107 | 108 | 109 | 110 | 111 | 112 | node0x55fe2e942430 113 | 114 | x1*w1 115 | 116 | data=-6.0000 117 | 118 | grad=0.5000 119 | 120 | 121 | 122 | node0x55fe2e9427b0+ 123 | 124 | + 125 | 126 | 127 | 128 | node0x55fe2e942430->node0x55fe2e9427b0+ 129 | 130 | 131 | 132 | 133 | 134 | node0x55fe2e942430*->node0x55fe2e942430 135 | 136 | 137 | 138 | 139 | 140 | node0x55fe2e9425f0 141 | 142 | x2*w2 143 | 144 | data=0.0000 145 | 146 | grad=0.5000 147 | 148 | 149 | 150 | node0x55fe2e9425f0->node0x55fe2e9427b0+ 151 | 152 | 153 | 154 | 155 | 156 | node0x55fe2e9425f0*->node0x55fe2e9425f0 157 | 158 | 159 | 160 | 161 | 162 | node0x55fe2e9427b0 163 | 164 | x1w1+x2w2 165 | 166 | data=-6.0000 167 | 168 | grad=0.5000 169 | 170 | 171 | 172 | node0x55fe2e9427b0->node0x55fe2e942970+ 173 | 174 | 175 | 176 | 177 | 178 | node0x55fe2e9427b0+->node0x55fe2e9427b0 179 | 180 | 181 | 182 | 183 | 184 | node0x55fe2e942970 185 | 186 | n 187 | 188 | data=0.8814 189 | 190 | grad=0.5000 191 | 192 | 193 | 194 | node0x55fe2e942b30tanh 195 | 196 | tanh 197 | 198 | 199 | 200 | node0x55fe2e942970->node0x55fe2e942b30tanh 201 | 202 | 203 | 204 | 205 | 206 | node0x55fe2e942970+->node0x55fe2e942970 207 | 208 | 209 | 210 | 211 | 212 | node0x55fe2e942b30 213 | 214 | o 215 | 216 | data=0.7071 217 | 218 | grad=1.0000 219 | 220 | 221 | 222 | node0x55fe2e942b30tanh->node0x55fe2e942b30 223 | 224 | 225 | 226 | 227 | 228 | -------------------------------------------------------------------------------- /examples/regression0.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "backprop.h" 4 | #include "nn.h" 5 | #include "graph.h" 6 | 7 | using namespace ai; 8 | 9 | class Regression0 { 10 | public: 11 | Regression0() 12 | : weight_(randomValue()) 13 | {} 14 | 15 | Value weight() const { 16 | return weight_; 17 | } 18 | 19 | Value operator()(const Value& x) const { 20 | return weight_ * x; 21 | } 22 | 23 | Value operator()(const double& x) const { 24 | return this->operator()(make_value(x)); 25 | } 26 | 27 | void adjust(const double& learning_rate) { 28 | weight_->adjust(learning_rate); 29 | } 30 | 31 | private: 32 | Value weight_; 33 | }; 34 | 35 | static inline std::ostream& operator<<(std::ostream& os, const Regression0& r) 36 | { 37 | return os << "Regression0{weight=" << r.weight() << "}"; 38 | } 39 | 40 | int main(int argc, char *argv[]) 41 | { 42 | Regression0 n; 43 | 44 | std::cerr << n << std::endl; 45 | 46 | std::array input = { 47 | {-7.0, -3.0, 1.0, 4.0}, 48 | }; 49 | 50 | std::array y = {-21.0, -9.0, 3.0, 12.0}; 51 | std::cerr << "y (gt):\t" << PrettyArray(y) << std::endl; 52 | 53 | // Run backprop 54 | double learning_rate = 0.01; 55 | 56 | auto backprop = BackProp(n, "loss.tsv"); 57 | 58 | double loss = backprop(input, y, learning_rate, 40, true); 59 | 60 | std::cout << Graph(backprop.loss_function()(input, y)) << std::endl; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /include/array.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | class PrettyArray : public std::array { 8 | public: 9 | PrettyArray() = default; 10 | PrettyArray(const std::array& arr) : std::array(arr) {} 11 | 12 | friend std::ostream& operator<<(std::ostream& os, const PrettyArray& arr) { 13 | os << "[\n"; 14 | for (const auto& elem : arr) { 15 | os << '\t' << elem << '\n'; 16 | } 17 | os << ']'; 18 | return os; 19 | } 20 | }; 21 | -------------------------------------------------------------------------------- /include/backprop.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "loss.h" 9 | 10 | namespace ai { 11 | 12 | template 13 | concept CanBackProp = requires(F f, Arg arg, T learning_rate) { 14 | { f(arg) } -> std::convertible_to>; 15 | { f.adjust(learning_rate) } -> std::convertible_to; 16 | }; 17 | 18 | 19 | template 20 | class BackPropImpl { 21 | public: 22 | BackPropImpl(const F& func, const std::string& loss_path) 23 | : func_(func), loss_output_(loss_path) 24 | { 25 | } 26 | 27 | MSELoss loss_function() const { 28 | return MSELoss(func_); 29 | } 30 | 31 | T operator()(std::array& input, const std::array& ground_truth, 32 | T learning_rate, int iterations, bool verbose=false) 33 | { 34 | auto loss_f = loss_function(); 35 | T result; 36 | 37 | for (int i=0; i < iterations; ++i) { 38 | Value loss = loss_f(input, ground_truth, verbose); 39 | 40 | result = loss->data(); 41 | loss_output_ << iter_ << '\t' << result << '\n'; 42 | 43 | if (verbose) { 44 | std::cerr << "Loss (" << iter_ << "):\t" << result << std::endl; 45 | } 46 | 47 | backward(loss); 48 | 49 | func_.adjust(learning_rate); 50 | 51 | ++iter_; 52 | } 53 | 54 | return result; 55 | } 56 | 57 | private: 58 | F func_; 59 | std::ofstream loss_output_; 60 | int iter_{0}; 61 | }; 62 | 63 | template 64 | requires CanBackProp 65 | auto BackProp(const F& func, const std::string& loss_path) 66 | { 67 | return BackPropImpl(func, loss_path); 68 | } 69 | 70 | } // namespace ai 71 | -------------------------------------------------------------------------------- /include/graph.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "value.h" 7 | 8 | namespace ai { 9 | 10 | template 11 | class Trace { 12 | public: 13 | Trace(const Value& root) 14 | { 15 | build(root); 16 | } 17 | 18 | const std::set*>& nodes() const { 19 | return nodes_; 20 | } 21 | 22 | const std::set*, RawValue*>> edges() const { 23 | return edges_; 24 | } 25 | 26 | private: 27 | void build(const Value& v) { 28 | if (!nodes_.contains(v.get())) { 29 | nodes_.insert(v.get()); 30 | for (auto && c : v->children()) { 31 | edges_.insert({c.get(), v.get()}); 32 | build(c); 33 | } 34 | } 35 | } 36 | 37 | private: 38 | std::set*> nodes_{}; 39 | std::set*, RawValue*>> edges_{}; 40 | }; 41 | 42 | template 43 | class NodeName { 44 | public: 45 | NodeName(const RawValue* ptr) 46 | : ptr_(ptr) 47 | {} 48 | 49 | const RawValue* get() const { 50 | return ptr_; 51 | } 52 | 53 | private: 54 | const RawValue* ptr_; 55 | }; 56 | 57 | template 58 | static inline std::ostream& operator<<(std::ostream& os, const NodeName& node) { 59 | return os << "\"node" << node.get() << "\""; 60 | } 61 | 62 | template 63 | class NodeOp { 64 | public: 65 | NodeOp(const RawValue* ptr) 66 | : ptr_(ptr) 67 | {} 68 | 69 | const RawValue* get() const { 70 | return ptr_; 71 | } 72 | 73 | private: 74 | const RawValue* ptr_; 75 | }; 76 | 77 | template 78 | static inline std::ostream& operator<<(std::ostream& os, const NodeOp& node) { 79 | return os << "\"node" << node.get() << node.get()->op() << "\""; 80 | } 81 | 82 | template 83 | class Graph { 84 | public: 85 | Graph(const std::shared_ptr>& root) 86 | : trace_(root) 87 | { 88 | } 89 | 90 | std::ostream& dump(std::ostream& os) const { 91 | auto old_precision = os.precision(); 92 | 93 | os << "digraph G {\n" 94 | << " rankdir = \"LR\";" 95 | << std::endl; 96 | 97 | os << std::fixed << std::setprecision(4); 98 | 99 | for (const RawValue* node : trace_.nodes()) { 100 | // For any value in the graph, create a rectangular ("record") node 101 | // for it 102 | os << " " << NodeName(node) 103 | << " [label = \"{ " << node->label() 104 | << " | data=" << node->data() 105 | << " | grad=" << node->grad() 106 | << " }\", shape=\"record\"]" 107 | << std::endl; 108 | 109 | if (!node->op().empty()) { 110 | // If this value is the result of an operation, create 111 | // an op node for it 112 | os << " " << NodeOp(node) 113 | << " [label = \"" << node->op() << "\"]" 114 | << std::endl; 115 | 116 | // And connect the op to it 117 | os << " " << NodeOp(node) 118 | << " -> " << NodeName(node) << ";" 119 | << std::endl; 120 | } 121 | } 122 | 123 | // Edges 124 | for (auto && [n1, n2] : trace_.edges()) { 125 | // Connect n1 to the op node of n2 126 | os << " " << NodeName(n1) << " -> " << NodeOp(n2) << ";" << std::endl; 127 | } 128 | 129 | os << "}" << std::endl; 130 | 131 | os << std::setprecision(old_precision); 132 | 133 | return os; 134 | } 135 | 136 | private: 137 | Trace trace_; 138 | }; 139 | 140 | template 141 | static inline std::ostream& operator<<(std::ostream& os, const Graph& graph) { 142 | return graph.dump(os); 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /include/loss.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "value.h" 8 | 9 | namespace ai { 10 | 11 | // Template function for double 12 | template 13 | double mse_loss(const T& predicted, const T& ground_truth) { 14 | static_assert(std::is_arithmetic::value, "Type must be arithmetic"); 15 | return std::pow(predicted - ground_truth, 2); 16 | } 17 | 18 | // Overloaded function for Value 19 | template 20 | Value mse_loss(const Value& predicted, const T& ground_truth) { 21 | static_assert(std::is_arithmetic::value, "Type must be arithmetic"); 22 | return pow(predicted - ground_truth, 2); 23 | } 24 | 25 | template 26 | Value mse_loss(const Value& predicted, const Value& ground_truth) { 27 | static_assert(std::is_arithmetic::value, "Type must be arithmetic"); 28 | return pow(predicted - ground_truth, 2); 29 | } 30 | 31 | // Wrapper function for containers 32 | template