├── .gitignore ├── pic.jpg ├── SineExample ├── model.h ├── model_data.h ├── SineExample.ino ├── model.cpp └── model_data.cpp └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | 4 | -------------------------------------------------------------------------------- /pic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hollance/TinyML-HelloWorld-ArduinoUno/HEAD/pic.jpg -------------------------------------------------------------------------------- /SineExample/model.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_H 2 | #define MODEL_H 3 | 4 | float predict(float x); 5 | 6 | #endif // MODEL_H 7 | -------------------------------------------------------------------------------- /SineExample/model_data.h: -------------------------------------------------------------------------------- 1 | #ifndef MODEL_DATA_H 2 | #define MODEL_DATA_H 3 | 4 | #include 5 | 6 | extern const float W1_data[]; 7 | extern const float b1_data[]; 8 | extern const float W2_data[]; 9 | extern const float b2_data[]; 10 | extern const float W3_data[]; 11 | extern const float b3_data[]; 12 | 13 | const int DENSE1_SIZE = 16; 14 | const int DENSE2_SIZE = 16; 15 | 16 | inline float W1(int i) { 17 | return pgm_read_float_near(W1_data + i); 18 | } 19 | 20 | inline float b1(int i) { 21 | return pgm_read_float_near(b1_data + i); 22 | } 23 | 24 | inline float W2(int i, int j) { 25 | return pgm_read_float_near(W2_data + i*DENSE1_SIZE + j); 26 | } 27 | 28 | inline float b2(int i) { 29 | return pgm_read_float_near(b2_data + i); 30 | } 31 | 32 | inline float W3(int i) { 33 | return pgm_read_float_near(W3_data + i); 34 | } 35 | 36 | inline float b3() { 37 | return pgm_read_float_near(b3_data); 38 | } 39 | 40 | #endif // MODEL_DATA_H 41 | -------------------------------------------------------------------------------- /SineExample/SineExample.ino: -------------------------------------------------------------------------------- 1 | #include "model.h" 2 | 3 | const int LED_PIN = 11; 4 | 5 | int inference_count = 0; 6 | const int kInferencesPerCycle = 1000; 7 | const float kXrange = 2.f * 3.14159265359f; 8 | 9 | void setup() { 10 | Serial.begin(9600); 11 | pinMode(LED_PIN, OUTPUT); 12 | } 13 | 14 | void loop() { 15 | float position = static_cast(inference_count) / 16 | static_cast(kInferencesPerCycle); 17 | float x_val = position * kXrange; 18 | 19 | float y_val = predict(x_val); 20 | 21 | // The model may actually predict values smaller than -1 or larger 22 | // than 1, so clamp the results. 23 | y_val = constrain(y_val, -1.0f, 1.0f); 24 | 25 | // Calculate the brightness of the LED such that y=-1 is fully off 26 | // and y=1 is fully on. The LED's brightness can range from 0-255. 27 | int brightness = (int)(127.5f * (y_val + 1.0f)); 28 | 29 | analogWrite(LED_PIN, brightness); 30 | 31 | // Enable this to view the sine wave with the Serial Plotter. 32 | //Serial.println(brightness); 33 | 34 | inference_count += 1; 35 | if (inference_count >= kInferencesPerCycle) inference_count = 0; 36 | } 37 | -------------------------------------------------------------------------------- /SineExample/model.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "model.h" 3 | #include "model_data.h" 4 | 5 | inline float relu(float x) { 6 | return fmaxf(0.0f, x); 7 | } 8 | 9 | float predict(float x) { 10 | // The activations of the first layer are small enough to store 11 | // on the stack (16 floats = 64 bytes). 12 | float h1[DENSE1_SIZE]; 13 | 14 | // First dense layer. Since there is only one input neuron, we don't need 15 | // to perform a full-blown matrix multiply. 16 | for (int i = 0; i < DENSE1_SIZE; ++i) { 17 | h1[i] = relu(x * W1(i) + b1(i)); 18 | } 19 | 20 | // Second dense layer. 21 | float y(0.0f); 22 | for (int i = 0; i < DENSE2_SIZE; ++i) { 23 | // Perform a dot product of the incoming activation vector with each 24 | // row of the W2 matrix. 25 | float h2(0.0f); 26 | for (int j = 0; j < DENSE1_SIZE; ++j) { 27 | h2 += h1[j] * W2(i, j); 28 | } 29 | h2 = relu(h2 + b2(i)); 30 | 31 | // We don't actually need to store the activations of the second layer. 32 | // Since the last layer only has one neuron, we can immediately compute 33 | // how much each activation contributes to the final layer. 34 | y += h2 * W3(i); 35 | } 36 | 37 | // Final dense layer. 38 | return y + b3(); 39 | } 40 | -------------------------------------------------------------------------------- /SineExample/model_data.cpp: -------------------------------------------------------------------------------- 1 | #include "model_data.h" 2 | 3 | const float W1_data[] PROGMEM = { 4 | -0.39788383f, 0.46116278f, 0.3715687f, -0.07777083f, -0.2472133f, 0.13155949f, 0.6120839f, -0.07711333f, 0.30813938f, 0.0954016f, 0.069017954f, 0.14382412f, 0.50789833f, 0.18803687f, -0.0057444884f, -0.5632218f 5 | }; 6 | 7 | const float b1_data[] PROGMEM = { 8 | 0.0f, 0.47112474f, -1.0349379f, 0.0f, 0.0f, 0.018007103f, -0.6069822f, 0.0f, -0.4051202f, 0.33553293f, 0.78297186f, 0.13247779f, 0.08387113f, -0.1520639f, 0.3705848f, 0.0f 9 | }; 10 | 11 | const float W2_data[] PROGMEM = { 12 | 0.008752942f, 0.10046681f, -0.5594067f, -0.38656723f, 0.026203513f, 0.3809035f, -0.27524522f, 0.22883019f, -0.54360247f, 0.30872577f, 0.2096457f, -0.2986594f, 0.16710472f, 0.08122665f, 0.49408376f, 0.26714543f, -0.04890293f, 0.23022875f, 1.0559002f, -0.13226247f, 0.36876526f, -0.4758394f, 0.014797875f, 0.19773671f, 0.14072222f, 0.09920495f, -0.28671607f, -0.2952227f, 0.22445521f, -0.1557873f, -0.44671398f, -0.26854938f, -0.079212666f, -0.061804853f, 0.20164743f, 0.033375174f, 0.41258594f, 0.37628275f, -0.47459936f, -0.0540981f, -0.0992337f, 0.17203633f, 0.42368558f, -0.11045875f, -0.6382502f, 0.07229512f, 0.65322745f, -0.00450325f, 0.42651084f, 0.14322363f, -0.06504886f, 0.23714831f, -0.09616521f, -0.012079281f, -0.03893903f, -0.12964973f, -0.080721244f, 0.18545523f, 0.5673849f, 0.35440624f, 0.40770048f, 0.2963744f, 0.23040722f, -0.40284377f, 0.1633878f, -0.17845818f, 0.41858193f, 0.03324738f, -0.07839513f, -0.24564163f, -0.41076794f, -0.011841029f, 0.07035999f, 0.014453096f, -0.09521273f, -0.38453805f, 0.059534963f, 0.31901756f, -0.0018180311f, -0.2622439f, -0.13349813f, 0.4687281f, -0.9387802f, 0.04901907f, 0.2085382f, -0.303582f, -0.048075523f, 0.21998444f, -0.09892515f, 0.29634193f, -0.03009107f, 0.33730397f, 0.17282778f, 0.010907181f, 0.5294982f, 0.3438234f, -0.05536762f, -0.33797836f, -0.06396872f, -0.40171507f, 0.011573434f, -0.33963677f, 0.28419265f, 0.089041024f, -0.14800817f, 0.017468125f, -0.1677407f, -0.3162433f, -0.09961668f, -0.17236248f, -0.40889624f, -0.042404294f, 0.087521404f, 0.3853618f, -0.17944188f, 0.18648282f, -0.2124529f, -0.0745753f, -0.18397354f, -0.084926695f, 0.1374913f, 0.36279848f, -0.1267633f, -0.06970617f, -0.26084384f, -0.5170918f, 0.5406603f, -0.16587976f, -0.037564486f, 0.29767224f, -0.7600049f, -0.38397318f, -0.35034558f, 0.2557612f, -0.45015487f, 0.15666643f, -0.41905773f, -0.4432149f, 0.5227524f, 0.051928133f, 0.06536075f, -0.20488113f, 0.93944883f, 0.22275141f, 0.21884283f, -0.20963274f, -0.28421858f, -0.33921123f, -0.4297228f, 0.33329493f, -0.3863936f, 0.3075181f, 0.06726721f, -0.4954743f, 0.4366714f, 0.081145726f, -0.6088066f, 0.36570713f, 0.49990937f, 0.12871823f, -0.27020234f, 0.3055169f, -0.4178655f, 0.13673994f, 0.3320376f, -0.24873519f, 0.14424029f, -0.2923328f, -0.06564156f, 0.30418956f, 0.25396284f, -0.062110756f, 0.34823626f, -0.35044038f, 0.22929129f, -0.26139465f, 0.042224765f, -0.29030794f, -0.11388543f, -0.22762935f, 0.366491f, 0.4184937f, -0.2679914f, 0.05284396f, 0.31304586f, 0.46836898f, 0.4662841f, -0.3168484f, -0.62202334f, 0.22289956f, 0.25762904f, -0.4111679f, 0.042361856f, 0.32926273f, 0.292555f, 0.2597312f, 0.25294778f, 0.26584086f, 0.39530355f, -0.18725905f, 0.4278392f, -0.42083043f, -0.662066f, 0.020050874f, 0.07275363f, 0.14569889f, -0.2032377f, 0.26153424f, 0.04289168f, 0.28540748f, -0.011160894f, 0.08113453f, -0.3503822f, 0.08603693f, 0.10026496f, -0.36113915f, -0.42191446f, -0.19292475f, 0.059979536f, -0.46330738f, -0.13543011f, -0.031109314f, -0.04798102f, 0.11116436f, -0.19061677f, 0.42657918f, -0.9026991f, -0.2057866f, 0.3852652f, 0.109483086f, -0.5143333f, 0.044659853f, -0.23391826f, 0.013950172f, 0.53942573f, 0.25642928f, -0.25812334f, -0.5175727f, 0.9448253f, -0.22992402f, -0.33865362f, 0.23385412f, -0.14182074f, 0.1301175f, 0.42415115f, 0.24237347f, 0.28174022f, 0.13199261f, -0.28549045f, -0.055565543f, -0.034443192f, -0.18449251f, -0.09318607f, -0.29925725f, 0.21919051f, 0.22934195f 13 | }; 14 | 15 | const float b2_data[] PROGMEM = { 16 | 0.26377532f, -0.45455533f, 0.42449933f, 0.27063936f, -0.005065765f, 0.332522f, 0.0f, 0.2209661f, 0.55025387f, 0.45371056f, 0.14686571f, 0.46175304f, -0.32032987f, 0.14935441f, 0.51105094f, 0.2417469f 17 | }; 18 | 19 | const float W3_data[] PROGMEM = { 20 | 0.8117f, 0.62595254f, -0.5416186f, -0.17542624f, 0.09408076f, -0.8181666f, -0.38303018f, 0.4873742f, 0.7816993f, -0.7960945f, -0.060853384f, -0.9514488f, 0.5558745f, 0.59897417f, 0.7961271f, -0.16341376f 21 | }; 22 | 23 | const float b3_data[] PROGMEM = { 24 | -0.31720653f 25 | }; 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TinyML "Hello World" on Arduino Uno v3 2 | 3 | 4 | ![](pic.jpg) 5 | 6 | Recently I've become interested in **TinyML**, or running machine learning models on (ultra) low-power microcontrollers. 7 | 8 | The book *[TinyML: Machine Learning with TensorFlow Lite on Arduino and Ultra-Low-Power Microcontrollers](https://tinymlbook.com)* by Pete Warden and Daniel Situnayake starts off with a very basic sine wave prediction model. This model is then used to control the brightness of an LED. You can find the original source code for this **hello_world** example in the [TensorFlow Lite for Microcontrollers repo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world). 9 | 10 | The book and repo demonstrate how to run the model on an Arduino Nano 33 BLE Sense or Arduino MKRZERO, but the only Arduino I have lying around here at the moment is an **Arduino Uno v3**. And TF Lite Micro does not appear to run on the Uno. Not surprisingly, as even this very simple model already requires more RAM than the Uno has (only 2 KB). 11 | 12 | But of course you don't *need* TF Lite to run machine learning models, you can also [implement the logic](SineExample/model.cpp) yourself. ;-) 13 | 14 | ## How to hook up the Arduino 15 | 16 | Pretty simple. Just attach an LED to pin 11 (or another PWM output), in series with a small resistor: 17 | 18 | ```nohighlight 19 | +---------+ 20 | | A | 21 | | r ~11 |---- 220 Ω ----+ 22 | | d | | 23 | | u | LED 24 | | i | | 25 | | n GND |---------------+ 26 | | o | 27 | +---------+ 28 | ``` 29 | 30 | The Arduino sketch will send a sine wave over pin 11, which makes the LED appear to pulsate. Rather than calling `sin()`, it uses a machine learning model to approximate the sine wave. 31 | 32 | Note: The Uno also has a built-in LED on pin 13, but that doesn't support PWM. If you use pin 13, the LED will just blink on and off. 33 | 34 | ## Training the model 35 | 36 | [This Jupyter notebook](train_hello_world_model.ipynb) shows how to train the model using TensorFlow 2. I took this notebook from the TF Lite repo and tweaked it a little. Notably, the model does not get exported as a .tflite file, but the notebook simply dumps the weights as C arrays that you can copy-paste into [model_data.cpp](SineExample/model_data.cpp). 37 | 38 | ## Model architecture 39 | 40 | The model consists of the following layers: 41 | 42 | 1. input layer, 1 neuron 43 | 2. dense layer, 16 neurons, ReLU activation 44 | 3. dense layer, 16 neurons, ReLU activation 45 | 4. dense layer, 1 neuron, no activation 46 | 47 | There are three weight matrices: 48 | 49 | - **W1** shape (1, 16) 50 | - **W2** shape (16, 16) 51 | - **W3** shape (16, 1) 52 | 53 | Of course, each layer also has bias parameters. 54 | 55 | Notice that the weight matrices for the first and last layer are really vectors because there is only one input neuron and one output neuron. 56 | 57 | In total there are 321 parameters. If we store those as 32-bit floats, it takes up 1284 bytes. That's small enough to fit even in the Arduino Uno's memory. 58 | 59 | ## Inference 60 | 61 | Since we're not using TF Lite, we'll have to implement the layer math ourselves. All the layers are dense / fully-connected layers but because the input and output are scalars, the logic for the input and output layers can be simplified a little. 62 | 63 | The model needs to perform these different functions: 64 | 65 | 1. scalar-vector multiplication for the first layer 66 | 2. vector-matrix multiplication for the middle layer 67 | 3. dot product for the last layer 68 | 4. (vector) addition for the bias terms 69 | 5. ReLU 70 | 71 | Fortunately, these are pretty simple to implement. You can find the full source code in [model.cpp](SineExample/model.cpp). It's less code than you might think! 72 | 73 | ## Memory usage 74 | 75 | The current version of the code works with 32-bit floats, no quantization is used. Therefore, every weight and every activation uses up 4 bytes of memory. 76 | 77 | The Arduino Uno v3 only has 2 KB of RAM. All your global and local variables need to fit inside this tiny amount of memory! 78 | 79 | The Uno also has 32 KB of read-only flash storage. As our model's weights already take up 1284 bytes, it makes sense to put them in flash memory. That leaves the RAM free for other things. 80 | 81 | The final program uses about 5000 bytes of flash memory (out of 32K). Even better, it uses less than 200 bytes of RAM for its global variables. That's about 10% of the space we have available. 82 | 83 | Note: Because we put the weights in flash memory (using `PROGMEM`), we also need to use special instructions such as `pgm_read_float_near()` to read them into RAM before we can use them. We can read the weights from ROM into RAM one at a time, so it's still very memory-efficient. However, I guess (?!) that reading from flash might be slower than reading from RAM (but I did not verify this). 84 | 85 | The implementation of the neural network logic only needs to store the 16 activations of the first layer. Normally you'd also need to store the 16 activations of the second layer, but because the output layer only has a single neuron we can optimize this away. In total, running `predict()` uses 100 or so bytes on the stack. 86 | 87 | ## Is this approach better than using TF Lite? 88 | 89 | It depends. :) 90 | 91 | We're doing the same task as the TF Lite Micro model but with a lot less memory. There's overhead in loading a .tflite file and running it through an interpreter. By doing everything yourself, you can avoid that overhead. I'm sure this could be optimized much more still. 92 | 93 | Does it always make sense to implement the neural network logic by hand? Probably not. For one thing, it's a lot less effort to use a framework such as TF Lite. Without such a framework, you always have to hardcode the entire model. If the model architecture changes, you also end up rewriting your logic. 94 | 95 | But if you're *really* resource-constrained, "rolling your own" might be the way to go. 96 | 97 | ## TODO 98 | 99 | - Optimize the code? Right now it's just a bunch of `for` loops. I have no idea if the ATmega328P CPU has vector operations that would make the dot products go faster. 100 | 101 | - Quantized version. Use uint8 weights instead of floats. 102 | 103 | ## License 104 | 105 | Apache License 2.0 (same as TensorFlow) 106 | --------------------------------------------------------------------------------