├── Makefile
├── .gitignore
├── LICENSE
├── download_mnist.sh
├── include
    ├── neon_ops.h
    └── mnist_loader.h
├── src
    ├── neon_ops.c
    ├── mnist_loader.c
    └── simple_mnist.c
└── README.md


/Makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -Wall -Wextra -O3 -march=native -Iinclude
 3 | LDFLAGS = -lm
 4 | 
 5 | SRCS = src/simple_mnist.c src/mnist_loader.c src/neon_ops.c
 6 | OBJS = $(SRCS:.c=.o)
 7 | TARGET = simple_mnist
 8 | 
 9 | all: $(TARGET)
10 | 
11 | $(TARGET): $(OBJS)
12 | 	$(CC) $(OBJS) -o $(TARGET) $(LDFLAGS)
13 | 
14 | %.o: %.c
15 | 	$(CC) $(CFLAGS) -c $< -o $@
16 | 
17 | clean:
18 | 	rm -f $(OBJS) $(TARGET)
19 | 
20 | .PHONY: all clean
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled binaries and object files
 2 | *.o
 3 | *.out
 4 | *.exe
 5 | simple_mnist
 6 | elastic_mnist
 7 | model*.bin
 8 | 
 9 | # Elastic MNIST files (not ready for GitHub yet)
10 | src/elastic_mnist.c
11 | src/elastic_mnist_loader.c
12 | include/elastic_mnist_loader.h
13 | Makefile.elastic
14 | 
15 | # Build directories
16 | build/
17 | dist/
18 | bin/
19 | 
20 | # Editor-specific files
21 | .vscode/
22 | .idea/
23 | *.swp
24 | *.swo
25 | *~
26 | 
27 | # OS-specific files
28 | .DS_Store
29 | Thumbs.db
30 | .directory
31 | 
32 | # MNIST data files (users should download these themselves)
33 | mnist_data/
34 | 
35 | # Backup files
36 | backups/
37 | 
38 | # Log files
39 | *.log
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 tsotchke
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/download_mnist.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Download and prepare the MNIST dataset for SIMPLE-MNIST
 3 | # This script downloads the MNIST dataset from Google Cloud Storage,
 4 | # which is a reliable mirror of the original dataset.
 5 | 
 6 | # Print colored status messages
 7 | GREEN='\033[0;32m'
 8 | RED='\033[0;31m'
 9 | BLUE='\033[0;34m'
10 | NC='\033[0m' # No Color
11 | 
12 | # Create a directory for the dataset
13 | mkdir -p mnist_data
14 | cd mnist_data
15 | 
16 | # Download MNIST files directly from Google Cloud Storage
17 | echo -e "${BLUE}Downloading MNIST dataset from Google Cloud Storage...${NC}"
18 | echo "This may take a few moments depending on your internet connection."
19 | 
20 | files=(
21 |   "train-images-idx3-ubyte.gz"
22 |   "train-labels-idx1-ubyte.gz"
23 |   "t10k-images-idx3-ubyte.gz"
24 |   "t10k-labels-idx1-ubyte.gz"
25 | )
26 | 
27 | for file in "${files[@]}"; do
28 |   echo -e "Downloading ${file}..."
29 |   wget -q --show-progress https://storage.googleapis.com/cvdf-datasets/mnist/${file}
30 | 
31 |   # Check if download was successful
32 |   if [ $? -ne 0 ]; then
33 |     echo -e "${RED}Error: Failed to download ${file}${NC}"
34 |     echo "Please check your internet connection and try again."
35 |     exit 1
36 |   fi
37 | done
38 | 
39 | # Extract the files
40 | echo -e "\n${BLUE}Extracting files...${NC}"
41 | for file in "${files[@]}"; do
42 |   echo "Extracting ${file}..."
43 |   gunzip -f ${file}
44 | done
45 | 
46 | # Verify files
47 | missing_files=0
48 | for file in "train-images-idx3-ubyte" "train-labels-idx1-ubyte" "t10k-images-idx3-ubyte" "t10k-labels-idx1-ubyte"; do
49 |   if [ ! -f ${file} ]; then
50 |     echo -e "${RED}Error: ${file} is missing${NC}"
51 |     missing_files=$((missing_files + 1))
52 |   fi
53 | done
54 | 
55 | if [ ${missing_files} -gt 0 ]; then
56 |   echo -e "${RED}Error: Failed to extract ${missing_files} MNIST files${NC}"
57 |   exit 1
58 | fi
59 | 
60 | # Display file sizes and information
61 | echo -e "\n${BLUE}MNIST Dataset Information:${NC}"
62 | echo -e "File sizes:"
63 | ls -lh
64 | 
65 | # Calculate total size
66 | total_size=$(du -ch *.ubyte | grep total | cut -f1)
67 | echo -e "\nTotal dataset size: ${total_size}"
68 | 
69 | # Print summary statistics
70 | train_images_size=$(stat -f%z "train-images-idx3-ubyte" 2>/dev/null || stat --format="%s" "train-images-idx3-ubyte")
71 | train_images_count=$((${train_images_size} - 16))
72 | train_images_count=$((${train_images_count} / 784))
73 | 
74 | test_images_size=$(stat -f%z "t10k-images-idx3-ubyte" 2>/dev/null || stat --format="%s" "t10k-images-idx3-ubyte")
75 | test_images_count=$((${test_images_size} - 16))
76 | test_images_count=$((${test_images_count} / 784))
77 | 
78 | echo -e "\nDataset contains:"
79 | echo "- ${train_images_count} training images (28x28 pixels)"
80 | echo "- ${test_images_count} test images (28x28 pixels)"
81 | 
82 | echo -e "\n${GREEN}MNIST dataset has been downloaded and prepared successfully${NC}"
83 | echo -e "Path to MNIST data: $(pwd)"
84 | echo -e "You can use this path as an argument to simple_mnist:"
85 | echo -e "${BLUE}./simple_mnist $(pwd)${NC}"
86 | echo -e "Or if you're in the build directory:"
87 | echo -e "${BLUE}./simple_mnist ./mnist_data${NC}"
88 | 
89 | exit 0
90 | 


--------------------------------------------------------------------------------
/include/neon_ops.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file neon_ops.h
  3 |  * @brief ARM Neon optimized operations for neural network computations
  4 |  * 
  5 |  * This file contains optimized implementations of common neural network
  6 |  * operations using ARM Neon SIMD instructions, specifically targeting
  7 |  * Apple Silicon (M1/M2/M3) processors.
  8 |  */
  9 | 
 10 | #ifndef NEON_OPS_H
 11 | #define NEON_OPS_H
 12 | 
 13 | #include <stdint.h>
 14 | #include <stddef.h>
 15 | #include <stdbool.h>
 16 | 
 17 | /**
 18 |  * @brief Detect if ARM Neon is available at runtime
 19 |  * 
 20 |  * @return true if ARM Neon is available, false otherwise
 21 |  */
 22 | bool neon_available(void);
 23 | 
 24 | /**
 25 |  * @brief Matrix multiplication optimized with ARM Neon
 26 |  * 
 27 |  * Computes C = A * B where:
 28 |  * A is an M x K matrix
 29 |  * B is a K x N matrix
 30 |  * C is an M x N matrix
 31 |  * 
 32 |  * @param A Pointer to matrix A (row-major)
 33 |  * @param B Pointer to matrix B (row-major)
 34 |  * @param C Pointer to output matrix C (row-major)
 35 |  * @param M Number of rows in A and C
 36 |  * @param N Number of columns in B and C
 37 |  * @param K Number of columns in A and rows in B
 38 |  */
 39 | void neon_matrix_multiply(const float *A, const float *B, float *C, int M, int N, int K);
 40 | 
 41 | /**
 42 |  * @brief Matrix-vector multiplication optimized with ARM Neon
 43 |  * 
 44 |  * Computes y = A * x where:
 45 |  * A is an M x N matrix
 46 |  * x is a vector of length N
 47 |  * y is a vector of length M
 48 |  * 
 49 |  * @param A Pointer to matrix A (row-major)
 50 |  * @param x Pointer to vector x
 51 |  * @param y Pointer to output vector y
 52 |  * @param M Number of rows in A
 53 |  * @param N Number of columns in A
 54 |  */
 55 | void neon_matrix_vector_multiply(const float *A, const float *x, float *y, int M, int N);
 56 | 
 57 | /**
 58 |  * @brief Apply ReLU activation function using ARM Neon
 59 |  * 
 60 |  * Computes ReLU(x) element-wise on input vector
 61 |  * 
 62 |  * @param x Pointer to input vector
 63 |  * @param y Pointer to output vector (can be the same as x for in-place)
 64 |  * @param n Length of vectors
 65 |  */
 66 | void neon_relu(const float *x, float *y, int n);
 67 | 
 68 | /**
 69 |  * @brief Apply elementwise multiply using ARM Neon
 70 |  * 
 71 |  * Computes z[i] = x[i] * y[i] for all i
 72 |  * 
 73 |  * @param x Pointer to first input vector
 74 |  * @param y Pointer to second input vector
 75 |  * @param z Pointer to output vector
 76 |  * @param n Length of vectors
 77 |  */
 78 | void neon_elementwise_multiply(const float *x, const float *y, float *z, int n);
 79 | 
 80 | /**
 81 |  * @brief Compute exponential function using ARM Neon
 82 |  * 
 83 |  * Computes y[i] = exp(x[i]) for all i
 84 |  * 
 85 |  * @param x Pointer to input vector
 86 |  * @param y Pointer to output vector
 87 |  * @param n Length of vectors
 88 |  */
 89 | void neon_exp(const float *x, float *y, int n);
 90 | 
 91 | /**
 92 |  * @brief Compute softmax function using ARM Neon
 93 |  * 
 94 |  * Computes softmax of input vector x
 95 |  * 
 96 |  * @param x Pointer to input vector
 97 |  * @param y Pointer to output vector
 98 |  * @param n Length of vectors
 99 |  */
100 | void neon_softmax(const float *x, float *y, int n);
101 | 
102 | 
103 | #endif /* NEON_OPS_H */
104 | 


--------------------------------------------------------------------------------
/include/mnist_loader.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file mnist_loader.h
  3 |  * @brief Functions for loading and preprocessing MNIST data
  4 |  */
  5 | 
  6 | #ifndef MNIST_LOADER_H
  7 | #define MNIST_LOADER_H
  8 | 
  9 | #include <stdint.h>
 10 | #include <stdbool.h>
 11 | #include <stddef.h>
 12 | 
 13 | // MNIST dataset constants
 14 | #define MNIST_TRAIN_SIZE 60000
 15 | #define MNIST_TEST_SIZE 10000
 16 | #define MNIST_IMG_SIZE 784  // 28x28 pixels
 17 | #define MNIST_NUM_CLASSES 10
 18 | 
 19 | /**
 20 |  * @brief MNIST dataset structure
 21 |  */
 22 | typedef struct {
 23 |     float *train_images;   // Training images [60000 x 784]
 24 |     int *train_labels;     // Training labels [60000]
 25 |     float *test_images;    // Test images [10000 x 784]
 26 |     int *test_labels;      // Test labels [10000]
 27 |     int num_train;         // Number of training samples
 28 |     int num_test;          // Number of test samples
 29 |     int image_size;        // Size of each image (pixels)
 30 |     int num_classes;       // Number of classes
 31 | } MNISTData;
 32 | 
 33 | /**
 34 |  * @brief Load MNIST dataset from binary files
 35 |  * 
 36 |  * @param base_path Path to directory containing MNIST files
 37 |  * @param data Pointer to MNISTData structure to fill
 38 |  * @return true if loading was successful, false otherwise
 39 |  */
 40 | bool load_mnist(const char *base_path, MNISTData *data);
 41 | 
 42 | /**
 43 |  * @brief Free memory allocated for MNIST data
 44 |  * 
 45 |  * @param data Pointer to MNISTData structure
 46 |  */
 47 | void free_mnist_data(MNISTData *data);
 48 | 
 49 | /**
 50 |  * @brief Preprocess MNIST images (normalize, enhance contrast, etc.)
 51 |  * 
 52 |  * @param images Array of images
 53 |  * @param num_images Number of images
 54 |  * @param image_size Size of each image
 55 |  * @param use_neon Whether to use ARM Neon optimizations
 56 |  */
 57 | void preprocess_mnist_images(float *images, int num_images, int image_size, bool use_neon);
 58 | 
 59 | /**
 60 |  * @brief Create a mini-batch from MNIST training data
 61 |  * 
 62 |  * @param data MNIST data structure
 63 |  * @param batch_indices Array of indices for the batch
 64 |  * @param batch_size Size of the batch
 65 |  * @param batch_images Output array for batch images [batch_size x image_size]
 66 |  * @param batch_labels Output array for batch labels [batch_size]
 67 |  */
 68 | void create_mnist_batch(const MNISTData *data, const int *batch_indices, int batch_size, 
 69 |                        float **batch_images, int *batch_labels);
 70 | 
 71 | /**
 72 |  * @brief Helper function to display MNIST digit in ASCII art
 73 |  * 
 74 |  * @param image Pointer to image data (784 pixels)
 75 |  * @param label Label of the image
 76 |  */
 77 | void display_mnist_digit(const float *image, int label);
 78 | 
 79 | /**
 80 |  * @brief Helper function to read IDX file format
 81 |  * 
 82 |  * @param filename IDX file to read
 83 |  * @param data Pointer to buffer to store data (must be pre-allocated)
 84 |  * @param size Size of data to read
 85 |  * @param magic_expected Expected magic number
 86 |  * @return true if reading was successful, false otherwise
 87 |  */
 88 | bool read_idx_file(const char *filename, void *data, size_t size, uint32_t magic_expected);
 89 | 
 90 | /**
 91 |  * @brief Display summary statistics for MNIST dataset
 92 |  * 
 93 |  * @param data MNIST data structure
 94 |  */
 95 | void print_mnist_summary(const MNISTData *data);
 96 | 
 97 | /**
 98 |  * @brief Shuffle indices for batch creation
 99 |  * 
100 |  * @param indices Array of indices to shuffle
101 |  * @param n Number of indices
102 |  */
103 | void shuffle_indices(int *indices, int n);
104 | 
105 | #endif /* MNIST_LOADER_H */
106 | 


--------------------------------------------------------------------------------
/src/neon_ops.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file neon_ops.c
  3 |  * @brief Implementation of ARM Neon optimized operations
  4 |  */
  5 | 
  6 | #include "neon_ops.h"
  7 | #include <math.h>
  8 | #include <string.h>
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | 
 12 | // Check if we're on an ARM platform and include ARM Neon headers
 13 | #if defined(__ARM_NEON) || defined(__ARM_NEON__)
 14 | #include <arm_neon.h>
 15 | #define HAVE_NEON 1
 16 | #else
 17 | #define HAVE_NEON 0
 18 | #endif
 19 | 
 20 | bool neon_available(void) {
 21 | #if HAVE_NEON
 22 |     return true;
 23 | #else
 24 |     return false;
 25 | #endif
 26 | }
 27 | 
 28 | // Fallback implementation for non-ARM platforms
 29 | static void __attribute__((unused)) fallback_matrix_multiply(const float *A, const float *B, float *C, int M, int N, int K) {
 30 |     // Initialize C to zeros
 31 |     memset(C, 0, M * N * sizeof(float));
 32 | 
 33 |     // Compute C = A * B
 34 |     for (int i = 0; i < M; i++) {
 35 |         for (int k = 0; k < K; k++) {
 36 |             for (int j = 0; j < N; j++) {
 37 |                 C[i * N + j] += A[i * K + k] * B[k * N + j];
 38 |             }
 39 |         }
 40 |     }
 41 | }
 42 | 
 43 | void neon_matrix_multiply(const float *A, const float *B, float *C, int M, int N, int K) {
 44 | #if HAVE_NEON
 45 |     // Initialize C to zeros
 46 |     memset(C, 0, M * N * sizeof(float));
 47 | 
 48 |     // Process 4 rows of A and 4 columns of B at a time when possible
 49 |     for (int i = 0; i < M; i++) {
 50 |         for (int j = 0; j < N; j += 4) {
 51 |             if (j + 4 <= N) {
 52 |                 // 4 output values at a time
 53 |                 float32x4_t c_val = vdupq_n_f32(0.0f);
 54 | 
 55 |                 for (int k = 0; k < K; k++) {
 56 |                     float32x4_t b_val = vld1q_f32(&B[k * N + j]);
 57 |                     float32x4_t a_val = vdupq_n_f32(A[i * K + k]);
 58 |                     c_val = vmlaq_f32(c_val, a_val, b_val);
 59 |                 }
 60 | 
 61 |                 vst1q_f32(&C[i * N + j], c_val);
 62 |             } else {
 63 |                 // Handle remaining columns (less than 4)
 64 |                 for (int jj = j; jj < N; jj++) {
 65 |                     float sum = 0.0f;
 66 |                     for (int k = 0; k < K; k++) {
 67 |                         sum += A[i * K + k] * B[k * N + jj];
 68 |                     }
 69 |                     C[i * N + jj] = sum;
 70 |                 }
 71 |             }
 72 |         }
 73 |     }
 74 | #else
 75 |     fallback_matrix_multiply(A, B, C, M, N, K);
 76 | #endif
 77 | }
 78 | 
 79 | void neon_matrix_vector_multiply(const float *A, const float *x, float *y, int M, int N) {
 80 | #if HAVE_NEON
 81 |     // Process 4 elements at a time
 82 |     for (int i = 0; i < M; i++) {
 83 |         float32x4_t sum_vec = vdupq_n_f32(0.0f);
 84 |         int j = 0;
 85 | 
 86 |         // Process 4 elements at a time
 87 |         for (; j <= N - 4; j += 4) {
 88 |             float32x4_t a_vec = vld1q_f32(&A[i * N + j]);
 89 |             float32x4_t x_vec = vld1q_f32(&x[j]);
 90 |             sum_vec = vmlaq_f32(sum_vec, a_vec, x_vec);
 91 |         }
 92 | 
 93 |         // Extract the sum
 94 |         float sum = vgetq_lane_f32(sum_vec, 0) + vgetq_lane_f32(sum_vec, 1) +
 95 |                     vgetq_lane_f32(sum_vec, 2) + vgetq_lane_f32(sum_vec, 3);
 96 | 
 97 |         // Handle remaining elements
 98 |         for (; j < N; j++) {
 99 |             sum += A[i * N + j] * x[j];
100 |         }
101 | 
102 |         y[i] = sum;
103 |     }
104 | #else
105 |     // Fallback implementation
106 |     for (int i = 0; i < M; i++) {
107 |         float sum = 0.0f;
108 |         for (int j = 0; j < N; j++) {
109 |             sum += A[i * N + j] * x[j];
110 |         }
111 |         y[i] = sum;
112 |     }
113 | #endif
114 | }
115 | 
116 | void neon_relu(const float *x, float *y, int n) {
117 | #if HAVE_NEON
118 |     const float32x4_t zero = vdupq_n_f32(0.0f);
119 |     int i = 0;
120 | 
121 |     // Process 4 elements at a time
122 |     for (; i <= n - 4; i += 4) {
123 |         float32x4_t x_vec = vld1q_f32(&x[i]);
124 |         float32x4_t y_vec = vmaxq_f32(x_vec, zero);
125 |         vst1q_f32(&y[i], y_vec);
126 |     }
127 | 
128 |     // Handle remaining elements
129 |     for (; i < n; i++) {
130 |         y[i] = fmaxf(x[i], 0.0f);
131 |     }
132 | #else
133 |     // Fallback implementation
134 |     for (int i = 0; i < n; i++) {
135 |         y[i] = fmaxf(x[i], 0.0f);
136 |     }
137 | #endif
138 | }
139 | 
140 | void neon_elementwise_multiply(const float *x, const float *y, float *z, int n) {
141 | #if HAVE_NEON
142 |     int i = 0;
143 | 
144 |     // Process 4 elements at a time
145 |     for (; i <= n - 4; i += 4) {
146 |         float32x4_t x_vec = vld1q_f32(&x[i]);
147 |         float32x4_t y_vec = vld1q_f32(&y[i]);
148 |         float32x4_t z_vec = vmulq_f32(x_vec, y_vec);
149 |         vst1q_f32(&z[i], z_vec);
150 |     }
151 | 
152 |     // Handle remaining elements
153 |     for (; i < n; i++) {
154 |         z[i] = x[i] * y[i];
155 |     }
156 | #else
157 |     // Fallback implementation
158 |     for (int i = 0; i < n; i++) {
159 |         z[i] = x[i] * y[i];
160 |     }
161 | #endif
162 | }
163 | 
164 | #if HAVE_NEON
165 | // Fast approximation of exp function using ARM Neon
166 | static float32x4_t exp_ps(float32x4_t x) {
167 |     // Better exp approximation using minimax polynomial
168 |     const float32x4_t LOG2EF = vdupq_n_f32(1.442695040f);
169 |     const float32x4_t C1 = vdupq_n_f32(0.693147182f);
170 |     const float32x4_t C2 = vdupq_n_f32(0.240226337f);
171 |     const float32x4_t C3 = vdupq_n_f32(0.055504110f);
172 |     const float32x4_t C4 = vdupq_n_f32(0.009618129f);
173 |     const float32x4_t C5 = vdupq_n_f32(0.001333355f);
174 | 
175 |     // Clamp input to avoid overflow
176 |     const float32x4_t max_val = vdupq_n_f32(88.3762626647949f);
177 |     const float32x4_t min_val = vdupq_n_f32(-88.3762626647949f);
178 |     x = vminq_f32(vmaxq_f32(x, min_val), max_val);
179 | 
180 |     // Scale by log2(e)
181 |     float32x4_t z = vmulq_f32(x, LOG2EF);
182 | 
183 |     // Round to nearest integer
184 |     float32x4_t n = vcvtq_f32_s32(vcvtq_s32_f32(z));
185 | 
186 |     // Polynomial approximation of exp2(fractional part)
187 |     float32x4_t p = vsubq_f32(z, n);
188 |     float32x4_t xx = vmulq_f32(p, p);
189 |     float32x4_t px = p;
190 |     float32x4_t result = vaddq_f32(C1, vmulq_f32(C2, px));
191 |     px = vmulq_f32(px, xx);
192 |     result = vaddq_f32(result, vmulq_f32(C3, px));
193 |     px = vmulq_f32(px, xx);
194 |     result = vaddq_f32(result, vmulq_f32(C4, px));
195 |     px = vmulq_f32(px, xx);
196 |     result = vaddq_f32(result, vmulq_f32(C5, px));
197 | 
198 |     // Reconstruct exp(x) = 2^n * exp2(fractional part)
199 |     const int32x4_t pow2n = vshlq_n_s32(vcvtq_s32_f32(n), 23);
200 |     const float32x4_t pow2 = vreinterpretq_f32_s32(vaddq_s32(pow2n, vdupq_n_s32(0x3f800000)));
201 |     result = vmulq_f32(result, pow2);
202 | 
203 |     return result;
204 | }
205 | #endif
206 | 
207 | void neon_exp(const float *x, float *y, int n) {
208 | #if HAVE_NEON
209 |     int i = 0;
210 | 
211 |     // Process 4 elements at a time
212 |     for (; i <= n - 4; i += 4) {
213 |         float32x4_t x_vec = vld1q_f32(&x[i]);
214 |         float32x4_t y_vec = exp_ps(x_vec);
215 |         vst1q_f32(&y[i], y_vec);
216 |     }
217 | 
218 |     // Handle remaining elements with standard exp
219 |     for (; i < n; i++) {
220 |         y[i] = expf(x[i]);
221 |     }
222 | #else
223 |     // Fallback implementation
224 |     for (int i = 0; i < n; i++) {
225 |         y[i] = expf(x[i]);
226 |     }
227 | #endif
228 | }
229 | 
230 | void neon_softmax(const float *x, float *y, int n) {
231 | #if HAVE_NEON
232 |     // Find max value for numerical stability
233 |     float max_val = x[0];
234 |     for (int i = 1; i < n; i++) {
235 |         if (x[i] > max_val) {
236 |             max_val = x[i];
237 |         }
238 |     }
239 | 
240 |     // Compute exp(x - max) and sum
241 |     float sum = 0.0f;
242 |     float *exp_values = (float *)malloc(n * sizeof(float));
243 | 
244 |     for (int i = 0; i < n; i++) {
245 |         exp_values[i] = x[i] - max_val;
246 |     }
247 | 
248 |     neon_exp(exp_values, exp_values, n);
249 | 
250 |     for (int i = 0; i < n; i++) {
251 |         sum += exp_values[i];
252 |     }
253 | 
254 |     // Normalize to get probabilities
255 |     float32x4_t sum_vec = vdupq_n_f32(sum);
256 |     int i = 0;
257 | 
258 |     // Process 4 elements at a time
259 |     for (; i <= n - 4; i += 4) {
260 |         float32x4_t exp_vec = vld1q_f32(&exp_values[i]);
261 |         float32x4_t y_vec = vdivq_f32(exp_vec, sum_vec);
262 |         vst1q_f32(&y[i], y_vec);
263 |     }
264 | 
265 |     // Handle remaining elements
266 |     for (; i < n; i++) {
267 |         y[i] = exp_values[i] / sum;
268 |     }
269 | 
270 |     free(exp_values);
271 | #else
272 |     // Fallback implementation
273 |     // Find max value for numerical stability
274 |     float max_val = x[0];
275 |     for (int i = 1; i < n; i++) {
276 |         if (x[i] > max_val) {
277 |             max_val = x[i];
278 |         }
279 |     }
280 | 
281 |     // Compute exp(x - max) and sum
282 |     float sum = 0.0f;
283 |     for (int i = 0; i < n; i++) {
284 |         y[i] = expf(x[i] - max_val);
285 |         sum += y[i];
286 |     }
287 | 
288 |     // Normalize to get probabilities
289 |     for (int i = 0; i < n; i++) {
290 |         y[i] /= sum;
291 |     }
292 | #endif
293 | }
294 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 🔢 Simple MNIST Neural Network
  2 | 
  3 | A sophisticated yet minimalist implementation of a pure feedforward neural network for MNIST digit recognition, written entirely in C. This implementation achieves remarkable >99% accuracy on the MNIST test set without using convolutional layers, demonstrating the power of carefully designed architecture and modern training techniques. This project is testament to the elegance of fundamental neural network principles when applied with precision and care.
  4 | 
  5 | ## ✨ What Makes This Implementation Special
  6 | 
  7 | Most high-accuracy MNIST implementations today rely on convolutional neural networks (CNNs), which have built-in spatial inductive biases. This implementation takes a different approach, proving that sometimes the classics, when finely tuned, can rival modern approaches:
  8 |  
  9 | - **Pure Feedforward Architecture**: Achieves CNN-like performance using only fully-connected layers
 10 | - **Optimized Neuron Allocation**: Strategic distribution of neurons across layers (784→512→256→10) balances capacity and generalization
 11 | - **Modern Optimization Stack**: Incorporates research-backed techniques typically found in deep learning frameworks
 12 |   - Conservative learning rate (0.01) with cosine annealing
 13 |   - Effective regularization (L2 weight decay: 2e-5)
 14 |   - Efficient batch processing (128 images per batch)
 15 | - **Minimal Dependencies**: Relies only on standard C libraries and math.h
 16 | - **Readable Implementation**: Clean, well-documented code that serves as an educational resource
 17 | - **Performance-Focused**: Optional SIMD optimizations for ARM processors
 18 | 
 19 | ## 🔬 Technical Deep Dive
 20 | 
 21 | ### 🧩 Network Architecture
 22 | 
 23 | The network architecture has been meticulously designed to extract hierarchical features from digit images:
 24 | 
 25 | 1. **Input Layer (784 neurons)**
 26 |    - Represents the flattened 28×28 pixel images
 27 |    - Each neuron corresponds to a single pixel intensity value
 28 | 
 29 | 2. **First Hidden Layer (512 neurons)**
 30 |    - Captures low-level features like edges, curves, and line segments
 31 |    - Size carefully chosen to provide sufficient representational capacity without overfitting
 32 |    - ReLU activation enables learning of non-linear patterns
 33 |    - He initialization ensures proper gradient flow during early training
 34 | 
 35 | 3. **Second Hidden Layer (256 neurons)**
 36 |    - Builds higher-level abstractions by combining low-level features
 37 |    - Detects digit-specific patterns like loops, intersections, and stroke patterns
 38 |    - Reduced size creates an information bottleneck that forces generalization
 39 |    - ReLU activation maintains sparse representations
 40 | 
 41 | 4. **Output Layer (10 neurons)**
 42 |    - One neuron per digit class (0-9)
 43 |    - Softmax activation provides normalized probability distribution
 44 |    - Xavier/Glorot initialization optimized for softmax outputs
 45 | 
 46 | ### 🚀 Advanced Training Techniques
 47 | 
 48 | The implementation incorporates numerous advanced techniques that are crucial for achieving high accuracy - the secret sauce that makes this network shine:
 49 | 
 50 | #### 🎯 Initialization Strategies
 51 | - **He Initialization** for ReLU layers: Weights initialized with variance scaled by `sqrt(2/n_inputs)` to maintain gradient magnitude
 52 | - **Xavier/Glorot Initialization** for output layer: Optimized for linear/softmax activations with variance scaled by `sqrt(2/(n_inputs + n_outputs))`
 53 | 
 54 | #### 🔥 Activation Functions
 55 | - **ReLU (Rectified Linear Unit)**: `f(x) = max(0, x)` for hidden layers
 56 |   - Sparse activation (typically 50-60% of neurons are active)
 57 |   - Mitigates vanishing gradient problem
 58 |   - Computationally efficient
 59 | - **Softmax**: `σ(z)ᵢ = exp(zᵢ) / Σⱼ exp(zⱼ)` for output layer
 60 |   - Numerically stable implementation with max subtraction
 61 |   - Provides proper probability distribution
 62 | 
 63 | #### 🛡️ Regularization Techniques
 64 | - **L2 Weight Decay**: Penalizes large weights with coefficient `2e-5`
 65 |   - Encourages smoother decision boundaries
 66 |   - Improves generalization by reducing model complexity
 67 |   - Prevents overfitting with larger network capacity
 68 | - **Data Augmentation**: Random shifts (±2 pixels) applied during training
 69 |   - Increases effective training set size
 70 |   - Improves robustness to translation variations
 71 |   - Implemented efficiently with in-place operations
 72 | 
 73 | #### ⚡ Optimization Approach
 74 | - **Mini-batch Gradient Descent**: Processes 128 images per batch
 75 |   - Balances computational efficiency and update frequency
 76 |   - Introduces beneficial noise for escaping local minima
 77 | - **Cosine Learning Rate Annealing**: `lr = initial_lr * 0.5 * (1 + cos(π * epoch/max_epochs))`
 78 |   - Starts with conservative steps (0.01) for stable convergence
 79 |   - Gradually reduces step size for fine-tuning
 80 |   - Smooth transition prevents oscillation near optima
 81 | 
 82 | #### 🔄 Backpropagation Implementation
 83 | - **Efficient Gradient Computation**: Directly computes gradients without storing intermediate values
 84 | - **Numerical Stability**: Careful implementation to avoid overflow/underflow
 85 | - **Vectorized Operations**: Matrix-vector multiplications optimized for cache efficiency
 86 | 
 87 | ### 📈 Performance Characteristics
 88 | 
 89 | The network demonstrates impressive learning dynamics that would make many complex architectures envious:
 90 | 
 91 | - **Rapid Initial Learning**: ~96.8% accuracy after just one epoch
 92 | - **Steady Improvement**: Reaches ~98% by epoch 3-5
 93 | - **Fast Convergence**: >98.4% accuracy by epoch 7, exceeding 99% within 50 epochs
 94 | - **Generalization**: Minimal gap between training and test accuracy (~0.3%)
 95 | - **Stability**: Consistent improvement with minimal fluctuations
 96 | 
 97 | ### 💾 Memory Efficiency
 98 | 
 99 | The implementation is designed to be remarkably memory-efficient, making it suitable even for resource-constrained environments:
100 | 
101 | - **Total Parameters**: ~537,866 weights + 778 biases
102 |   - Input→Hidden1: 784×512 = 401,408 weights + 512 biases
103 |   - Hidden1→Hidden2: 512×256 = 131,072 weights + 256 biases
104 |   - Hidden2→Output: 256×10 = 2,560 weights + 10 biases
105 | - **Memory Footprint**: ~2.1MB during training
106 | - **Batch Processing**: Efficiently processes 128 images per batch
107 | 
108 | ### 🚄 SIMD Optimizations (Optional)
109 | 
110 | For ARM processors, the implementation includes NEON SIMD optimizations that turbocharge performance:
111 | 
112 | - **Vectorized Matrix Operations**: 4x floating-point operations per cycle
113 | - **Optimized Activation Functions**: Parallel ReLU and softmax computations
114 | - **Efficient Data Preprocessing**: Vectorized normalization and augmentation
115 | 
116 | ## 🔍 Comparison with CNNs
117 | 
118 | While convolutional neural networks (CNNs) are the standard approach for image classification today, this pure feedforward implementation boldly challenges that convention and demonstrates that:
119 | 
120 | 1. Well-designed fully-connected networks can achieve comparable performance on constrained problems
121 | 2. The inductive bias of CNNs can be partially compensated by:
122 |    - Proper regularization
123 |    - Data augmentation
124 |    - Careful architecture design
125 | 3. Feedforward networks can be more efficient for deployment in certain scenarios
126 | 
127 | ## 📋 Requirements
128 | 
129 | - C compiler (gcc/clang)
130 | - make
131 | - wget (for downloading MNIST dataset)
132 | - ~100MB disk space for MNIST dataset
133 | 
134 | ## 🚀 Quick Start
135 | 
136 | 1. Clone the repository:
137 | ```bash
138 | git clone https://github.com/tsotchke/simple_mnist.git
139 | cd simple_mnist
140 | ```
141 | 
142 | 2. Download the MNIST dataset:
143 | ```bash
144 | ./download_mnist.sh
145 | ```
146 | 
147 | 3. Build and run:
148 | ```bash
149 | make
150 | ./simple_mnist mnist_data
151 | ```
152 | 
153 | Add `--verbose` flag for detailed statistics and confusion matrix:
154 | ```bash
155 | ./simple_mnist mnist_data --verbose
156 | ```
157 | 
158 | ## 📁 Project Structure
159 | 
160 | ```
161 | simple_mnist/
162 | ├── include/           # Header files
163 | │   ├── mnist_loader.h # MNIST dataset loading and preprocessing
164 | │   └── neon_ops.h     # SIMD optimizations for ARM processors
165 | ├── src/               # Source files
166 | │   ├── mnist_loader.c # Dataset handling implementation
167 | │   ├── neon_ops.c     # Optimized math operations
168 | │   └── simple_mnist.c # Neural network implementation
169 | ├── mnist_data/        # MNIST dataset (after download)
170 | ├── Makefile           # Build configuration
171 | └── download_mnist.sh  # Dataset acquisition script
172 | ```
173 | 
174 | ## 💻 Implementation Highlights
175 | 
176 | ### ⚙️ Efficient Forward Pass
177 | 
178 | ```c
179 | void forward_pass(Network *network, const float *input, float *hidden1, float *hidden2, float *output) {
180 |     // First hidden layer: h1 = relu(W_h1 * x + b_h1)
181 |     for (int i = 0; i < HIDDEN1_SIZE; i++) {
182 |         hidden1[i] = 0.0f;
183 |         for (int j = 0; j < INPUT_SIZE; j++) {
184 |             hidden1[i] += network->hidden1_weights[i * INPUT_SIZE + j] * input[j];
185 |         }
186 |         hidden1[i] += network->hidden1_biases[i];
187 |         // Apply ReLU activation
188 |         hidden1[i] = (hidden1[i] > 0.0f) ? hidden1[i] : 0.0f;
189 |     }
190 |     
191 |     // Second hidden layer and output layer follow similar pattern...
192 | }
193 | ```
194 | 
195 | ### 🧮 Numerically Stable Softmax
196 | 
197 | ```c
198 | // Find max value for numerical stability
199 | float max_val = output[0];
200 | for (int i = 1; i < OUTPUT_SIZE; i++) {
201 |     if (output[i] > max_val) {
202 |         max_val = output[i];
203 |     }
204 | }
205 | 
206 | // Compute exp(x - max) and sum
207 | float sum = 0.0f;
208 | for (int i = 0; i < OUTPUT_SIZE; i++) {
209 |     output[i] = expf(output[i] - max_val);
210 |     sum += output[i];
211 | }
212 | 
213 | // Normalize to get probabilities
214 | for (int i = 0; i < OUTPUT_SIZE; i++) {
215 |     output[i] /= sum;
216 | }
217 | ```
218 | 
219 | ### 🔄 Data Augmentation
220 | 
221 | ```c
222 | void augment_image(const float *input, float *output) {
223 |     // Apply small random shifts (up to 2 pixels in each direction)
224 |     int shift_x = rand() % 5 - 2;  // -2 to 2
225 |     int shift_y = rand() % 5 - 2;  // -2 to 2
226 |     
227 |     // Clear output
228 |     memset(output, 0, INPUT_SIZE * sizeof(float));
229 |     
230 |     // Apply shift
231 |     for (int y = 0; y < 28; y++) {
232 |         for (int x = 0; x < 28; x++) {
233 |             int new_y = y + shift_y;
234 |             int new_x = x + shift_x;
235 |             
236 |             // Check bounds
237 |             if (new_x >= 0 && new_x < 28 && new_y >= 0 && new_y < 28) {
238 |                 output[new_y * 28 + new_x] = input[y * 28 + x];
239 |             }
240 |         }
241 |     }
242 | }
243 | ```
244 | 
245 | ## 🛠️ Building from Source
246 | 
247 | The project uses a standard Makefile system for easy compilation:
248 | 
249 | ```bash
250 | make        # Build with optimizations
251 | make clean  # Remove build artifacts
252 | ```
253 | 
254 | ## 📄 License
255 | 
256 | MIT License - Copyright (c) 2025 tsotchke - See [LICENSE](LICENSE) for details
257 | 
258 | ## 📚 Citation
259 | 
260 | If you use this implementation in your research or projects, please cite it using the following BibTeX format:
261 | 
262 | ```bibtex
263 | @software{simple_mnist,
264 |   author       = {tsotchke},
265 |   title        = {Simple MNIST: A Pure Feedforward Neural Network Implementation},
266 |   year         = {2025},
267 |   url = {https://github.com/tsotchke/simple_mnist}
268 | }
269 | ```
270 | 
271 | ## 🙏 Acknowledgments
272 | 
273 | - MNIST dataset by Yann LeCun and Corinna Cortes
274 | - Special thanks to the deep learning research community for optimization techniques
275 | - Inspired by the fundamental principles of neural networks that continue to drive innovation
276 | 


--------------------------------------------------------------------------------
/src/mnist_loader.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file mnist_loader.c
  3 |  * @brief Implementation of MNIST dataset loading functions
  4 |  */
  5 | 
  6 | #include "mnist_loader.h"
  7 | #include "neon_ops.h"
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include <string.h>
 11 | #include <stdint.h>
 12 | #include <stdbool.h>
 13 | #include <math.h>
 14 | 
 15 | // Helper function to swap endianness (MNIST data is big-endian)
 16 | static uint32_t swap_endian(uint32_t val) {
 17 |     return ((val & 0xFF) << 24) | 
 18 |            ((val & 0xFF00) << 8) | 
 19 |            ((val & 0xFF0000) >> 8) | 
 20 |            ((val & 0xFF000000) >> 24);
 21 | }
 22 | 
 23 | bool read_idx_file(const char *filename, void *data, size_t size, uint32_t magic_expected) {
 24 |     FILE *file = fopen(filename, "rb");
 25 |     if (!file) {
 26 |         fprintf(stderr, "Error: Failed to open %s\n", filename);
 27 |         return false;
 28 |     }
 29 |     
 30 |     // Read magic number
 31 |     uint32_t magic;
 32 |     if (fread(&magic, sizeof(uint32_t), 1, file) != 1) {
 33 |         fprintf(stderr, "Error: Failed to read magic number from %s\n", filename);
 34 |         fclose(file);
 35 |         return false;
 36 |     }
 37 |     
 38 |     magic = swap_endian(magic);
 39 |     if (magic != magic_expected) {
 40 |         fprintf(stderr, "Error: Invalid magic number in %s (expected 0x%08x, got 0x%08x)\n", 
 41 |                 filename, magic_expected, magic);
 42 |         fclose(file);
 43 |         return false;
 44 |     }
 45 |     
 46 |     // Skip dimensions (we know the expected sizes)
 47 |     uint32_t dimensions[3];
 48 |     int dim_count = (magic_expected == 0x00000803) ? 3 : 1; // 3 dims for images, 1 for labels
 49 |     
 50 |     if (fread(dimensions, sizeof(uint32_t), dim_count, file) != (size_t)dim_count) {
 51 |         fprintf(stderr, "Error: Failed to read dimensions from %s\n", filename);
 52 |         fclose(file);
 53 |         return false;
 54 |     }
 55 |     
 56 |     // Read the data
 57 |     if (fread(data, 1, size, file) != size) {
 58 |         fprintf(stderr, "Error: Failed to read data from %s\n", filename);
 59 |         fclose(file);
 60 |         return false;
 61 |     }
 62 |     
 63 |     fclose(file);
 64 |     return true;
 65 | }
 66 | 
 67 | bool load_mnist(const char *base_path, MNISTData *data) {
 68 |     char path[512];
 69 |     
 70 |     // Default MNIST dataset dimensions
 71 |     data->num_train = MNIST_TRAIN_SIZE;
 72 |     data->num_test = MNIST_TEST_SIZE;
 73 |     data->image_size = MNIST_IMG_SIZE;
 74 |     data->num_classes = MNIST_NUM_CLASSES;
 75 |     
 76 |     // Allocate memory for MNIST data
 77 |     data->train_images = (float *)malloc(data->num_train * data->image_size * sizeof(float));
 78 |     data->train_labels = (int *)malloc(data->num_train * sizeof(int));
 79 |     data->test_images = (float *)malloc(data->num_test * data->image_size * sizeof(float));
 80 |     data->test_labels = (int *)malloc(data->num_test * sizeof(int));
 81 |     
 82 |     if (!data->train_images || !data->train_labels || !data->test_images || !data->test_labels) {
 83 |         fprintf(stderr, "Error: Failed to allocate memory for MNIST data\n");
 84 |         free_mnist_data(data);
 85 |         return false;
 86 |     }
 87 |     
 88 |     // Temporary buffers for raw data
 89 |     unsigned char *train_images_raw = (unsigned char *)malloc(data->num_train * data->image_size);
 90 |     unsigned char *test_images_raw = (unsigned char *)malloc(data->num_test * data->image_size);
 91 |     unsigned char *train_labels_raw = (unsigned char *)malloc(data->num_train);
 92 |     unsigned char *test_labels_raw = (unsigned char *)malloc(data->num_test);
 93 |     
 94 |     if (!train_images_raw || !test_images_raw || !train_labels_raw || !test_labels_raw) {
 95 |         fprintf(stderr, "Error: Failed to allocate memory for raw MNIST data\n");
 96 |         free(train_images_raw);
 97 |         free(test_images_raw);
 98 |         free(train_labels_raw);
 99 |         free(test_labels_raw);
100 |         free_mnist_data(data);
101 |         return false;
102 |     }
103 |     
104 |     // Load training images
105 |     sprintf(path, "%s/train-images-idx3-ubyte", base_path);
106 |     if (!read_idx_file(path, train_images_raw, data->num_train * data->image_size, 0x00000803)) {
107 |         free(train_images_raw);
108 |         free(test_images_raw);
109 |         free(train_labels_raw);
110 |         free(test_labels_raw);
111 |         free_mnist_data(data);
112 |         return false;
113 |     }
114 |     
115 |     // Load training labels
116 |     sprintf(path, "%s/train-labels-idx1-ubyte", base_path);
117 |     if (!read_idx_file(path, train_labels_raw, data->num_train, 0x00000801)) {
118 |         free(train_images_raw);
119 |         free(test_images_raw);
120 |         free(train_labels_raw);
121 |         free(test_labels_raw);
122 |         free_mnist_data(data);
123 |         return false;
124 |     }
125 |     
126 |     // Load test images
127 |     sprintf(path, "%s/t10k-images-idx3-ubyte", base_path);
128 |     if (!read_idx_file(path, test_images_raw, data->num_test * data->image_size, 0x00000803)) {
129 |         free(train_images_raw);
130 |         free(test_images_raw);
131 |         free(train_labels_raw);
132 |         free(test_labels_raw);
133 |         free_mnist_data(data);
134 |         return false;
135 |     }
136 |     
137 |     // Load test labels
138 |     sprintf(path, "%s/t10k-labels-idx1-ubyte", base_path);
139 |     if (!read_idx_file(path, test_labels_raw, data->num_test, 0x00000801)) {
140 |         free(train_images_raw);
141 |         free(test_images_raw);
142 |         free(train_labels_raw);
143 |         free(test_labels_raw);
144 |         free_mnist_data(data);
145 |         return false;
146 |     }
147 |     
148 |     // Convert raw data to float/int
149 |     for (int i = 0; i < data->num_train * data->image_size; i++) {
150 |         data->train_images[i] = train_images_raw[i] / 255.0f;
151 |     }
152 |     
153 |     for (int i = 0; i < data->num_test * data->image_size; i++) {
154 |         data->test_images[i] = test_images_raw[i] / 255.0f;
155 |     }
156 |     
157 |     for (int i = 0; i < data->num_train; i++) {
158 |         data->train_labels[i] = train_labels_raw[i];
159 |     }
160 |     
161 |     for (int i = 0; i < data->num_test; i++) {
162 |         data->test_labels[i] = test_labels_raw[i];
163 |     }
164 |     
165 |     // Free raw data buffers
166 |     free(train_images_raw);
167 |     free(test_images_raw);
168 |     free(train_labels_raw);
169 |     free(test_labels_raw);
170 |     
171 |     // Apply preprocessing
172 |     preprocess_mnist_images(data->train_images, data->num_train, data->image_size, true);
173 |     preprocess_mnist_images(data->test_images, data->num_test, data->image_size, true);
174 |     
175 |     return true;
176 | }
177 | 
178 | void free_mnist_data(MNISTData *data) {
179 |     if (data) {
180 |         if (data->train_images) free(data->train_images);
181 |         if (data->train_labels) free(data->train_labels);
182 |         if (data->test_images) free(data->test_images);
183 |         if (data->test_labels) free(data->test_labels);
184 |         
185 |         data->train_images = NULL;
186 |         data->train_labels = NULL;
187 |         data->test_images = NULL;
188 |         data->test_labels = NULL;
189 |     }
190 | }
191 | 
192 | void preprocess_mnist_images(float *images, int num_images, int image_size, bool use_neon) {
193 | #if defined(__ARM_NEON) || defined(__ARM_NEON__)
194 |     if (use_neon) {
195 |         // Neon-optimized preprocessing
196 |         for (int i = 0; i < num_images; i++) {
197 |             float *img = &images[i * image_size];
198 |             
199 |             // Step 1: Calculate mean
200 |             float mean = 0.0f;
201 |             for (int j = 0; j < image_size; j++) {
202 |                 mean += img[j];
203 |             }
204 |             mean /= image_size;
205 |             
206 |             // Step 2: Center the data
207 |             for (int j = 0; j < image_size; j++) {
208 |                 img[j] -= mean;
209 |             }
210 |             
211 |             // Step 3: Calculate standard deviation
212 |             float variance = 0.0f;
213 |             for (int j = 0; j < image_size; j++) {
214 |                 variance += img[j] * img[j];
215 |             }
216 |             variance /= image_size;
217 |             float std_dev = sqrtf(variance + 1e-8f); // Add epsilon for numerical stability
218 |             
219 |             // Step 4: Normalize
220 |             for (int j = 0; j < image_size; j++) {
221 |                 img[j] /= std_dev;
222 |             }
223 |             
224 |             // Step 5: Scale to [0, 1] range
225 |             float min_val = img[0], max_val = img[0];
226 |             for (int j = 1; j < image_size; j++) {
227 |                 if (img[j] < min_val) min_val = img[j];
228 |                 if (img[j] > max_val) max_val = img[j];
229 |             }
230 |             float range = max_val - min_val;
231 |             if (range > 1e-8f) {
232 |                 for (int j = 0; j < image_size; j++) {
233 |                     img[j] = (img[j] - min_val) / range;
234 |                 }
235 |             }
236 |         }
237 |     } else {
238 | #endif
239 |         // Standard preprocessing
240 |         for (int i = 0; i < num_images; i++) {
241 |             float *img = &images[i * image_size];
242 |             
243 |             // Calculate mean and standard deviation
244 |             float mean = 0.0f, variance = 0.0f;
245 |             for (int j = 0; j < image_size; j++) {
246 |                 mean += img[j];
247 |             }
248 |             mean /= image_size;
249 |             
250 |             for (int j = 0; j < image_size; j++) {
251 |                 float diff = img[j] - mean;
252 |                 variance += diff * diff;
253 |             }
254 |             variance /= image_size;
255 |             float std_dev = sqrtf(variance + 1e-8f);
256 |             
257 |             // Normalize
258 |             for (int j = 0; j < image_size; j++) {
259 |                 img[j] = (img[j] - mean) / std_dev;
260 |             }
261 |             
262 |             // Rescale to [0, 1]
263 |             float min_val = img[0], max_val = img[0];
264 |             for (int j = 1; j < image_size; j++) {
265 |                 if (img[j] < min_val) min_val = img[j];
266 |                 if (img[j] > max_val) max_val = img[j];
267 |             }
268 |             float range = max_val - min_val;
269 |             if (range > 1e-8f) {
270 |                 for (int j = 0; j < image_size; j++) {
271 |                     img[j] = (img[j] - min_val) / range;
272 |                 }
273 |             }
274 |         }
275 | #if defined(__ARM_NEON) || defined(__ARM_NEON__)
276 |     }
277 | #endif
278 | }
279 | 
280 | void create_mnist_batch(const MNISTData *data, const int *batch_indices, int batch_size, 
281 |                        float **batch_images, int *batch_labels) {
282 |     for (int i = 0; i < batch_size; i++) {
283 |         int idx = batch_indices[i];
284 |         memcpy(batch_images[i], &data->train_images[idx * data->image_size], 
285 |                data->image_size * sizeof(float));
286 |         batch_labels[i] = data->train_labels[idx];
287 |     }
288 | }
289 | 
290 | void display_mnist_digit(const float *image, int label) {
291 |     printf("Digit: %d\n", label);
292 |     for (int i = 0; i < 28; i++) {
293 |         for (int j = 0; j < 28; j++) {
294 |             float pixel = image[i * 28 + j];
295 |             if (pixel < 0.2f) {
296 |                 printf("  ");
297 |             } else if (pixel < 0.5f) {
298 |                 printf("· ");
299 |             } else if (pixel < 0.8f) {
300 |                 printf("o ");
301 |             } else {
302 |                 printf("@ ");
303 |             }
304 |         }
305 |         printf("\n");
306 |     }
307 | }
308 | 
309 | void print_mnist_summary(const MNISTData *data) {
310 |     printf("MNIST Dataset Summary\n");
311 |     printf("---------------------\n");
312 |     printf("Training samples: %d\n", data->num_train);
313 |     printf("Test samples: %d\n", data->num_test);
314 |     printf("Image size: %d pixels\n", data->image_size);
315 |     printf("Number of classes: %d\n", data->num_classes);
316 |     
317 |     // Count samples per class in training set
318 |     int train_counts[10] = {0};
319 |     for (int i = 0; i < data->num_train; i++) {
320 |         train_counts[data->train_labels[i]]++;
321 |     }
322 |     
323 |     // Count samples per class in test set
324 |     int test_counts[10] = {0};
325 |     for (int i = 0; i < data->num_test; i++) {
326 |         test_counts[data->test_labels[i]]++;
327 |     }
328 |     
329 |     printf("\nClass distribution:\n");
330 |     printf("Digit   Training    Test\n");
331 |     for (int i = 0; i < 10; i++) {
332 |         printf("  %d     %5d     %4d\n", i, train_counts[i], test_counts[i]);
333 |     }
334 |     
335 |     printf("\nSample digit:\n");
336 |     display_mnist_digit(data->train_images, data->train_labels[0]);
337 | }
338 | 
339 | void shuffle_indices(int *indices, int n) {
340 |     for (int i = n - 1; i > 0; i--) {
341 |         int j = rand() % (i + 1);
342 |         int temp = indices[i];
343 |         indices[i] = indices[j];
344 |         indices[j] = temp;
345 |     }
346 | }
347 | 


--------------------------------------------------------------------------------
/src/simple_mnist.c:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @file simple_mnist.c
  3 |  * @brief Minimalist MNIST implementation in pure C with two hidden layers
  4 |  */
  5 | 
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <string.h>
  9 | #include <math.h>
 10 | #include <time.h>
 11 | #include "mnist_loader.h"
 12 | 
 13 | // Network architecture
 14 | #define INPUT_SIZE 784     // 28x28 pixels
 15 | #define HIDDEN1_SIZE 512   // First hidden layer (increased further)
 16 | #define HIDDEN2_SIZE 256   // Second hidden layer (increased further)
 17 | #define OUTPUT_SIZE 10     // 10 digits
 18 | #define LEARNING_RATE 0.01f // Lower learning rate for more stable convergence
 19 | #define WEIGHT_DECAY 2e-5f // Increased L2 regularization to prevent overfitting
 20 | #define BATCH_SIZE 128     // Larger batch size for better gradient estimates
 21 | #define EPOCHS 150         // More epochs
 22 | #define USE_DATA_AUGMENTATION 1 // Enable data augmentation
 23 | 
 24 | // Network structure
 25 | typedef struct {
 26 |     // First hidden layer
 27 |     float *hidden1_weights;  // [HIDDEN1_SIZE x INPUT_SIZE]
 28 |     float *hidden1_biases;   // [HIDDEN1_SIZE]
 29 |     
 30 |     // Second hidden layer
 31 |     float *hidden2_weights;  // [HIDDEN2_SIZE x HIDDEN1_SIZE]
 32 |     float *hidden2_biases;   // [HIDDEN2_SIZE]
 33 |     
 34 |     // Output layer
 35 |     float *output_weights;   // [OUTPUT_SIZE x HIDDEN2_SIZE]
 36 |     float *output_biases;    // [OUTPUT_SIZE]
 37 | } Network;
 38 | 
 39 | // Function prototypes
 40 | Network create_network();
 41 | void free_network(Network *network);
 42 | void forward_pass(Network *network, const float *input, float *hidden1, float *hidden2, float *output);
 43 | void backward_pass(Network *network, const float *input, const float *hidden1, const float *hidden2, 
 44 |                   const float *output, int label, float learning_rate);
 45 | float calculate_accuracy(Network *network, float *images, int *labels, int num_samples);
 46 | void print_confusion_matrix(Network *network, float *images, int *labels, int num_samples);
 47 | void train_network(Network *network, float *train_images, int *train_labels,
 48 |                   float *test_images, int *test_labels, int epochs, int batch_size);
 49 | 
 50 | // Helper functions
 51 | void apply_relu(float *x, int size);
 52 | void apply_softmax(float *x, int size);
 53 | void matrix_vector_multiply(const float *matrix, const float *vector, float *result, 
 54 |                            int rows, int cols);
 55 | 
 56 | int main(int argc, char *argv[]) {
 57 |     // Check command line arguments
 58 |     if (argc < 2) {
 59 |         printf("Usage: %s <mnist_data_dir> [--verbose]\n", argv[0]);
 60 |         return 1;
 61 |     }
 62 |     
 63 |     // Parse command line arguments
 64 |     char *data_dir = argv[1];
 65 |     bool verbose = false;
 66 |     
 67 |     for (int i = 2; i < argc; i++) {
 68 |         if (strcmp(argv[i], "--verbose") == 0) {
 69 |             verbose = true;
 70 |         }
 71 |     }
 72 |     
 73 |     // Seed random number generator
 74 |     srand(time(NULL));
 75 |     
 76 |     // Load MNIST dataset
 77 |     printf("Loading MNIST dataset from %s...\n", data_dir);
 78 |     MNISTData data;
 79 |     if (!load_mnist(data_dir, &data)) {
 80 |         fprintf(stderr, "Failed to load MNIST dataset\n");
 81 |         return 1;
 82 |     }
 83 |     printf("MNIST dataset loaded successfully\n");
 84 |     
 85 |     // Print dataset summary
 86 |     if (verbose) {
 87 |         print_mnist_summary(&data);
 88 |     }
 89 |     
 90 |     // Create network
 91 |     Network network = create_network();
 92 |     printf("Network created\n");
 93 |     printf("Architecture: %d -> %d -> %d -> %d\n", INPUT_SIZE, HIDDEN1_SIZE, HIDDEN2_SIZE, OUTPUT_SIZE);
 94 |     
 95 |     // Train network
 96 |     printf("Starting training...\n");
 97 |     train_network(&network, data.train_images, data.train_labels, 
 98 |                  data.test_images, data.test_labels, EPOCHS, BATCH_SIZE);
 99 |     
100 |     // Print final test accuracy
101 |     float accuracy = calculate_accuracy(&network, data.test_images, data.test_labels, MNIST_TEST_SIZE);
102 |     printf("Final test accuracy: %.4f%%\n", accuracy * 100.0f);
103 |     
104 |     // Print confusion matrix
105 |     if (verbose) {
106 |         print_confusion_matrix(&network, data.test_images, data.test_labels, MNIST_TEST_SIZE);
107 |     }
108 |     
109 |     // Free memory
110 |     free_network(&network);
111 |     free_mnist_data(&data);
112 |     
113 |     return 0;
114 | }
115 | 
116 | Network create_network() {
117 |     Network network;
118 |     
119 |     // Allocate memory for weights and biases
120 |     network.hidden1_weights = (float *)malloc(HIDDEN1_SIZE * INPUT_SIZE * sizeof(float));
121 |     network.hidden1_biases = (float *)malloc(HIDDEN1_SIZE * sizeof(float));
122 |     
123 |     network.hidden2_weights = (float *)malloc(HIDDEN2_SIZE * HIDDEN1_SIZE * sizeof(float));
124 |     network.hidden2_biases = (float *)malloc(HIDDEN2_SIZE * sizeof(float));
125 |     
126 |     network.output_weights = (float *)malloc(OUTPUT_SIZE * HIDDEN2_SIZE * sizeof(float));
127 |     network.output_biases = (float *)malloc(OUTPUT_SIZE * sizeof(float));
128 |     
129 |     // Initialize hidden1 layer weights with He initialization
130 |     float scale_hidden1 = sqrtf(2.0f / INPUT_SIZE);
131 |     for (int i = 0; i < HIDDEN1_SIZE; i++) {
132 |         for (int j = 0; j < INPUT_SIZE; j++) {
133 |             network.hidden1_weights[i * INPUT_SIZE + j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * scale_hidden1;
134 |         }
135 |         network.hidden1_biases[i] = 0.0f;
136 |     }
137 |     
138 |     // Initialize hidden2 layer weights with He initialization
139 |     float scale_hidden2 = sqrtf(2.0f / HIDDEN1_SIZE);
140 |     for (int i = 0; i < HIDDEN2_SIZE; i++) {
141 |         for (int j = 0; j < HIDDEN1_SIZE; j++) {
142 |             network.hidden2_weights[i * HIDDEN1_SIZE + j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * scale_hidden2;
143 |         }
144 |         network.hidden2_biases[i] = 0.0f;
145 |     }
146 |     
147 |     // Initialize output layer weights with Xavier/Glorot initialization
148 |     float scale_output = sqrtf(2.0f / (HIDDEN2_SIZE + OUTPUT_SIZE));
149 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
150 |         for (int j = 0; j < HIDDEN2_SIZE; j++) {
151 |             network.output_weights[i * HIDDEN2_SIZE + j] = ((float)rand() / RAND_MAX * 2.0f - 1.0f) * scale_output;
152 |         }
153 |         network.output_biases[i] = 0.0f;
154 |     }
155 |     
156 |     return network;
157 | }
158 | 
159 | void free_network(Network *network) {
160 |     free(network->hidden1_weights);
161 |     free(network->hidden1_biases);
162 |     free(network->hidden2_weights);
163 |     free(network->hidden2_biases);
164 |     free(network->output_weights);
165 |     free(network->output_biases);
166 | }
167 | 
168 | void forward_pass(Network *network, const float *input, float *hidden1, float *hidden2, float *output) {
169 |     // First hidden layer: h1 = relu(W_h1 * x + b_h1)
170 |     for (int i = 0; i < HIDDEN1_SIZE; i++) {
171 |         hidden1[i] = 0.0f;
172 |         for (int j = 0; j < INPUT_SIZE; j++) {
173 |             hidden1[i] += network->hidden1_weights[i * INPUT_SIZE + j] * input[j];
174 |         }
175 |         hidden1[i] += network->hidden1_biases[i];
176 |         // Apply ReLU activation
177 |         hidden1[i] = (hidden1[i] > 0.0f) ? hidden1[i] : 0.0f;
178 |     }
179 |     
180 |     // Second hidden layer: h2 = relu(W_h2 * h1 + b_h2)
181 |     for (int i = 0; i < HIDDEN2_SIZE; i++) {
182 |         hidden2[i] = 0.0f;
183 |         for (int j = 0; j < HIDDEN1_SIZE; j++) {
184 |             hidden2[i] += network->hidden2_weights[i * HIDDEN1_SIZE + j] * hidden1[j];
185 |         }
186 |         hidden2[i] += network->hidden2_biases[i];
187 |         // Apply ReLU activation
188 |         hidden2[i] = (hidden2[i] > 0.0f) ? hidden2[i] : 0.0f;
189 |     }
190 |     
191 |     // Output layer: o = softmax(W_o * h2 + b_o)
192 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
193 |         output[i] = 0.0f;
194 |         for (int j = 0; j < HIDDEN2_SIZE; j++) {
195 |             output[i] += network->output_weights[i * HIDDEN2_SIZE + j] * hidden2[j];
196 |         }
197 |         output[i] += network->output_biases[i];
198 |     }
199 |     
200 |     // Apply softmax activation
201 |     // Find max value for numerical stability
202 |     float max_val = output[0];
203 |     for (int i = 1; i < OUTPUT_SIZE; i++) {
204 |         if (output[i] > max_val) {
205 |             max_val = output[i];
206 |         }
207 |     }
208 |     
209 |     // Compute exp(x - max) and sum
210 |     float sum = 0.0f;
211 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
212 |         output[i] = expf(output[i] - max_val);
213 |         sum += output[i];
214 |     }
215 |     
216 |     // Normalize to get probabilities
217 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
218 |         output[i] /= sum;
219 |     }
220 | }
221 | 
222 | // Data augmentation function
223 | void augment_image(const float *input, float *output) {
224 |     // Apply small random shifts (up to 2 pixels in each direction)
225 |     int shift_x = rand() % 5 - 2;  // -2 to 2
226 |     int shift_y = rand() % 5 - 2;  // -2 to 2
227 |     
228 |     // Clear output
229 |     memset(output, 0, INPUT_SIZE * sizeof(float));
230 |     
231 |     // Apply shift
232 |     for (int y = 0; y < 28; y++) {
233 |         for (int x = 0; x < 28; x++) {
234 |             int new_y = y + shift_y;
235 |             int new_x = x + shift_x;
236 |             
237 |             // Check bounds
238 |             if (new_x >= 0 && new_x < 28 && new_y >= 0 && new_y < 28) {
239 |                 output[new_y * 28 + new_x] = input[y * 28 + x];
240 |             }
241 |         }
242 |     }
243 | }
244 | 
245 | void backward_pass(Network *network, const float *input, const float *hidden1, const float *hidden2, 
246 |                   const float *output, int label, float learning_rate) {
247 |     // Compute output layer error (cross-entropy gradient with softmax)
248 |     float output_error[OUTPUT_SIZE];
249 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
250 |         output_error[i] = output[i] - (i == label ? 1.0f : 0.0f);
251 |     }
252 |     
253 |     // Compute hidden2 layer error
254 |     float hidden2_error[HIDDEN2_SIZE] = {0};
255 |     for (int i = 0; i < HIDDEN2_SIZE; i++) {
256 |         for (int j = 0; j < OUTPUT_SIZE; j++) {
257 |             hidden2_error[i] += output_error[j] * network->output_weights[j * HIDDEN2_SIZE + i];
258 |         }
259 |         // Apply derivative of ReLU: 1 if activation > 0, 0 otherwise
260 |         hidden2_error[i] *= (hidden2[i] > 0.0f) ? 1.0f : 0.0f;
261 |     }
262 |     
263 |     // Compute hidden1 layer error
264 |     float hidden1_error[HIDDEN1_SIZE] = {0};
265 |     for (int i = 0; i < HIDDEN1_SIZE; i++) {
266 |         for (int j = 0; j < HIDDEN2_SIZE; j++) {
267 |             hidden1_error[i] += hidden2_error[j] * network->hidden2_weights[j * HIDDEN1_SIZE + i];
268 |         }
269 |         // Apply derivative of ReLU: 1 if activation > 0, 0 otherwise
270 |         hidden1_error[i] *= (hidden1[i] > 0.0f) ? 1.0f : 0.0f;
271 |     }
272 |     
273 |     // Update output layer weights and biases with L2 regularization
274 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
275 |         for (int j = 0; j < HIDDEN2_SIZE; j++) {
276 |             // Add L2 regularization gradient (weight decay)
277 |             float l2_grad = WEIGHT_DECAY * network->output_weights[i * HIDDEN2_SIZE + j];
278 |             network->output_weights[i * HIDDEN2_SIZE + j] -= learning_rate * (output_error[i] * hidden2[j] + l2_grad);
279 |         }
280 |         network->output_biases[i] -= learning_rate * output_error[i];
281 |     }
282 |     
283 |     // Update hidden2 layer weights and biases with L2 regularization
284 |     for (int i = 0; i < HIDDEN2_SIZE; i++) {
285 |         for (int j = 0; j < HIDDEN1_SIZE; j++) {
286 |             // Add L2 regularization gradient (weight decay)
287 |             float l2_grad = WEIGHT_DECAY * network->hidden2_weights[i * HIDDEN1_SIZE + j];
288 |             network->hidden2_weights[i * HIDDEN1_SIZE + j] -= learning_rate * (hidden2_error[i] * hidden1[j] + l2_grad);
289 |         }
290 |         network->hidden2_biases[i] -= learning_rate * hidden2_error[i];
291 |     }
292 |     
293 |     // Update hidden1 layer weights and biases with L2 regularization
294 |     for (int i = 0; i < HIDDEN1_SIZE; i++) {
295 |         for (int j = 0; j < INPUT_SIZE; j++) {
296 |             // Add L2 regularization gradient (weight decay)
297 |             float l2_grad = WEIGHT_DECAY * network->hidden1_weights[i * INPUT_SIZE + j];
298 |             network->hidden1_weights[i * INPUT_SIZE + j] -= learning_rate * (hidden1_error[i] * input[j] + l2_grad);
299 |         }
300 |         network->hidden1_biases[i] -= learning_rate * hidden1_error[i];
301 |     }
302 | }
303 | 
304 | float calculate_accuracy(Network *network, float *images, int *labels, int num_samples) {
305 |     int correct = 0;
306 |     
307 |     // Allocate memory for activations
308 |     float *hidden1 = (float *)malloc(HIDDEN1_SIZE * sizeof(float));
309 |     float *hidden2 = (float *)malloc(HIDDEN2_SIZE * sizeof(float));
310 |     float *output = (float *)malloc(OUTPUT_SIZE * sizeof(float));
311 |     
312 |     // Evaluate each sample
313 |     for (int i = 0; i < num_samples; i++) {
314 |         // Forward pass
315 |         forward_pass(network, &images[i * INPUT_SIZE], hidden1, hidden2, output);
316 |         
317 |         // Find the predicted class (maximum probability)
318 |         int predicted = 0;
319 |         for (int j = 1; j < OUTPUT_SIZE; j++) {
320 |             if (output[j] > output[predicted]) {
321 |                 predicted = j;
322 |             }
323 |         }
324 |         
325 |         // Check if prediction is correct
326 |         if (predicted == labels[i]) {
327 |             correct++;
328 |         }
329 |     }
330 |     
331 |     // Free memory
332 |     free(hidden1);
333 |     free(hidden2);
334 |     free(output);
335 |     
336 |     // Return accuracy as a fraction
337 |     return (float)correct / num_samples;
338 | }
339 | 
340 | void print_confusion_matrix(Network *network, float *images, int *labels, int num_samples) {
341 |     int confusion_matrix[OUTPUT_SIZE][OUTPUT_SIZE] = {0};
342 |     
343 |     // Allocate memory for activations
344 |     float *hidden1 = (float *)malloc(HIDDEN1_SIZE * sizeof(float));
345 |     float *hidden2 = (float *)malloc(HIDDEN2_SIZE * sizeof(float));
346 |     float *output = (float *)malloc(OUTPUT_SIZE * sizeof(float));
347 |     
348 |     // Evaluate each sample
349 |     for (int i = 0; i < num_samples; i++) {
350 |         // Forward pass
351 |         forward_pass(network, &images[i * INPUT_SIZE], hidden1, hidden2, output);
352 |         
353 |         // Find the predicted class (maximum probability)
354 |         int predicted = 0;
355 |         for (int j = 1; j < OUTPUT_SIZE; j++) {
356 |             if (output[j] > output[predicted]) {
357 |                 predicted = j;
358 |             }
359 |         }
360 |         
361 |         // Update confusion matrix
362 |         confusion_matrix[labels[i]][predicted]++;
363 |     }
364 |     
365 |     // Print confusion matrix
366 |     printf("\nConfusion Matrix:\n");
367 |     printf("    ");
368 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
369 |         printf("%4d", i);
370 |     }
371 |     printf("\n    ");
372 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
373 |         printf("----");
374 |     }
375 |     printf("\n");
376 |     
377 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
378 |         printf("%d | ", i);
379 |         for (int j = 0; j < OUTPUT_SIZE; j++) {
380 |             printf("%4d", confusion_matrix[i][j]);
381 |         }
382 |         printf("\n");
383 |     }
384 |     
385 |     // Free memory
386 |     free(hidden1);
387 |     free(hidden2);
388 |     free(output);
389 | }
390 | 
391 | void train_network(Network *network, float *train_images, int *train_labels,
392 |                   float *test_images, int *test_labels, int epochs, int batch_size) {
393 |     // Allocate memory for activations
394 |     float *hidden1 = (float *)malloc(HIDDEN1_SIZE * sizeof(float));
395 |     float *hidden2 = (float *)malloc(HIDDEN2_SIZE * sizeof(float));
396 |     float *output = (float *)malloc(OUTPUT_SIZE * sizeof(float));
397 |     
398 |     // Allocate memory for augmented image
399 |     float *augmented_image = (float *)malloc(INPUT_SIZE * sizeof(float));
400 |     
401 |     // Create array of indices for shuffling
402 |     int *indices = (int *)malloc(MNIST_TRAIN_SIZE * sizeof(int));
403 |     for (int i = 0; i < MNIST_TRAIN_SIZE; i++) {
404 |         indices[i] = i;
405 |     }
406 |     
407 |     // Training loop
408 |     int num_batches = MNIST_TRAIN_SIZE / batch_size;
409 |     float best_accuracy = 0.0f;
410 |     float learning_rate = LEARNING_RATE;
411 |     
412 |     for (int epoch = 0; epoch < epochs; epoch++) {
413 |         // Shuffle indices for randomized training
414 |         for (int i = MNIST_TRAIN_SIZE - 1; i > 0; i--) {
415 |             int j = rand() % (i + 1);
416 |             int temp = indices[i];
417 |             indices[i] = indices[j];
418 |             indices[j] = temp;
419 |         }
420 |         
421 |         // Process each batch
422 |         for (int batch = 0; batch < num_batches; batch++) {
423 |             for (int i = 0; i < batch_size; i++) {
424 |                 int idx = indices[batch * batch_size + i];
425 |                 
426 |                 // Apply data augmentation if enabled
427 |                 const float *input_data;
428 |                 if (USE_DATA_AUGMENTATION && rand() % 2 == 0) {  // 50% chance of augmentation
429 |                     augment_image(&train_images[idx * INPUT_SIZE], augmented_image);
430 |                     input_data = augmented_image;
431 |                 } else {
432 |                     input_data = &train_images[idx * INPUT_SIZE];
433 |                 }
434 |                 
435 |                 // Forward pass
436 |                 forward_pass(network, input_data, hidden1, hidden2, output);
437 |                 
438 |                 // Backward pass
439 |                 backward_pass(network, input_data, hidden1, hidden2, output, 
440 |                              train_labels[idx], learning_rate);
441 |             }
442 |             
443 |             // Print progress
444 |             if (batch % 100 == 0) {
445 |                 printf("Epoch %d/%d - Batch %d/%d\r", epoch + 1, epochs, batch, num_batches);
446 |                 fflush(stdout);
447 |             }
448 |         }
449 |         
450 |         // Evaluate on test set
451 |         float accuracy = calculate_accuracy(network, test_images, test_labels, MNIST_TEST_SIZE);
452 |         printf("Epoch %d/%d - Test accuracy: %.4f%%\n", epoch + 1, epochs, accuracy * 100.0f);
453 |         
454 |         if (accuracy > best_accuracy) {
455 |             best_accuracy = accuracy;
456 |             printf("New best accuracy: %.4f%%\n", best_accuracy * 100.0f);
457 |         }
458 |         
459 |         // Adjust learning rate (cosine annealing)
460 |         float progress = (float)(epoch) / (float)(epochs);
461 |         learning_rate = LEARNING_RATE * 0.5f * (1.0f + cosf(M_PI * progress));
462 |         
463 |         // Check if we've reached the target accuracy
464 |         if (accuracy >= 0.99f) {
465 |             printf("Target accuracy of 99%% achieved! Stopping training.\n");
466 |             break;
467 |         }
468 |     }
469 |     
470 |     // Free memory
471 |     free(hidden1);
472 |     free(hidden2);
473 |     free(output);
474 |     free(augmented_image);
475 |     free(indices);
476 | }
477 | 
478 | void apply_relu(float *x, int size) {
479 |     for (int i = 0; i < size; i++) {
480 |         x[i] = (x[i] > 0.0f) ? x[i] : 0.0f;
481 |     }
482 | }
483 | 
484 | void apply_softmax(float *x, int size) {
485 |     // Find max value for numerical stability
486 |     float max_val = x[0];
487 |     for (int i = 1; i < size; i++) {
488 |         if (x[i] > max_val) {
489 |             max_val = x[i];
490 |         }
491 |     }
492 |     
493 |     // Compute exp(x - max) and sum
494 |     float sum = 0.0f;
495 |     for (int i = 0; i < size; i++) {
496 |         x[i] = expf(x[i] - max_val);
497 |         sum += x[i];
498 |     }
499 |     
500 |     // Normalize to get probabilities
501 |     for (int i = 0; i < size; i++) {
502 |         x[i] /= sum;
503 |     }
504 | }
505 | 
506 | void matrix_vector_multiply(const float *matrix, const float *vector, float *result, 
507 |                            int rows, int cols) {
508 |     // Matrix is stored in row-major order: matrix[row * cols + col]
509 |     for (int i = 0; i < rows; i++) {
510 |         result[i] = 0.0f;
511 |         for (int j = 0; j < cols; j++) {
512 |             result[i] += matrix[i * cols + j] * vector[j];
513 |         }
514 |     }
515 | }
516 | 


--------------------------------------------------------------------------------