├── .gitignore ├── CNN.rkt ├── CNN_IMPLEMENTATION.md ├── FNN.rkt ├── OPTIMIZATION_STRATEGY.md ├── RACOGRAD_IMPLEMENTATION_DETAILS.md ├── RACOGRAD_USER_GUIDE.md ├── README.md ├── autograd.rkt ├── benchmark.rkt ├── cnn_benchmark.rkt ├── cnn_ops.c ├── cnn_ops.rkt ├── cnn_test.rkt ├── compile_extensions.sh ├── cpu_benchmark.rkt ├── cuda_ops.c ├── debuggin_lib.rkt ├── demo.rkt ├── device.rkt ├── ffi_ops.rkt ├── gpu_acceleration.md ├── gpu_benchmark.rkt ├── hardware_detection.rkt ├── load-mnist.rkt ├── matrix_multiplication.c ├── matrix_opencl.c ├── mlx_cnn_ops.c ├── mlx_ops.c ├── mnist.rkt ├── mnist_benchmark.rkt ├── mnist_device.rkt ├── parallel_ops.c ├── regression-test.rkt ├── simd_ops.c ├── simple-test.rkt ├── tensor.rkt ├── tensor2.rkt ├── tensor_device.rkt ├── tensor_optimized.rkt ├── test.rkt └── visualization.rkt /.gitignore: -------------------------------------------------------------------------------- 1 | # Racket compiled files 2 | compiled/ 3 | *.zo 4 | *.dep 5 | *.rkt~ 6 | 7 | # Binary files 8 | *.dylib 9 | *.so 10 | *.dll 11 | 12 | # OS specific files 13 | .DS_Store 14 | 15 | # MNIST data files (large binaries) 16 | mnist-data/t10k-images.idx3-ubyte 17 | mnist-data/t10k-labels.idx1-ubyte 18 | mnist-data/train-images.idx3-ubyte 19 | mnist-data/train-labels.idx1-ubyte 20 | 21 | # IDE/Editor files 22 | *~ -------------------------------------------------------------------------------- /CNN_IMPLEMENTATION.md: -------------------------------------------------------------------------------- 1 | # Implementing a CNN with Optimized RacoGrad 2 | 3 | This guide shows how to implement a Convolutional Neural Network (CNN) using the optimized RacoGrad library components. 4 | 5 | ## Prerequisites 6 | 7 | - Compiled C extensions (`matrix_multiplication.so`, `simd_ops.so`, `parallel_ops.so`) 8 | - OpenCL setup (optional, for GPU acceleration) 9 | - Optimized tensor implementation (`tensor_optimized.rkt`) 10 | 11 | ## Step 1: Import Required Modules 12 | 13 | ```racket 14 | #lang racket 15 | 16 | (require "tensor_optimized.rkt" ; Optimized tensor operations 17 | "ffi_ops.rkt" ; C extension bindings 18 | "load-mnist.rkt") ; For MNIST dataset loading 19 | ``` 20 | 21 | ## Step 2: Implement Optimized Convolution Layer 22 | 23 | The convolution layer is critical for CNN performance. Here's an implementation using C extensions: 24 | 25 | ```racket 26 | ;; Forward pass through a convolution layer 27 | (define (conv2d-forward input-tensor filter-tensor stride padding) 28 | (let* ([input-shape (t-opt:shape input-tensor)] 29 | [filter-shape (t-opt:shape filter-tensor)] 30 | [batch-size (car input-shape)] 31 | [in-channels (cadr input-shape)] 32 | [in-height (caddr input-shape)] 33 | [in-width (cadddr input-shape)] 34 | [out-channels (car filter-shape)] 35 | [filter-height (caddr filter-shape)] 36 | [filter-width (cadddr filter-shape)] 37 | 38 | ;; Calculate output dimensions 39 | [out-height (add1 (quotient (- (+ in-height (* 2 padding)) filter-height) stride))] 40 | [out-width (add1 (quotient (- (+ in-width (* 2 padding)) filter-width) stride))] 41 | 42 | ;; Create output tensor 43 | [output-tensor (t-opt:create (list batch-size out-channels out-height out-width) 44 | (make-vector (* batch-size out-channels out-height out-width) 0.0))]) 45 | 46 | ;; Call the C extension for convolution 47 | (c:conv2d-forward 48 | batch-size in-channels in-height in-width 49 | out-channels filter-height filter-width 50 | stride padding 51 | (t-opt:data input-tensor) 52 | (t-opt:data filter-tensor) 53 | (t-opt:data output-tensor)) 54 | 55 | output-tensor)) 56 | ``` 57 | 58 | ## Step 3: Implement Pooling Layer 59 | 60 | Max pooling is another critical operation for CNNs: 61 | 62 | ```racket 63 | ;; Max pooling layer (2x2 pooling with stride 2) 64 | (define (max-pool-2x2 input-tensor) 65 | (let* ([input-shape (t-opt:shape input-tensor)] 66 | [batch-size (car input-shape)] 67 | [channels (cadr input-shape)] 68 | [in-height (caddr input-shape)] 69 | [in-width (cadddr input-shape)] 70 | 71 | ;; Output dimensions 72 | [out-height (quotient in-height 2)] 73 | [out-width (quotient in-width 2)] 74 | 75 | ;; Create output tensor 76 | [output-tensor (t-opt:create (list batch-size channels out-height out-width) 77 | (make-vector (* batch-size channels out-height out-width) 0.0))]) 78 | 79 | ;; Call the C extension for max pooling 80 | (c:max-pool-2x2 81 | batch-size channels in-height in-width 82 | (t-opt:data input-tensor) 83 | (t-opt:data output-tensor)) 84 | 85 | output-tensor)) 86 | ``` 87 | 88 | ## Step 4: Implement the Fully Connected Layer 89 | 90 | Using our optimized matrix multiplication: 91 | 92 | ```racket 93 | ;; Fully connected layer using optimized operations 94 | (define (fc-layer input-tensor weights-tensor bias-tensor activation-fn) 95 | (let* ([input-flat (if (> (length (t-opt:shape input-tensor)) 2) 96 | ;; Flatten input if it's coming from conv layer 97 | (flatten-tensor input-tensor) 98 | input-tensor)] 99 | [z (t-opt:add (t-opt:mul input-flat weights-tensor) bias-tensor)]) 100 | (activation-fn z))) 101 | ``` 102 | 103 | ## Step 5: Activation Functions 104 | 105 | ReLU is the most common activation in CNNs: 106 | 107 | ```racket 108 | ;; ReLU activation using C extension 109 | (define (relu-opt tensor) 110 | (let* ([shape (t-opt:shape tensor)] 111 | [size (apply * shape)] 112 | [result (make-f64vector size 0.0)]) 113 | (c:relu-forward size (t-opt:data tensor) result) 114 | (t-opt:create shape result))) 115 | ``` 116 | 117 | ## Step 6: Building the Complete CNN 118 | 119 | Put everything together to build a CNN for MNIST: 120 | 121 | ```racket 122 | ;; Define model parameters 123 | (define batch-size 64) 124 | (define learning-rate 0.01) 125 | 126 | ;; Create model (LeNet-5 architecture) 127 | (define (create-lenet) 128 | (let ([conv1-filters (t-opt:random (list 6 1 5 5) 0.1)] ; 6 5x5 filters 129 | [conv1-bias (t-opt:random (list 6) 0.1)] 130 | 131 | [conv2-filters (t-opt:random (list 16 6 5 5) 0.1)] ; 16 5x5 filters 132 | [conv2-bias (t-opt:random (list 16) 0.1)] 133 | 134 | [fc1-weights (t-opt:random (list 400 120) 0.1)] ; 400 -> 120 135 | [fc1-bias (t-opt:random (list 120) 0.1)] 136 | 137 | [fc2-weights (t-opt:random (list 120 84) 0.1)] ; 120 -> 84 138 | [fc2-bias (t-opt:random (list 84) 0.1)] 139 | 140 | [fc3-weights (t-opt:random (list 84 10) 0.1)] ; 84 -> 10 141 | [fc3-bias (t-opt:random (list 10) 0.1)]) 142 | 143 | (values conv1-filters conv1-bias 144 | conv2-filters conv2-bias 145 | fc1-weights fc1-bias 146 | fc2-weights fc2-bias 147 | fc3-weights fc3-bias))) 148 | 149 | ;; Forward pass through the network 150 | (define (forward-pass x 151 | conv1-filters conv1-bias 152 | conv2-filters conv2-bias 153 | fc1-weights fc1-bias 154 | fc2-weights fc2-bias 155 | fc3-weights fc3-bias) 156 | 157 | ;; First conv layer + relu + pool 158 | (let* ([conv1 (conv2d-forward x conv1-filters 1 2)] 159 | [relu1 (relu-opt conv1)] 160 | [pool1 (max-pool-2x2 relu1)] 161 | 162 | ;; Second conv layer + relu + pool 163 | [conv2 (conv2d-forward pool1 conv2-filters 1 0)] 164 | [relu2 (relu-opt conv2)] 165 | [pool2 (max-pool-2x2 relu2)] 166 | 167 | ;; Flatten and fully connected layers 168 | [flatten (flatten-tensor pool2)] 169 | 170 | [fc1 (fc-layer flatten fc1-weights fc1-bias relu-opt)] 171 | [fc2 (fc-layer fc1 fc2-weights fc2-bias relu-opt)] 172 | [fc3 (fc-layer fc2 fc3-weights fc3-bias identity)]) 173 | 174 | fc3)) 175 | ``` 176 | 177 | ## Step 7: Using GPU Acceleration (Optional) 178 | 179 | If OpenCL is configured: 180 | 181 | ```racket 182 | ;; Check if GPU is available 183 | (define gpu-available? (c:check-opencl-available)) 184 | 185 | ;; Select appropriate implementation based on hardware 186 | (define conv2d-impl 187 | (if gpu-available? 188 | conv2d-forward-gpu ; Use GPU implementation 189 | conv2d-forward)) ; Use CPU implementation 190 | ``` 191 | 192 | ## Step 8: Training Loop 193 | 194 | An optimized training loop: 195 | 196 | ```racket 197 | (define (train-epoch! dataset 198 | conv1-filters conv1-bias 199 | conv2-filters conv2-bias 200 | fc1-weights fc1-bias 201 | fc2-weights fc2-bias 202 | fc3-weights fc3-bias) 203 | 204 | ;; Process mini-batches in parallel when possible 205 | (for ([batch (in-dataset-batches dataset batch-size)]) 206 | (let-values ([(inputs labels) (batch->tensors batch)]) 207 | ;; Forward pass 208 | (define outputs 209 | (forward-pass inputs 210 | conv1-filters conv1-bias 211 | conv2-filters conv2-bias 212 | fc1-weights fc1-bias 213 | fc2-weights fc2-bias 214 | fc3-weights fc3-bias)) 215 | 216 | ;; Compute loss (cross-entropy) 217 | (define loss (cross-entropy-loss outputs labels)) 218 | 219 | ;; Backward pass and update weights 220 | ;; ... (omitted for brevity) 221 | ))) 222 | ``` 223 | 224 | ## Performance Considerations 225 | 226 | 1. **Batch Processing**: Use the parallel C extensions to process batches 227 | 2. **Memory Reuse**: Use in-place operations when possible 228 | 3. **GPU Acceleration**: Identify operations that benefit most from GPU 229 | 230 | ## Example Usage 231 | 232 | ```racket 233 | ;; Load MNIST data 234 | (define mnist-data (load-mnist)) 235 | 236 | ;; Create model 237 | (let-values ([(conv1-filters conv1-bias 238 | conv2-filters conv2-bias 239 | fc1-weights fc1-bias 240 | fc2-weights fc2-bias 241 | fc3-weights fc3-bias) (create-lenet)]) 242 | 243 | ;; Train for multiple epochs 244 | (for ([epoch 10]) 245 | (printf "Epoch ~a\n" epoch) 246 | (train-epoch! mnist-data 247 | conv1-filters conv1-bias 248 | conv2-filters conv2-bias 249 | fc1-weights fc1-bias 250 | fc2-weights fc2-bias 251 | fc3-weights fc3-bias))) 252 | ``` -------------------------------------------------------------------------------- /FNN.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt" 4 | "autograd.rkt" 5 | "device.rkt") 6 | 7 | (provide (all-defined-out)) 8 | 9 | ;; Feedforward Neural Network for classification tasks 10 | 11 | ;; Initialize a multi-layer feedforward neural network 12 | ;; Example: (initialize-neural-network (list 784 256 128 10)) 13 | (define (initialize-neural-network layer-sizes [scale 0.01]) 14 | (let* ([num-layers (length layer-sizes)] 15 | [weights (for/list ([i (in-range (sub1 num-layers))]) 16 | (t:random (list (list-ref layer-sizes i) 17 | (list-ref layer-sizes (add1 i))) 18 | scale))] 19 | [biases (for/list ([i (in-range (sub1 num-layers))]) 20 | (t:random (list 1 (list-ref layer-sizes (add1 i))) 21 | scale))]) 22 | (values weights biases))) 23 | 24 | ;; Forward pass through the entire network 25 | (define (forward-pass input weights biases activation-fn [final-activation softmax]) 26 | (let* ([num-layers (add1 (length weights))] 27 | [layer-outputs '()] 28 | [layer-inputs '()] 29 | [current-activations input]) 30 | 31 | ;; Process each layer 32 | (for ([i (in-range (length weights))]) 33 | (let* ([w (list-ref weights i)] 34 | [b (list-ref biases i)] 35 | [is-final-layer? (= i (sub1 (length weights)))] 36 | [z (t:add (t:mul current-activations w) b)] 37 | [a (if is-final-layer? 38 | (final-activation z) 39 | (activation-fn z))]) 40 | (set! layer-inputs (cons z layer-inputs)) 41 | (set! layer-outputs (cons a layer-outputs)) 42 | (set! current-activations a))) 43 | 44 | ;; Return final output and intermediate values for backprop 45 | (values (car layer-outputs) ; Final output 46 | (reverse (cdr layer-outputs)) ; Hidden layer activations 47 | (reverse layer-inputs)))) ; Pre-activation values 48 | 49 | ;; Calculate cross-entropy loss for classification 50 | (define (cross-entropy-loss y-pred y-true) 51 | (let* ([m (car (t:shape y-true))] 52 | [epsilon 1e-15] 53 | [y-pred-clipped (t:create (t:shape y-pred) 54 | (for/list ([p (vector->list (t:data y-pred))]) 55 | (max (min p (- 1.0 epsilon)) epsilon)))] 56 | [loss-vec (for/list ([i (in-range (vector-length (t:data y-true)))]) 57 | (* (vector-ref (t:data y-true) i) 58 | (log (vector-ref (t:data y-pred-clipped) i))))]) 59 | (- (/ (apply + loss-vec) m)))) 60 | 61 | ;; Backpropagation and parameter update 62 | (define (backpropagation input y-true y-pred hidden-activations z-values weights biases learning-rate) 63 | (let* ([num-layers (length weights)] 64 | [batch-size (car (t:shape input))] 65 | 66 | ;; Calculate initial gradient (for output layer) 67 | [output-grad (t:sub y-pred y-true)] ; For softmax + cross-entropy, this is the gradient 68 | 69 | ;; Initialize lists to store gradients 70 | [weight-gradients '()] 71 | [bias-gradients '()] 72 | 73 | ;; Add input to start of activations for backprop calculations 74 | [all-activations (cons input hidden-activations)] 75 | 76 | ;; Backpropagate through the network 77 | [current-grad output-grad]) 78 | 79 | ;; Process each layer in reverse 80 | (for ([layer-idx (in-range (sub1 num-layers) -1 -1)]) 81 | (let* ([activation (list-ref all-activations layer-idx)] 82 | [z (list-ref z-values layer-idx)] 83 | [w (list-ref weights layer-idx)] 84 | 85 | ;; For hidden layers, apply activation derivative to gradients 86 | [gradient (if (= layer-idx (sub1 num-layers)) 87 | current-grad ; Output layer gradient 88 | ;; Hidden layer gradient 89 | (let ([grad-times-weights (t:mul current-grad (t:transpose (list-ref weights (add1 layer-idx))))] 90 | [activation-deriv (relu-derivative z)]) 91 | (t:mul grad-times-weights activation-deriv)))] 92 | 93 | ;; Calculate gradients for weights and biases 94 | [weight-grad (t:scale (t:mul (t:transpose activation) gradient) 95 | (/ 1.0 batch-size))] 96 | [bias-grad (t:create (t:shape (list-ref biases layer-idx)) 97 | (for/list ([j (in-range (cadr (t:shape gradient)))]) 98 | (/ (for/sum ([i (in-range batch-size)]) 99 | (vector-ref (t:data gradient) (+ (* i (cadr (t:shape gradient))) j))) 100 | batch-size)))]) 101 | 102 | ;; Store gradients 103 | (set! weight-gradients (cons weight-grad weight-gradients)) 104 | (set! bias-gradients (cons bias-grad bias-gradients)) 105 | 106 | ;; Update current gradient for next layer 107 | (set! current-grad gradient))) 108 | 109 | ;; Update weights and biases using gradients 110 | (let ([new-weights (map (lambda (w gradient) 111 | (t:sub w (t:scale gradient learning-rate))) 112 | weights weight-gradients)] 113 | [new-biases (map (lambda (b gradient) 114 | (t:sub b (t:scale gradient learning-rate))) 115 | biases bias-gradients)]) 116 | 117 | (values new-weights new-biases)))) 118 | 119 | ;; Softmax function 120 | (define (softmax z) 121 | (let* ([shape (t:shape z)] 122 | [data (t:data z)] 123 | [max-vals (for/vector ([i (in-range (car shape))]) 124 | (apply max (for/list ([j (in-range (cadr shape))]) 125 | (vector-ref data (+ (* i (cadr shape)) j)))))] 126 | [exp-vals (for/vector ([i (in-range (vector-length data))]) 127 | (exp (- (vector-ref data i) 128 | (vector-ref max-vals (quotient i (cadr shape))))))] 129 | [sum-vals (for/vector ([i (in-range (car shape))]) 130 | (for/sum ([j (in-range (cadr shape))]) 131 | (vector-ref exp-vals (+ (* i (cadr shape)) j))))]) 132 | (t:create shape 133 | (for/vector ([i (in-range (vector-length exp-vals))]) 134 | (/ (vector-ref exp-vals i) 135 | (vector-ref sum-vals (quotient i (cadr shape)))))))) 136 | 137 | ;; Predict classes from model output 138 | (define (predict output-tensor) 139 | (let* ([shape (t:shape output-tensor)] 140 | [batch-size (car shape)] 141 | [num-classes (cadr shape)] 142 | [data (t:data output-tensor)] 143 | [predictions (for/list ([b (in-range batch-size)]) 144 | (for/fold ([max-idx 0] 145 | [max-val (vector-ref data (* b num-classes))]) 146 | ([c (in-range 1 num-classes)]) 147 | (let ([val (vector-ref data (+ (* b num-classes) c))]) 148 | (if (> val max-val) 149 | (values c val) 150 | (values max-idx max-val)))))]) 151 | predictions)) 152 | 153 | ;; Calculate accuracy 154 | (define (accuracy predictions y-true) 155 | (let* ([num-samples (length predictions)] 156 | [true-labels (let ([shape (t:shape y-true)] 157 | [data (t:data y-true)]) 158 | (for/list ([i (in-range (car shape))]) 159 | (for/first ([j (in-range (cadr shape))] 160 | #:when (= 1.0 (vector-ref data (+ (* i (cadr shape)) j)))) 161 | j)))] 162 | [correct-count (for/sum ([i (in-range num-samples)]) 163 | (if (= (list-ref predictions i) (list-ref true-labels i)) 164 | 1 165 | 0))]) 166 | (/ correct-count num-samples))) 167 | 168 | ;; Create batch data 169 | (define (get-batch X y indices) 170 | (let* ([batch-size (length indices)] 171 | [X-features (cadr (t:shape X))] 172 | [y-classes (cadr (t:shape y))] 173 | 174 | [X-batch-data (make-vector (* batch-size X-features) 0.0)] 175 | [y-batch-data (make-vector (* batch-size y-classes) 0.0)]) 176 | 177 | ;; Copy data for each example in the batch 178 | (for ([i (in-range batch-size)] 179 | [idx (in-list indices)]) 180 | ;; Copy features 181 | (for ([j (in-range X-features)]) 182 | (vector-set! X-batch-data 183 | (+ (* i X-features) j) 184 | (vector-ref (t:data X) (+ (* idx X-features) j)))) 185 | 186 | ;; Copy labels 187 | (for ([j (in-range y-classes)]) 188 | (vector-set! y-batch-data 189 | (+ (* i y-classes) j) 190 | (vector-ref (t:data y) (+ (* idx y-classes) j))))) 191 | 192 | (values (t:create (list batch-size X-features) X-batch-data) 193 | (t:create (list batch-size y-classes) y-batch-data)))) 194 | 195 | ;; Simple training function for demonstration 196 | (define (train-fnn X-train y-train X-val y-val 197 | [hidden-layers (list 128 64)] 198 | [learning-rate 0.01] 199 | [epochs 10] 200 | [batch-size 32]) 201 | (printf "Initializing neural network...~n") 202 | 203 | ;; Determine network architecture 204 | (define input-size (cadr (t:shape X-train))) 205 | (define output-size (cadr (t:shape y-train))) 206 | (define layer-sizes (append (list input-size) hidden-layers (list output-size))) 207 | 208 | ;; Initialize network parameters 209 | (define-values (weights biases) 210 | (initialize-neural-network layer-sizes)) 211 | 212 | ;; Early stopping parameters 213 | (define patience 3) 214 | (define best-val-accuracy 0.0) 215 | (define wait 0) 216 | (define best-weights weights) 217 | (define best-biases biases) 218 | 219 | (printf "Neural network architecture: ~a~n" layer-sizes) 220 | (printf "Starting training for ~a epochs...~n" epochs) 221 | 222 | ;; Training loop 223 | (for/fold ([stop? #f]) 224 | ([epoch (in-range epochs)] #:break stop?) 225 | (printf "Epoch ~a/~a:~n" (add1 epoch) epochs) 226 | 227 | ;; Shuffle training data 228 | (define train-size (car (t:shape X-train))) 229 | (define indices (shuffle (range train-size))) 230 | (define num-batches (quotient train-size batch-size)) 231 | 232 | ;; Initialize tracking variables 233 | (define epoch-losses '()) 234 | 235 | ;; Process each batch 236 | (for ([batch (in-range num-batches)]) 237 | (let* ([start-idx (* batch batch-size)] 238 | [end-idx (min (+ start-idx batch-size) train-size)] 239 | [batch-indices (take (drop indices start-idx) (- end-idx start-idx))]) 240 | 241 | ;; Get batch data 242 | (define-values (X-batch y-batch) 243 | (get-batch X-train y-train batch-indices)) 244 | 245 | ;; Forward pass 246 | (define-values (y-pred hidden-activations z-values) 247 | (forward-pass X-batch weights biases relu)) 248 | 249 | ;; Calculate loss 250 | (define loss (cross-entropy-loss y-pred y-batch)) 251 | (set! epoch-losses (cons loss epoch-losses)) 252 | 253 | ;; Backpropagation and parameter update 254 | (define-values (new-weights new-biases) 255 | (backpropagation X-batch y-batch y-pred 256 | hidden-activations z-values 257 | weights biases learning-rate)) 258 | 259 | ;; Update parameters 260 | (set! weights new-weights) 261 | (set! biases new-biases) 262 | 263 | ;; Print progress 264 | (when (= (modulo batch 10) 0) 265 | (printf " Batch ~a/~a: Loss = ~a~n" 266 | batch 267 | num-batches 268 | (real->decimal-string loss 4))))) 269 | 270 | ;; Evaluate on validation set 271 | (define-values (val-pred _ __) 272 | (forward-pass X-val weights biases relu)) 273 | 274 | (define val-predictions (predict val-pred)) 275 | (define val-accuracy (* 100.0 (accuracy val-predictions y-val))) 276 | 277 | ;; Calculate average loss for epoch 278 | (define avg-loss 279 | (/ (apply + epoch-losses) (length epoch-losses))) 280 | 281 | (printf "Epoch Summary:~n") 282 | (printf " Average Loss: ~a~n" (real->decimal-string avg-loss 4)) 283 | (printf " Validation Accuracy: ~a%~n" (real->decimal-string val-accuracy 2)) 284 | 285 | ;; Early stopping check 286 | (if (> val-accuracy best-val-accuracy) 287 | (begin 288 | (set! best-val-accuracy val-accuracy) 289 | (set! best-weights weights) 290 | (set! best-biases biases) 291 | (set! wait 0) 292 | (printf " New best model saved!~n")) 293 | (begin 294 | (set! wait (add1 wait)) 295 | (printf " No improvement for ~a epochs.~n" wait))) 296 | 297 | ;; Check if we should stop 298 | (if (>= wait patience) 299 | (begin 300 | (printf "Early stopping triggered after ~a epochs without improvement.~n" 301 | patience) 302 | #t) ; Stop training 303 | #f)) ; Continue training 304 | 305 | ;; Restore best model 306 | (set! weights best-weights) 307 | (set! biases best-biases) 308 | 309 | (printf "Training Complete!~n") 310 | (printf "Best Validation Accuracy: ~a%~n" 311 | (real->decimal-string best-val-accuracy 2)) 312 | 313 | ;; Return the trained model 314 | (values weights biases)) 315 | 316 | ;; Example usage in a main module 317 | (module+ main 318 | (require "mnist.rkt") 319 | 320 | (printf "Loading MNIST data...~n") 321 | 322 | ;; Data loading functions are in mnist.rkt 323 | (define-values (X-train-full y-train-full) (load-mnist-data "train")) 324 | (define-values (X-test y-test) (load-mnist-data "test")) 325 | 326 | ;; Normalize and one-hot encode 327 | (define X-train-norm (normalize X-train-full)) 328 | (define X-test-norm (normalize X-test)) 329 | (define y-train-onehot (one-hot y-train-full 10)) 330 | (define y-test-onehot (one-hot y-test 10)) 331 | 332 | ;; Create validation split 333 | (define train-size (car (t:shape X-train-norm))) 334 | (define validation-split 0.1) 335 | (define val-size (inexact->exact (floor (* train-size validation-split)))) 336 | (define train-size-actual (- train-size val-size)) 337 | 338 | ;; Shuffle indices 339 | (define all-indices (shuffle (range train-size))) 340 | (define train-indices (take all-indices train-size-actual)) 341 | (define val-indices (drop all-indices train-size-actual)) 342 | 343 | ;; Create training and validation sets 344 | (define-values (X-train y-train) (get-batch X-train-norm y-train-onehot train-indices)) 345 | (define-values (X-val y-val) (get-batch X-train-norm y-train-onehot val-indices)) 346 | 347 | (printf "Training FNN on MNIST...~n") 348 | (printf "Train set: ~a examples~n" (car (t:shape X-train))) 349 | (printf "Validation set: ~a examples~n" (car (t:shape X-val))) 350 | (printf "Test set: ~a examples~n" (car (t:shape X-test-norm))) 351 | 352 | ;; Train the model 353 | (define-values (trained-weights trained-biases) 354 | (train-fnn X-train y-train X-val y-val 355 | (list 128) ; One hidden layer with 128 units 356 | 0.01 ; Learning rate 357 | 5 ; Epochs 358 | 64)) ; Batch size 359 | 360 | ;; Evaluate on test set 361 | (define-values (test-pred _ __) 362 | (forward-pass X-test-norm trained-weights trained-biases relu)) 363 | 364 | (define test-predictions (predict test-pred)) 365 | (define test-accuracy (* 100.0 (accuracy test-predictions y-test-onehot))) 366 | 367 | (printf "Final Test Accuracy: ~a%~n" 368 | (real->decimal-string test-accuracy 2))) -------------------------------------------------------------------------------- /OPTIMIZATION_STRATEGY.md: -------------------------------------------------------------------------------- 1 | # RacoGrad Optimization Strategy 2 | 3 | This document outlines the comprehensive plan for optimizing the RacoGrad deep learning library to achieve exponential speedups. 4 | 5 | ## Performance Optimization Techniques 6 | 7 | ### 1. C Extensions (Immediate Wins) 8 | 9 | Native C implementations provide significant speedups for core operations: 10 | 11 | - **Matrix Multiplication**: 10-100x faster than pure Racket 12 | - **Element-wise Operations**: 5-20x faster 13 | - **Activation Functions**: 3-10x faster 14 | 15 | **Implementation Files**: 16 | - `matrix_multiplication.c`: Core C implementations 17 | - `ffi_ops.rkt`: Racket FFI interface 18 | 19 | **Usage**: 20 | ```racket 21 | (require "ffi_ops.rkt") 22 | ;; Use C-accelerated matrix multiply 23 | (c:matrix-multiply rows-a cols-a cols-b input-vec weights-vec output-vec) 24 | ``` 25 | 26 | ### 2. SIMD Vectorization (CPU Optimization) 27 | 28 | SIMD instructions leverage CPU vector units for parallel data processing: 29 | 30 | - **AVX/SSE Instructions**: Process 4/2 doubles simultaneously 31 | - **Key Operations**: Matrix multiplication, element-wise ops, activations 32 | 33 | **Implementation Files**: 34 | - `simd_ops.c`: SIMD-optimized implementations 35 | - `simd_ffi.rkt`: Racket FFI interface for SIMD ops 36 | 37 | **Expected Gains**: 38 | - 2-4x speedup over basic C implementations 39 | - Automatic fallback to non-SIMD code when unavailable 40 | 41 | ### 3. Memory Optimization (Reduced Overhead) 42 | 43 | Better memory management reduces allocation overhead: 44 | 45 | - **Contiguous Memory**: Using `f64vector` instead of Racket vectors 46 | - **In-place Operations**: Modifying tensors directly 47 | - **Memory Layout**: Cache-friendly data arrangement 48 | 49 | **Implementation Files**: 50 | - `tensor_optimized.rkt`: Memory-optimized tensor implementation 51 | 52 | **Usage Example**: 53 | ```racket 54 | ;; Out-of-place (returns new tensor) 55 | (t-opt:add tensor1 tensor2) 56 | ;; In-place (modifies tensor1) 57 | (t-opt:add! tensor1 tensor2) 58 | ``` 59 | 60 | ### 4. Parallelization (Multi-core Utilization) 61 | 62 | Multi-threading for batch processing and large operations: 63 | 64 | - **Batch Processing**: Split mini-batches across threads 65 | - **Parallel Matrix Operations**: Divide work among threads 66 | 67 | **Implementation Files**: 68 | - `parallel_ops.c`: Parallel implementations using pthreads 69 | - `parallel_ffi.rkt`: Racket FFI interface 70 | 71 | **Expected Gains**: 72 | - Near-linear scaling with number of cores 73 | - 2-8x speedup on typical multi-core systems 74 | 75 | ### 5. GPU Acceleration (Massive Parallelism) 76 | 77 | Leveraging GPU for massively parallel computations: 78 | 79 | - **OpenCL**: Cross-platform GPU acceleration 80 | - **ArrayFire/ONNX**: High-level GPU libraries 81 | - **Vulkan Compute**: Modern GPU API option 82 | 83 | **Implementation Files**: 84 | - `gpu_acceleration.md`: Implementation guide 85 | - `ocl_kernels.cl`: OpenCL kernel examples 86 | 87 | ## Integration Strategy 88 | 89 | ### Phase 1: C Extensions & Memory Optimization 90 | 91 | 1. Replace most performance-critical Racket functions with C implementations 92 | 2. Implement memory-optimized tensor structure with `f64vector` 93 | 3. Add in-place operation variants 94 | 4. Benchmark and identify remaining bottlenecks 95 | 96 | ### Phase 2: SIMD & Parallelization 97 | 98 | 1. Add SIMD optimizations to C code 99 | 2. Implement multi-threading for batch processing 100 | 3. Parallelize large matrix operations 101 | 4. Update Racket interfaces to expose thread count parameters 102 | 103 | ### Phase 3: GPU Acceleration 104 | 105 | 1. Start with OpenCL for broadest compatibility 106 | 2. Implement key kernels for matrix multiplication and convolution 107 | 3. Add automatic fallback to CPU when GPU is unavailable 108 | 4. Create tensor-to-GPU memory transfer utilities 109 | 110 | ## Benchmarking Framework 111 | 112 | Implement a comprehensive benchmarking system: 113 | 114 | ```racket 115 | (benchmark-ops 116 | (list 117 | (cons "Racket Matrix Multiply" (lambda () (t:mul A B))) 118 | (cons "C Matrix Multiply" (lambda () (t-opt:mul A B))) 119 | (cons "SIMD Matrix Multiply" (lambda () (t-simd:mul A B))) 120 | (cons "GPU Matrix Multiply" (lambda () (t-gpu:mul A B))))) 121 | ``` 122 | 123 | ## Implementation Priorities 124 | 125 | 1. **Matrix Multiplication**: Most impactful operation in neural networks 126 | 2. **Convolution Operations**: Critical for CNN performance 127 | 3. **Element-wise Operations**: Used throughout backpropagation 128 | 4. **Activation Functions**: Applied to every neuron 129 | 130 | ## Usage Examples 131 | 132 | ### Original Racket Code: 133 | ```racket 134 | (define output (dense-forward input weights biases relu)) 135 | ``` 136 | 137 | ### Optimized Version: 138 | ```racket 139 | ;; CPU optimized with SIMD 140 | (define output (dense-forward-opt input weights biases relu-simd)) 141 | 142 | ;; GPU accelerated 143 | (define output (dense-forward-gpu input weights biases relu-gpu)) 144 | ``` 145 | 146 | ## Deployment Considerations 147 | 148 | - Auto-detect available optimizations (SIMD, number of cores, GPU) 149 | - Provide unified interface with automatic backend selection 150 | - Maintain compatibility with existing RacoGrad code -------------------------------------------------------------------------------- /RACOGRAD_USER_GUIDE.md: -------------------------------------------------------------------------------- 1 | # RacoGrad: A Deep Learning Framework in Racket 2 | 3 | ## Overview 4 | 5 | RacoGrad is a deep learning framework implemented in Racket that provides functionality for creating, training, and evaluating neural networks. It includes support for: 6 | 7 | - Tensor operations 8 | - Automatic differentiation (autograd) 9 | - Convolutional Neural Networks (CNNs) 10 | - Device-aware computation (CPU, GPU via OpenCL, and Apple Silicon via MLX) 11 | - MNIST dataset handling 12 | 13 | ## Getting Started 14 | 15 | ### Installation 16 | 17 | 1. Make sure you have Racket installed on your system 18 | 2. Clone this repository 19 | 3. Compile the C/C++ extensions by running: 20 | ```bash 21 | ./compile_extensions.sh 22 | ``` 23 | 24 | ### Basic Usage 25 | 26 | #### Training a Logistic Regression Model on MNIST 27 | 28 | ```racket 29 | #lang racket 30 | (require "mnist.rkt") 31 | 32 | ;; The mnist.rkt module will automatically load and train a logistic regression model 33 | ;; on the MNIST dataset when required 34 | ``` 35 | 36 | #### Training a CNN on MNIST 37 | 38 | ```racket 39 | #lang racket 40 | (require "CNN.rkt") 41 | 42 | ;; To train a CNN on the default device (MLX if available): 43 | (train-cnn) 44 | 45 | ;; To specify the device: 46 | (train-cnn 'cpu) ; Use CPU 47 | (train-cnn 'mlx) ; Use MLX (Apple Silicon) 48 | (train-cnn 'gpu) ; Use GPU (via OpenCL) 49 | 50 | ;; To specify training parameters: 51 | (train-cnn 'cpu 10 64) ; 10 epochs, batch size 64 52 | ``` 53 | 54 | ## Core Components 55 | 56 | ### Tensor Operations 57 | 58 | The tensor operations are provided by "tensor.rkt" and "tensor_device.rkt": 59 | 60 | ```racket 61 | (require "tensor.rkt") ; Basic tensor operations 62 | (require "tensor_device.rkt") ; Device-aware tensor operations 63 | 64 | ;; Create a tensor 65 | (define t (t:create '(2 3) #(1 2 3 4 5 6))) 66 | 67 | ;; Create a device-aware tensor 68 | (require "device.rkt") 69 | (define dt (dt:create '(2 3) #(1 2 3 4 5 6) (cpu))) 70 | 71 | ;; Perform operations 72 | (t:add t1 t2) ; Add two tensors 73 | (dt:add dt1 dt2) ; Add two device tensors 74 | 75 | ;; Move tensor between devices 76 | (dt:to dt (gpu)) ; Move tensor to GPU 77 | ``` 78 | 79 | ### Automatic Differentiation 80 | 81 | The autograd functionality is provided by "autograd.rkt": 82 | 83 | ```racket 84 | (require "autograd.rkt") 85 | 86 | ;; Activation functions 87 | (relu x) ; ReLU activation 88 | (sigmoid x) ; Sigmoid activation 89 | 90 | ;; Forward pass 91 | (dense-forward input weights biases activation-fn) 92 | 93 | ;; Backward pass 94 | (dense-backward input weights biases output grad-output 95 | activation-derivative learning-rate) 96 | ``` 97 | 98 | ### CNN Operations 99 | 100 | CNN operations are provided by "cnn_ops.rkt": 101 | 102 | ```racket 103 | (require "cnn_ops.rkt") 104 | 105 | ;; These are lower-level C function bindings 106 | ;; Higher-level API is provided in CNN.rkt 107 | 108 | ;; Load optimal implementation for current device 109 | (load-optimal-ops (current-device)) 110 | ``` 111 | 112 | ## Hardware Support 113 | 114 | RacoGrad can utilize different hardware backends: 115 | 116 | - **CPU**: Default, works on all systems 117 | - **GPU**: Via OpenCL for systems with compatible GPUs 118 | - **MLX**: Accelerated operations on Apple Silicon (M1/M2/M3) 119 | 120 | To select a device: 121 | 122 | ```racket 123 | (require "device.rkt") 124 | (require "hardware_detection.rkt") 125 | 126 | ;; Set current device 127 | (set-current-device! (cpu)) ; Use CPU 128 | (set-current-device! (gpu)) ; Use GPU if available 129 | (set-current-device! (mlx)) ; Use MLX if on Apple Silicon 130 | 131 | ;; Check hardware availability 132 | (gpu-available?) ; Returns #t if GPU is available 133 | (device-available? 'mlx) ; Returns #t if MLX is available 134 | ``` 135 | 136 | ## Example: Custom Model 137 | 138 | Here's how to create a custom neural network: 139 | 140 | ```racket 141 | (require "tensor.rkt") 142 | (require "autograd.rkt") 143 | 144 | ;; Define model parameters 145 | (define input-size 784) 146 | (define hidden-size 128) 147 | (define output-size 10) 148 | 149 | ;; Initialize weights and biases 150 | (define W1 (t:random (list input-size hidden-size) 0.01)) 151 | (define b1 (t:random (list 1 hidden-size) 0.01)) 152 | (define W2 (t:random (list hidden-size output-size) 0.01)) 153 | (define b2 (t:random (list 1 output-size) 0.01)) 154 | 155 | ;; Forward pass function 156 | (define (my-model input) 157 | (let* ([hidden (relu (dense-forward input W1 b1 identity))] 158 | [output (dense-forward hidden W2 b2 identity)]) 159 | output)) 160 | 161 | ;; Predict function 162 | (define (predict output-tensor) 163 | (argmax vector-ref (t:data output-tensor))) 164 | ``` 165 | 166 | ## Advanced Features 167 | 168 | ### Model Saving/Loading 169 | 170 | To save and load models, you can use Racket's built-in serialization: 171 | 172 | ```racket 173 | (require racket/serialize) 174 | 175 | ;; Save model parameters 176 | (define out (open-output-file "model.dat" #:exists 'replace)) 177 | (serialize 178 | (list (t:data weights) (t:shape weights) 179 | (t:data bias) (t:shape bias)) 180 | out) 181 | (close-output-port out) 182 | 183 | ;; Load model parameters 184 | (define in (open-input-file "model.dat")) 185 | (define params (deserialize in)) 186 | (define loaded-weights 187 | (t:create (second params) (first params))) 188 | (define loaded-bias 189 | (t:create (fourth params) (third params))) 190 | (close-input-port in) 191 | ``` 192 | 193 | ## Troubleshooting 194 | 195 | - If you encounter errors with the C extensions, make sure to run the `compile_extensions.sh` script. 196 | - Some operations require specific hardware. If not available, the framework will fall back to CPU. 197 | - MNIST data needs to be in the `mnist-data` directory. The scripts expect the standard MNIST files: 198 | - `train-images.idx3-ubyte` 199 | - `train-labels.idx1-ubyte` 200 | - `t10k-images.idx3-ubyte` 201 | - `t10k-labels.idx1-ubyte` 202 | 203 | ## Performance Tips 204 | 205 | - Use device-aware tensors (dt:*) rather than basic tensors (t:*) for better performance 206 | - When running on Apple Silicon, use the MLX device for best performance 207 | - Batch processing improves throughput significantly 208 | - For large datasets, use mini-batch training rather than full batch -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RacoGrad: A Deep Learning Framework in Racket 2 | 3 | RacoGrad is a deep learning framework implemented in Racket, a dialect of the Lisp/Scheme family of programming languages. It provides a comprehensive set of tools for creating, training, and evaluating neural networks, with a focus on both educational value and practical functionality. 4 | 5 | ## Features 6 | 7 | - **Tensor Operations**: Comprehensive set of tensor operations with shape checking and broadcasting 8 | - **Device-Aware Computation**: Support for CPU, GPU (via OpenCL), and MLX (Apple Silicon) 9 | - **Automatic Differentiation**: Backpropagation for gradient computation 10 | - **CNN Support**: Implementation of convolutional neural networks with LeNet-5 architecture 11 | - **MNIST Dataset Handling**: Tools for loading, preprocessing, and training on MNIST 12 | - **Early Stopping and Validation**: Techniques to prevent overfitting during training 13 | 14 | ## Installation 15 | 16 | 1. Clone this repository: 17 | ```bash 18 | git clone https://github.com/dev-null321/RacoGrad.git 19 | cd RacoGrad 20 | ``` 21 | 22 | 2. Download the MNIST dataset: 23 | ```bash 24 | mkdir -p mnist-data 25 | cd mnist-data 26 | ``` 27 | 28 | Download the following files and place them in the mnist-data directory: 29 | 30 | Note I used Kaggle do download the dataset. Download it and place it in your directory. 31 | 32 | Note: Remember to gunzip these files after downloading. 33 | 34 | 4. Compile the C extensions: 35 | ```bash 36 | ./compile_extensions.sh 37 | ``` 38 | 39 | ## Usage 40 | 41 | ### Training a Logistic Regression Model on MNIST 42 | 43 | ```racket 44 | #lang racket 45 | (require "mnist.rkt") 46 | 47 | ;; The mnist.rkt module will automatically load and train a logistic regression model 48 | ;; on the MNIST dataset when required 49 | ``` 50 | 51 | ### Training a CNN on MNIST 52 | 53 | ```racket 54 | #lang racket 55 | (require "CNN.rkt") 56 | 57 | ;; To train a CNN on the default device (MLX if available): 58 | (train-cnn) 59 | 60 | ;; To specify the device: 61 | (train-cnn 'cpu) ; Use CPU 62 | (train-cnn 'mlx) ; Use MLX (Apple Silicon) 63 | (train-cnn 'gpu) ; Use GPU (via OpenCL) 64 | 65 | ;; To specify training parameters: 66 | (train-cnn 'cpu 10 64) ; 10 epochs, batch size 64 67 | ``` 68 | 69 | ### Tensor Operations 70 | 71 | ```racket 72 | (require "tensor.rkt") 73 | 74 | ;; Create a tensor 75 | (define t (t:create '(2 3) #(1 2 3 4 5 6))) 76 | 77 | ;; Basic operations 78 | (t:add t1 t2) ; Add two tensors 79 | (t:mul t1 t2) ; Matrix multiplication 80 | (t:scale t 2.0) ; Scalar multiplication 81 | (t:transpose t) ; Transpose tensor 82 | 83 | ;; Device-aware tensors 84 | (require "tensor_device.rkt") 85 | (require "device.rkt") 86 | 87 | ;; Create a device tensor on CPU 88 | (define dt (dt:create '(2 3) #(1 2 3 4 5 6) (cpu))) 89 | 90 | ;; Move to GPU if available 91 | (dt:to dt (gpu)) 92 | 93 | ;; Operations automatically use the appropriate device 94 | (dt:add dt1 dt2) 95 | ``` 96 | 97 | ## Documentation 98 | 99 | For detailed information about the implementation and usage, refer to the following documents: 100 | 101 | - [User Guide](./RACOGRAD_USER_GUIDE.md): Basic usage guide for users 102 | - [Implementation Details](./RACOGRAD_IMPLEMENTATION_DETAILS.md): Technical details of the implementation 103 | - [Optimization Strategy](./OPTIMIZATION_STRATEGY.md): Performance optimization strategies 104 | - [GPU Acceleration](./gpu_acceleration.md): Details on GPU acceleration 105 | 106 | ## New Additions 107 | 108 | This updated version of RacoGrad includes several major enhancements: 109 | 110 | - **Convolutional Neural Networks**: Full implementation of CNN with backpropagation 111 | - **Device-Aware Computing**: Abstraction layer for running on different hardware 112 | - **Hardware Acceleration**: Support for GPU via OpenCL and Apple Silicon via MLX 113 | - **Improved MNIST Training**: Added validation splits and early stopping 114 | - **Better Documentation**: Comprehensive documentation of the implementation 115 | 116 | ## Contributing 117 | 118 | Contributions are welcome! Please feel free to submit a Pull Request. 119 | 120 | ## License 121 | 122 | This project is open source and available under the MIT License. 123 | -------------------------------------------------------------------------------- /autograd.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt") 4 | 5 | (provide dense-forward 6 | mean-squared-error 7 | dense-backward 8 | relu 9 | relu-derivative 10 | initialize-fnn 11 | sigmoid 12 | sigmoid-derivative) 13 | 14 | ;; Activation functions 15 | (define (relu x) 16 | (t:create (t:shape x) 17 | (for/vector ([v (t:data x)]) (max 0 v)))) 18 | 19 | (define (relu-derivative x) 20 | (t:create (t:shape x) 21 | (for/vector ([v (t:data x)]) (if (> v 0) 1 0)))) 22 | 23 | (define (sigmoid x) 24 | (t:create (t:shape x) 25 | (for/vector ([v (t:data x)]) (/ 1 (+ 1 (exp (- v))))))) 26 | 27 | (define (sigmoid-derivative x) 28 | (let ([sig (sigmoid x)]) 29 | (t:create (t:shape x) 30 | (for/vector ([v (t:data sig)]) (* v (- 1 v)))))) 31 | 32 | (define (tanh x) 33 | (t:create (t:shape x) 34 | (for/vector ([v (t:data x)]) 35 | (let ([e^v (exp v)] 36 | [e^-v (exp (- v))]) 37 | (/ (- e^v e^-v) (+ e^v e^-v)))))) 38 | 39 | (define (tanh-derivative x) 40 | (let ([t (tanh x)]) 41 | (t:create (t:shape x) 42 | (for/vector ([v (t:data t)]) (- 1 (* v v)))))) 43 | 44 | ;; Forward pass through a dense layer 45 | (define (dense-forward input weights biases activation-fn) 46 | (let* ([mul-result (t:mul input weights)] 47 | [output-dim (cadr (t:shape mul-result))] 48 | [reshaped-biases (t:reshape biases (list output-dim))] 49 | [z (t:add mul-result reshaped-biases)] 50 | [activation-output (activation-fn z)]) 51 | activation-output)) 52 | 53 | ;; Mean Squared Error 54 | (define (mean-squared-error y-true y-pred) 55 | (let* ([diff (t:sub y-true y-pred)] 56 | [squared-diff (t:mul diff diff)] 57 | [sum (apply + (vector->list (t:data squared-diff)))]) 58 | (/ sum (length (vector->list (t:data y-true)))))) 59 | 60 | ;; Backward pass for a dense layer 61 | (define (dense-backward input weights biases output grad-output activation-derivative learning-rate) 62 | (let* ([grad-activation (activation-derivative output)] 63 | [grad-z (t:mul grad-output grad-activation)] 64 | [grad-weights (t:mul (t:transpose input) grad-z)] 65 | [bias-len (vector-length (t:data biases))] 66 | ;; Compute grad-biases by summing each column of grad-z 67 | [grad-biases (t:create (list bias-len) 68 | (for/vector ([j bias-len]) 69 | (apply + 70 | (for/list ([i (car (t:shape grad-z))]) 71 | (vector-ref (t:data grad-z) 72 | (+ (* i bias-len) j))))))] 73 | [grad-input (t:mul grad-z (t:transpose weights))]) 74 | (values grad-weights grad-biases grad-input))) 75 | 76 | ;; Initialize a fully-connected neural network layer (input-tensor, weights, biases) 77 | (define (initialize-fnn batch-size input-dim output-dim) 78 | (let* ([input-data (make-list (* batch-size input-dim) 0)] 79 | [input-tensor (t:create (list batch-size input-dim) input-data)] 80 | [weight-shape (list input-dim output-dim)] 81 | [bias-shape (list output-dim)] 82 | [weights (t:random weight-shape 1.0)] 83 | [biases (t:random bias-shape 1.0)]) 84 | (values input-tensor weights biases))) 85 | -------------------------------------------------------------------------------- /benchmark.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt" 4 | "tensor_optimized.rkt" 5 | "ffi_ops.rkt") 6 | 7 | (provide run-benchmarks) 8 | 9 | ;; Run the benchmarks automatically when this file is executed directly 10 | (module+ main 11 | (run-benchmarks)) 12 | 13 | ;; Utility to time function execution 14 | (define (time-execution func) 15 | (let ([start (current-inexact-milliseconds)]) 16 | (func) 17 | (- (current-inexact-milliseconds) start))) 18 | 19 | ;; Run a benchmark multiple times and return average 20 | (define (run-benchmark name func iterations) 21 | (printf "Running benchmark: ~a\n" name) 22 | (let* ([times (for/list ([i iterations]) 23 | (time-execution func))] 24 | [avg (/ (apply + times) iterations)]) 25 | (printf " Average time over ~a iterations: ~a ms\n" iterations avg) 26 | (cons name avg))) 27 | 28 | ;; Helper function to create random matrices of given dimensions 29 | (define (create-random-matrices rows-a cols-a cols-b) 30 | (let ([a (t:random (list rows-a cols-a) 1.0)] 31 | [b (t:random (list cols-a cols-b) 1.0)]) 32 | (values a b))) 33 | 34 | ;; Helper to create optimized random matrices 35 | (define (create-random-optimized-matrices rows-a cols-a cols-b) 36 | (let ([a (t-opt:random (list rows-a cols-a) 1.0)] 37 | [b (t-opt:random (list cols-a cols-b) 1.0)]) 38 | (values a b))) 39 | 40 | ;; Benchmark matrix multiplication 41 | (define (benchmark-matrix-mul iterations) 42 | (printf "\n=== Matrix Multiplication Benchmark ===\n") 43 | 44 | (define rows-a 100) 45 | (define cols-a 100) 46 | (define cols-b 100) 47 | 48 | (let-values ([(a b) (create-random-matrices rows-a cols-a cols-b)] 49 | [(a-opt b-opt) (create-random-optimized-matrices rows-a cols-a cols-b)]) 50 | 51 | ;; Standard Racket implementation 52 | (define racket-result 53 | (run-benchmark "Racket Matrix Multiply" 54 | (lambda () (t:mul a b)) 55 | iterations)) 56 | 57 | ;; C implementation via optimized tensors 58 | (define c-result 59 | (run-benchmark "C Matrix Multiply" 60 | (lambda () (t-opt:mul a-opt b-opt)) 61 | iterations)) 62 | 63 | ;; Print speedup 64 | (printf "\nSpeedup: ~a times faster\n" 65 | (exact->inexact (/ (cdr racket-result) (cdr c-result)))))) 66 | 67 | ;; Benchmark element-wise operations 68 | (define (benchmark-elementwise iterations) 69 | (printf "\n=== Element-wise Operations Benchmark ===\n") 70 | 71 | (define size 1000000) 72 | 73 | (let ([a (t:random (list size) 1.0)] 74 | [b (t:random (list size) 1.0)] 75 | [a-opt (t-opt:random (list size) 1.0)] 76 | [b-opt (t-opt:random (list size) 1.0)]) 77 | 78 | ;; Addition 79 | (define racket-add 80 | (run-benchmark "Racket Add" 81 | (lambda () (t:add a b)) 82 | iterations)) 83 | 84 | (define c-add 85 | (run-benchmark "C Add" 86 | (lambda () (t-opt:add a-opt b-opt)) 87 | iterations)) 88 | 89 | (printf "\nAdd Speedup: ~a times faster\n" 90 | (exact->inexact (/ (cdr racket-add) (cdr c-add)))) 91 | 92 | ;; Multiplication 93 | (define racket-mul 94 | (run-benchmark "Racket Element-wise Mul" 95 | (lambda () (t:mul a b)) 96 | iterations)) 97 | 98 | (define c-mul 99 | (run-benchmark "C Element-wise Mul" 100 | (lambda () (t-opt:mul a-opt b-opt)) 101 | iterations)) 102 | 103 | (printf "\nMultiply Speedup: ~a times faster\n" 104 | (exact->inexact (/ (cdr racket-mul) (cdr c-mul)))))) 105 | 106 | ;; Run all benchmarks 107 | (define (run-benchmarks [iterations 10]) 108 | (printf "\n====================================\n") 109 | (printf " RACOGRAD PERFORMANCE BENCHMARKS\n") 110 | (printf "====================================\n") 111 | 112 | (benchmark-matrix-mul iterations) 113 | (benchmark-elementwise iterations) 114 | 115 | (printf "\n====================================\n") 116 | (printf "Benchmarks complete!\n") 117 | (printf "See OPTIMIZATION_STRATEGY.md for more details on optimizations.\n") 118 | (printf "====================================\n")) -------------------------------------------------------------------------------- /cnn_benchmark.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "CNN.rkt" 4 | "device.rkt" 5 | "hardware_detection.rkt") 6 | 7 | ;; Benchmark CNN on different devices 8 | (define (benchmark-cnn) 9 | (printf "====================================\n") 10 | (printf " RACOGRAD CNN BENCHMARKS\n") 11 | (printf "====================================\n") 12 | 13 | (print-hardware-info) 14 | 15 | ;; Set parameters for quick benchmarking 16 | (define epochs 2) 17 | (define batch-size 32) 18 | 19 | ;; Run on CPU 20 | (printf "\n\nRunning CNN benchmark on CPU...\n") 21 | (define cpu-result (train-cnn 'cpu epochs batch-size)) 22 | 23 | (define results (list (cons 'cpu cpu-result))) 24 | 25 | ;; Run on MLX if available 26 | (when (device-available? 'mlx) 27 | (printf "\n\nRunning CNN benchmark on MLX (Apple Silicon)...\n") 28 | (define mlx-result (train-cnn 'mlx epochs batch-size)) 29 | (set! results (cons (cons 'mlx mlx-result) results))) 30 | 31 | ;; Run on CUDA if available 32 | (when (device-available? 'cuda) 33 | (printf "\n\nRunning CNN benchmark on CUDA (NVIDIA GPU)...\n") 34 | (define cuda-result (train-cnn 'cuda epochs batch-size)) 35 | (set! results (cons (cons 'cuda cuda-result) results))) 36 | 37 | ;; Run on OpenCL if available 38 | (when (device-available? 'opencl) 39 | (printf "\n\nRunning CNN benchmark on OpenCL...\n") 40 | (define opencl-result (train-cnn 'opencl epochs batch-size)) 41 | (set! results (cons (cons 'opencl opencl-result) results))) 42 | 43 | ;; Compare results 44 | (printf "\n\n====================================\n") 45 | (printf " PERFORMANCE COMPARISON\n") 46 | (printf "====================================\n") 47 | 48 | (define cpu-time (hash-ref (cdr (assoc 'cpu results)) 'time)) 49 | (define cpu-accuracy (hash-ref (cdr (assoc 'cpu results)) 'accuracy)) 50 | 51 | (printf "CPU Training Time: ~a seconds\n" (real->decimal-string cpu-time 2)) 52 | (printf "CPU Accuracy: ~a%\n\n" (real->decimal-string cpu-accuracy 2)) 53 | 54 | (for ([result (in-list results)]) 55 | (unless (eq? (car result) 'cpu) 56 | (let* ([device-type (car result)] 57 | [device-result (cdr result)] 58 | [device-time (hash-ref device-result 'time)] 59 | [device-accuracy (hash-ref device-result 'accuracy)] 60 | [speedup (/ cpu-time device-time)]) 61 | (printf "~a Training Time: ~a seconds\n" 62 | device-type 63 | (real->decimal-string device-time 2)) 64 | (printf "~a Accuracy: ~a%\n" 65 | device-type 66 | (real->decimal-string device-accuracy 2)) 67 | (printf "Speedup vs CPU: ~a times faster\n\n" 68 | (real->decimal-string speedup 2))))) 69 | 70 | (printf "====================================\n")) 71 | 72 | ;; Run the benchmark from the command line 73 | (module+ main 74 | (benchmark-cnn)) -------------------------------------------------------------------------------- /cnn_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // Convolution operation (naive implementation) 7 | void conv2d_forward(int batch_size, int in_channels, int in_height, int in_width, 8 | int out_channels, int filter_height, int filter_width, 9 | int stride, int padding, 10 | double* input, double* filters, double* output) { 11 | 12 | int out_height = 1 + (in_height + 2 * padding - filter_height) / stride; 13 | int out_width = 1 + (in_width + 2 * padding - filter_width) / stride; 14 | 15 | // Initialize output to zeros 16 | int output_size = batch_size * out_channels * out_height * out_width; 17 | memset(output, 0, output_size * sizeof(double)); 18 | 19 | // For each element in the batch 20 | for (int b = 0; b < batch_size; b++) { 21 | // For each output channel 22 | for (int oc = 0; oc < out_channels; oc++) { 23 | // For each input channel 24 | for (int ic = 0; ic < in_channels; ic++) { 25 | // For each output position 26 | for (int oh = 0; oh < out_height; oh++) { 27 | for (int ow = 0; ow < out_width; ow++) { 28 | // Calculate output index 29 | int out_idx = ((b * out_channels + oc) * out_height + oh) * out_width + ow; 30 | 31 | // For each filter position 32 | for (int fh = 0; fh < filter_height; fh++) { 33 | for (int fw = 0; fw < filter_width; fw++) { 34 | // Calculate input position with padding 35 | int ih = oh * stride + fh - padding; 36 | int iw = ow * stride + fw - padding; 37 | 38 | // Skip if outside the input boundaries 39 | if (ih < 0 || ih >= in_height || iw < 0 || iw >= in_width) { 40 | continue; 41 | } 42 | 43 | // Calculate input index 44 | int in_idx = ((b * in_channels + ic) * in_height + ih) * in_width + iw; 45 | 46 | // Calculate filter index 47 | int filter_idx = ((oc * in_channels + ic) * filter_height + fh) * filter_width + fw; 48 | 49 | // Add contribution to output 50 | output[out_idx] += input[in_idx] * filters[filter_idx]; 51 | } 52 | } 53 | } 54 | } 55 | } 56 | } 57 | } 58 | } 59 | 60 | // Max pooling 2x2 with stride 2 61 | void max_pool_2x2(int batch_size, int channels, int in_height, int in_width, 62 | double* input, double* output) { 63 | 64 | int out_height = in_height / 2; 65 | int out_width = in_width / 2; 66 | 67 | // Initialize output to very negative values 68 | int output_size = batch_size * channels * out_height * out_width; 69 | for (int i = 0; i < output_size; i++) { 70 | output[i] = -1e9; 71 | } 72 | 73 | // For each element in the batch 74 | for (int b = 0; b < batch_size; b++) { 75 | // For each channel 76 | for (int c = 0; c < channels; c++) { 77 | // For each pooling region 78 | for (int oh = 0; oh < out_height; oh++) { 79 | for (int ow = 0; ow < out_width; ow++) { 80 | // Calculate output index 81 | int out_idx = ((b * channels + c) * out_height + oh) * out_width + ow; 82 | 83 | // For each input in the pooling region 84 | for (int kh = 0; kh < 2; kh++) { 85 | for (int kw = 0; kw < 2; kw++) { 86 | // Calculate input index 87 | int ih = oh * 2 + kh; 88 | int iw = ow * 2 + kw; 89 | int in_idx = ((b * channels + c) * in_height + ih) * in_width + iw; 90 | 91 | // Update max value 92 | if (input[in_idx] > output[out_idx]) { 93 | output[out_idx] = input[in_idx]; 94 | } 95 | } 96 | } 97 | } 98 | } 99 | } 100 | } 101 | } 102 | 103 | // Flatten a 4D tensor (batch, channels, height, width) to a 2D tensor (batch, channels*height*width) 104 | void flatten_tensor(int batch_size, int channels, int height, int width, 105 | double* input, double* output) { 106 | 107 | int flat_size = channels * height * width; 108 | 109 | // For each batch element 110 | for (int b = 0; b < batch_size; b++) { 111 | // For each channel 112 | for (int c = 0; c < channels; c++) { 113 | // For each height 114 | for (int h = 0; h < height; h++) { 115 | // For each width 116 | for (int w = 0; w < width; w++) { 117 | // Calculate input index 118 | int in_idx = ((b * channels + c) * height + h) * width + w; 119 | 120 | // Calculate flat index 121 | int flat_idx = b * flat_size + (c * height * width + h * width + w); 122 | 123 | // Copy value 124 | output[flat_idx] = input[in_idx]; 125 | } 126 | } 127 | } 128 | } 129 | } 130 | 131 | // Softmax operation 132 | void softmax(int batch_size, int num_classes, double* input, double* output) { 133 | // For each batch 134 | for (int b = 0; b < batch_size; b++) { 135 | // Find max value for stability 136 | double max_val = -1e9; 137 | for (int c = 0; c < num_classes; c++) { 138 | if (input[b * num_classes + c] > max_val) { 139 | max_val = input[b * num_classes + c]; 140 | } 141 | } 142 | 143 | // Calculate exp and sum 144 | double sum = 0.0; 145 | for (int c = 0; c < num_classes; c++) { 146 | output[b * num_classes + c] = exp(input[b * num_classes + c] - max_val); 147 | sum += output[b * num_classes + c]; 148 | } 149 | 150 | // Normalize 151 | for (int c = 0; c < num_classes; c++) { 152 | output[b * num_classes + c] /= sum; 153 | } 154 | } 155 | } 156 | 157 | // Cross entropy loss 158 | double cross_entropy_loss(int batch_size, int num_classes, double* pred, double* target) { 159 | double loss = 0.0; 160 | double epsilon = 1e-15; // Small value for numerical stability 161 | 162 | for (int b = 0; b < batch_size; b++) { 163 | for (int c = 0; c < num_classes; c++) { 164 | // Clamp prediction for numerical stability 165 | double p = pred[b * num_classes + c]; 166 | if (p < epsilon) p = epsilon; 167 | if (p > 1.0 - epsilon) p = 1.0 - epsilon; 168 | 169 | // Add to loss if target is 1 170 | if (target[b * num_classes + c] > 0.5) { 171 | loss -= log(p); 172 | } 173 | } 174 | } 175 | 176 | return loss / batch_size; 177 | } -------------------------------------------------------------------------------- /cnn_ops.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require ffi/unsafe 4 | ffi/vector 5 | "device.rkt" 6 | "hardware_detection.rkt") 7 | 8 | ;; Provide CNN operations 9 | (provide 10 | ;; CNN operations 11 | c:conv2d-forward 12 | c:max-pool-2x2 13 | c:flatten-tensor 14 | c:softmax 15 | c:cross-entropy-loss 16 | 17 | ;; Load functions based on available hardware 18 | load-optimal-ops) 19 | 20 | ;; Load CPU implementations with error handling 21 | (define cnn-lib 22 | (with-handlers ([exn:fail? (lambda (e) 23 | (printf "Warning: cnn_ops library not found. ~a~n" (exn-message e)) 24 | #f)]) 25 | (ffi-lib "cnn_ops" '("" "0")))) 26 | 27 | ;; Try to load MLX implementations if on Apple Silicon 28 | (define mlx-cnn-lib 29 | (with-handlers ([exn:fail? (lambda (e) #f)]) 30 | (ffi-lib "mlx_cnn_ops" '("" "0")))) 31 | 32 | ;; Dummy implementations when library is not available 33 | (define (dummy-conv2d batch-size in-channels in-height in-width 34 | out-channels filter-height filter-width 35 | stride padding 36 | input filters output) 37 | (printf "Using dummy conv2d implementation~n")) 38 | 39 | (define (dummy-max-pool batch-size channels in-height in-width 40 | input output) 41 | (printf "Using dummy max-pool implementation~n")) 42 | 43 | (define (dummy-flatten batch-size channels height width 44 | input output) 45 | (printf "Using dummy flatten implementation~n")) 46 | 47 | (define (dummy-softmax batch-size num-classes 48 | input output) 49 | (printf "Using dummy softmax implementation~n")) 50 | 51 | (define (dummy-cross-entropy batch-size num-classes 52 | predictions targets) 53 | (printf "Using dummy cross-entropy implementation~n") 54 | 0.1) ; Return a dummy loss 55 | 56 | ;; Default implementations using CPU (or dummy if not available) 57 | (define c:conv2d-forward 58 | (if cnn-lib 59 | (with-handlers ([exn:fail? (lambda (e) dummy-conv2d)]) 60 | (get-ffi-obj "conv2d_forward" cnn-lib 61 | (_fun _int _int _int _int ; batch_size, in_channels, in_height, in_width 62 | _int _int _int ; out_channels, filter_height, filter_width 63 | _int _int ; stride, padding 64 | _f64vector _f64vector _f64vector -> _void))) 65 | dummy-conv2d)) 66 | 67 | (define c:max-pool-2x2 68 | (if cnn-lib 69 | (with-handlers ([exn:fail? (lambda (e) dummy-max-pool)]) 70 | (get-ffi-obj "max_pool_2x2" cnn-lib 71 | (_fun _int _int _int _int ; batch_size, channels, in_height, in_width 72 | _f64vector _f64vector -> _void))) 73 | dummy-max-pool)) 74 | 75 | (define c:flatten-tensor 76 | (if cnn-lib 77 | (with-handlers ([exn:fail? (lambda (e) dummy-flatten)]) 78 | (get-ffi-obj "flatten_tensor" cnn-lib 79 | (_fun _int _int _int _int ; batch_size, channels, height, width 80 | _f64vector _f64vector -> _void))) 81 | dummy-flatten)) 82 | 83 | (define c:softmax 84 | (if cnn-lib 85 | (with-handlers ([exn:fail? (lambda (e) dummy-softmax)]) 86 | (get-ffi-obj "softmax" cnn-lib 87 | (_fun _int _int ; batch_size, num_classes 88 | _f64vector _f64vector -> _void))) 89 | dummy-softmax)) 90 | 91 | (define c:cross-entropy-loss 92 | (if cnn-lib 93 | (with-handlers ([exn:fail? (lambda (e) dummy-cross-entropy)]) 94 | (get-ffi-obj "cross_entropy_loss" cnn-lib 95 | (_fun _int _int ; batch_size, num_classes 96 | _f64vector _f64vector -> _double))) 97 | dummy-cross-entropy)) 98 | 99 | ;; MLX implementations (if available) 100 | (define mlx:conv2d-forward 101 | (if mlx-cnn-lib 102 | (with-handlers ([exn:fail? (lambda (e) 103 | (printf "MLX conv2d not available: ~a~n" (exn-message e)) 104 | #f)]) 105 | (get-ffi-obj "mlx_conv2d_forward" mlx-cnn-lib 106 | (_fun _int _int _int _int 107 | _int _int _int 108 | _int _int 109 | _f64vector _f64vector _f64vector -> _void))) 110 | ;; If MLX not available, use dummy or CPU implementation 111 | (if cnn-lib c:conv2d-forward dummy-conv2d))) 112 | 113 | (define mlx:max-pool-2x2 114 | (if mlx-cnn-lib 115 | (with-handlers ([exn:fail? (lambda (e) #f)]) 116 | (get-ffi-obj "mlx_max_pool_2x2" mlx-cnn-lib 117 | (_fun _int _int _int _int 118 | _f64vector _f64vector -> _void))) 119 | ;; If MLX not available, use dummy or CPU implementation 120 | (if cnn-lib c:max-pool-2x2 dummy-max-pool))) 121 | 122 | (define mlx:flatten-tensor 123 | (if mlx-cnn-lib 124 | (with-handlers ([exn:fail? (lambda (e) #f)]) 125 | (get-ffi-obj "mlx_flatten_tensor" mlx-cnn-lib 126 | (_fun _int _int _int _int 127 | _f64vector _f64vector -> _void))) 128 | ;; If MLX not available, use dummy or CPU implementation 129 | (if cnn-lib c:flatten-tensor dummy-flatten))) 130 | 131 | (define mlx:softmax 132 | (if mlx-cnn-lib 133 | (with-handlers ([exn:fail? (lambda (e) #f)]) 134 | (get-ffi-obj "mlx_softmax" mlx-cnn-lib 135 | (_fun _int _int 136 | _f64vector _f64vector -> _void))) 137 | ;; If MLX not available, use dummy or CPU implementation 138 | (if cnn-lib c:softmax dummy-softmax))) 139 | 140 | ;; Function to reload operations based on device 141 | (define (load-optimal-ops [dev (current-device)]) 142 | (cond 143 | [(mlx-device? dev) 144 | (printf "Loading MLX optimized CNN operations for Apple Silicon~n") 145 | (if mlx-cnn-lib 146 | (begin 147 | (set! c:conv2d-forward mlx:conv2d-forward) 148 | (set! c:max-pool-2x2 mlx:max-pool-2x2) 149 | (set! c:flatten-tensor mlx:flatten-tensor) 150 | (set! c:softmax mlx:softmax)) 151 | (printf "Warning: MLX library not available. Using CPU fallback.~n"))] 152 | 153 | [(gpu-device? dev) 154 | (printf "GPU acceleration not yet implemented, using CPU fallback~n")] 155 | 156 | [else 157 | (printf "Using standard CPU CNN operations~n") 158 | (if (not cnn-lib) 159 | (printf "Warning: CNN operations library not found. Using dummy implementations.~n"))])) -------------------------------------------------------------------------------- /cnn_test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt" 4 | "device.rkt" 5 | "tensor_device.rkt") 6 | 7 | ;; Simple test for CNN components 8 | ;; This will test the basic tensor operations needed for CNNs without 9 | ;; relying on the C libraries or MNIST data 10 | 11 | ;; Create a simple 4D tensor (batch_size, channels, height, width) 12 | (define (make-test-tensor [batch-size 2] [channels 3] [height 8] [width 8]) 13 | (let* ([data (make-vector (* batch-size channels height width) 0.0)]) 14 | ;; Fill with simple pattern 15 | (for ([b (in-range batch-size)] 16 | [b-val (in-list '(0.1 0.2))]) 17 | (for ([c (in-range channels)] 18 | [c-val (in-list '(0.01 0.02 0.03))]) 19 | (for ([h (in-range height)]) 20 | (for ([w (in-range width)]) 21 | (let ([idx (+ (* b channels height width) 22 | (* c height width) 23 | (* h width) 24 | w)]) 25 | (vector-set! data idx (+ b-val c-val (* 0.1 h) (* 0.01 w)))))))) 26 | 27 | ;; Create tensor 28 | (dt:create (list batch-size channels height width) data (cpu)))) 29 | 30 | ;; Simple convolutional operation (without C library) 31 | (define (simple-conv2d input-tensor filter-tensor [stride 1] [padding 0]) 32 | (let* ([input-shape (dt:shape input-tensor)] 33 | [filter-shape (dt:shape filter-tensor)] 34 | 35 | [batch-size (car input-shape)] 36 | [in-channels (cadr input-shape)] 37 | [in-height (caddr input-shape)] 38 | [in-width (cadddr input-shape)] 39 | 40 | [out-channels (car filter-shape)] 41 | [filter-height (caddr filter-shape)] 42 | [filter-width (cadddr filter-shape)] 43 | 44 | ;; Calculate output dimensions 45 | [out-height (add1 (quotient (- (+ in-height (* 2 padding)) filter-height) stride))] 46 | [out-width (add1 (quotient (- (+ in-width (* 2 padding)) filter-width) stride))] 47 | 48 | [output-data (make-vector (* batch-size out-channels out-height out-width) 0.0)] 49 | [output-tensor (dt:create (list batch-size out-channels out-height out-width) 50 | output-data 51 | (dt:device input-tensor))] 52 | 53 | [input-data (dt:data input-tensor)] 54 | [filter-data (dt:data filter-tensor)]) 55 | 56 | ;; Perform convolution manually 57 | (for ([b (in-range batch-size)]) 58 | (for ([oc (in-range out-channels)]) 59 | (for ([oh (in-range out-height)]) 60 | (for ([ow (in-range out-width)]) 61 | ;; For each output position 62 | (let ([out-idx (+ (* b out-channels out-height out-width) 63 | (* oc out-height out-width) 64 | (* oh out-width) 65 | ow)] 66 | [sum 0.0]) 67 | 68 | ;; Sum over input channels and filter dimensions 69 | (for ([ic (in-range in-channels)]) 70 | (for ([fh (in-range filter-height)]) 71 | (for ([fw (in-range filter-width)]) 72 | (let* ([ih (+ (* oh stride) fh (- padding))] 73 | [iw (+ (* ow stride) fw (- padding))]) 74 | 75 | ;; Check if input position is valid 76 | (when (and (>= ih 0) (< ih in-height) 77 | (>= iw 0) (< iw in-width)) 78 | (let ([input-idx (+ (* b in-channels in-height in-width) 79 | (* ic in-height in-width) 80 | (* ih in-width) 81 | iw)] 82 | [filter-idx (+ (* oc in-channels filter-height filter-width) 83 | (* ic filter-height filter-width) 84 | (* fh filter-width) 85 | fw)]) 86 | (set! sum (+ sum (* (vector-ref input-data input-idx) 87 | (vector-ref filter-data filter-idx)))))))))) 88 | 89 | ;; Set output 90 | (vector-set! output-data out-idx sum))))) 91 | 92 | output-tensor)) 93 | 94 | ;; Simple max pooling (2x2 without C library) 95 | (define (simple-max-pool input-tensor) 96 | (let* ([input-shape (dt:shape input-tensor)] 97 | [batch-size (car input-shape)] 98 | [channels (cadr input-shape)] 99 | [in-height (caddr input-shape)] 100 | [in-width (cadddr input-shape)] 101 | 102 | [out-height (quotient in-height 2)] 103 | [out-width (quotient in-width 2)] 104 | 105 | [output-data (make-vector (* batch-size channels out-height out-width) 0.0)] 106 | [output-tensor (dt:create (list batch-size channels out-height out-width) 107 | output-data 108 | (dt:device input-tensor))] 109 | 110 | [input-data (dt:data input-tensor)]) 111 | 112 | ;; Perform max pooling 113 | (for ([b (in-range batch-size)]) 114 | (for ([c (in-range channels)]) 115 | (for ([oh (in-range out-height)]) 116 | (for ([ow (in-range out-width)]) 117 | (let ([out-idx (+ (* b channels out-height out-width) 118 | (* c out-height out-width) 119 | (* oh out-width) 120 | ow)] 121 | [max-val -inf.0]) 122 | 123 | ;; Find max in 2x2 region 124 | (for ([h (in-range (* oh 2) (+ (* oh 2) 2))]) 125 | (for ([w (in-range (* ow 2) (+ (* ow 2) 2))]) 126 | (let ([in-idx (+ (* b channels in-height in-width) 127 | (* c in-height in-width) 128 | (* h in-width) 129 | w)]) 130 | (set! max-val (max max-val (vector-ref input-data in-idx)))))) 131 | 132 | ;; Set output 133 | (vector-set! output-data out-idx max-val)))))) 134 | 135 | output-tensor)) 136 | 137 | ;; Simple ReLU activation 138 | (define (simple-relu input-tensor) 139 | (let* ([shape (dt:shape input-tensor)] 140 | [size (apply * shape)] 141 | [input-data (dt:data input-tensor)] 142 | [output-data (make-vector size 0.0)] 143 | [output-tensor (dt:create shape output-data (dt:device input-tensor))]) 144 | 145 | ;; Apply ReLU 146 | (for ([i (in-range size)]) 147 | (vector-set! output-data i (max 0.0 (vector-ref input-data i)))) 148 | 149 | output-tensor)) 150 | 151 | ;; Simple tensor flatten (4D to 2D) 152 | (define (simple-flatten input-tensor) 153 | (let* ([shape (dt:shape input-tensor)] 154 | [batch-size (car shape)] 155 | [channels (cadr shape)] 156 | [height (caddr shape)] 157 | [width (cadddr shape)] 158 | [flat-size (* channels height width)] 159 | 160 | [output-data (make-vector (* batch-size flat-size) 0.0)] 161 | [output-tensor (dt:create (list batch-size flat-size) 162 | output-data 163 | (dt:device input-tensor))] 164 | 165 | [input-data (dt:data input-tensor)]) 166 | 167 | ;; Flatten the tensor 168 | (for ([b (in-range batch-size)]) 169 | (for ([c (in-range channels)]) 170 | (for ([h (in-range height)]) 171 | (for ([w (in-range width)]) 172 | (let ([in-idx (+ (* b channels height width) 173 | (* c height width) 174 | (* h width) 175 | w)] 176 | [out-idx (+ (* b flat-size) 177 | (* c height width) 178 | (* h width) 179 | w)]) 180 | (vector-set! output-data out-idx (vector-ref input-data in-idx))))))) 181 | 182 | output-tensor)) 183 | 184 | ;; Fully connected layer 185 | (define (simple-fc input-tensor weights bias [activation-fn simple-relu]) 186 | (let* ([batch-size (car (dt:shape input-tensor))] 187 | [out-features (cadr (dt:shape weights))] 188 | 189 | [output-data (make-vector (* batch-size out-features) 0.0)] 190 | [z (dt:create (list batch-size out-features) 191 | output-data 192 | (dt:device input-tensor))] 193 | 194 | [input-data (dt:data input-tensor)] 195 | [weights-data (dt:data weights)] 196 | [bias-data (dt:data bias)]) 197 | 198 | ;; Matrix multiplication and bias addition 199 | (for ([b (in-range batch-size)]) 200 | (for ([o (in-range out-features)]) 201 | (let ([z-idx (+ (* b out-features) o)] 202 | [sum 0.0]) 203 | 204 | ;; Dot product 205 | (for ([i (in-range (cadr (dt:shape input-tensor)))]) 206 | (let ([in-idx (+ (* b (cadr (dt:shape input-tensor))) i)] 207 | [w-idx (+ (* i out-features) o)]) 208 | (set! sum (+ sum (* (vector-ref input-data in-idx) 209 | (vector-ref weights-data w-idx)))))) 210 | 211 | ;; Add bias 212 | (set! sum (+ sum (vector-ref bias-data o))) 213 | (vector-set! output-data z-idx sum)))) 214 | 215 | ;; Apply activation function if provided 216 | (if activation-fn 217 | (activation-fn z) 218 | z))) 219 | 220 | ;; Simple softmax function 221 | (define (simple-softmax z) 222 | (let* ([shape (dt:shape z)] 223 | [batch-size (car shape)] 224 | [features (cadr shape)] 225 | [z-data (dt:data z)] 226 | [output-data (make-vector (* batch-size features) 0.0)] 227 | [output (dt:create shape output-data (dt:device z))]) 228 | 229 | ;; Apply softmax: exp(x_i) / sum(exp(x_j)) 230 | (for ([b (in-range batch-size)]) 231 | (let* ([start-idx (* b features)] 232 | [end-idx (+ start-idx features)] 233 | 234 | ;; Find max value for numerical stability 235 | [max-val (for/fold ([max-val -inf.0]) 236 | ([i (in-range start-idx end-idx)]) 237 | (max max-val (vector-ref z-data i)))] 238 | 239 | ;; Compute exp(x_i - max) 240 | [exp-vals (for/vector ([i (in-range start-idx end-idx)]) 241 | (exp (- (vector-ref z-data i) max-val)))] 242 | 243 | ;; Compute sum of exp values 244 | [sum (for/sum ([i (in-range features)]) 245 | (vector-ref exp-vals i))]) 246 | 247 | ;; Normalize by sum 248 | (for ([i (in-range features)]) 249 | (vector-set! output-data (+ start-idx i) 250 | (/ (vector-ref exp-vals i) sum))))) 251 | 252 | output)) 253 | 254 | ;; Run a simple CNN test 255 | (define (run-cnn-test) 256 | (printf "Running basic CNN test...~n") 257 | 258 | ;; Create test input 259 | (printf "Creating test input tensor...~n") 260 | (define input (make-test-tensor 2 3 8 8)) 261 | (printf "Input shape: ~a~n" (dt:shape input)) 262 | 263 | ;; Create test filters 264 | (printf "Creating test filters...~n") 265 | (define filters1 (dt:random (list 6 3 3 3) 0.1)) 266 | (define bias1 (dt:random (list 1 6) 0.1)) 267 | (printf "First layer filter shape: ~a~n" (dt:shape filters1)) 268 | 269 | ;; First convolution layer 270 | (printf "Running first convolution layer...~n") 271 | (define conv1 (simple-conv2d input filters1 1 1)) 272 | (printf "Conv1 output shape: ~a~n" (dt:shape conv1)) 273 | 274 | ;; Add bias 275 | (printf "Adding bias...~n") 276 | (define conv1-with-bias conv1) ; Simplified for test 277 | 278 | ;; ReLU activation 279 | (printf "Applying ReLU...~n") 280 | (define relu1 (simple-relu conv1-with-bias)) 281 | 282 | ;; Max pooling 283 | (printf "Max pooling...~n") 284 | (define pool1 (simple-max-pool relu1)) 285 | (printf "Pool1 output shape: ~a~n" (dt:shape pool1)) 286 | 287 | ;; Second layer 288 | (printf "Creating second layer filters...~n") 289 | (define filters2 (dt:random (list 12 6 3 3) 0.1)) 290 | (define bias2 (dt:random (list 1 12) 0.1)) 291 | 292 | ;; Second convolution 293 | (printf "Running second convolution layer...~n") 294 | (define conv2 (simple-conv2d pool1 filters2 1 1)) 295 | (printf "Conv2 output shape: ~a~n" (dt:shape conv2)) 296 | 297 | ;; Add bias 298 | (define conv2-with-bias conv2) ; Simplified for test 299 | 300 | ;; ReLU activation 301 | (printf "Applying ReLU...~n") 302 | (define relu2 (simple-relu conv2-with-bias)) 303 | 304 | ;; Max pooling 305 | (printf "Max pooling...~n") 306 | (define pool2 (simple-max-pool relu2)) 307 | (printf "Pool2 output shape: ~a~n" (dt:shape pool2)) 308 | 309 | ;; Flatten 310 | (printf "Flattening tensor...~n") 311 | (define flat (simple-flatten pool2)) 312 | (printf "Flattened shape: ~a~n" (dt:shape flat)) 313 | 314 | ;; FC layers 315 | (printf "Creating fully connected layers...~n") 316 | (define flat-size (cadr (dt:shape flat))) 317 | (define fc1-weights (dt:random (list flat-size 84) 0.1)) 318 | (define fc1-bias (dt:random (list 1 84) 0.1)) 319 | 320 | (printf "Running FC layer 1...~n") 321 | (define fc1 (simple-fc flat fc1-weights fc1-bias)) 322 | (printf "FC1 output shape: ~a~n" (dt:shape fc1)) 323 | 324 | (define fc2-weights (dt:random (list 84 10) 0.1)) 325 | (define fc2-bias (dt:random (list 1 10) 0.1)) 326 | 327 | (printf "Running FC layer 2...~n") 328 | (define fc2 (simple-fc fc1 fc2-weights fc2-bias #f)) 329 | (printf "FC2 output shape: ~a~n" (dt:shape fc2)) 330 | 331 | ;; Softmax 332 | (printf "Applying softmax...~n") 333 | (define output (simple-softmax fc2)) 334 | (printf "Final output shape: ~a~n" (dt:shape output)) 335 | 336 | ;; Print sample of output 337 | (printf "Sample output probabilities (first example):~n") 338 | (for ([i (in-range 10)]) 339 | (printf " Class ~a: ~a~n" i (vector-ref (dt:data output) i))) 340 | 341 | (printf "CNN test completed successfully!~n") 342 | output) 343 | 344 | ;; Run the test when the file is executed directly 345 | (module+ main 346 | (run-cnn-test)) -------------------------------------------------------------------------------- /compile_extensions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Detect OS 4 | if [[ "$OSTYPE" == "darwin"* ]]; then 5 | # macOS 6 | SHARED_EXT=".dylib" 7 | COMPILE_FLAGS="-dynamiclib -O3" 8 | if [[ $(sysctl -n machdep.cpu.features) == *"AVX"* ]]; then 9 | SIMD_FLAGS="-mavx" 10 | elif [[ $(sysctl -n machdep.cpu.features) == *"SSE"* ]]; then 11 | SIMD_FLAGS="-msse4.2" 12 | else 13 | SIMD_FLAGS="" 14 | fi 15 | else 16 | # Linux and others 17 | SHARED_EXT=".so" 18 | COMPILE_FLAGS="-shared -fPIC -O3" 19 | if grep -q avx /proc/cpuinfo; then 20 | SIMD_FLAGS="-mavx" 21 | elif grep -q sse /proc/cpuinfo; then 22 | SIMD_FLAGS="-msse4.2" 23 | else 24 | SIMD_FLAGS="" 25 | fi 26 | fi 27 | 28 | # Compile basic matrix operations 29 | echo "Compiling basic matrix operations..." 30 | cc $COMPILE_FLAGS -o matrix_multiplication$SHARED_EXT matrix_multiplication.c 31 | 32 | # Compile SIMD operations 33 | echo "Compiling SIMD optimized operations..." 34 | cc $COMPILE_FLAGS $SIMD_FLAGS -o simd_ops$SHARED_EXT simd_ops.c 35 | 36 | # Compile parallel operations 37 | echo "Compiling parallel operations..." 38 | cc $COMPILE_FLAGS -o parallel_ops$SHARED_EXT parallel_ops.c -lpthread 39 | 40 | # Compile OpenCL operations 41 | if [ "$(uname)" == "Darwin" ]; then 42 | echo "Compiling OpenCL operations (macOS)..." 43 | cc $COMPILE_FLAGS -o matrix_opencl$SHARED_EXT matrix_opencl.c -framework OpenCL 44 | else 45 | echo "Compiling OpenCL operations (Linux)..." 46 | cc $COMPILE_FLAGS -o matrix_opencl$SHARED_EXT matrix_opencl.c -lOpenCL 47 | fi 48 | 49 | # Compile MLX placeholder operations (for Apple Silicon) 50 | echo "Compiling MLX placeholder operations..." 51 | cc $COMPILE_FLAGS -o mlx_ops$SHARED_EXT mlx_ops.c 52 | 53 | # Compile CUDA placeholder operations 54 | echo "Compiling CUDA placeholder operations..." 55 | cc $COMPILE_FLAGS -o cuda_ops$SHARED_EXT cuda_ops.c 56 | 57 | # Compile CNN operations 58 | echo "Compiling CNN operations..." 59 | cc $COMPILE_FLAGS -o cnn_ops$SHARED_EXT cnn_ops.c 60 | 61 | # Compile MLX CNN operations (for Apple Silicon) 62 | echo "Compiling MLX CNN operations..." 63 | if [[ "$OSTYPE" == "darwin"* ]]; then 64 | cc $COMPILE_FLAGS -o mlx_cnn_ops$SHARED_EXT mlx_cnn_ops.c 65 | else 66 | cc $COMPILE_FLAGS -fopenmp -o mlx_cnn_ops$SHARED_EXT mlx_cnn_ops.c 67 | fi 68 | 69 | echo "Compilation complete!" 70 | echo 71 | echo "==================================================================" 72 | echo "OPTIMIZATION SUMMARY" 73 | echo "==================================================================" 74 | echo "1. C Extensions: Basic operations in C for improved performance" 75 | echo "2. SIMD Vectorization: Using CPU vector instructions when available" 76 | echo " SIMD flags used: $SIMD_FLAGS" 77 | echo "3. Parallel Processing: Multi-threaded operations for batch processing" 78 | echo "4. Memory Optimization: Better memory layout and in-place operations" 79 | echo "5. GPU Acceleration: See gpu_acceleration.md for implementation options" 80 | echo "==================================================================" -------------------------------------------------------------------------------- /cpu_benchmark.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt" 4 | "tensor_optimized.rkt" 5 | "hardware_detection.rkt") 6 | 7 | (define (benchmark-matrix-multiplication sizes [iterations 5]) 8 | (printf "===== Matrix Multiplication Benchmark =====~n") 9 | 10 | (for ([size sizes]) 11 | (printf "Matrix size: ~a x ~a~n" size size) 12 | 13 | ;; Create random matrices 14 | (define A (t:random (list size size) 1.0)) 15 | (define B (t:random (list size size) 1.0)) 16 | 17 | ;; Create optimized matrices 18 | (define A-opt (t-opt:random (list size size) 1.0)) 19 | (define B-opt (t-opt:random (list size size) 1.0)) 20 | 21 | ;; Time standard Racket multiplication 22 | (define racket-times 23 | (for/list ([i (in-range iterations)]) 24 | (let ([start (current-inexact-milliseconds)]) 25 | (t:mul A B) 26 | (- (current-inexact-milliseconds) start)))) 27 | 28 | (define avg-racket-time (/ (apply + racket-times) iterations)) 29 | (printf " Standard Racket time: ~a ms~n" (real->decimal-string avg-racket-time 2)) 30 | 31 | ;; Time optimized C multiplication 32 | (define c-times 33 | (for/list ([i (in-range iterations)]) 34 | (let ([start (current-inexact-milliseconds)]) 35 | (t-opt:mul A-opt B-opt) 36 | (- (current-inexact-milliseconds) start)))) 37 | 38 | (define avg-c-time (/ (apply + c-times) iterations)) 39 | (printf " Optimized C time: ~a ms~n" (real->decimal-string avg-c-time 2)) 40 | 41 | ;; Calculate speedup 42 | (define speedup (/ avg-racket-time avg-c-time)) 43 | (printf " Speedup: ~a times faster~n" (real->decimal-string speedup 2)) 44 | 45 | (printf "~n"))) 46 | 47 | (define (benchmark-elementwise-operations size [iterations 5]) 48 | (printf "===== Element-wise Operations Benchmark =====~n") 49 | (printf "Vector size: ~a~n" size) 50 | 51 | ;; Create random vectors 52 | (define A (t:random (list size) 1.0)) 53 | (define B (t:random (list size) 1.0)) 54 | 55 | ;; Create optimized vectors 56 | (define A-opt (t-opt:random (list size) 1.0)) 57 | (define B-opt (t-opt:random (list size) 1.0)) 58 | 59 | ;; Time standard Racket addition 60 | (define racket-add-times 61 | (for/list ([i (in-range iterations)]) 62 | (let ([start (current-inexact-milliseconds)]) 63 | (t:add A B) 64 | (- (current-inexact-milliseconds) start)))) 65 | 66 | (define avg-racket-add-time (/ (apply + racket-add-times) iterations)) 67 | (printf " Standard Racket add time: ~a ms~n" (real->decimal-string avg-racket-add-time 2)) 68 | 69 | ;; Time optimized C addition 70 | (define c-add-times 71 | (for/list ([i (in-range iterations)]) 72 | (let ([start (current-inexact-milliseconds)]) 73 | (t-opt:add A-opt B-opt) 74 | (- (current-inexact-milliseconds) start)))) 75 | 76 | (define avg-c-add-time (/ (apply + c-add-times) iterations)) 77 | (printf " Optimized C add time: ~a ms~n" (real->decimal-string avg-c-add-time 2)) 78 | 79 | ;; Calculate speedup 80 | (define add-speedup (/ avg-racket-add-time avg-c-add-time)) 81 | (printf " Add speedup: ~a times faster~n" (real->decimal-string add-speedup 2)) 82 | 83 | (printf "~n")) 84 | 85 | (define (run-all-benchmarks) 86 | (printf "~n======================================~n") 87 | (printf " RACOGRAD CPU PERFORMANCE BENCHMARKS~n") 88 | (printf "======================================~n") 89 | 90 | (print-hardware-info) 91 | 92 | (benchmark-matrix-multiplication '(100 500 1000)) 93 | (benchmark-elementwise-operations 1000000) 94 | 95 | (printf "======================================~n") 96 | (printf "Benchmarks complete!~n") 97 | (printf "======================================~n")) 98 | 99 | ;; Run benchmarks when executed directly 100 | (module+ main 101 | (run-all-benchmarks)) -------------------------------------------------------------------------------- /cuda_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // #include // Would be included in a real CUDA implementation 7 | 8 | // Placeholder function to check if CUDA is available 9 | int check_cuda_available() { 10 | // This would actually check for CUDA-capable devices 11 | // For now, just a placeholder that returns 0 (not available) 12 | return 0; 13 | } 14 | 15 | // Matrix multiplication using CUDA 16 | // This is a placeholder for actual CUDA implementation 17 | int matrix_multiply_cuda(int M, int N, int K, double* A, double* B, double* C) { 18 | printf("CUDA Matrix multiplication called (placeholder).\n"); 19 | printf("Dimensions: (%d x %d) * (%d x %d) = (%d x %d)\n", M, K, K, N, M, N); 20 | 21 | // Fall back to CPU implementation for now 22 | for (int i = 0; i < M; i++) { 23 | for (int j = 0; j < N; j++) { 24 | double sum = 0.0; 25 | for (int k = 0; k < K; k++) { 26 | sum += A[i * K + k] * B[k * N + j]; 27 | } 28 | C[i * N + j] = sum; 29 | } 30 | } 31 | 32 | return 0; 33 | } 34 | 35 | // These are all placeholder functions that would actually use CUDA 36 | // in a real implementation 37 | 38 | int tensor_add_cuda(int size, double* a, double* b, double* result) { 39 | printf("CUDA tensor add called (placeholder).\n"); 40 | for (int i = 0; i < size; i++) { 41 | result[i] = a[i] + b[i]; 42 | } 43 | return 0; 44 | } 45 | 46 | int tensor_sub_cuda(int size, double* a, double* b, double* result) { 47 | printf("CUDA tensor subtract called (placeholder).\n"); 48 | for (int i = 0; i < size; i++) { 49 | result[i] = a[i] - b[i]; 50 | } 51 | return 0; 52 | } 53 | 54 | int tensor_mul_elementwise_cuda(int size, double* a, double* b, double* result) { 55 | printf("CUDA tensor element-wise multiply called (placeholder).\n"); 56 | for (int i = 0; i < size; i++) { 57 | result[i] = a[i] * b[i]; 58 | } 59 | return 0; 60 | } 61 | 62 | int relu_forward_cuda(int size, double* input, double* output) { 63 | printf("CUDA ReLU forward called (placeholder).\n"); 64 | for (int i = 0; i < size; i++) { 65 | output[i] = input[i] > 0 ? input[i] : 0; 66 | } 67 | return 0; 68 | } 69 | 70 | int relu_backward_cuda(int size, double* input, double* output) { 71 | printf("CUDA ReLU backward called (placeholder).\n"); 72 | for (int i = 0; i < size; i++) { 73 | output[i] = input[i] > 0 ? 1 : 0; 74 | } 75 | return 0; 76 | } -------------------------------------------------------------------------------- /debuggin_lib.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt") 4 | (provide dense-forward mean-squared-error dense-backward relu relu-derivative initialize-fnn sigmoid sigmoid-derivative) 5 | 6 | (define (relu x) 7 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (max 0 v)))) 8 | 9 | (define (relu-derivative x) 10 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (if (> v 0) 1 0)))) 11 | 12 | (define (sigmoid x) 13 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (/ 1 (+ 1 (exp (- v))))))) 14 | 15 | (define (sigmoid-derivative x) 16 | (let ([sig (sigmoid x)]) 17 | (tensor (tensor-shape x) (for/vector ([v (tensor-data sig)]) (* v (- 1 v)))))) 18 | 19 | (define (tanh x) 20 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (/ (- (exp v) (exp (- v))) (+ (exp v) (exp (- v))))))) 21 | 22 | (define (tanh-derivative x) 23 | (let ([t (tanh x)]) 24 | (tensor (tensor-shape x) (for/vector ([v (tensor-data t)]) (- 1 (* v v)))))) 25 | 26 | (define (dense-forward input weights biases activation-fn) 27 | (let* ([mul-result (tensor-multiply input weights)] 28 | [mul-result-shape (tensor-shape mul-result)] 29 | [output-dim (cadr mul-result-shape)] 30 | [reshaped-biases (reshape-tensor biases (list output-dim))] 31 | [z (tensor-add mul-result reshaped-biases)] 32 | [activation-output (activation-fn z)]) 33 | activation-output)) 34 | 35 | (define (mean-squared-error y-true y-pred) 36 | (let* ([diff (tensor-subtract y-true y-pred)] 37 | [squared-diff (tensor-multiply diff diff)] 38 | [sum (apply + (vector->list (tensor-data squared-diff)))]) 39 | (/ sum (length (vector->list (tensor-data y-true)))))) 40 | 41 | (define (dense-backward input weights biases output grad-output activation-derivative learning-rate) 42 | (displayln "dense-backward: Starting") 43 | (displayln (string-append "Input shape: " (format "~a" (tensor-shape input)))) 44 | (displayln (string-append "Weights shape: " (format "~a" (tensor-shape weights)))) 45 | (displayln (string-append "Biases shape: " (format "~a" (tensor-shape biases)))) 46 | (displayln (string-append "Output shape: " (format "~a" (tensor-shape output)))) 47 | (displayln (string-append "Grad-output shape: " (format "~a" (tensor-shape grad-output)))) 48 | 49 | (let* ([grad-activation (activation-derivative output)] 50 | [_ (displayln (string-append "Grad-activation shape: " (format "~a" (tensor-shape grad-activation))))] 51 | [grad-z (tensor-multiply grad-output grad-activation)] 52 | [_ (displayln (string-append "Grad-z shape: " (format "~a" (tensor-shape grad-z))))] 53 | [grad-weights (tensor-multiply (transpose input) grad-z)] 54 | [_ (displayln (string-append "Grad-weights shape: " (format "~a" (tensor-shape grad-weights))))] 55 | [grad-biases (tensor (list (vector-length (tensor-data biases))) 56 | (for/vector ([j (vector-length (tensor-data biases))]) 57 | (apply + (for/list ([i (car (tensor-shape grad-z))]) 58 | (vector-ref (tensor-data grad-z) (+ (* i (vector-length (tensor-data biases))) j))))))] 59 | [_ (displayln (string-append "Grad-biases shape: " (format "~a" (tensor-shape grad-biases))))] 60 | [grad-input (tensor-multiply grad-z (transpose weights))] 61 | [_ (displayln (string-append "Grad-input shape: " (format "~a" (tensor-shape grad-input))))]) 62 | (displayln "dense-backward: Finished") 63 | (values grad-weights grad-biases grad-input))) 64 | 65 | (define (initialize-fnn batch-size input-dim output-dim) 66 | (let* ([input-data (make-list (* batch-size input-dim) 0)] 67 | [input-tensor (create-tensor (list batch-size input-dim) input-data)] 68 | [weight-shape (list input-dim output-dim)] 69 | [bias-shape (list output-dim)] ; Ensure same size as output dim 70 | [weights (random-tensor weight-shape 1.0)] 71 | [biases (random-tensor bias-shape 1.0)]) 72 | (values input-tensor weights biases))) 73 | -------------------------------------------------------------------------------- /demo.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "mnist.rkt" 4 | "CNN.rkt" 5 | "visualization.rkt" 6 | "tensor.rkt" 7 | "device.rkt" 8 | "hardware_detection.rkt") 9 | 10 | (provide run-mnist-demo run-cnn-demo run-full-demo) 11 | 12 | ;; Function to run MNIST demo 13 | (define (run-mnist-demo [epochs 5] [batch-size 64]) 14 | (printf "Running MNIST Logistic Regression Demo~n") 15 | (printf "=====================================~n") 16 | 17 | ;; Here we would customize mnist.rkt parameters and run it 18 | ;; For demo purposes, we'll just use the default behavior 19 | (system "racket -t mnist.rkt -m") 20 | 21 | (printf "MNIST Demo Complete!~n~n")) 22 | 23 | ;; Function to run CNN demo 24 | (define (run-cnn-demo [device 'cpu] [epochs 3] [batch-size 32]) 25 | (printf "Running CNN Demo on device: ~a~n" device) 26 | (printf "==================================~n") 27 | 28 | ;; Run CNN training with specified parameters 29 | (let ([results (train-cnn device epochs batch-size)]) 30 | (printf "CNN Training Results:~n") 31 | (printf " Accuracy: ~a%~n" (hash-ref results 'accuracy)) 32 | (printf " Training Time: ~a seconds~n" (hash-ref results 'time)) 33 | (printf " Device: ~a~n" (hash-ref results 'device)) 34 | 35 | results)) 36 | 37 | ;; Function to run hardware detection demo 38 | (define (run-hardware-demo) 39 | (printf "Hardware Detection Demo~n") 40 | (printf "======================~n") 41 | 42 | (printf "CPU: Available~n") 43 | 44 | (printf "GPU: ~a~n" 45 | (if (gpu-available?) "Available" "Not available")) 46 | 47 | (printf "MLX (Apple Silicon): ~a~n" 48 | (if (device-available? 'mlx) "Available" "Not available")) 49 | 50 | (printf "Current device: ~a~n" (get-device-type (current-device))) 51 | 52 | (printf "Hardware Demo Complete!~n~n")) 53 | 54 | ;; Function to run full demo 55 | (define (run-full-demo) 56 | (printf "RacoGrad Full Demo~n") 57 | (printf "=================~n~n") 58 | 59 | ;; Run hardware detection 60 | (run-hardware-demo) 61 | 62 | ;; Run MNIST demo 63 | (run-mnist-demo 3 64) 64 | 65 | ;; Run CNN demo on best available device 66 | (let ([device (cond 67 | [(device-available? 'mlx) 'mlx] 68 | [(gpu-available?) 'gpu] 69 | [else 'cpu])]) 70 | (run-cnn-demo device 2 32)) 71 | 72 | (printf "Full Demo Complete!~n")) 73 | 74 | ;; Run the demo when this file is executed directly 75 | (module+ main 76 | (define mode (if (> (vector-length (current-command-line-arguments)) 0) 77 | (string->symbol (vector-ref (current-command-line-arguments) 0)) 78 | 'full)) 79 | 80 | (case mode 81 | [(mnist) (run-mnist-demo)] 82 | [(cnn) (run-cnn-demo)] 83 | [(hardware) (run-hardware-demo)] 84 | [else (run-full-demo)])) -------------------------------------------------------------------------------- /device.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "hardware_detection.rkt") 4 | 5 | (provide my-device? 6 | cpu-device? 7 | gpu-device? 8 | mlx-device? 9 | cuda-device? 10 | opencl-device? 11 | 12 | make-device 13 | cpu 14 | gpu 15 | mlx 16 | cuda 17 | opencl 18 | 19 | current-device 20 | set-current-device! 21 | device-synchronize 22 | 23 | get-device-type 24 | device-available? 25 | gpu-available?) 26 | 27 | ;; Define device struct 28 | (struct device (type properties) #:transparent #:mutable) 29 | 30 | ;; Define device types 31 | (define CPU 'cpu) 32 | (define GPU 'gpu) 33 | (define MLX 'mlx) 34 | (define CUDA 'cuda) 35 | (define OPENCL 'opencl) 36 | 37 | ;; Helper predicates 38 | (define (my-device? obj) 39 | (device? obj)) 40 | 41 | (define (cpu-device? dev) 42 | (and (my-device? dev) (eq? (device-type dev) CPU))) 43 | 44 | (define (gpu-device? dev) 45 | (and (my-device? dev) 46 | (or (eq? (device-type dev) GPU) 47 | (eq? (device-type dev) MLX) 48 | (eq? (device-type dev) CUDA) 49 | (eq? (device-type dev) OPENCL)))) 50 | 51 | (define (mlx-device? dev) 52 | (and (my-device? dev) (eq? (device-type dev) MLX))) 53 | 54 | (define (cuda-device? dev) 55 | (and (my-device? dev) (eq? (device-type dev) CUDA))) 56 | 57 | (define (opencl-device? dev) 58 | (and (my-device? dev) (eq? (device-type dev) OPENCL))) 59 | 60 | ;; Default devices 61 | (define cpu-device (device CPU (hash 'cores (get-optimal-num-threads)))) 62 | 63 | ;; Choose the best available GPU device 64 | (define gpu-device 65 | (cond 66 | [(has-mlx-support?) 67 | (device MLX (hash 'backend 'mlx))] 68 | [(has-cuda-support?) 69 | (device CUDA (hash 'backend 'cuda))] 70 | [(has-opencl?) 71 | (device OPENCL (hash 'backend 'opencl))] 72 | [else #f])) 73 | 74 | ;; Specific GPU type devices 75 | (define mlx-device 76 | (if (has-mlx-support?) 77 | (device MLX (hash 'backend 'mlx)) 78 | #f)) 79 | 80 | (define cuda-device 81 | (if (has-cuda-support?) 82 | (device CUDA (hash 'backend 'cuda)) 83 | #f)) 84 | 85 | (define opencl-device 86 | (if (has-opencl?) 87 | (device OPENCL (hash 'backend 'opencl)) 88 | #f)) 89 | 90 | ;; Factory function 91 | (define (make-device type [properties (hash)]) 92 | (device type properties)) 93 | 94 | ;; Device accessors 95 | (define (cpu) 96 | cpu-device) 97 | 98 | (define (gpu) 99 | (if gpu-device 100 | gpu-device 101 | (error "No GPU device available on this system"))) 102 | 103 | (define (mlx) 104 | (if mlx-device 105 | mlx-device 106 | (error "MLX not available - requires Apple Silicon"))) 107 | 108 | (define (cuda) 109 | (if cuda-device 110 | cuda-device 111 | (error "CUDA not available - requires NVIDIA GPU"))) 112 | 113 | (define (opencl) 114 | (if opencl-device 115 | opencl-device 116 | (error "OpenCL not available on this system"))) 117 | 118 | ;; Current device management 119 | (define current-device-param (make-parameter cpu-device)) 120 | 121 | (define (current-device) 122 | (current-device-param)) 123 | 124 | (define (set-current-device! dev) 125 | (when (not (my-device? dev)) 126 | (error "set-current-device!: expected a device, got ~a" dev)) 127 | (current-device-param dev)) 128 | 129 | ;; Device synchronization (no-op for CPU, will wait for GPU operations to complete) 130 | (define (device-synchronize [dev (current-device)]) 131 | (when (gpu-device? dev) 132 | (printf "Synchronizing GPU operations~n"))) 133 | 134 | ;; Get device type 135 | (define (get-device-type [dev (current-device)]) 136 | (device-type dev)) 137 | 138 | ;; Check if a device is available 139 | (define (device-available? type) 140 | (case type 141 | [(cpu) #t] 142 | [(gpu) (or (has-mlx-support?) (has-cuda-support?) (has-opencl?))] 143 | [(mlx) (has-mlx-support?)] 144 | [(cuda) (has-cuda-support?)] 145 | [(opencl) (has-opencl?)] 146 | [else #f])) 147 | 148 | ;; Helper to check if any GPU is available 149 | (define (gpu-available?) 150 | (or (has-mlx-support?) (has-cuda-support?) (has-opencl?))) 151 | 152 | ;; Print hardware info on load 153 | (print-hardware-info) -------------------------------------------------------------------------------- /ffi_ops.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require ffi/unsafe 4 | ffi/vector) 5 | 6 | (provide c:matrix-multiply 7 | c:tensor-add 8 | c:tensor-sub 9 | c:tensor-mul-elementwise 10 | c:tensor-scale 11 | c:relu-forward 12 | c:relu-backward 13 | c:sigmoid-forward 14 | c:sigmoid-backward) 15 | 16 | ;; Load the shared library 17 | (define matrix-lib (ffi-lib "matrix_multiplication")) 18 | 19 | ;; Define the FFI functions 20 | (define c:matrix-multiply 21 | (get-ffi-obj "matrix_multiply" matrix-lib 22 | (_fun _int _int _int _f64vector _f64vector _f64vector -> _void))) 23 | 24 | (define c:tensor-add 25 | (get-ffi-obj "tensor_add" matrix-lib 26 | (_fun _int _f64vector _f64vector _f64vector -> _void))) 27 | 28 | (define c:tensor-sub 29 | (get-ffi-obj "tensor_sub" matrix-lib 30 | (_fun _int _f64vector _f64vector _f64vector -> _void))) 31 | 32 | (define c:tensor-mul-elementwise 33 | (get-ffi-obj "tensor_mul_elementwise" matrix-lib 34 | (_fun _int _f64vector _f64vector _f64vector -> _void))) 35 | 36 | (define c:tensor-scale 37 | (get-ffi-obj "tensor_scale" matrix-lib 38 | (_fun _int _f64vector _double _f64vector -> _void))) 39 | 40 | (define c:relu-forward 41 | (get-ffi-obj "relu_forward" matrix-lib 42 | (_fun _int _f64vector _f64vector -> _void))) 43 | 44 | (define c:relu-backward 45 | (get-ffi-obj "relu_backward" matrix-lib 46 | (_fun _int _f64vector _f64vector -> _void))) 47 | 48 | (define c:sigmoid-forward 49 | (get-ffi-obj "sigmoid_forward" matrix-lib 50 | (_fun _int _f64vector _f64vector -> _void))) 51 | 52 | (define c:sigmoid-backward 53 | (get-ffi-obj "sigmoid_backward" matrix-lib 54 | (_fun _int _f64vector _f64vector -> _void))) -------------------------------------------------------------------------------- /gpu_acceleration.md: -------------------------------------------------------------------------------- 1 | # GPU Acceleration Guide for RacoGrad 2 | 3 | This document outlines multiple approaches to add GPU acceleration to RacoGrad without requiring deep CUDA knowledge. 4 | 5 | ## 1. OpenCL Integration (Recommended for beginners) 6 | 7 | OpenCL is more accessible than CUDA and works across different GPU vendors (AMD, NVIDIA, Intel). 8 | 9 | ### Setup Steps: 10 | 11 | 1. Install OpenCL development kit: 12 | - macOS: Already included in the OS 13 | - Linux: `sudo apt install opencl-headers ocl-icd-opencl-dev` 14 | - Windows: Install vendor-specific OpenCL SDK 15 | 16 | 2. Create OpenCL kernels for key operations (matrix multiplication example): 17 | 18 | ```c 19 | // matrix_mul.cl 20 | __kernel void matrix_multiply( 21 | const int M, const int N, const int K, 22 | __global const float* A, 23 | __global const float* B, 24 | __global float* C) 25 | { 26 | // Get global position in Y direction 27 | int row = get_global_id(0); 28 | // Get global position in X direction 29 | int col = get_global_id(1); 30 | 31 | float sum = 0.0f; 32 | for (int i = 0; i < K; i++) { 33 | sum += A[row * K + i] * B[i * N + col]; 34 | } 35 | 36 | C[row * N + col] = sum; 37 | } 38 | ``` 39 | 40 | 3. Racket FFI to OpenCL: 41 | 42 | ```racket 43 | ;; Basic OpenCL FFI setup (simplified) 44 | (define opencl-lib (ffi-lib "OpenCL")) 45 | (define clCreateProgramWithSource 46 | (get-ffi-obj "clCreateProgramWithSource" opencl-lib (_fun ...))) 47 | ;; ... more FFI definitions 48 | ``` 49 | 50 | ## 2. Use Existing GPU Libraries 51 | 52 | Instead of writing OpenCL code directly, leverage existing libraries: 53 | 54 | ### ArrayFire 55 | 56 | ArrayFire is a high-performance library with GPU support: 57 | 58 | ``` 59 | # Install ArrayFire 60 | brew install arrayfire # macOS 61 | ``` 62 | 63 | Connect via FFI to ArrayFire's C API for common operations. 64 | 65 | ### ONNX Runtime 66 | 67 | ONNX Runtime provides GPU acceleration: 68 | 69 | 1. Export your models to ONNX format 70 | 2. Use ONNX Runtime for inference 71 | 3. Connect via FFI to ONNX Runtime's C API 72 | 73 | ### TensorFlow or PyTorch C++ APIs 74 | 75 | Both frameworks offer C++ APIs that can be connected through FFI. 76 | 77 | ## 3. Vulkan Compute 78 | 79 | For newer GPUs, Vulkan Compute is another option: 80 | 81 | 1. Vulkan SDK installation 82 | 2. Kompute library for simplified Vulkan Compute 83 | 84 | ## Implementation Strategy 85 | 86 | 1. Start with the simplest option: OpenCL for critical operations 87 | 2. Develop a fallback CPU path for compatibility 88 | 3. Profile to identify bottlenecks 89 | 4. Gradually replace more operations with GPU implementations 90 | 91 | ## Compilation Instructions 92 | 93 | For OpenCL on macOS: 94 | ``` 95 | clang -framework OpenCL -o matrix_opencl matrix_opencl.c 96 | ``` 97 | 98 | For OpenCL on Linux: 99 | ``` 100 | gcc -o matrix_opencl matrix_opencl.c -lOpenCL 101 | ``` -------------------------------------------------------------------------------- /gpu_benchmark.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor_device.rkt" 4 | "device.rkt" 5 | "hardware_detection.rkt") 6 | 7 | (define (benchmark-matrix-multiplication sizes [iterations 5]) 8 | (printf "===== Matrix Multiplication Benchmark =====~n") 9 | 10 | (for ([size sizes]) 11 | (printf "Matrix size: ~a x ~a~n" size size) 12 | 13 | ;; Create random matrices on CPU 14 | (define A-cpu (dt:random (list size size) 1.0 (cpu))) 15 | (define B-cpu (dt:random (list size size) 1.0 (cpu))) 16 | 17 | ;; Time CPU multiplication 18 | (define cpu-times 19 | (for/list ([i (in-range iterations)]) 20 | (let ([start (current-inexact-milliseconds)]) 21 | (dt:mul A-cpu B-cpu) 22 | (- (current-inexact-milliseconds) start)))) 23 | 24 | (define avg-cpu-time (/ (apply + cpu-times) iterations)) 25 | (printf " CPU time: ~a ms~n" (real->decimal-string avg-cpu-time 2)) 26 | 27 | ;; If GPU is available, benchmark on GPU 28 | (when (gpu-available?) 29 | ;; Move matrices to GPU 30 | (define A-gpu (dt:to A-cpu (gpu))) 31 | (define B-gpu (dt:to B-cpu (gpu))) 32 | 33 | ;; Time GPU multiplication 34 | (define gpu-times 35 | (for/list ([i (in-range iterations)]) 36 | (let ([start (current-inexact-milliseconds)]) 37 | (dt:mul A-gpu B-gpu) 38 | (- (current-inexact-milliseconds) start)))) 39 | 40 | (define avg-gpu-time (/ (apply + gpu-times) iterations)) 41 | (printf " GPU time: ~a ms~n" (real->decimal-string avg-gpu-time 2)) 42 | 43 | ;; Calculate speedup 44 | (define speedup (/ avg-cpu-time avg-gpu-time)) 45 | (printf " Speedup: ~a times faster~n" (real->decimal-string speedup 2))) 46 | 47 | (printf "~n"))) 48 | 49 | (define (benchmark-elementwise-operations size [iterations 5]) 50 | (printf "===== Element-wise Operations Benchmark =====~n") 51 | (printf "Vector size: ~a~n" size) 52 | 53 | ;; Create random vectors on CPU 54 | (define A-cpu (dt:random (list size) 1.0 (cpu))) 55 | (define B-cpu (dt:random (list size) 1.0 (cpu))) 56 | 57 | ;; Time CPU addition 58 | (define cpu-add-times 59 | (for/list ([i (in-range iterations)]) 60 | (let ([start (current-inexact-milliseconds)]) 61 | (dt:add A-cpu B-cpu) 62 | (- (current-inexact-milliseconds) start)))) 63 | 64 | (define avg-cpu-add-time (/ (apply + cpu-add-times) iterations)) 65 | (printf " CPU add time: ~a ms~n" (real->decimal-string avg-cpu-add-time 2)) 66 | 67 | ;; If GPU is available, benchmark on GPU 68 | (when (gpu-available?) 69 | ;; Move vectors to GPU 70 | (define A-gpu (dt:to A-cpu (gpu))) 71 | (define B-gpu (dt:to B-cpu (gpu))) 72 | 73 | ;; Time GPU addition 74 | (define gpu-add-times 75 | (for/list ([i (in-range iterations)]) 76 | (let ([start (current-inexact-milliseconds)]) 77 | (dt:add A-gpu B-gpu) 78 | (- (current-inexact-milliseconds) start)))) 79 | 80 | (define avg-gpu-add-time (/ (apply + gpu-add-times) iterations)) 81 | (printf " GPU add time: ~a ms~n" (real->decimal-string avg-gpu-add-time 2)) 82 | 83 | ;; Calculate speedup 84 | (define add-speedup (/ avg-cpu-add-time avg-gpu-add-time)) 85 | (printf " Add speedup: ~a times faster~n" (real->decimal-string add-speedup 2))) 86 | 87 | (printf "~n")) 88 | 89 | (define (run-all-benchmarks) 90 | (printf "~n======================================~n") 91 | (printf " RACOGRAD GPU PERFORMANCE BENCHMARKS~n") 92 | (printf "======================================~n") 93 | 94 | (print-hardware-info) 95 | 96 | (benchmark-matrix-multiplication '(100 500 1000)) 97 | (benchmark-elementwise-operations 1000000) 98 | 99 | (printf "======================================~n") 100 | (printf "Benchmarks complete!~n") 101 | (printf "======================================~n")) 102 | 103 | ;; Run benchmarks when executed directly 104 | (module+ main 105 | (printf "Checking GPU availability...~n") 106 | (if (gpu-available?) 107 | (run-all-benchmarks) 108 | (begin 109 | (printf "GPU acceleration not available on this system.~n") 110 | (printf "Running CPU-only benchmarks...~n") 111 | (benchmark-matrix-multiplication '(100 500 1000))))) -------------------------------------------------------------------------------- /hardware_detection.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require ffi/unsafe) 4 | 5 | (provide detect-hardware-capabilities 6 | get-optimal-num-threads 7 | has-avx? 8 | has-sse? 9 | has-opencl? 10 | is-apple-silicon? 11 | has-mlx-support? 12 | has-cuda-support? 13 | print-hardware-info) 14 | 15 | ;; Safely try to load the libraries with proper error handling 16 | (define libcheck 17 | (with-handlers ([exn:fail? (lambda (e) #f)]) 18 | (ffi-lib "simd_ops" '("" "0")))) 19 | 20 | ;; Safely try to load the libraries and their availability checks 21 | (define opencl-lib 22 | (with-handlers ([exn:fail? (lambda (e) #f)]) 23 | (ffi-lib "matrix_opencl" '("" "0")))) 24 | 25 | (define mlx-lib 26 | (with-handlers ([exn:fail? (lambda (e) #f)]) 27 | (ffi-lib "mlx_ops" '("" "0")))) 28 | 29 | (define cuda-lib 30 | (with-handlers ([exn:fail? (lambda (e) #f)]) 31 | (ffi-lib "cuda_ops" '("" "0")))) 32 | 33 | ;; Import availability check functions if libraries are available 34 | (define check-opencl-available 35 | (if opencl-lib 36 | (with-handlers ([exn:fail? (lambda (e) (lambda () 0))]) 37 | (get-ffi-obj "check_opencl_available" opencl-lib (_fun -> _int))) 38 | (lambda () 0))) 39 | 40 | (define check-mlx-available 41 | (if mlx-lib 42 | (with-handlers ([exn:fail? (lambda (e) (lambda () 0))]) 43 | (get-ffi-obj "check_mlx_available" mlx-lib (_fun -> _int))) 44 | (lambda () 0))) 45 | 46 | (define check-cuda-available 47 | (if cuda-lib 48 | (with-handlers ([exn:fail? (lambda (e) (lambda () 0))]) 49 | (get-ffi-obj "check_cuda_available" cuda-lib (_fun -> _int))) 50 | (lambda () 0))) 51 | 52 | ;; Number of physical cores detection 53 | (define (detect-num-cores) 54 | (cond 55 | [(equal? (system-type 'os) 'unix) 56 | (with-handlers ([exn:fail? (lambda (_) 4)]) ; Default to 4 if detection fails 57 | (let* ([output (with-output-to-string 58 | (lambda () (system "sysctl -n hw.physicalcpu 2>/dev/null || 59 | grep -c ^processor /proc/cpuinfo 2>/dev/null || 60 | echo 4")))] 61 | [num (string->number (string-trim output))]) 62 | (if num num 4)))] 63 | [(equal? (system-type 'os) 'windows) 64 | (with-handlers ([exn:fail? (lambda (_) 4)]) 65 | (let* ([output (with-output-to-string 66 | (lambda () (system "echo %NUMBER_OF_PROCESSORS%")))] 67 | [num (string->number (string-trim output))]) 68 | (if num num 4)))] 69 | [else 4])) ; Default value 70 | 71 | ;; Check for SIMD instruction support 72 | (define (has-simd-support? type) 73 | (cond 74 | [(equal? (system-type 'os) 'unix) 75 | (case type 76 | [(avx) (with-handlers ([exn:fail? (lambda (_) #f)]) 77 | (let ([output (with-output-to-string 78 | (lambda () (system "grep -q avx /proc/cpuinfo && echo yes || 79 | sysctl -n machdep.cpu.features 2>/dev/null | grep -q AVX && echo yes")))]) 80 | (string-contains? (string-trim output) "yes")))] 81 | [(sse) (with-handlers ([exn:fail? (lambda (_) #f)]) 82 | (let ([output (with-output-to-string 83 | (lambda () (system "grep -q sse /proc/cpuinfo && echo yes || 84 | sysctl -n machdep.cpu.features 2>/dev/null | grep -q SSE && echo yes")))]) 85 | (string-contains? (string-trim output) "yes")))] 86 | [else #f])] 87 | [else #f])) ; Simplified for other OSes 88 | 89 | ;; Check for OpenCL support 90 | (define (has-opencl-support?) 91 | (with-handlers ([exn:fail? (lambda (e) #f)]) 92 | (and opencl-lib 93 | (procedure? check-opencl-available) 94 | (= (check-opencl-available) 1)))) 95 | 96 | ;; Check for Apple Silicon (M1/M2/M3) 97 | (define (detect-apple-silicon) 98 | (and (equal? (system-type 'os) 'macosx) 99 | (or (string-contains? (with-output-to-string 100 | (lambda () (system "uname -m"))) 101 | "arm64") 102 | (string-contains? (with-output-to-string 103 | (lambda () (system "sysctl -n machdep.cpu.brand_string 2>/dev/null"))) 104 | "Apple")))) 105 | 106 | ;; Check for MLX availability on Apple Silicon 107 | (define (has-mlx-support?) 108 | (with-handlers ([exn:fail? (lambda (e) #f)]) 109 | (and (detect-apple-silicon) 110 | ;; Check for MLX library in standard locations 111 | (or mlx-lib 112 | (with-handlers ([exn:fail? (lambda (e) #f)]) 113 | (file-exists? "/opt/homebrew/lib/libmlx.dylib")) 114 | (with-handlers ([exn:fail? (lambda (e) #f)]) 115 | (file-exists? "/usr/local/lib/libmlx.dylib")) 116 | (with-handlers ([exn:fail? (lambda (e) #f)]) 117 | (file-exists? "/usr/lib/libmlx.dylib"))) 118 | ;; Fall back to environment variable check 119 | (or (and (procedure? check-mlx-available) 120 | (= (check-mlx-available) 1)) 121 | (getenv "MLX_AVAILABLE"))))) 122 | 123 | ;; Check for CUDA availability 124 | (define (has-cuda-support?) 125 | (with-handlers ([exn:fail? (lambda (e) #f)]) 126 | (and cuda-lib 127 | (procedure? check-cuda-available) 128 | (= (check-cuda-available) 1)))) 129 | 130 | ;; Memory size detection (in GB) 131 | (define (detect-memory-size) 132 | (cond 133 | [(equal? (system-type 'os) 'unix) 134 | (with-handlers ([exn:fail? (lambda (_) 8.0)]) ; Default to 8GB 135 | (let* ([output (with-output-to-string 136 | (lambda () (system "sysctl -n hw.memsize 2>/dev/null || 137 | grep MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}' || 138 | echo 8589934592")))] 139 | [num (string->number (string-trim output))]) 140 | (if num 141 | (if (< num 1000000) ; If returned in KB (Linux) 142 | (/ num 1024 1024) 143 | (/ num 1024 1024 1024)) 144 | 8.0)))] 145 | [else 8.0])) ; Default for other OSes 146 | 147 | ;; Main hardware detection function 148 | (define (detect-hardware-capabilities) 149 | (let ([cores (detect-num-cores)] 150 | [avx (has-simd-support? 'avx)] 151 | [sse (has-simd-support? 'sse)] 152 | [opencl (has-opencl-support?)] 153 | [apple-silicon (detect-apple-silicon)] 154 | [mlx (has-mlx-support?)] 155 | [cuda (has-cuda-support?)] 156 | [memory (detect-memory-size)]) 157 | (hash 'cores cores 158 | 'avx avx 159 | 'sse sse 160 | 'opencl opencl 161 | 'apple-silicon apple-silicon 162 | 'mlx mlx 163 | 'cuda cuda 164 | 'memory memory))) 165 | 166 | ;; Cache the detection results 167 | (define hardware-info (detect-hardware-capabilities)) 168 | 169 | ;; Accessor functions 170 | (define (get-optimal-num-threads) 171 | (hash-ref hardware-info 'cores)) 172 | 173 | (define (has-avx?) 174 | (hash-ref hardware-info 'avx)) 175 | 176 | (define (has-sse?) 177 | (hash-ref hardware-info 'sse)) 178 | 179 | (define (has-opencl?) 180 | (hash-ref hardware-info 'opencl)) 181 | 182 | (define (is-apple-silicon?) 183 | (hash-ref hardware-info 'apple-silicon)) 184 | 185 | ;; Print hardware information with error handling 186 | (define (print-hardware-info) 187 | (with-handlers ([exn:fail? (lambda (e) 188 | (printf "======================================\n") 189 | (printf "Error detecting hardware capabilities: ~a\n" (exn-message e)) 190 | (printf "======================================\n"))]) 191 | (define info hardware-info) 192 | (printf "======================================\n") 193 | (printf "Hardware Capabilities for RacoGrad\n") 194 | (printf "======================================\n") 195 | (printf "CPU Cores: ~a\n" (hash-ref info 'cores 4)) 196 | (printf "AVX Support: ~a\n" (hash-ref info 'avx #f)) 197 | (printf "SSE Support: ~a\n" (hash-ref info 'sse #f)) 198 | (printf "OpenCL Available: ~a\n" (hash-ref info 'opencl #f)) 199 | (printf "Apple Silicon: ~a\n" (hash-ref info 'apple-silicon #f)) 200 | (printf "MLX Support: ~a\n" (hash-ref info 'mlx #f)) 201 | (printf "CUDA Support: ~a\n" (hash-ref info 'cuda #f)) 202 | (printf "System Memory: ~a GB\n" (hash-ref info 'memory 8.0)) 203 | (printf "--------------------------------------\n") 204 | (printf "Recommended Configuration:\n") 205 | (cond 206 | [(hash-ref info 'mlx #f) 207 | (printf "- Use MLX acceleration for Apple Silicon\n")] 208 | [(hash-ref info 'cuda #f) 209 | (printf "- Use CUDA acceleration for NVIDIA GPUs\n")] 210 | [(hash-ref info 'opencl #f) 211 | (printf "- Use OpenCL acceleration for cross-platform GPU support\n")] 212 | [(hash-ref info 'avx #f) 213 | (printf "- Use AVX-optimized functions\n")] 214 | [(hash-ref info 'sse #f) 215 | (printf "- Use SSE-optimized functions\n")] 216 | [else 217 | (printf "- Use basic C optimizations\n")]) 218 | (printf "- Use ~a threads for parallel operations\n" (hash-ref info 'cores 4)) 219 | (printf "======================================\n"))) -------------------------------------------------------------------------------- /load-mnist.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | (provide load-mnist-data) 3 | (require "tensor.rkt") 4 | 5 | (define (read-idx-file filename) 6 | (let* ([data (file->bytes filename)]) 7 | ;; Parse IDX file format 8 | (let* ([magic-number (integer-bytes->integer data #f #t 0 4)] 9 | [data-type (bitwise-and magic-number #xFF)] 10 | [dimension-count (bitwise-and (arithmetic-shift magic-number -8) #xFF)] 11 | [dimensions (for/list ([i (in-range dimension-count)]) 12 | (integer-bytes->integer data #f #t (+ 4 (* i 4)) (+ 8 (* i 4))))] 13 | [data-start (+ 4 (* dimension-count 4))] 14 | [data-length (- (bytes-length data) data-start)]) 15 | (values dimensions (subbytes data data-start))))) 16 | 17 | (define (load-mnist-data type) 18 | (let* ([images-file (build-path "mnist-data" 19 | (format "~a-images.idx3-ubyte" type))] 20 | [labels-file (build-path "mnist-data" 21 | (format "~a-labels.idx1-ubyte" type))] 22 | [images-dims+data (read-idx-file images-file)] 23 | [labels-dims+data (read-idx-file labels-file)]) 24 | ;; Convert to tensors and return 25 | (values 26 | (t:create (list (car (car images-dims+data)) 784) 27 | (bytes->list (cdr images-dims+data))) 28 | (t:create (list (car (car labels-dims+data))) 29 | (bytes->list (cdr labels-dims+data)))))) -------------------------------------------------------------------------------- /matrix_multiplication.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // Basic matrix multiplication 7 | void matrix_multiply(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c) { 8 | for (int i = 0; i < rows_a; ++i) { 9 | for (int j = 0; j < cols_b; ++j) { 10 | double sum = 0.0; 11 | for (int k = 0; k < cols_a; ++k) { 12 | sum += a[i * cols_a + k] * b[k * cols_b + j]; 13 | } 14 | c[i * cols_b + j] = sum; 15 | } 16 | } 17 | } 18 | 19 | // Add element-wise tensor operations 20 | void tensor_add(int size, double *a, double *b, double *result) { 21 | for (int i = 0; i < size; i++) { 22 | result[i] = a[i] + b[i]; 23 | } 24 | } 25 | 26 | void tensor_sub(int size, double *a, double *b, double *result) { 27 | for (int i = 0; i < size; i++) { 28 | result[i] = a[i] - b[i]; 29 | } 30 | } 31 | 32 | void tensor_mul_elementwise(int size, double *a, double *b, double *result) { 33 | for (int i = 0; i < size; i++) { 34 | result[i] = a[i] * b[i]; 35 | } 36 | } 37 | 38 | void tensor_scale(int size, double *a, double scalar, double *result) { 39 | for (int i = 0; i < size; i++) { 40 | result[i] = a[i] * scalar; 41 | } 42 | } 43 | 44 | // Activation functions in C for speed 45 | void relu_forward(int size, double *input, double *output) { 46 | for (int i = 0; i < size; i++) { 47 | output[i] = input[i] > 0 ? input[i] : 0; 48 | } 49 | } 50 | 51 | void relu_backward(int size, double *input, double *output) { 52 | for (int i = 0; i < size; i++) { 53 | output[i] = input[i] > 0 ? 1 : 0; 54 | } 55 | } 56 | 57 | void sigmoid_forward(int size, double *input, double *output) { 58 | for (int i = 0; i < size; i++) { 59 | output[i] = 1.0 / (1.0 + exp(-input[i])); 60 | } 61 | } 62 | 63 | void sigmoid_backward(int size, double *input, double *output) { 64 | for (int i = 0; i < size; i++) { 65 | double sigmoid_val = 1.0 / (1.0 + exp(-input[i])); 66 | output[i] = sigmoid_val * (1.0 - sigmoid_val); 67 | } 68 | } -------------------------------------------------------------------------------- /matrix_opencl.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifdef __APPLE__ 6 | #include 7 | #else 8 | #include 9 | #endif 10 | 11 | // Matrix multiplication kernel as a string 12 | const char* matrixMulKernelSource = 13 | "__kernel void matrixMul(\n" 14 | " const int M, const int N, const int K,\n" 15 | " __global const double* A,\n" 16 | " __global const double* B,\n" 17 | " __global double* C) {\n" 18 | " \n" 19 | " // Get global position in the grid\n" 20 | " const int row = get_global_id(0);\n" 21 | " const int col = get_global_id(1);\n" 22 | " \n" 23 | " // Ensure we don't go out of bounds\n" 24 | " if (row < M && col < N) {\n" 25 | " double sum = 0.0;\n" 26 | " for (int k = 0; k < K; k++) {\n" 27 | " sum += A[row * K + k] * B[k * N + col];\n" 28 | " }\n" 29 | " C[row * N + col] = sum;\n" 30 | " }\n" 31 | "}\n"; 32 | 33 | // OpenCL context and related variables 34 | cl_context context = NULL; 35 | cl_command_queue queue = NULL; 36 | cl_program program = NULL; 37 | cl_kernel kernel = NULL; 38 | cl_int err; 39 | cl_bool is_opencl_initialized = CL_FALSE; 40 | 41 | // Initialize OpenCL 42 | cl_bool initialize_opencl() { 43 | if (is_opencl_initialized) return CL_TRUE; 44 | 45 | // Get platform and device information 46 | cl_platform_id platform_id = NULL; 47 | cl_device_id device_id = NULL; 48 | cl_uint ret_num_devices; 49 | cl_uint ret_num_platforms; 50 | 51 | err = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); 52 | if (err != CL_SUCCESS) { 53 | printf("Failed to get platform ID: %d\n", err); 54 | return CL_FALSE; 55 | } 56 | 57 | err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); 58 | if (err != CL_SUCCESS) { 59 | // Try CPU if GPU is not available 60 | err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); 61 | if (err != CL_SUCCESS) { 62 | printf("Failed to get device ID: %d\n", err); 63 | return CL_FALSE; 64 | } 65 | printf("Using CPU device (GPU not available)\n"); 66 | } 67 | 68 | // Create an OpenCL context 69 | context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); 70 | if (err != CL_SUCCESS) { 71 | printf("Failed to create context: %d\n", err); 72 | return CL_FALSE; 73 | } 74 | 75 | // Create a command queue 76 | queue = clCreateCommandQueue(context, device_id, 0, &err); 77 | if (err != CL_SUCCESS) { 78 | printf("Failed to create command queue: %d\n", err); 79 | clReleaseContext(context); 80 | return CL_FALSE; 81 | } 82 | 83 | // Create the compute program from the source buffer 84 | program = clCreateProgramWithSource(context, 1, (const char **)&matrixMulKernelSource, NULL, &err); 85 | if (err != CL_SUCCESS) { 86 | printf("Failed to create program: %d\n", err); 87 | clReleaseCommandQueue(queue); 88 | clReleaseContext(context); 89 | return CL_FALSE; 90 | } 91 | 92 | // Build the program executable 93 | err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 94 | if (err != CL_SUCCESS) { 95 | size_t len; 96 | char buffer[2048]; 97 | 98 | printf("Failed to build program executable: %d\n", err); 99 | clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); 100 | printf("%s\n", buffer); 101 | 102 | clReleaseProgram(program); 103 | clReleaseCommandQueue(queue); 104 | clReleaseContext(context); 105 | return CL_FALSE; 106 | } 107 | 108 | // Create the compute kernel 109 | kernel = clCreateKernel(program, "matrixMul", &err); 110 | if (err != CL_SUCCESS) { 111 | printf("Failed to create kernel: %d\n", err); 112 | clReleaseProgram(program); 113 | clReleaseCommandQueue(queue); 114 | clReleaseContext(context); 115 | return CL_FALSE; 116 | } 117 | 118 | is_opencl_initialized = CL_TRUE; 119 | return CL_TRUE; 120 | } 121 | 122 | // Clean up OpenCL resources 123 | void cleanup_opencl() { 124 | if (kernel) clReleaseKernel(kernel); 125 | if (program) clReleaseProgram(program); 126 | if (queue) clReleaseCommandQueue(queue); 127 | if (context) clReleaseContext(context); 128 | is_opencl_initialized = CL_FALSE; 129 | } 130 | 131 | // Matrix multiplication using OpenCL 132 | int matrix_multiply_opencl(int M, int N, int K, double* A, double* B, double* C) { 133 | cl_mem d_A, d_B, d_C; 134 | size_t global_work_size[2]; 135 | const size_t A_size = M * K * sizeof(double); 136 | const size_t B_size = K * N * sizeof(double); 137 | const size_t C_size = M * N * sizeof(double); 138 | 139 | // Initialize OpenCL if not already done 140 | if (!initialize_opencl()) { 141 | return -1; 142 | } 143 | 144 | // Create memory buffers on the device 145 | d_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, A_size, A, &err); 146 | if (err != CL_SUCCESS) { 147 | printf("Failed to create buffer for A: %d\n", err); 148 | return -1; 149 | } 150 | 151 | d_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, B_size, B, &err); 152 | if (err != CL_SUCCESS) { 153 | printf("Failed to create buffer for B: %d\n", err); 154 | clReleaseMemObject(d_A); 155 | return -1; 156 | } 157 | 158 | d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, C_size, NULL, &err); 159 | if (err != CL_SUCCESS) { 160 | printf("Failed to create buffer for C: %d\n", err); 161 | clReleaseMemObject(d_A); 162 | clReleaseMemObject(d_B); 163 | return -1; 164 | } 165 | 166 | // Set the arguments of the kernel 167 | err = clSetKernelArg(kernel, 0, sizeof(int), (void *)&M); 168 | err |= clSetKernelArg(kernel, 1, sizeof(int), (void *)&N); 169 | err |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&K); 170 | err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_A); 171 | err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_B); 172 | err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_C); 173 | 174 | if (err != CL_SUCCESS) { 175 | printf("Failed to set kernel arguments: %d\n", err); 176 | clReleaseMemObject(d_A); 177 | clReleaseMemObject(d_B); 178 | clReleaseMemObject(d_C); 179 | return -1; 180 | } 181 | 182 | // Define work group size and execute the kernel 183 | global_work_size[0] = M; 184 | global_work_size[1] = N; 185 | 186 | err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); 187 | if (err != CL_SUCCESS) { 188 | printf("Failed to execute kernel: %d\n", err); 189 | clReleaseMemObject(d_A); 190 | clReleaseMemObject(d_B); 191 | clReleaseMemObject(d_C); 192 | return -1; 193 | } 194 | 195 | // Read back the results from the device 196 | err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, C_size, C, 0, NULL, NULL); 197 | if (err != CL_SUCCESS) { 198 | printf("Failed to read output matrix: %d\n", err); 199 | clReleaseMemObject(d_A); 200 | clReleaseMemObject(d_B); 201 | clReleaseMemObject(d_C); 202 | return -1; 203 | } 204 | 205 | // Release resources 206 | clReleaseMemObject(d_A); 207 | clReleaseMemObject(d_B); 208 | clReleaseMemObject(d_C); 209 | 210 | return 0; 211 | } 212 | 213 | // Expose a function that can be called to check if OpenCL is available 214 | int check_opencl_available() { 215 | cl_bool result = initialize_opencl(); 216 | if (result == CL_TRUE) { 217 | cleanup_opencl(); 218 | return 1; 219 | } 220 | return 0; 221 | } -------------------------------------------------------------------------------- /mlx_cnn_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | // To be replaced with actual MLX includes when available 8 | // #include 9 | #endif 10 | 11 | // MLX version of convolution operation 12 | void mlx_conv2d_forward(int batch_size, int in_channels, int in_height, int in_width, 13 | int out_channels, int filter_height, int filter_width, 14 | int stride, int padding, 15 | double* input, double* filters, double* output) { 16 | printf("MLX Convolution called - optimized for Apple Silicon\n"); 17 | 18 | // For now, just call the CPU implementation 19 | // In a real implementation, this would use MLX's optimized convolution 20 | 21 | int out_height = 1 + (in_height + 2 * padding - filter_height) / stride; 22 | int out_width = 1 + (in_width + 2 * padding - filter_width) / stride; 23 | 24 | // Initialize output to zeros 25 | int output_size = batch_size * out_channels * out_height * out_width; 26 | memset(output, 0, output_size * sizeof(double)); 27 | 28 | // Optimized for Apple Silicon - would use MLX operations 29 | // Current implementation falls back to a somewhat optimized CPU version 30 | 31 | // Cache-efficient implementation 32 | for (int b = 0; b < batch_size; b++) { 33 | for (int oc = 0; oc < out_channels; oc++) { 34 | for (int oh = 0; oh < out_height; oh++) { 35 | for (int ow = 0; ow < out_width; ow++) { 36 | int out_idx = ((b * out_channels + oc) * out_height + oh) * out_width + ow; 37 | double sum = 0.0; 38 | 39 | for (int ic = 0; ic < in_channels; ic++) { 40 | for (int fh = 0; fh < filter_height; fh++) { 41 | int ih = oh * stride + fh - padding; 42 | if (ih < 0 || ih >= in_height) continue; 43 | 44 | for (int fw = 0; fw < filter_width; fw++) { 45 | int iw = ow * stride + fw - padding; 46 | if (iw < 0 || iw >= in_width) continue; 47 | 48 | int in_idx = ((b * in_channels + ic) * in_height + ih) * in_width + iw; 49 | int filter_idx = ((oc * in_channels + ic) * filter_height + fh) * filter_width + fw; 50 | 51 | sum += input[in_idx] * filters[filter_idx]; 52 | } 53 | } 54 | } 55 | 56 | output[out_idx] = sum; 57 | } 58 | } 59 | } 60 | } 61 | } 62 | 63 | // MLX version of max pooling 64 | void mlx_max_pool_2x2(int batch_size, int channels, int in_height, int in_width, 65 | double* input, double* output) { 66 | printf("MLX Max Pooling called - optimized for Apple Silicon\n"); 67 | 68 | // Calculate output dimensions 69 | int out_height = in_height / 2; 70 | int out_width = in_width / 2; 71 | 72 | // Optimized max pooling for Apple Silicon 73 | // Removing OpenMP pragma on Mac 74 | //#pragma omp parallel for collapse(3) 75 | for (int b = 0; b < batch_size; b++) { 76 | for (int c = 0; c < channels; c++) { 77 | for (int oh = 0; oh < out_height; oh++) { 78 | for (int ow = 0; ow < out_width; ow++) { 79 | int out_idx = ((b * channels + c) * out_height + oh) * out_width + ow; 80 | 81 | // Find max in the 2x2 region 82 | double max_val = -INFINITY; 83 | for (int kh = 0; kh < 2; kh++) { 84 | for (int kw = 0; kw < 2; kw++) { 85 | int ih = oh * 2 + kh; 86 | int iw = ow * 2 + kw; 87 | int in_idx = ((b * channels + c) * in_height + ih) * in_width + iw; 88 | 89 | if (input[in_idx] > max_val) { 90 | max_val = input[in_idx]; 91 | } 92 | } 93 | } 94 | 95 | output[out_idx] = max_val; 96 | } 97 | } 98 | } 99 | } 100 | } 101 | 102 | // MLX version of tensor flattening 103 | void mlx_flatten_tensor(int batch_size, int channels, int height, int width, 104 | double* input, double* output) { 105 | printf("MLX Flatten called - optimized for Apple Silicon\n"); 106 | 107 | int flat_size = channels * height * width; 108 | 109 | // Simple memory copy - already efficient since we're just reshaping 110 | for (int b = 0; b < batch_size; b++) { 111 | memcpy(output + b * flat_size, input + b * flat_size, flat_size * sizeof(double)); 112 | } 113 | } 114 | 115 | // MLX version of softmax 116 | void mlx_softmax(int batch_size, int num_classes, double* input, double* output) { 117 | printf("MLX Softmax called - optimized for Apple Silicon\n"); 118 | 119 | // Optimized for Apple Silicon 120 | // Removing OpenMP pragma on Mac 121 | //#pragma omp parallel for 122 | for (int b = 0; b < batch_size; b++) { 123 | // Find max value for numerical stability 124 | double max_val = -INFINITY; 125 | for (int c = 0; c < num_classes; c++) { 126 | if (input[b * num_classes + c] > max_val) { 127 | max_val = input[b * num_classes + c]; 128 | } 129 | } 130 | 131 | // Compute exponentials and sum 132 | double sum = 0.0; 133 | for (int c = 0; c < num_classes; c++) { 134 | output[b * num_classes + c] = exp(input[b * num_classes + c] - max_val); 135 | sum += output[b * num_classes + c]; 136 | } 137 | 138 | // Normalize 139 | for (int c = 0; c < num_classes; c++) { 140 | output[b * num_classes + c] /= sum; 141 | } 142 | } 143 | } 144 | 145 | // Check if MLX is available (placeholder) 146 | int check_mlx_available() { 147 | #ifdef __APPLE__ 148 | // On Apple Silicon this would check for MLX availability 149 | // For now, return true since we're on a Mac 150 | return 1; 151 | #else 152 | return 0; 153 | #endif 154 | } -------------------------------------------------------------------------------- /mlx_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #ifdef __APPLE__ 7 | // Check if MLX is available - this would actually import the MLX C API 8 | // #include // This would be the actual MLX C API header 9 | #endif 10 | 11 | // Placeholder function to check if MLX is available 12 | int check_mlx_available() { 13 | #ifdef __APPLE__ 14 | // This would check if we're on Apple Silicon and MLX is installed 15 | // For now, just a placeholder that returns 0 (not available) 16 | return 0; 17 | #else 18 | return 0; 19 | #endif 20 | } 21 | 22 | // Matrix multiplication using MLX 23 | // This is a placeholder for actual MLX implementation 24 | int matrix_multiply_mlx(int M, int N, int K, double* A, double* B, double* C) { 25 | printf("MLX Matrix multiplication called (placeholders).\n"); 26 | printf("Dimensions: (%d x %d) * (%d x %d) = (%d x %d)\n", M, K, K, N, M, N); 27 | 28 | // Fall back to CPU implementation for now 29 | for (int i = 0; i < M; i++) { 30 | for (int j = 0; j < N; j++) { 31 | double sum = 0.0; 32 | for (int k = 0; k < K; k++) { 33 | sum += A[i * K + k] * B[k * N + j]; 34 | } 35 | C[i * N + j] = sum; 36 | } 37 | } 38 | 39 | return 0; 40 | } 41 | 42 | // These are all placeholder functions that would actually use MLX 43 | // in a real implementation 44 | 45 | int tensor_add_mlx(int size, double* a, double* b, double* result) { 46 | printf("MLX tensor add called (placeholder).\n"); 47 | for (int i = 0; i < size; i++) { 48 | result[i] = a[i] + b[i]; 49 | } 50 | return 0; 51 | } 52 | 53 | int tensor_sub_mlx(int size, double* a, double* b, double* result) { 54 | printf("MLX tensor subtract called (placeholder).\n"); 55 | for (int i = 0; i < size; i++) { 56 | result[i] = a[i] - b[i]; 57 | } 58 | return 0; 59 | } 60 | 61 | int tensor_mul_elementwise_mlx(int size, double* a, double* b, double* result) { 62 | printf("MLX tensor element-wise multiply called (placeholder).\n"); 63 | for (int i = 0; i < size; i++) { 64 | result[i] = a[i] * b[i]; 65 | } 66 | return 0; 67 | } 68 | 69 | int relu_forward_mlx(int size, double* input, double* output) { 70 | printf("MLX ReLU forward called (placeholder).\n"); 71 | for (int i = 0; i < size; i++) { 72 | output[i] = input[i] > 0 ? input[i] : 0; 73 | } 74 | return 0; 75 | } 76 | 77 | int relu_backward_mlx(int size, double* input, double* output) { 78 | printf("MLX ReLU backward called (placeholder).\n"); 79 | for (int i = 0; i < size; i++) { 80 | output[i] = input[i] > 0 ? 1 : 0; 81 | } 82 | return 0; 83 | } -------------------------------------------------------------------------------- /mnist.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt") 4 | (require "autograd.rkt") 5 | 6 | ;; MNIST Loading functions 7 | (define (read-idx3-ubyte filename) 8 | (let* ([p (open-input-file filename #:mode 'binary)] 9 | [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)] 10 | [num-images (integer-bytes->integer (read-bytes 4 p) #f #t)] 11 | [num-rows (integer-bytes->integer (read-bytes 4 p) #f #t)] 12 | [num-cols (integer-bytes->integer (read-bytes 4 p) #f #t)] 13 | [data (make-vector (* num-images num-rows num-cols) 0)]) 14 | (for ([i (in-range (vector-length data))]) 15 | (vector-set! data i (read-byte p))) 16 | (close-input-port p) 17 | (t:create (list num-images (* num-rows num-cols)) data))) 18 | 19 | (define (read-idx1-ubyte filename) 20 | (let* ([p (open-input-file filename #:mode 'binary)] 21 | [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)] 22 | [num-items (integer-bytes->integer (read-bytes 4 p) #f #t)] 23 | [data (make-vector num-items 0)]) 24 | (for ([i (in-range num-items)]) 25 | (vector-set! data i (read-byte p))) 26 | (close-input-port p) 27 | (t:create (list num-items 1) data))) 28 | 29 | ;; Load MNIST data 30 | (define (load-mnist-data type) 31 | (let* ([base-path "/path to mnist/"] 32 | [images-file (string-append base-path 33 | (if (equal? type "train") 34 | "train-images.idx3-ubyte" 35 | "t10k-images.idx3-ubyte"))] 36 | [labels-file (string-append base-path 37 | (if (equal? type "train") 38 | "train-labels.idx1-ubyte" 39 | "t10k-labels.idx1-ubyte"))]) 40 | (printf "Looking for images-file at: ~a~n" images-file) 41 | (printf "Looking for labels-file at: ~a~n" labels-file) 42 | 43 | ;; Error checking 44 | (unless (file-exists? images-file) 45 | (error 'load-mnist-data 46 | (format "MNIST images file not found: ~a" images-file))) 47 | (unless (file-exists? labels-file) 48 | (error 'load-mnist-data 49 | (format "MNIST labels file not found: ~a" labels-file))) 50 | (values (read-idx3-ubyte images-file) 51 | (read-idx1-ubyte labels-file)))) 52 | 53 | ;; Normalize the data 54 | (define (normalize X) 55 | (t:scale X (/ 1.0 255.0))) 56 | 57 | ;; One-hot encode labels 58 | (define (one-hot y num-classes) 59 | (let* ([num-samples (car (t:shape y))] 60 | [encoded (make-vector (* num-samples num-classes) 0.0)]) 61 | (for ([i (in-range num-samples)]) 62 | (vector-set! encoded (+ (* i num-classes) (vector-ref (t:data y) i)) 1.0)) 63 | (t:create (list num-samples num-classes) encoded))) 64 | 65 | ;; Get a batch of data given indices 66 | (define (get-batch X indices) 67 | (let* ([batch-size (length indices)] 68 | [feature-size (cadr (t:shape X))] 69 | [batch-data (make-vector (* batch-size feature-size) 0.0)]) 70 | (for ([i (range batch-size)] 71 | [idx indices]) 72 | (for ([j (range feature-size)]) 73 | (vector-set! batch-data 74 | (+ (* i feature-size) j) 75 | (vector-ref (t:data X) (+ (* idx feature-size) j))))) 76 | (t:create (list batch-size feature-size) batch-data))) 77 | 78 | ;; Softmax function 79 | (define (softmax z) 80 | (let* ([shape (t:shape z)] 81 | [data (t:data z)] 82 | [max-vals (for/vector ([i (in-range (car shape))]) 83 | (apply max (for/list ([j (in-range (cadr shape))]) 84 | (vector-ref data (+ (* i (cadr shape)) j)))))] 85 | [exp-vals (for/vector ([i (in-range (vector-length data))]) 86 | (exp (- (vector-ref data i) 87 | (vector-ref max-vals (quotient i (cadr shape))))))] 88 | [sum-vals (for/vector ([i (in-range (car shape))]) 89 | (for/sum ([j (in-range (cadr shape))]) 90 | (vector-ref exp-vals (+ (* i (cadr shape)) j))))]) 91 | (t:create shape 92 | (for/vector ([i (in-range (vector-length exp-vals))]) 93 | (/ (vector-ref exp-vals i) 94 | (vector-ref sum-vals (quotient i (cadr shape)))))))) 95 | 96 | ;; Forward pass 97 | (define (broadcast-bias b batch-size) 98 | (let* ([bias-shape (t:shape b)] 99 | [num-classes (cadr bias-shape)] 100 | [expanded-data (make-vector (* batch-size num-classes) 0.0)]) 101 | (for ([i (in-range batch-size)]) 102 | (for ([j (in-range num-classes)]) 103 | (vector-set! expanded-data (+ (* i num-classes) j) 104 | (vector-ref (t:data b) j)))) 105 | (t:create (list batch-size num-classes) expanded-data))) 106 | 107 | (define (forward X w b) 108 | (let* ([z (t:add (t:mul X w) (broadcast-bias b (car (t:shape X))))]) ; Broadcast b 109 | (softmax z))) 110 | 111 | ;; Cross-entropy loss 112 | (define (cross-entropy y-pred y-true) 113 | (let* ([m (car (t:shape y-true))] 114 | [epsilon 1e-15] 115 | [y-pred-clipped (t:create (t:shape y-pred) 116 | (for/list ([p (vector->list (t:data y-pred))]) 117 | (max (min p (- 1.0 epsilon)) epsilon)))] 118 | [loss-vec (for/list ([i (in-range (vector-length (t:data y-true)))]) 119 | (* (vector-ref (t:data y-true) i) 120 | (log (vector-ref (t:data y-pred-clipped) i))))]) 121 | (- (/ (apply + loss-vec) m)))) 122 | 123 | ;; Initialize parameters 124 | (define input-size 784) ; 28x28 pixels 125 | (define num-classes 10) ; digits 0-9 126 | (define weights (t:random (list input-size num-classes) 0.01)) ; (input-size, num-classes) 127 | (define bias (t:random (list 1 num-classes) 0.01)) ; (1, num-classes) 128 | 129 | ;; Training hyperparameters 130 | (define learning-rate 0.1) 131 | (define epochs 3) ; Reduced for testing 132 | (define batch-size 64) 133 | 134 | ;; Load and initialize MNIST data 135 | (printf "Loading MNIST data...~n") 136 | (define-values (X-train y-train) (load-mnist-data "train")) ; Load training data 137 | (define-values (X-test y-test) (load-mnist-data "test")) ; Load test data 138 | 139 | (printf "Normalizing data...~n") 140 | (set! X-train (normalize X-train)) ; Normalize X-train 141 | (set! X-test (normalize X-test)) ; Normalize X-test 142 | 143 | (printf "One-hot encoding labels...~n") 144 | (set! y-train (one-hot y-train 10)) ; Convert y-train to one-hot 145 | (set! y-test (one-hot y-test 10)) ; Convert y-test to one-hot 146 | 147 | ;; Training loop using autograd 148 | (define (train-batch X-batch y-batch) 149 | (let* ([y-pred (forward X-batch weights bias)] 150 | [loss (cross-entropy y-pred y-batch)] 151 | 152 | ;; Compute gradient of loss with respect to predictions 153 | [batch-size (car (t:shape y-batch))] 154 | [dloss (t:sub y-pred y-batch)] ; for cross-entropy with softmax, gradient is (pred - true) 155 | 156 | ;; Use autograd to compute gradients 157 | ;; Since we don't have a full autograd system, we'll compute gradients manually 158 | ;; but structure it to use the autograd module's functions 159 | 160 | ;; For softmax regression, gradient for weights: X^T * (y_pred - y_true) / batch_size 161 | [gradient-w (t:scale (t:mul (t:transpose X-batch) dloss) 162 | (/ 1.0 batch-size))] 163 | 164 | ;; Gradient for bias: sum(y_pred - y_true, axis=0) / batch_size 165 | [gradient-b (t:create (t:shape bias) 166 | (for/list ([j (in-range num-classes)]) 167 | (/ (for/sum ([i (in-range batch-size)]) 168 | (vector-ref (t:data dloss) (+ (* i num-classes) j))) 169 | batch-size)))]) 170 | 171 | ;; Update parameters with gradients 172 | (set! weights (t:sub weights (t:scale gradient-w learning-rate))) 173 | (set! bias (t:sub bias (t:scale gradient-b learning-rate))) 174 | 175 | loss)) 176 | 177 | (define (get-test-accuracy X y) 178 | (let* ([predictions (forward X weights bias)] 179 | [num-samples (car (t:shape X))] 180 | [num-classes (cadr (t:shape y))] 181 | [pred-data (t:data predictions)] 182 | [true-data (t:data y)] 183 | [correct-count 0]) 184 | 185 | ; Count correct predictions 186 | (for ([i (range num-samples)]) 187 | (let* ([start-idx (* i num-classes)] 188 | ; Find predicted class (max probability index) 189 | [pred-vals (for/list ([j (range num-classes)]) 190 | (vector-ref pred-data (+ start-idx j)))] 191 | [pred-class (argmax (lambda (j) (list-ref pred-vals j)) 192 | (range num-classes))] 193 | ; Find true class 194 | [true-class (for/first ([j (range num-classes)] 195 | #:when (= 1.0 (vector-ref true-data (+ start-idx j)))) 196 | j)]) 197 | ; Increment counter if prediction matches truth 198 | (when (= pred-class true-class) 199 | (set! correct-count (add1 correct-count))))) 200 | 201 | ; Return accuracy as percentage 202 | (exact->inexact (* 100.0 (/ correct-count num-samples))))) 203 | 204 | ;; Create a validation split from training data 205 | (define validation-split 0.1) ; 10% for validation 206 | (define train-size (car (t:shape X-train))) 207 | (define validation-size (inexact->exact (floor (* train-size validation-split)))) 208 | (define actual-train-size (- train-size validation-size)) 209 | 210 | ;; Shuffle indices and split data 211 | (define all-indices (shuffle (range train-size))) 212 | (define train-indices (take all-indices actual-train-size)) 213 | (define validation-indices (drop all-indices actual-train-size)) 214 | 215 | ;; Create validation set 216 | (define X-val (get-batch X-train validation-indices)) 217 | (define y-val (get-batch y-train validation-indices)) 218 | 219 | ;; Update training set 220 | (define X-train-actual (get-batch X-train train-indices)) 221 | (define y-train-actual (get-batch y-train train-indices)) 222 | 223 | ;; Set as new training set 224 | (set! X-train X-train-actual) 225 | (set! y-train y-train-actual) 226 | 227 | ;; Define early stopping parameters 228 | (define patience 3) 229 | (define min-delta 0.001) 230 | (define wait 0) 231 | (define best-val-accuracy 0.0) 232 | (define best-weights weights) 233 | (define best-bias bias) 234 | 235 | ;; Main training loop with early stopping 236 | (printf "Starting training...~n") 237 | (printf "Training on ~a samples, validating on ~a samples, testing on ~a samples~n~n" 238 | (car (t:shape X-train)) 239 | (car (t:shape X-val)) 240 | (car (t:shape X-test))) 241 | 242 | (define best-test-accuracy 0.0) 243 | 244 | (for/fold ([stop? #f]) 245 | ([epoch (in-range epochs)] #:break stop?) 246 | (printf "Epoch ~a/~a:~n" (add1 epoch) epochs) 247 | 248 | (let* ([indices (shuffle (range (car (t:shape X-train))))] 249 | [num-batches (quotient (length indices) batch-size)] 250 | [epoch-losses '()]) 251 | 252 | ; Train on batches 253 | (for ([batch (in-range num-batches)]) 254 | (let* ([batch-indices (take (drop indices (* batch batch-size)) batch-size)] 255 | [X-batch (get-batch X-train batch-indices)] 256 | [y-batch (get-batch y-train batch-indices)] 257 | [loss (train-batch X-batch y-batch)]) 258 | 259 | (set! epoch-losses (cons loss epoch-losses)) 260 | 261 | (when (= (modulo batch 50) 0) 262 | (printf " Batch ~a/~a - Loss: ~a~n" 263 | batch 264 | num-batches 265 | (real->decimal-string loss 4))))) 266 | 267 | ; Epoch evaluation with validation 268 | (let* ([avg-loss (/ (apply + epoch-losses) (length epoch-losses))] 269 | [val-accuracy (get-test-accuracy X-val y-val)] 270 | [test-accuracy (get-test-accuracy X-test y-test)]) 271 | 272 | (printf "~nEpoch Summary:~n") 273 | (printf " Average Loss: ~a~n" (real->decimal-string avg-loss 4)) 274 | (printf " Validation Accuracy: ~a%~n" (real->decimal-string val-accuracy 2)) 275 | (printf " Test Accuracy: ~a%~n" (real->decimal-string test-accuracy 2)) 276 | 277 | ;; Save best model based on validation accuracy 278 | (if (> val-accuracy best-val-accuracy) 279 | (begin 280 | (set! best-val-accuracy val-accuracy) 281 | (set! best-weights (t:create (t:shape weights) (t:data weights))) 282 | (set! best-bias (t:create (t:shape bias) (t:data bias))) 283 | (set! wait 0) 284 | (printf " New best model saved!~n")) 285 | (begin 286 | (set! wait (add1 wait)) 287 | (printf " No improvement for ~a epochs.~n" wait))) 288 | 289 | ;; Save best test accuracy for reporting 290 | (when (> test-accuracy best-test-accuracy) 291 | (set! best-test-accuracy test-accuracy)) 292 | 293 | (printf " Best Validation Accuracy: ~a%~n" (real->decimal-string best-val-accuracy 2)) 294 | (printf " Best Test Accuracy So Far: ~a%~n~n" (real->decimal-string best-test-accuracy 2)) 295 | 296 | ;; Check early stopping condition 297 | (if (>= wait patience) 298 | (begin 299 | (printf "Early stopping triggered after ~a epochs without improvement.~n" patience) 300 | #t) ; Stop training 301 | #f)))) ; Continue training 302 | 303 | ;; Restore best model 304 | (set! weights best-weights) 305 | (set! bias best-bias) 306 | 307 | (printf "Training Complete!~n") 308 | (printf "Final Test Accuracy: ~a%~n" 309 | (real->decimal-string (get-test-accuracy X-test y-test) 2)) 310 | (printf "Best Validation Accuracy: ~a%~n" 311 | (real->decimal-string best-val-accuracy 2)) 312 | (printf "Best Test Accuracy: ~a%~n" 313 | (real->decimal-string best-test-accuracy 2)) 314 | -------------------------------------------------------------------------------- /mnist_benchmark.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "mnist_device.rkt" 4 | "device.rkt" 5 | "hardware_detection.rkt") 6 | 7 | ;; Benchmark on all available devices 8 | (define (benchmark-mnist epochs batch-size) 9 | (printf "====================================\n") 10 | (printf " RACOGRAD MNIST BENCHMARKS\n") 11 | (printf "====================================\n") 12 | 13 | (print-hardware-info) 14 | 15 | ;; Run on CPU 16 | (printf "\n\nRunning MNIST benchmark on CPU...\n") 17 | (define cpu-result (train-mnist 'cpu batch-size epochs)) 18 | 19 | (define results (list (cons 'cpu cpu-result))) 20 | 21 | ;; Run on MLX if available 22 | (when (has-mlx-support?) 23 | (printf "\n\nRunning MNIST benchmark on MLX (Apple Silicon)...\n") 24 | (define mlx-result (train-mnist 'mlx batch-size epochs)) 25 | (set! results (cons (cons 'mlx mlx-result) results))) 26 | 27 | ;; Run on CUDA if available 28 | (when (has-cuda-support?) 29 | (printf "\n\nRunning MNIST benchmark on CUDA (NVIDIA GPU)...\n") 30 | (define cuda-result (train-mnist 'cuda batch-size epochs)) 31 | (set! results (cons (cons 'cuda cuda-result) results))) 32 | 33 | ;; Run on OpenCL if available 34 | (when (has-opencl?) 35 | (printf "\n\nRunning MNIST benchmark on OpenCL...\n") 36 | (define opencl-result (train-mnist 'opencl batch-size epochs)) 37 | (set! results (cons (cons 'opencl opencl-result) results))) 38 | 39 | ;; Compare results 40 | (printf "\n\n====================================\n") 41 | (printf " PERFORMANCE COMPARISON\n") 42 | (printf "====================================\n") 43 | 44 | (define cpu-time (hash-ref (cdr (assoc 'cpu results)) 'time)) 45 | (define cpu-accuracy (hash-ref (cdr (assoc 'cpu results)) 'accuracy)) 46 | 47 | (printf "CPU Training Time: ~a seconds\n" (real->decimal-string cpu-time 2)) 48 | (printf "CPU Accuracy: ~a%\n\n" (real->decimal-string cpu-accuracy 2)) 49 | 50 | (for ([result (in-list results)]) 51 | (unless (eq? (car result) 'cpu) 52 | (let* ([device-type (car result)] 53 | [device-result (cdr result)] 54 | [device-time (hash-ref device-result 'time)] 55 | [device-accuracy (hash-ref device-result 'accuracy)] 56 | [speedup (/ cpu-time device-time)]) 57 | (printf "~a Training Time: ~a seconds\n" 58 | device-type 59 | (real->decimal-string device-time 2)) 60 | (printf "~a Accuracy: ~a%\n" 61 | device-type 62 | (real->decimal-string device-accuracy 2)) 63 | (printf "Speedup vs CPU: ~a times faster\n\n" 64 | (real->decimal-string speedup 2))))) 65 | 66 | (printf "====================================\n")) 67 | 68 | ;; Run the benchmark when executed directly 69 | (module+ main 70 | ;; Use smaller values for quick testing 71 | (benchmark-mnist 2 128)) -------------------------------------------------------------------------------- /mnist_device.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor_device.rkt" 4 | "device.rkt" 5 | "autograd.rkt") 6 | 7 | ;; MNIST Loading functions 8 | (define (read-idx3-ubyte filename) 9 | (let* ([p (open-input-file filename #:mode 'binary)] 10 | [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)] 11 | [num-images (integer-bytes->integer (read-bytes 4 p) #f #t)] 12 | [num-rows (integer-bytes->integer (read-bytes 4 p) #f #t)] 13 | [num-cols (integer-bytes->integer (read-bytes 4 p) #f #t)] 14 | [data (make-vector (* num-images num-rows num-cols) 0)]) 15 | (for ([i (in-range (vector-length data))]) 16 | (vector-set! data i (read-byte p))) 17 | (close-input-port p) 18 | (dt:create (list num-images (* num-rows num-cols)) data))) 19 | 20 | (define (read-idx1-ubyte filename) 21 | (let* ([p (open-input-file filename #:mode 'binary)] 22 | [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)] 23 | [num-items (integer-bytes->integer (read-bytes 4 p) #f #t)] 24 | [data (make-vector num-items 0)]) 25 | (for ([i (in-range num-items)]) 26 | (vector-set! data i (read-byte p))) 27 | (close-input-port p) 28 | (dt:create (list num-items 1) data))) 29 | 30 | ;; Load MNIST data 31 | (define (load-mnist-data type) 32 | (let* ([base-path "/path to mnist/"] 33 | [images-file (string-append base-path 34 | (if (equal? type "train") 35 | "train-images.idx3-ubyte" 36 | "t10k-images.idx3-ubyte"))] 37 | [labels-file (string-append base-path 38 | (if (equal? type "train") 39 | "train-labels.idx1-ubyte" 40 | "t10k-labels.idx1-ubyte"))]) 41 | (printf "Looking for images-file at: ~a~n" images-file) 42 | (printf "Looking for labels-file at: ~a~n" labels-file) 43 | 44 | ;; Error checking 45 | (unless (file-exists? images-file) 46 | (error 'load-mnist-data 47 | (format "MNIST images file not found: ~a" images-file))) 48 | (unless (file-exists? labels-file) 49 | (error 'load-mnist-data 50 | (format "MNIST labels file not found: ~a" labels-file))) 51 | (values (read-idx3-ubyte images-file) 52 | (read-idx1-ubyte labels-file)))) 53 | 54 | ;; Normalize the data 55 | (define (normalize X) 56 | (dt:scale X (/ 1.0 255.0))) 57 | 58 | ;; One-hot encode labels 59 | (define (one-hot y num-classes) 60 | (let* ([num-samples (car (dt:shape y))] 61 | [encoded (make-vector (* num-samples num-classes) 0.0)]) 62 | (for ([i (in-range num-samples)]) 63 | (vector-set! encoded (+ (* i num-classes) (vector-ref (dt:data y) i)) 1.0)) 64 | (dt:create (list num-samples num-classes) encoded))) 65 | 66 | ;; Get a batch of data given indices 67 | (define (get-batch X indices) 68 | (let* ([batch-size (length indices)] 69 | [feature-size (cadr (dt:shape X))] 70 | [batch-data (make-vector (* batch-size feature-size) 0.0)]) 71 | (for ([i (range batch-size)] 72 | [idx indices]) 73 | (for ([j (range feature-size)]) 74 | (vector-set! batch-data 75 | (+ (* i feature-size) j) 76 | (vector-ref (dt:data X) (+ (* idx feature-size) j))))) 77 | (dt:create (list batch-size feature-size) batch-data))) 78 | 79 | ;; Softmax function 80 | (define (softmax z) 81 | (let* ([shape (dt:shape z)] 82 | [data (dt:data z)] 83 | [max-vals (for/vector ([i (in-range (car shape))]) 84 | (apply max (for/list ([j (in-range (cadr shape))]) 85 | (vector-ref data (+ (* i (cadr shape)) j)))))] 86 | [exp-vals (for/vector ([i (in-range (vector-length data))]) 87 | (exp (- (vector-ref data i) 88 | (vector-ref max-vals (quotient i (cadr shape))))))] 89 | [sum-vals (for/vector ([i (in-range (car shape))]) 90 | (for/sum ([j (in-range (cadr shape))]) 91 | (vector-ref exp-vals (+ (* i (cadr shape)) j))))]) 92 | (dt:create shape 93 | (for/vector ([i (in-range (vector-length exp-vals))]) 94 | (/ (vector-ref exp-vals i) 95 | (vector-ref sum-vals (quotient i (cadr shape)))))))) 96 | 97 | ;; Forward pass 98 | (define (broadcast-bias b batch-size) 99 | (let* ([bias-shape (dt:shape b)] 100 | [num-classes (cadr bias-shape)] 101 | [expanded-data (make-vector (* batch-size num-classes) 0.0)]) 102 | (for ([i (in-range batch-size)]) 103 | (for ([j (in-range num-classes)]) 104 | (vector-set! expanded-data (+ (* i num-classes) j) 105 | (vector-ref (dt:data b) j)))) 106 | (dt:create (list batch-size num-classes) expanded-data))) 107 | 108 | (define (forward X w b) 109 | (let* ([z (dt:add (dt:mul X w) (broadcast-bias b (car (dt:shape X))))]) ; Broadcast b 110 | (softmax z))) 111 | 112 | ;; Cross-entropy loss 113 | (define (cross-entropy y-pred y-true) 114 | (let* ([m (car (dt:shape y-true))] 115 | [epsilon 1e-15] 116 | [y-pred-clipped (dt:create (dt:shape y-pred) 117 | (for/list ([p (vector->list (dt:data y-pred))]) 118 | (max (min p (- 1.0 epsilon)) epsilon)))] 119 | [loss-vec (for/list ([i (in-range (vector-length (dt:data y-true)))]) 120 | (* (vector-ref (dt:data y-true) i) 121 | (log (vector-ref (dt:data y-pred-clipped) i))))]) 122 | (- (/ (apply + loss-vec) m)))) 123 | 124 | ;; Initialize parameters 125 | (define input-size 784) ; 28x28 pixels 126 | (define num-classes 10) ; digits 0-9 127 | 128 | ;; Run the training process with a specified device 129 | (provide train-mnist) 130 | 131 | (define (train-mnist device-type batch-size [epochs 10]) 132 | (printf "~nStarting MNIST training on device: ~a~n" device-type) 133 | 134 | ;; Set the device for computation 135 | (cond 136 | [(eq? device-type 'cpu) 137 | (set-current-device! (cpu))] 138 | [(eq? device-type 'mlx) 139 | (if (device-available? 'mlx) 140 | (set-current-device! (mlx)) 141 | (begin 142 | (printf "MLX not available. Falling back to CPU.\n") 143 | (set-current-device! (cpu))))] 144 | [(eq? device-type 'cuda) 145 | (if (device-available? 'cuda) 146 | (set-current-device! (cuda)) 147 | (begin 148 | (printf "CUDA not available. Falling back to CPU.\n") 149 | (set-current-device! (cpu))))] 150 | [(eq? device-type 'opencl) 151 | (if (device-available? 'opencl) 152 | (set-current-device! (opencl)) 153 | (begin 154 | (printf "OpenCL not available. Falling back to CPU.\n") 155 | (set-current-device! (cpu))))] 156 | [(eq? device-type 'gpu) 157 | (if (gpu-available?) 158 | (set-current-device! (gpu)) 159 | (begin 160 | (printf "No GPU available. Falling back to CPU.\n") 161 | (set-current-device! (cpu))))] 162 | [else 163 | (printf "Unknown device type: ~a. Using CPU.\n" device-type) 164 | (set-current-device! (cpu))]) 165 | 166 | (printf "Using device: ~a~n" (get-device-type (current-device))) 167 | 168 | ;; Create model parameters on the selected device 169 | (define weights (dt:random (list input-size num-classes) 0.01)) ; (input-size, num-classes) 170 | (define bias (dt:random (list 1 num-classes) 0.01)) ; (1, num-classes) 171 | 172 | ;; Training hyperparameters 173 | (define learning-rate 0.1) 174 | 175 | ;; Load and initialize MNIST data 176 | (printf "Loading MNIST data...~n") 177 | (define-values (X-train y-train) (load-mnist-data "train")) ; Load training data 178 | (define-values (X-test y-test) (load-mnist-data "test")) ; Load test data 179 | 180 | (printf "Normalizing data...~n") 181 | (set! X-train (normalize X-train)) ; Normalize X-train 182 | (set! X-test (normalize X-test)) ; Normalize X-test 183 | 184 | (printf "One-hot encoding labels...~n") 185 | (set! y-train (one-hot y-train 10)) ; Convert y-train to one-hot 186 | (set! y-test (one-hot y-test 10)) ; Convert y-test to one-hot 187 | 188 | ;; Move data to device if using GPU 189 | (when (eq? device-type 'gpu) 190 | (printf "Moving data to GPU...~n") 191 | (set! X-train (dt:to X-train (current-device))) 192 | (set! y-train (dt:to y-train (current-device))) 193 | (set! X-test (dt:to X-test (current-device))) 194 | (set! y-test (dt:to y-test (current-device)))) 195 | 196 | ;; Training loop 197 | (define (train-batch X-batch y-batch) 198 | (let* ([y-pred (forward X-batch weights bias)] 199 | [loss (cross-entropy y-pred y-batch)] 200 | [error (dt:sub y-pred y-batch)] 201 | [gradient-w (dt:scale (dt:mul (dt:transpose X-batch) error) 202 | (/ 1.0 (car (dt:shape X-batch))))] 203 | [gradient-b (dt:create (dt:shape bias) 204 | (for/list ([j (in-range num-classes)]) 205 | (/ (for/sum ([i (in-range (car (dt:shape error)))]) 206 | (vector-ref (dt:data error) (+ (* i num-classes) j))) 207 | (car (dt:shape error)))))]) 208 | ;; Update parameters 209 | (set! weights (dt:sub weights (dt:scale gradient-w learning-rate))) 210 | (set! bias (dt:sub bias (dt:scale gradient-b learning-rate))) 211 | loss)) 212 | 213 | (define (get-test-accuracy X y) 214 | (let* ([predictions (forward X weights bias)] 215 | [num-samples (car (dt:shape X))] 216 | [num-classes (cadr (dt:shape y))] 217 | [pred-data (dt:data predictions)] 218 | [true-data (dt:data y)] 219 | [correct-count 0]) 220 | 221 | ; Count correct predictions 222 | (for ([i (range num-samples)]) 223 | (let* ([start-idx (* i num-classes)] 224 | ; Find predicted class (max probability index) 225 | [pred-vals (for/list ([j (range num-classes)]) 226 | (vector-ref pred-data (+ start-idx j)))] 227 | [pred-class (argmax (lambda (j) (list-ref pred-vals j)) 228 | (range num-classes))] 229 | ; Find true class 230 | [true-class (for/first ([j (range num-classes)] 231 | #:when (= 1.0 (vector-ref true-data (+ start-idx j)))) 232 | j)]) 233 | ; Increment counter if prediction matches truth 234 | (when (= pred-class true-class) 235 | (set! correct-count (add1 correct-count))))) 236 | 237 | ; Return accuracy as percentage 238 | (exact->inexact (* 100.0 (/ correct-count num-samples))))) 239 | 240 | ;; Track time for benchmarking 241 | (define start-time (current-inexact-milliseconds)) 242 | 243 | ;; Main training loop 244 | (printf "Starting training...~n") 245 | (printf "Training on ~a samples, validating on ~a samples~n~n" 246 | (car (dt:shape X-train)) 247 | (car (dt:shape X-test))) 248 | 249 | (define best-accuracy 0.0) 250 | 251 | (for ([epoch (in-range epochs)]) 252 | (printf "Epoch ~a/~a:~n" (add1 epoch) epochs) 253 | 254 | (let* ([indices (shuffle (range (car (dt:shape X-train))))] 255 | [num-batches (quotient (length indices) batch-size)] 256 | [epoch-losses '()]) 257 | 258 | ; Train on batches 259 | (for ([batch (in-range num-batches)]) 260 | (let* ([batch-indices (take (drop indices (* batch batch-size)) batch-size)] 261 | [X-batch (get-batch X-train batch-indices)] 262 | [y-batch (get-batch y-train batch-indices)] 263 | [loss (train-batch X-batch y-batch)]) 264 | 265 | (set! epoch-losses (cons loss epoch-losses)) 266 | 267 | (when (= (modulo batch 50) 0) 268 | (printf " Batch ~a/~a - Loss: ~a~n" 269 | batch 270 | num-batches 271 | (real->decimal-string loss 4))))) 272 | 273 | ; Epoch evaluation 274 | (let* ([avg-loss (/ (apply + epoch-losses) (length epoch-losses))] 275 | [test-accuracy (get-test-accuracy X-test y-test)]) 276 | 277 | (when (> test-accuracy best-accuracy) 278 | (set! best-accuracy test-accuracy)) 279 | 280 | (printf "~nEpoch Summary:~n") 281 | (printf " Average Loss: ~a~n" (real->decimal-string avg-loss 4)) 282 | (printf " Test Accuracy: ~a%~n" (real->decimal-string test-accuracy 2)) 283 | (printf " Best Accuracy So Far: ~a%~n~n" (real->decimal-string best-accuracy 2))))) 284 | 285 | (define end-time (current-inexact-milliseconds)) 286 | (define total-time (/ (- end-time start-time) 1000.0)) 287 | 288 | (printf "Training Complete!~n") 289 | (printf "Final Test Accuracy: ~a%~n" 290 | (real->decimal-string (get-test-accuracy X-test y-test) 2)) 291 | (printf "Total training time: ~a seconds~n" (real->decimal-string total-time 2)) 292 | 293 | ;; Return benchmark results 294 | (hash 'accuracy (get-test-accuracy X-test y-test) 295 | 'time total-time 296 | 'device device-type)) 297 | 298 | ;; When executed directly, run on both CPU and GPU (if available) 299 | (module+ main 300 | (printf "Starting CPU training...~n") 301 | (define cpu-result (train-mnist 'cpu 128 2)) 302 | 303 | (if (gpu-available?) 304 | (let ([gpu-result (train-mnist 'gpu 128 2)]) 305 | (printf "~n~n===== PERFORMANCE COMPARISON =====~n") 306 | (printf "CPU Training Time: ~a seconds~n" (real->decimal-string (hash-ref cpu-result 'time) 2)) 307 | (printf "GPU Training Time: ~a seconds~n" (real->decimal-string (hash-ref gpu-result 'time) 2)) 308 | (printf "Speedup: ~a times faster~n" 309 | (real->decimal-string (/ (hash-ref cpu-result 'time) (hash-ref gpu-result 'time)) 2)) 310 | (printf "CPU Accuracy: ~a%~n" (real->decimal-string (hash-ref cpu-result 'accuracy) 2)) 311 | (printf "GPU Accuracy: ~a%~n" (real->decimal-string (hash-ref gpu-result 'accuracy) 2)) 312 | (printf "===============================~n")) 313 | (printf "~nGPU acceleration not available on this system. Skipping GPU training.~n"))) 314 | -------------------------------------------------------------------------------- /parallel_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | // Thread arguments for parallel matrix multiplication 6 | typedef struct { 7 | int start_row; 8 | int end_row; 9 | int cols_a; 10 | int cols_b; 11 | double *a; 12 | double *b; 13 | double *c; 14 | } thread_data_t; 15 | 16 | // Thread function for matrix multiplication 17 | void* matrix_multiply_thread(void *arg) { 18 | thread_data_t *data = (thread_data_t*)arg; 19 | 20 | for (int i = data->start_row; i < data->end_row; i++) { 21 | for (int j = 0; j < data->cols_b; j++) { 22 | double sum = 0.0; 23 | for (int k = 0; k < data->cols_a; k++) { 24 | sum += data->a[i * data->cols_a + k] * data->b[k * data->cols_b + j]; 25 | } 26 | data->c[i * data->cols_b + j] = sum; 27 | } 28 | } 29 | 30 | pthread_exit(NULL); 31 | } 32 | 33 | // Parallel matrix multiplication 34 | void matrix_multiply_parallel(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c, int num_threads) { 35 | pthread_t threads[num_threads]; 36 | thread_data_t thread_data[num_threads]; 37 | 38 | int rows_per_thread = rows_a / num_threads; 39 | int remainder = rows_a % num_threads; 40 | 41 | int start_row = 0; 42 | for (int t = 0; t < num_threads; t++) { 43 | int end_row = start_row + rows_per_thread + (t < remainder ? 1 : 0); 44 | 45 | thread_data[t].start_row = start_row; 46 | thread_data[t].end_row = end_row; 47 | thread_data[t].cols_a = cols_a; 48 | thread_data[t].cols_b = cols_b; 49 | thread_data[t].a = a; 50 | thread_data[t].b = b; 51 | thread_data[t].c = c; 52 | 53 | pthread_create(&threads[t], NULL, matrix_multiply_thread, &thread_data[t]); 54 | 55 | start_row = end_row; 56 | } 57 | 58 | // Wait for all threads to complete 59 | for (int t = 0; t < num_threads; t++) { 60 | pthread_join(threads[t], NULL); 61 | } 62 | } 63 | 64 | // Thread function for elementwise operations 65 | typedef struct { 66 | int start_idx; 67 | int end_idx; 68 | double *a; 69 | double *b; 70 | double *result; 71 | int operation; // 0: add, 1: subtract, 2: multiply 72 | } elementwise_thread_data_t; 73 | 74 | void* elementwise_operation_thread(void *arg) { 75 | elementwise_thread_data_t *data = (elementwise_thread_data_t*)arg; 76 | 77 | switch(data->operation) { 78 | case 0: // Add 79 | for (int i = data->start_idx; i < data->end_idx; i++) { 80 | data->result[i] = data->a[i] + data->b[i]; 81 | } 82 | break; 83 | case 1: // Subtract 84 | for (int i = data->start_idx; i < data->end_idx; i++) { 85 | data->result[i] = data->a[i] - data->b[i]; 86 | } 87 | break; 88 | case 2: // Multiply 89 | for (int i = data->start_idx; i < data->end_idx; i++) { 90 | data->result[i] = data->a[i] * data->b[i]; 91 | } 92 | break; 93 | } 94 | 95 | pthread_exit(NULL); 96 | } 97 | 98 | // Parallel elementwise operations 99 | void tensor_elementwise_parallel(int size, double *a, double *b, double *result, int operation, int num_threads) { 100 | pthread_t threads[num_threads]; 101 | elementwise_thread_data_t thread_data[num_threads]; 102 | 103 | int elems_per_thread = size / num_threads; 104 | int remainder = size % num_threads; 105 | 106 | int start_idx = 0; 107 | for (int t = 0; t < num_threads; t++) { 108 | int end_idx = start_idx + elems_per_thread + (t < remainder ? 1 : 0); 109 | 110 | thread_data[t].start_idx = start_idx; 111 | thread_data[t].end_idx = end_idx; 112 | thread_data[t].a = a; 113 | thread_data[t].b = b; 114 | thread_data[t].result = result; 115 | thread_data[t].operation = operation; 116 | 117 | pthread_create(&threads[t], NULL, elementwise_operation_thread, &thread_data[t]); 118 | 119 | start_idx = end_idx; 120 | } 121 | 122 | // Wait for all threads to complete 123 | for (int t = 0; t < num_threads; t++) { 124 | pthread_join(threads[t], NULL); 125 | } 126 | } 127 | 128 | // Wrapper functions 129 | void tensor_add_parallel(int size, double *a, double *b, double *result, int num_threads) { 130 | tensor_elementwise_parallel(size, a, b, result, 0, num_threads); 131 | } 132 | 133 | void tensor_sub_parallel(int size, double *a, double *b, double *result, int num_threads) { 134 | tensor_elementwise_parallel(size, a, b, result, 1, num_threads); 135 | } 136 | 137 | void tensor_mul_elementwise_parallel(int size, double *a, double *b, double *result, int num_threads) { 138 | tensor_elementwise_parallel(size, a, b, result, 2, num_threads); 139 | } 140 | 141 | // Batch processing with parallelism for neural networks 142 | typedef struct { 143 | int start_sample; 144 | int end_sample; 145 | int input_dim; 146 | int output_dim; 147 | double *inputs; 148 | double *weights; 149 | double *biases; 150 | double *outputs; 151 | } batch_thread_data_t; 152 | 153 | void* process_batch_thread(void *arg) { 154 | batch_thread_data_t *data = (batch_thread_data_t*)arg; 155 | 156 | for (int i = data->start_sample; i < data->end_sample; i++) { 157 | // For each sample in this thread's portion of the batch 158 | for (int j = 0; j < data->output_dim; j++) { 159 | double sum = data->biases[j]; 160 | for (int k = 0; k < data->input_dim; k++) { 161 | sum += data->inputs[i * data->input_dim + k] * data->weights[k * data->output_dim + j]; 162 | } 163 | // Apply ReLU 164 | data->outputs[i * data->output_dim + j] = sum > 0 ? sum : 0; 165 | } 166 | } 167 | 168 | pthread_exit(NULL); 169 | } 170 | 171 | // Process a batch of samples through a layer in parallel 172 | void process_batch_parallel(int batch_size, int input_dim, int output_dim, 173 | double *inputs, double *weights, double *biases, 174 | double *outputs, int num_threads) { 175 | pthread_t threads[num_threads]; 176 | batch_thread_data_t thread_data[num_threads]; 177 | 178 | int samples_per_thread = batch_size / num_threads; 179 | int remainder = batch_size % num_threads; 180 | 181 | int start_sample = 0; 182 | for (int t = 0; t < num_threads; t++) { 183 | int end_sample = start_sample + samples_per_thread + (t < remainder ? 1 : 0); 184 | 185 | thread_data[t].start_sample = start_sample; 186 | thread_data[t].end_sample = end_sample; 187 | thread_data[t].input_dim = input_dim; 188 | thread_data[t].output_dim = output_dim; 189 | thread_data[t].inputs = inputs; 190 | thread_data[t].weights = weights; 191 | thread_data[t].biases = biases; 192 | thread_data[t].outputs = outputs; 193 | 194 | pthread_create(&threads[t], NULL, process_batch_thread, &thread_data[t]); 195 | 196 | start_sample = end_sample; 197 | } 198 | 199 | // Wait for all threads to complete 200 | for (int t = 0; t < num_threads; t++) { 201 | pthread_join(threads[t], NULL); 202 | } 203 | } -------------------------------------------------------------------------------- /regression-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt") 4 | (require "deep_learn_library.rkt") 5 | 6 | ;; Create sample data 7 | ;; X: 2 features, 4 samples 8 | (define X (t:create '(4 2) '(1 2 9 | 2 3 10 | 3 4 11 | 4 5))) 12 | 13 | ;; y: binary labels (0 or 1) 14 | (define y (t:create '(4 1) '(0.0 0.0 1.0 1.0))) 15 | 16 | ;; Initialize weights and bias 17 | (define weights (t:random '(2 1) 1.0)) ; 2 features -> 1 output 18 | (define bias (t:random '(1) 1.0)) 19 | 20 | ;; Sigmoid function 21 | (define (sigmoid z) 22 | (t:create (t:shape z) 23 | (for/list ([x (vector->list (t:data z))]) 24 | (/ 1.0 (+ 1.0 (exp (- x))))))) 25 | 26 | ;; Forward pass 27 | (define (forward X w b) 28 | (let* ([z (t:add (t:mul X w) b)]) 29 | (sigmoid z))) 30 | 31 | ;; Binary Cross Entropy Loss 32 | (define (binary-cross-entropy y-pred y-true) 33 | (let* ([m (car (t:shape y-true))] 34 | [epsilon 1e-15] ; To avoid log(0) 35 | [y-pred-clipped (t:create (t:shape y-pred) 36 | (for/list ([p (vector->list (t:data y-pred))]) 37 | (max (min p (- 1.0 epsilon)) epsilon)))] 38 | [loss-vec (for/list ([p (vector->list (t:data y-pred-clipped))] 39 | [t (vector->list (t:data y-true))]) 40 | (+ (* t (log p)) 41 | (* (- 1 t) (log (- 1.0 p)))))]) 42 | (- (/ (apply + loss-vec) m)))) 43 | 44 | ;; Training loop 45 | (define learning-rate 0.1) 46 | (define epochs 100) 47 | 48 | (for ([epoch (in-range epochs)]) 49 | (let* ([;; Forward pass 50 | y-pred (forward X weights bias)] 51 | [;; Calculate loss 52 | loss (binary-cross-entropy y-pred y)]) 53 | 54 | ;; Print progress every 10 epochs 55 | (when (= (modulo epoch 10) 0) 56 | (printf "Epoch ~a, Loss: ~a~n" epoch loss)) 57 | 58 | ;; Calculate gradients (simplified for this example) 59 | (let* ([error (t:sub y-pred y)] 60 | [gradient-w (t:scale (t:mul (t:transpose X) error) (/ 1.0 (car (t:shape X))))] 61 | [gradient-b (t:create '(1) 62 | (list (/ (for/sum ([e (vector->list (t:data error))]) e) 63 | (car (t:shape X)))))]) 64 | 65 | ;; Update weights and bias 66 | (set! weights (t:sub weights (t:scale gradient-w learning-rate))) 67 | (set! bias (t:sub bias (t:scale gradient-b learning-rate)))))) 68 | 69 | ;; Test the model 70 | (printf "~nFinal weights:~n") 71 | (t:print weights) 72 | (printf "~nFinal bias:~n") 73 | (t:print bias) 74 | 75 | ;; Make predictions 76 | (printf "~nFinal predictions:~n") 77 | (t:print (forward X weights bias)) -------------------------------------------------------------------------------- /simd_ops.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #ifdef __AVX__ 6 | #include 7 | #endif 8 | 9 | #ifdef __SSE__ 10 | #include 11 | #endif 12 | 13 | // Matrix multiplication with SIMD optimization where available 14 | void matrix_multiply_simd(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c) { 15 | #ifdef __AVX__ 16 | // AVX optimized version (processes 4 doubles at once) 17 | for (int i = 0; i < rows_a; ++i) { 18 | for (int j = 0; j < cols_b; ++j) { 19 | __m256d sum_vec = _mm256_setzero_pd(); 20 | 21 | // Process 4 elements at a time 22 | int k = 0; 23 | for (; k <= cols_a - 4; k += 4) { 24 | __m256d a_vec = _mm256_loadu_pd(&a[i * cols_a + k]); 25 | 26 | // For each of the 4 elements, we need to multiply by b[k+n][j] 27 | __m256d b_vec = _mm256_set_pd( 28 | b[(k+3) * cols_b + j], 29 | b[(k+2) * cols_b + j], 30 | b[(k+1) * cols_b + j], 31 | b[k * cols_b + j] 32 | ); 33 | 34 | // Multiply and add 35 | sum_vec = _mm256_add_pd(sum_vec, _mm256_mul_pd(a_vec, b_vec)); 36 | } 37 | 38 | // Horizontal sum of the vector 39 | double sum_array[4]; 40 | _mm256_storeu_pd(sum_array, sum_vec); 41 | double sum = sum_array[0] + sum_array[1] + sum_array[2] + sum_array[3]; 42 | 43 | // Handle remaining elements 44 | for (; k < cols_a; ++k) { 45 | sum += a[i * cols_a + k] * b[k * cols_b + j]; 46 | } 47 | 48 | c[i * cols_b + j] = sum; 49 | } 50 | } 51 | #elif defined(__SSE__) 52 | // SSE optimized version (processes 2 doubles at once) 53 | for (int i = 0; i < rows_a; ++i) { 54 | for (int j = 0; j < cols_b; ++j) { 55 | __m128d sum_vec = _mm_setzero_pd(); 56 | 57 | // Process 2 elements at a time 58 | int k = 0; 59 | for (; k <= cols_a - 2; k += 2) { 60 | __m128d a_vec = _mm_loadu_pd(&a[i * cols_a + k]); 61 | __m128d b_vec = _mm_set_pd( 62 | b[(k+1) * cols_b + j], 63 | b[k * cols_b + j] 64 | ); 65 | 66 | // Multiply and add 67 | sum_vec = _mm_add_pd(sum_vec, _mm_mul_pd(a_vec, b_vec)); 68 | } 69 | 70 | // Horizontal sum 71 | double sum_array[2]; 72 | _mm_storeu_pd(sum_array, sum_vec); 73 | double sum = sum_array[0] + sum_array[1]; 74 | 75 | // Handle remaining elements 76 | for (; k < cols_a; ++k) { 77 | sum += a[i * cols_a + k] * b[k * cols_b + j]; 78 | } 79 | 80 | c[i * cols_b + j] = sum; 81 | } 82 | } 83 | #else 84 | // Fallback to standard implementation 85 | for (int i = 0; i < rows_a; ++i) { 86 | for (int j = 0; j < cols_b; ++j) { 87 | double sum = 0.0; 88 | for (int k = 0; k < cols_a; ++k) { 89 | sum += a[i * cols_a + k] * b[k * cols_b + j]; 90 | } 91 | c[i * cols_b + j] = sum; 92 | } 93 | } 94 | #endif 95 | } 96 | 97 | // Element-wise operations with SIMD 98 | void tensor_add_simd(int size, double *a, double *b, double *result) { 99 | #ifdef __AVX__ 100 | int i = 0; 101 | // Process 4 doubles at a time with AVX 102 | for (; i <= size - 4; i += 4) { 103 | __m256d a_vec = _mm256_loadu_pd(&a[i]); 104 | __m256d b_vec = _mm256_loadu_pd(&b[i]); 105 | __m256d res_vec = _mm256_add_pd(a_vec, b_vec); 106 | _mm256_storeu_pd(&result[i], res_vec); 107 | } 108 | // Handle remaining elements 109 | for (; i < size; i++) { 110 | result[i] = a[i] + b[i]; 111 | } 112 | #elif defined(__SSE__) 113 | int i = 0; 114 | // Process 2 doubles at a time with SSE 115 | for (; i <= size - 2; i += 2) { 116 | __m128d a_vec = _mm_loadu_pd(&a[i]); 117 | __m128d b_vec = _mm_loadu_pd(&b[i]); 118 | __m128d res_vec = _mm_add_pd(a_vec, b_vec); 119 | _mm_storeu_pd(&result[i], res_vec); 120 | } 121 | // Handle remaining elements 122 | for (; i < size; i++) { 123 | result[i] = a[i] + b[i]; 124 | } 125 | #else 126 | // Fallback implementation 127 | for (int i = 0; i < size; i++) { 128 | result[i] = a[i] + b[i]; 129 | } 130 | #endif 131 | } 132 | 133 | // Other SIMD operations (similar structure to tensor_add_simd) 134 | void tensor_mul_elementwise_simd(int size, double *a, double *b, double *result) { 135 | #ifdef __AVX__ 136 | int i = 0; 137 | for (; i <= size - 4; i += 4) { 138 | __m256d a_vec = _mm256_loadu_pd(&a[i]); 139 | __m256d b_vec = _mm256_loadu_pd(&b[i]); 140 | __m256d res_vec = _mm256_mul_pd(a_vec, b_vec); 141 | _mm256_storeu_pd(&result[i], res_vec); 142 | } 143 | for (; i < size; i++) { 144 | result[i] = a[i] * b[i]; 145 | } 146 | #elif defined(__SSE__) 147 | int i = 0; 148 | for (; i <= size - 2; i += 2) { 149 | __m128d a_vec = _mm_loadu_pd(&a[i]); 150 | __m128d b_vec = _mm_loadu_pd(&b[i]); 151 | __m128d res_vec = _mm_mul_pd(a_vec, b_vec); 152 | _mm_storeu_pd(&result[i], res_vec); 153 | } 154 | for (; i < size; i++) { 155 | result[i] = a[i] * b[i]; 156 | } 157 | #else 158 | for (int i = 0; i < size; i++) { 159 | result[i] = a[i] * b[i]; 160 | } 161 | #endif 162 | } 163 | 164 | // SIMD optimized ReLU 165 | void relu_forward_simd(int size, double *input, double *output) { 166 | #ifdef __AVX__ 167 | int i = 0; 168 | __m256d zeros = _mm256_setzero_pd(); 169 | 170 | for (; i <= size - 4; i += 4) { 171 | __m256d in_vec = _mm256_loadu_pd(&input[i]); 172 | __m256d res_vec = _mm256_max_pd(in_vec, zeros); 173 | _mm256_storeu_pd(&output[i], res_vec); 174 | } 175 | 176 | for (; i < size; i++) { 177 | output[i] = input[i] > 0 ? input[i] : 0; 178 | } 179 | #elif defined(__SSE__) 180 | int i = 0; 181 | __m128d zeros = _mm_setzero_pd(); 182 | 183 | for (; i <= size - 2; i += 2) { 184 | __m128d in_vec = _mm_loadu_pd(&input[i]); 185 | __m128d res_vec = _mm_max_pd(in_vec, zeros); 186 | _mm_storeu_pd(&output[i], res_vec); 187 | } 188 | 189 | for (; i < size; i++) { 190 | output[i] = input[i] > 0 ? input[i] : 0; 191 | } 192 | #else 193 | for (int i = 0; i < size; i++) { 194 | output[i] = input[i] > 0 ? input[i] : 0; 195 | } 196 | #endif 197 | } -------------------------------------------------------------------------------- /simple-test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt") 4 | (require "grad.rkt") 5 | 6 | (define (test-fnn) 7 | (let* ([input-dim 3] 8 | [hidden-dim 4] 9 | [output-dim 2] 10 | [batch-size 2] 11 | [learning-rate 0.01] 12 | [num-epochs 20] 13 | [input-data (list (list 0.1 0.2 0.3) (list 0.4 0.5 0.6))] 14 | [output-data (list (list 0.7 0.8) (list 0.9 0.1))]) 15 | 16 | (define (create-data-tensor data) 17 | (let ([flattened-data (apply append data)]) 18 | (create-tensor (list (length data) (length (car data))) flattened-data))) 19 | 20 | (let* ([input-tensor (create-data-tensor input-data)] 21 | [output-tensor (create-data-tensor output-data)] 22 | [hidden-weights (random-tensor (list input-dim hidden-dim) 0.1)] 23 | [hidden-biases (random-tensor (list hidden-dim) 0.1)] 24 | [output-weights (random-tensor (list hidden-dim output-dim) 0.1)] 25 | [output-biases (random-tensor (list output-dim) 0.1)]) 26 | 27 | (displayln "Initial hidden weights:") 28 | (print-tensor hidden-weights) 29 | (displayln "Initial hidden biases:") 30 | (print-tensor hidden-biases) 31 | (displayln "Initial output weights:") 32 | (print-tensor output-weights) 33 | (displayln "Initial output biases:") 34 | (print-tensor output-biases) 35 | (newline) 36 | 37 | (for ([epoch (in-range num-epochs)]) 38 | (let* ([hidden-output (dense-forward input-tensor hidden-weights hidden-biases)] 39 | [output (dense-forward hidden-output output-weights output-biases)] 40 | [loss (mean-squared-error output-tensor output)] 41 | [output-grad (tensor-subtract output output-tensor)]) 42 | (displayln (string-append "Epoch: " (number->string epoch) ", Loss: " (number->string loss))) 43 | 44 | (let-values ([(output-grad-weights output-grad-biases output-grad-input) 45 | (dense-backward hidden-output output-weights output-biases output output-grad learning-rate)] 46 | [(hidden-grad-weights hidden-grad-biases _) 47 | (dense-backward input-tensor hidden-weights hidden-biases hidden-output 48 | (tensor-multiply output-grad (transpose output-weights)) 49 | learning-rate)]) 50 | (set! output-weights output-grad-weights) 51 | (set! output-biases output-grad-biases) 52 | (set! hidden-weights hidden-grad-weights) 53 | (set! hidden-biases hidden-grad-biases)))) 54 | 55 | (displayln "Final hidden weights:") 56 | (print-tensor hidden-weights) 57 | (displayln "Final hidden biases:") 58 | (print-tensor hidden-biases) 59 | (displayln "Final output weights:") 60 | (print-tensor output-weights) 61 | (displayln "Final output biases:") 62 | (print-tensor output-biases)))) 63 | 64 | (test-fnn) 65 | -------------------------------------------------------------------------------- /tensor.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (provide (struct-out tensor) ; Export the tensor struct and its accessors 4 | ;; core operations 5 | t:create ; creates a tensor 6 | t:random ; random tensor 7 | t:reshape 8 | t:print 9 | 10 | ;; math operations 11 | t:add 12 | t:sub 13 | t:mul ; Matrix multiplication and elementwise 14 | t:dot ; Dot Product 15 | t:scale ; Scalar Multiplication 16 | t:transpose 17 | 18 | ;; Accessors 19 | t:shape ; Get shape 20 | t:data ; Get Data 21 | t:ref 22 | ; t:at ; Uncomment if implemented 23 | ; t:slice ; Uncomment if implemented 24 | ) 25 | 26 | (struct tensor (shape data) #:transparent) 27 | 28 | ;; Wrapper accessors 29 | (define (t:shape t) 30 | (tensor-shape t)) 31 | 32 | (define (t:data t) 33 | (tensor-data t)) 34 | 35 | ;; Create a tensor 36 | (define (t:create shape data) 37 | (let ((vec-data (if (vector? data) data (list->vector data)))) 38 | (cond 39 | [(= (apply * shape) (vector-length vec-data)) 40 | (tensor shape vec-data)] 41 | [else 42 | (begin 43 | (println "Error: Data does not match, please check the size") 44 | #f)]))) 45 | 46 | ;; Print tensor 47 | (define (t:print t) 48 | (let ([shape (tensor-shape t)] 49 | [data (tensor-data t)]) 50 | (cond 51 | [(= (length shape) 1) 52 | (display "[") 53 | (for ([i (in-range (car shape))]) 54 | (display (vector-ref data i)) 55 | (display " ")) 56 | (display "]") 57 | (newline)] 58 | [(= (length shape) 2) 59 | (for ([i (in-range (car shape))]) 60 | (display "[") 61 | (for ([j (in-range (cadr shape))]) 62 | (display (vector-ref data (+ (* i (cadr shape)) j))) 63 | (display " ")) 64 | (display "]") 65 | (newline))] 66 | [else 67 | (error "t:print: unsupported tensor shape")]))) 68 | 69 | ;; Random tensor 70 | (define (t:random shape range) 71 | (let* ((size (apply * shape)) 72 | (max-value (inexact->exact (floor (* range 10000))))) 73 | (tensor shape 74 | (for/vector ([i size]) 75 | (/ (random max-value) 10000.0))))) 76 | 77 | ;; Reshape tensor 78 | (define (t:reshape t new-shape) 79 | (let ([original-size (apply * (tensor-shape t))] 80 | [new-size (apply * new-shape)]) 81 | (if (= original-size new-size) 82 | (tensor new-shape (tensor-data t)) 83 | (error "t:reshape: New shape must have the same number of elements as the original shape")))) 84 | 85 | ;; Add tensors 86 | (define (t:add t1 t2) 87 | (let ([shape1 (tensor-shape t1)] 88 | [shape2 (tensor-shape t2)]) 89 | (cond 90 | [(equal? shape1 shape2) 91 | (tensor shape1 92 | (for/vector ([i (vector-length (tensor-data t1))]) 93 | (+ (vector-ref (tensor-data t1) i) 94 | (vector-ref (tensor-data t2) i))))] 95 | [(= (length shape1) 1) 96 | (let ([scalar-val (vector-ref (tensor-data t1) 0)]) 97 | (tensor shape2 98 | (for/vector ([i (vector-length (tensor-data t2))]) 99 | (+ scalar-val (vector-ref (tensor-data t2) i)))))] 100 | 101 | [(= (length shape2) 1) 102 | (let ([scalar-val (vector-ref (tensor-data t2) 0)]) 103 | (tensor shape1 104 | (for/vector ([i (vector-length (tensor-data t1))]) 105 | (+ (vector-ref (tensor-data t1) i) scalar-val))))] 106 | [else 107 | (error "t:add: Tensors must have the same shape or be broadcastable for addition")]))) 108 | 109 | ;; Subtract tensors 110 | (define (t:sub t1 t2) 111 | (let ([shape1 (tensor-shape t1)] 112 | [shape2 (tensor-shape t2)]) 113 | (cond 114 | [(equal? shape1 shape2) 115 | (tensor shape1 116 | (for/vector ([i (vector-length (tensor-data t1))]) 117 | (- (vector-ref (tensor-data t1) i) 118 | (vector-ref (tensor-data t2) i))))] 119 | [else 120 | (error "t:sub: Tensors must have the same shape for subtraction")]))) 121 | 122 | ;; Multiply tensors (Matrix multiply or elementwise) 123 | (define (t:mul t1 t2) 124 | (let ([shape1 (tensor-shape t1)] 125 | [shape2 (tensor-shape t2)]) 126 | (cond 127 | ;; Matrix multiplication: (A: MxN) * (B: NxP) -> (C: MxP) 128 | [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2))) 129 | (let* ([rows-a (car shape1)] 130 | [cols-a (cadr shape1)] 131 | [cols-b (cadr shape2)] 132 | [result (make-vector (* rows-a cols-b) 0.0)]) 133 | (for ([i (in-range rows-a)]) 134 | (for ([j (in-range cols-b)]) 135 | (for ([k (in-range cols-a)]) 136 | (vector-set! result (+ (* i cols-b) j) 137 | (+ (vector-ref result (+ (* i cols-b) j)) 138 | (* (t:ref t1 i k) (t:ref t2 k j))))))) 139 | (tensor (list rows-a cols-b) result))] 140 | 141 | ;; Vector (1D) * Matrix (2D) multiplication when shapes align 142 | [(and (= (length shape1) 1) (= (length shape2) 2) (= (car shape1) (car shape2))) 143 | (let* ([rows-b (car shape2)] 144 | [cols-b (cadr shape2)] 145 | [result (make-vector cols-b 0.0)]) 146 | (for ([j (in-range cols-b)]) 147 | (for ([i (in-range rows-b)]) 148 | (vector-set! result j 149 | (+ (vector-ref result j) 150 | (* (vector-ref (tensor-data t1) i) 151 | (t:ref t2 i j)))))) 152 | (tensor (list cols-b) result))] 153 | 154 | ;; Elementwise multiplication if shapes match and are both 2D (or same dimension) 155 | [(equal? shape1 shape2) 156 | (tensor shape1 (vector-map * (tensor-data t1) (tensor-data t2)))] 157 | 158 | [else 159 | (error "t:mul: Tensors must have compatible shapes for multiplication")]))) 160 | 161 | ;; Reference element at (i, j) 162 | (define (t:ref t i j) 163 | (vector-ref (tensor-data t) (+ (* i (cadr (tensor-shape t))) j))) 164 | 165 | ;; Transpose a matrix (2D only) 166 | (define (t:transpose t) 167 | (let* ([shape (tensor-shape t)] 168 | [rows (car shape)] 169 | [cols (cadr shape)] 170 | [data (tensor-data t)] 171 | [new-data (make-vector (apply * (reverse shape)) 0)]) 172 | (for* ([i rows] 173 | [j cols]) 174 | (vector-set! new-data (+ (* j rows) i) (vector-ref data (+ (* i cols) j)))) 175 | (tensor (reverse shape) new-data))) 176 | 177 | ;; Scalar multiply a tensor 178 | (define (t:scale t scalar) 179 | (let ([data (tensor-data t)]) 180 | (tensor (tensor-shape t) 181 | (for/vector ([v data]) 182 | (* v scalar))))) 183 | 184 | ;; Dot product (1D only) 185 | (define (t:dot t1 t2) 186 | (let ([data1 (tensor-data t1)] 187 | [data2 (tensor-data t2)]) 188 | (if (not (= (vector-length data1) (vector-length data2))) 189 | (error "t:dot: Tensors must have the same length for dot product") 190 | (apply + (map * data1 data2))))) 191 | -------------------------------------------------------------------------------- /tensor2.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require ffi/unsafe 4 | ffi/unsafe/define 5 | ffi/unsafe/cvector) 6 | 7 | ; Define the C function 8 | (define-ffi-definer define-c (ffi-lib "./matrix_multiplication.so")) 9 | (define-c matrix_multiply (_fun _pointer _pointer _pointer _int _int _int -> _void)) 10 | 11 | (provide tensor create-tensor tensor-add tensor-subtract tensor-multiply dense-forward mean-squared-error dense-backward print-tensor random-tensor reshape-tensor 12 | transpose scalar-multiply dot-product relu relu-derivative tensor-shape tensor-data initialize-fnn) 13 | 14 | (struct tensor (shape data) #:transparent) 15 | 16 | (define (create-tensor shape data) 17 | (let ((vec-data (if (vector? data) data (list->vector data)))) 18 | (cond 19 | [(= (apply * shape) (vector-length vec-data)) 20 | (tensor shape vec-data)] 21 | [else 22 | (begin 23 | (println "Error: Data does not match, please check the size") 24 | #f)]))) 25 | 26 | (define (print-tensor t) 27 | (let ([shape (tensor-shape t)] 28 | [data (tensor-data t)]) 29 | (for ([i (in-range (car shape))]) 30 | (display "[") 31 | (for ([j (in-range (cadr shape))]) 32 | (display (vector-ref data (+ (* i (cadr shape)) j))) 33 | (display " ")) 34 | (display "]") 35 | (newline)))) 36 | 37 | ; Create random tensor 38 | (define (random-tensor shape range) 39 | (let* ((size (apply * shape)) 40 | (max-value (inexact->exact (floor (* range 10000))))) 41 | (tensor shape 42 | (for/vector ([i size]) 43 | (/ (random max-value) 10000.0))))) 44 | 45 | ; Reshaping tensors 46 | (define (reshape-tensor t new-shape) 47 | (let ([original-size (apply * (tensor-shape t))] 48 | [new-size (apply * new-shape)]) 49 | (if (= original-size new-size) 50 | (tensor new-shape (tensor-data t)) 51 | (error "New shape must have the same number of elements as the original shape")))) 52 | 53 | ; Element-wise operations 54 | (define (tensor-add t1 t2) 55 | (let ([shape1 (tensor-shape t1)] 56 | [shape2 (tensor-shape t2)]) 57 | (displayln (string-append "Adding tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2))) 58 | (cond 59 | [(equal? shape1 shape2) 60 | (tensor shape1 (for/vector ([i (vector-length (tensor-data t1))]) 61 | (+ (vector-ref (tensor-data t1) i) 62 | (vector-ref (tensor-data t2) i))))] 63 | [else 64 | (error "Tensors must have the same shape for addition")]))) 65 | 66 | (define (tensor-subtract t1 t2) 67 | (let ([shape1 (tensor-shape t1)] 68 | [shape2 (tensor-shape t2)]) 69 | (displayln (string-append "Subtracting tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2))) 70 | (cond 71 | [(equal? shape1 shape2) 72 | (tensor shape1 (for/vector ([i (vector-length (tensor-data t1))]) 73 | (- (vector-ref (tensor-data t1) i) 74 | (vector-ref (tensor-data t2) i))))] 75 | [else 76 | (error "Tensors must have the same shape for subtraction")]))) 77 | 78 | (define (tensor-multiply t1 t2) 79 | (let ([shape1 (tensor-shape t1)] 80 | [shape2 (tensor-shape t2)]) 81 | (displayln (string-append "Multiplying tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2))) 82 | (cond 83 | [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2))) 84 | (let* ([rows-a (car shape1)] 85 | [cols-a (cadr shape1)] 86 | [cols-b (cadr shape2)] 87 | [a (tensor-data t1)] 88 | [b (tensor-data t2)] 89 | [c (make-vector (* rows-a cols-b) 0.0)] 90 | [ptr-a (_cvector _double (vector->list a))] 91 | [ptr-b (_cvector _double (vector->list b))] 92 | [ptr-c (_cvector _double (vector->list c))]) 93 | (matrix_multiply ptr-a ptr-b ptr-c rows-a cols-a cols-b) 94 | (tensor (list rows-a cols-b) (list->vector (cvector->list ptr-c))))] 95 | [else 96 | (error "Tensors must have compatible shapes for multiplication")]))) 97 | ; Activation function (ReLU) 98 | (define (relu x) 99 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (max 0 v)))) 100 | 101 | ; Derivative of ReLU 102 | (define (relu-derivative x) 103 | (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (if (> v 0) 1 0)))) 104 | 105 | ; Dense layer forward propagation 106 | (define (dense-forward input weights biases) 107 | (let* ([z (tensor-add (tensor-multiply input weights) biases)] 108 | [activation-output (relu z)]) 109 | activation-output)) 110 | 111 | ; Loss function 112 | (define (mean-squared-error y-true y-pred) 113 | (let* ([diff (tensor-subtract y-true y-pred)] 114 | [squared-diff (tensor-multiply diff diff)] 115 | [sum (apply + (tensor-data squared-diff))]) 116 | (/ sum (length (tensor-data y-true))))) 117 | 118 | ; Backpropagation for dense layer 119 | (define (dense-backward input weights biases output grad-output learning-rate) 120 | (let* ([grad-activation (relu-derivative output)] 121 | [grad-z (tensor-multiply grad-output grad-activation)] 122 | [grad-biases (tensor (tensor-shape biases) 123 | (for/vector ([i (tensor-data grad-z)]) 124 | (apply + (tensor-data i))))] 125 | [grad-weights (tensor-multiply (transpose input) grad-z)] 126 | [grad-input (tensor-multiply grad-z (transpose weights))]) 127 | (let* ([new-weights (tensor-subtract weights (scalar-multiply grad-weights learning-rate))] 128 | [new-biases (tensor-subtract biases (scalar-multiply grad-biases learning-rate))]) 129 | (values new-weights new-biases grad-input)))) 130 | 131 | (define (transpose t) 132 | (let* ([shape (tensor-shape t)] 133 | [rows (car shape)] 134 | [cols (cadr shape)] 135 | [data (tensor-data t)] 136 | [new-data (make-vector (apply * (reverse shape)) 0)]) 137 | (for* ([i rows] 138 | [j cols]) 139 | (vector-set! new-data (+ (* j rows) i) (vector-ref data (+ (* i cols) j)))) 140 | (tensor (reverse shape) new-data))) 141 | 142 | (define (scalar-multiply t scalar) 143 | (tensor (tensor-shape t) (for/vector ([v (tensor-data t)]) (* v scalar)))) 144 | 145 | ; Example of initializing a simple FNN 146 | (define (initialize-fnn batch-size input-dim output-dim) 147 | (let* ([input-data (make-list (* batch-size input-dim) 0)] 148 | [input-tensor (create-tensor (list batch-size input-dim) input-data)] 149 | [weight-shape (list input-dim output-dim)] 150 | [bias-shape (list output-dim)] 151 | [weights (random-tensor weight-shape 1.0)] 152 | [biases (random-tensor bias-shape 1.0)]) 153 | (values input-tensor weights biases))) 154 | 155 | ; Dot product 156 | (define (dot-product t1 t2) 157 | (let ([data1 (tensor-data t1)] 158 | [data2 (tensor-data t2)]) 159 | (if (not (= (length data1) (length data2))) 160 | (error "Tensors must have the same length for dot product") 161 | (apply + (map * data1 data2))))) -------------------------------------------------------------------------------- /tensor_device.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "tensor.rkt" 4 | "device.rkt" 5 | "hardware_detection.rkt" 6 | ffi/unsafe 7 | ffi/vector) 8 | 9 | ;; Provide extended tensor operations with device support 10 | (provide 11 | ;; Core operations 12 | dt:create ; Create a tensor on a specific device 13 | dt:to ; Move a tensor to a specific device 14 | dt:random ; Create a random tensor on a device 15 | dt:print ; Print a tensor 16 | dt:reshape ; Reshape a tensor 17 | dt:device ; Get the device of a tensor 18 | 19 | ;; Math operations 20 | dt:add ; Add tensors 21 | dt:sub ; Subtract tensors 22 | dt:mul ; Matrix multiplication or elementwise 23 | dt:dot ; Dot product 24 | dt:scale ; Scalar multiplication 25 | dt:transpose ; Transpose a tensor 26 | 27 | ;; Accessors (same as tensor.rkt) 28 | dt:shape ; Get shape 29 | dt:data ; Get data 30 | dt:ref ; Get value at index 31 | ) 32 | 33 | ;; Define device-aware tensor structure as a separate struct 34 | ;; Instead of extending tensor, which causes issues 35 | (struct dt:tensor (shape data device) #:transparent) 36 | 37 | ;; Create tensor on device 38 | (define (dt:create shape data [dev (current-device)]) 39 | (let ([t (t:create shape data)]) 40 | (dt:tensor (t:shape t) (t:data t) dev))) 41 | 42 | ;; Get device of tensor 43 | (define (dt:device t) 44 | (dt:tensor-device t)) 45 | 46 | ;; Move tensor to device 47 | (define (dt:to t dev) 48 | (cond 49 | [(dt:tensor? t) 50 | (let ([current-dev (dt:tensor-device t)]) 51 | (cond 52 | ;; Same device - no-op 53 | [(equal? current-dev dev) t] 54 | 55 | ;; Move to GPU from CPU 56 | [(and (cpu-device? current-dev) (gpu-device? dev)) 57 | (printf "Moving tensor to GPU device~n") 58 | (dt:tensor (dt:tensor-shape t) (dt:tensor-data t) dev)] 59 | 60 | ;; Move to CPU from GPU 61 | [(and (gpu-device? current-dev) (cpu-device? dev)) 62 | (printf "Moving tensor to CPU device~n") 63 | (dt:tensor (dt:tensor-shape t) (dt:tensor-data t) dev)] 64 | 65 | [else t]))] 66 | 67 | ;; Convert regular tensor to device tensor 68 | [(tensor? t) 69 | (dt:tensor (t:shape t) (t:data t) dev)] 70 | 71 | [else 72 | (error "dt:to: expected a tensor, got ~a" t)])) 73 | 74 | ;; Create random tensor on device 75 | (define (dt:random shape range [dev (current-device)]) 76 | (let ([t (t:random shape range)]) 77 | (dt:tensor (t:shape t) (t:data t) dev))) 78 | 79 | ;; Print tensor 80 | (define (dt:print t) 81 | (when (dt:tensor? t) 82 | ;; Create a regular tensor to print 83 | (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))]) 84 | (t:print regular-tensor) 85 | (printf "Device: ~a~n" (get-device-type (dt:tensor-device t)))))) 86 | 87 | ;; Reshape tensor 88 | (define (dt:reshape t new-shape) 89 | (when (dt:tensor? t) 90 | (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))] 91 | [dev (dt:tensor-device t)]) 92 | (let ([reshaped (t:reshape regular-tensor new-shape)]) 93 | (dt:tensor (t:shape reshaped) (t:data reshaped) dev))))) 94 | 95 | ;; Add tensors 96 | (define (dt:add t1 t2) 97 | (cond 98 | [(and (dt:tensor? t1) (dt:tensor? t2)) 99 | (let ([dev1 (dt:tensor-device t1)] 100 | [dev2 (dt:tensor-device t2)]) 101 | (cond 102 | ;; Both on CPU 103 | [(and (cpu-device? dev1) (cpu-device? dev2)) 104 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 105 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 106 | (let ([sum (t:add regular-t1 regular-t2)]) 107 | (dt:tensor (t:shape sum) (t:data sum) dev1)))] 108 | 109 | ;; Different devices - move to same device 110 | [(not (equal? dev1 dev2)) 111 | (let ([target-dev (if (gpu-device? dev1) dev1 dev2)]) 112 | (dt:add (dt:to t1 target-dev) (dt:to t2 target-dev)))] 113 | 114 | ;; Both on GPU - use OpenCL implementation if available 115 | [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?)) 116 | (printf "Performing GPU tensor addition~n") 117 | ;; For now, fall back to CPU implementation 118 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 119 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 120 | (let ([sum (t:add regular-t1 regular-t2)]) 121 | (dt:tensor (t:shape sum) (t:data sum) dev1)))] 122 | 123 | [else 124 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 125 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 126 | (let ([sum (t:add regular-t1 regular-t2)]) 127 | (dt:tensor (t:shape sum) (t:data sum) dev1)))]))] 128 | 129 | ;; Handle regular tensors 130 | [else 131 | (let ([dev (current-device)]) 132 | (let ([dt1 (if (dt:tensor? t1) 133 | t1 134 | (dt:tensor (t:shape t1) (t:data t1) dev))] 135 | [dt2 (if (dt:tensor? t2) 136 | t2 137 | (dt:tensor (t:shape t2) (t:data t2) dev))]) 138 | (dt:add dt1 dt2)))])) 139 | 140 | ;; Subtract tensors 141 | (define (dt:sub t1 t2) 142 | (cond 143 | [(and (dt:tensor? t1) (dt:tensor? t2)) 144 | (let ([dev1 (dt:tensor-device t1)] 145 | [dev2 (dt:tensor-device t2)]) 146 | (cond 147 | ;; Both on CPU 148 | [(and (cpu-device? dev1) (cpu-device? dev2)) 149 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 150 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 151 | (let ([diff (t:sub regular-t1 regular-t2)]) 152 | (dt:tensor (t:shape diff) (t:data diff) dev1)))] 153 | 154 | ;; Different devices - move to same device 155 | [(not (equal? dev1 dev2)) 156 | (let ([target-dev (if (gpu-device? dev1) dev1 dev2)]) 157 | (dt:sub (dt:to t1 target-dev) (dt:to t2 target-dev)))] 158 | 159 | ;; Both on GPU - use OpenCL implementation if available 160 | [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?)) 161 | (printf "Performing GPU tensor subtraction~n") 162 | ;; For now, fall back to CPU implementation 163 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 164 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 165 | (let ([diff (t:sub regular-t1 regular-t2)]) 166 | (dt:tensor (t:shape diff) (t:data diff) dev1)))] 167 | 168 | [else 169 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 170 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 171 | (let ([diff (t:sub regular-t1 regular-t2)]) 172 | (dt:tensor (t:shape diff) (t:data diff) dev1)))]))] 173 | 174 | ;; Handle regular tensors 175 | [else 176 | (let ([dev (current-device)]) 177 | (let ([dt1 (if (dt:tensor? t1) 178 | t1 179 | (dt:tensor (t:shape t1) (t:data t1) dev))] 180 | [dt2 (if (dt:tensor? t2) 181 | t2 182 | (dt:tensor (t:shape t2) (t:data t2) dev))]) 183 | (dt:sub dt1 dt2)))])) 184 | 185 | ;; Multiply tensors 186 | (define (dt:mul t1 t2) 187 | (cond 188 | [(and (dt:tensor? t1) (dt:tensor? t2)) 189 | (let ([dev1 (dt:tensor-device t1)] 190 | [dev2 (dt:tensor-device t2)]) 191 | (cond 192 | ;; Both on CPU 193 | [(and (cpu-device? dev1) (cpu-device? dev2)) 194 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 195 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 196 | (let ([prod (t:mul regular-t1 regular-t2)]) 197 | (dt:tensor (t:shape prod) (t:data prod) dev1)))] 198 | 199 | ;; Different devices - move to same device 200 | [(not (equal? dev1 dev2)) 201 | (let ([target-dev (if (gpu-device? dev1) dev1 dev2)]) 202 | (dt:mul (dt:to t1 target-dev) (dt:to t2 target-dev)))] 203 | 204 | ;; Both on GPU - use OpenCL implementation if available 205 | [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?)) 206 | (printf "Performing GPU tensor multiplication~n") 207 | ;; TODO: Implement GPU matrix multiplication using OpenCL 208 | ;; For now, fall back to CPU implementation 209 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 210 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 211 | (let ([prod (t:mul regular-t1 regular-t2)]) 212 | (dt:tensor (t:shape prod) (t:data prod) dev1)))] 213 | 214 | [else 215 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 216 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 217 | (let ([prod (t:mul regular-t1 regular-t2)]) 218 | (dt:tensor (t:shape prod) (t:data prod) dev1)))]))] 219 | 220 | ;; Handle regular tensors 221 | [else 222 | (let ([dev (current-device)]) 223 | (let ([dt1 (if (dt:tensor? t1) 224 | t1 225 | (dt:tensor (t:shape t1) (t:data t1) dev))] 226 | [dt2 (if (dt:tensor? t2) 227 | t2 228 | (dt:tensor (t:shape t2) (t:data t2) dev))]) 229 | (dt:mul dt1 dt2)))])) 230 | 231 | ;; Scalar multiply 232 | (define (dt:scale t scalar) 233 | (if (dt:tensor? t) 234 | (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))] 235 | [dev (dt:tensor-device t)]) 236 | (let ([scaled (t:scale regular-tensor scalar)]) 237 | (dt:tensor (t:shape scaled) (t:data scaled) dev))) 238 | (let ([scaled (t:scale t scalar)]) 239 | (dt:tensor (t:shape scaled) (t:data scaled) (current-device))))) 240 | 241 | ;; Transpose 242 | (define (dt:transpose t) 243 | (if (dt:tensor? t) 244 | (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))] 245 | [dev (dt:tensor-device t)]) 246 | (let ([transposed (t:transpose regular-tensor)]) 247 | (dt:tensor (t:shape transposed) (t:data transposed) dev))) 248 | (let ([transposed (t:transpose t)]) 249 | (dt:tensor (t:shape transposed) (t:data transposed) (current-device))))) 250 | 251 | ;; Dot product 252 | (define (dt:dot t1 t2) 253 | (cond 254 | [(and (dt:tensor? t1) (dt:tensor? t2)) 255 | (let ([dev1 (dt:tensor-device t1)] 256 | [dev2 (dt:tensor-device t2)]) 257 | (cond 258 | ;; Different devices - move to same device 259 | [(not (equal? dev1 dev2)) 260 | (let ([target-dev (if (gpu-device? dev1) dev1 dev2)]) 261 | (dt:dot (dt:to t1 target-dev) (dt:to t2 target-dev)))] 262 | 263 | ;; Both on GPU - use OpenCL implementation if available 264 | [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?)) 265 | (printf "Performing GPU dot product~n") 266 | ;; For now, fall back to CPU implementation 267 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 268 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 269 | (t:dot regular-t1 regular-t2))] 270 | 271 | [else 272 | (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))] 273 | [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))]) 274 | (t:dot regular-t1 regular-t2))]))] 275 | 276 | ;; Handle regular tensors 277 | [else 278 | (let ([dev (current-device)]) 279 | (let ([regular-t1 (if (dt:tensor? t1) 280 | (t:create (dt:tensor-shape t1) (dt:tensor-data t1)) 281 | t1)] 282 | [regular-t2 (if (dt:tensor? t2) 283 | (t:create (dt:tensor-shape t2) (dt:tensor-data t2)) 284 | t2)]) 285 | (t:dot regular-t1 regular-t2)))])) 286 | 287 | ;; Accessors 288 | (define (dt:shape t) 289 | (if (dt:tensor? t) 290 | (dt:tensor-shape t) 291 | (t:shape t))) 292 | 293 | (define (dt:data t) 294 | (if (dt:tensor? t) 295 | (dt:tensor-data t) 296 | (t:data t))) 297 | 298 | (define (dt:ref t i j) 299 | (if (dt:tensor? t) 300 | (vector-ref (dt:tensor-data t) (+ (* i (cadr (dt:tensor-shape t))) j)) 301 | (t:ref t i j))) -------------------------------------------------------------------------------- /tensor_optimized.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require "ffi_ops.rkt") 4 | (require ffi/vector) 5 | 6 | (provide (struct-out tensor-opt) 7 | t-opt:create 8 | t-opt:random 9 | t-opt:reshape 10 | t-opt:print 11 | t-opt:add 12 | t-opt:add! 13 | t-opt:sub 14 | t-opt:sub! 15 | t-opt:mul 16 | t-opt:transpose 17 | t-opt:shape 18 | t-opt:data 19 | t-opt:ref 20 | t-opt:scale 21 | t-opt:scale!) 22 | 23 | ;; Optimized tensor structure 24 | ;; Uses a f64vector for better FFI compatibility and memory layout 25 | (struct tensor-opt (shape data) #:transparent) 26 | 27 | ;; Accessors 28 | (define (t-opt:shape t) 29 | (tensor-opt-shape t)) 30 | 31 | (define (t-opt:data t) 32 | (tensor-opt-data t)) 33 | 34 | ;; Create tensor from data 35 | (define (t-opt:create shape data) 36 | (let ((vec-data (cond 37 | [(f64vector? data) data] 38 | [(vector? data) (list->f64vector (vector->list data))] 39 | [else (list->f64vector data)]))) 40 | (cond 41 | [(= (apply * shape) (f64vector-length vec-data)) 42 | (tensor-opt shape vec-data)] 43 | [else 44 | (error "t-opt:create: Data does not match shape")]))) 45 | 46 | ;; Random tensor 47 | (define (t-opt:random shape range) 48 | (let* ((size (apply * shape)) 49 | (max-value (inexact->exact (floor (* range 10000))))) 50 | (tensor-opt shape 51 | (list->f64vector 52 | (for/list ([i size]) 53 | (/ (random max-value) 10000.0)))))) 54 | 55 | ;; Reshape tensor 56 | (define (t-opt:reshape t new-shape) 57 | (let ([original-size (apply * (tensor-opt-shape t))] 58 | [new-size (apply * new-shape)]) 59 | (if (= original-size new-size) 60 | (tensor-opt new-shape (tensor-opt-data t)) 61 | (error "t-opt:reshape: New shape must have the same number of elements")))) 62 | 63 | ;; Print tensor 64 | (define (t-opt:print t) 65 | (let ([shape (tensor-opt-shape t)] 66 | [data (tensor-opt-data t)]) 67 | (cond 68 | [(= (length shape) 1) 69 | (display "[") 70 | (for ([i (in-range (car shape))]) 71 | (display (f64vector-ref data i)) 72 | (display " ")) 73 | (display "]") 74 | (newline)] 75 | [(= (length shape) 2) 76 | (for ([i (in-range (car shape))]) 77 | (display "[") 78 | (for ([j (in-range (cadr shape))]) 79 | (display (f64vector-ref data (+ (* i (cadr shape)) j))) 80 | (display " ")) 81 | (display "]") 82 | (newline))] 83 | [else (error "t-opt:print: Unsupported tensor shape")]))) 84 | 85 | ;; Add tensors - out of place 86 | (define (t-opt:add t1 t2) 87 | (let ([shape1 (tensor-opt-shape t1)] 88 | [shape2 (tensor-opt-shape t2)]) 89 | (cond 90 | [(equal? shape1 shape2) 91 | (let* ([size (apply * shape1)] 92 | [result (make-f64vector size 0.0)]) 93 | (c:tensor-add size 94 | (tensor-opt-data t1) 95 | (tensor-opt-data t2) 96 | result) 97 | (tensor-opt shape1 result))] 98 | [else 99 | (error "t-opt:add: Tensors must have the same shape")]))) 100 | 101 | ;; Add tensors - in-place version (t1 += t2) 102 | (define (t-opt:add! t1 t2) 103 | (let ([shape1 (tensor-opt-shape t1)] 104 | [shape2 (tensor-opt-shape t2)]) 105 | (cond 106 | [(equal? shape1 shape2) 107 | (let ([size (apply * shape1)]) 108 | (c:tensor-add size 109 | (tensor-opt-data t1) 110 | (tensor-opt-data t2) 111 | (tensor-opt-data t1)) 112 | t1)] 113 | [else 114 | (error "t-opt:add!: Tensors must have the same shape")]))) 115 | 116 | ;; Subtract tensors - out of place 117 | (define (t-opt:sub t1 t2) 118 | (let ([shape1 (tensor-opt-shape t1)] 119 | [shape2 (tensor-opt-shape t2)]) 120 | (cond 121 | [(equal? shape1 shape2) 122 | (let* ([size (apply * shape1)] 123 | [result (make-f64vector size 0.0)]) 124 | (c:tensor-sub size 125 | (tensor-opt-data t1) 126 | (tensor-opt-data t2) 127 | result) 128 | (tensor-opt shape1 result))] 129 | [else 130 | (error "t-opt:sub: Tensors must have the same shape")]))) 131 | 132 | ;; Subtract tensors - in-place version (t1 -= t2) 133 | (define (t-opt:sub! t1 t2) 134 | (let ([shape1 (tensor-opt-shape t1)] 135 | [shape2 (tensor-opt-shape t2)]) 136 | (cond 137 | [(equal? shape1 shape2) 138 | (let ([size (apply * shape1)]) 139 | (c:tensor-sub size 140 | (tensor-opt-data t1) 141 | (tensor-opt-data t2) 142 | (tensor-opt-data t1)) 143 | t1)] 144 | [else 145 | (error "t-opt:sub!: Tensors must have the same shape")]))) 146 | 147 | ;; Scale tensor - out of place 148 | (define (t-opt:scale t scalar) 149 | (let* ([shape (tensor-opt-shape t)] 150 | [size (apply * shape)] 151 | [result (make-f64vector size 0.0)]) 152 | (c:tensor-scale size 153 | (tensor-opt-data t) 154 | scalar 155 | result) 156 | (tensor-opt shape result))) 157 | 158 | ;; Scale tensor - in-place version (t *= scalar) 159 | (define (t-opt:scale! t scalar) 160 | (let* ([shape (tensor-opt-shape t)] 161 | [size (apply * shape)]) 162 | (c:tensor-scale size 163 | (tensor-opt-data t) 164 | scalar 165 | (tensor-opt-data t)) 166 | t)) 167 | 168 | ;; Matrix multiplication 169 | (define (t-opt:mul t1 t2) 170 | (let ([shape1 (tensor-opt-shape t1)] 171 | [shape2 (tensor-opt-shape t2)]) 172 | (cond 173 | ;; Matrix multiplication: (A: MxN) * (B: NxP) -> (C: MxP) 174 | [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2))) 175 | (let* ([rows-a (car shape1)] 176 | [cols-a (cadr shape1)] 177 | [cols-b (cadr shape2)] 178 | [result (make-f64vector (* rows-a cols-b) 0.0)]) 179 | (c:matrix-multiply rows-a cols-a cols-b 180 | (tensor-opt-data t1) 181 | (tensor-opt-data t2) 182 | result) 183 | (tensor-opt (list rows-a cols-b) result))] 184 | 185 | ;; Elementwise multiplication if shapes match 186 | [(equal? shape1 shape2) 187 | (let* ([size (apply * shape1)] 188 | [result (make-f64vector size 0.0)]) 189 | (c:tensor-mul-elementwise size 190 | (tensor-opt-data t1) 191 | (tensor-opt-data t2) 192 | result) 193 | (tensor-opt shape1 result))] 194 | 195 | [else 196 | (error "t-opt:mul: Tensors must have compatible shapes")]))) 197 | 198 | ;; Reference element at (i, j) 199 | (define (t-opt:ref t i j) 200 | (f64vector-ref (tensor-opt-data t) (+ (* i (cadr (tensor-opt-shape t))) j))) 201 | 202 | ;; Transpose a matrix (2D only) 203 | (define (t-opt:transpose t) 204 | (let* ([shape (tensor-opt-shape t)] 205 | [rows (car shape)] 206 | [cols (cadr shape)] 207 | [data (tensor-opt-data t)] 208 | [new-data (make-f64vector (apply * (reverse shape)) 0.0)]) 209 | (for* ([i rows] 210 | [j cols]) 211 | (f64vector-set! new-data (+ (* j rows) i) (f64vector-ref data (+ (* i cols) j)))) 212 | (tensor-opt (reverse shape) new-data))) -------------------------------------------------------------------------------- /test.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (printf "Starting RacoGrad test script...~n~n") 4 | 5 | ;; Test MNIST 6 | (printf "Testing MNIST implementation (abbreviated run for quick testing)...~n") 7 | (printf "=============================================================~n") 8 | 9 | ;; Set MNIST to run just a couple of epochs 10 | (parameterize ([current-command-line-arguments (vector "-m")]) 11 | (dynamic-require "mnist.rkt" #f)) 12 | 13 | ;; Wait a bit 14 | (sleep 2) 15 | 16 | ;; Test CNN 17 | (printf "~n~nTesting CNN implementation (abbreviated run for quick testing)...~n") 18 | (printf "=================================================================~n") 19 | 20 | ;; Run CNN with minimal epochs/examples for testing 21 | (parameterize ([current-command-line-arguments (vector "-m")]) 22 | (dynamic-require "CNN.rkt" #f)) 23 | 24 | (printf "~n~nTests completed!~n") -------------------------------------------------------------------------------- /visualization.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require plot) 4 | 5 | (provide plot-training-history) 6 | 7 | ;; Function to plot training history 8 | ;; history: a list of entries in format (epoch loss accuracy val-accuracy) 9 | (define (plot-training-history history [filename "training_history.png"]) 10 | (let* ([epochs (map first history)] 11 | [losses (map second history)] 12 | [accuracies (map third history)] 13 | [val-accuracies (map fourth history)]) 14 | 15 | ;; Generate the loss plot 16 | (define loss-plot 17 | (parameterize ([plot-x-label "Epoch"] 18 | [plot-y-label "Loss"] 19 | [plot-title "Training Loss"] 20 | [plot-font-size 12]) 21 | (plot 22 | (list 23 | (points (map vector epochs losses) 24 | #:color 'blue 25 | #:sym 'circle 26 | #:size 6) 27 | (lines (map vector epochs losses) 28 | #:color 'blue 29 | #:width 2 30 | #:style 'solid 31 | #:label "Training Loss")) 32 | #:x-min 0 33 | #:y-min 0 34 | #:width 500 35 | #:height 300))) 36 | 37 | ;; Generate the accuracy plot 38 | (define accuracy-plot 39 | (parameterize ([plot-x-label "Epoch"] 40 | [plot-y-label "Accuracy (%)"] 41 | [plot-title "Training and Validation Accuracy"] 42 | [plot-font-size 12]) 43 | (plot 44 | (list 45 | (points (map vector epochs accuracies) 46 | #:color 'green 47 | #:sym 'circle 48 | #:size 6) 49 | (lines (map vector epochs accuracies) 50 | #:color 'green 51 | #:width 2 52 | #:style 'solid 53 | #:label "Training Accuracy") 54 | (points (map vector epochs val-accuracies) 55 | #:color 'red 56 | #:sym 'triangle 57 | #:size 6) 58 | (lines (map vector epochs val-accuracies) 59 | #:color 'red 60 | #:width 2 61 | #:style 'long-dash 62 | #:label "Validation Accuracy")) 63 | #:x-min 0 64 | #:y-min 0 65 | #:y-max 100 66 | #:width 500 67 | #:height 300))) 68 | 69 | ;; Create a combined plot 70 | (define combined-plot 71 | (vl-append 10 loss-plot accuracy-plot)) 72 | 73 | ;; Save the plots 74 | (save-plot filename combined-plot 'png) 75 | 76 | (printf "Training history plot saved to ~a~n" filename))) 77 | 78 | ;; Example usage 79 | (module+ main 80 | ;; Generate sample history data 81 | (define sample-history 82 | (for/list ([epoch (in-range 10)]) 83 | (let* ([loss (- 1.0 (* 0.09 epoch))] 84 | [acc (* 10 epoch)] 85 | [val-acc (- (* 10 epoch) 5)]) 86 | (list epoch loss acc val-acc)))) 87 | 88 | ;; Plot the sample data 89 | (plot-training-history sample-history)) --------------------------------------------------------------------------------