├── .gitignore
├── CNN.rkt
├── CNN_IMPLEMENTATION.md
├── FNN.rkt
├── OPTIMIZATION_STRATEGY.md
├── RACOGRAD_IMPLEMENTATION_DETAILS.md
├── RACOGRAD_USER_GUIDE.md
├── README.md
├── autograd.rkt
├── benchmark.rkt
├── cnn_benchmark.rkt
├── cnn_ops.c
├── cnn_ops.rkt
├── cnn_test.rkt
├── compile_extensions.sh
├── cpu_benchmark.rkt
├── cuda_ops.c
├── debuggin_lib.rkt
├── demo.rkt
├── device.rkt
├── ffi_ops.rkt
├── gpu_acceleration.md
├── gpu_benchmark.rkt
├── hardware_detection.rkt
├── load-mnist.rkt
├── matrix_multiplication.c
├── matrix_opencl.c
├── mlx_cnn_ops.c
├── mlx_ops.c
├── mnist.rkt
├── mnist_benchmark.rkt
├── mnist_device.rkt
├── parallel_ops.c
├── regression-test.rkt
├── simd_ops.c
├── simple-test.rkt
├── tensor.rkt
├── tensor2.rkt
├── tensor_device.rkt
├── tensor_optimized.rkt
├── test.rkt
└── visualization.rkt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Racket compiled files
 2 | compiled/
 3 | *.zo
 4 | *.dep
 5 | *.rkt~
 6 | 
 7 | # Binary files
 8 | *.dylib
 9 | *.so
10 | *.dll
11 | 
12 | # OS specific files
13 | .DS_Store
14 | 
15 | # MNIST data files (large binaries)
16 | mnist-data/t10k-images.idx3-ubyte
17 | mnist-data/t10k-labels.idx1-ubyte
18 | mnist-data/train-images.idx3-ubyte
19 | mnist-data/train-labels.idx1-ubyte
20 | 
21 | # IDE/Editor files
22 | *~


--------------------------------------------------------------------------------
/CNN_IMPLEMENTATION.md:
--------------------------------------------------------------------------------
  1 | # Implementing a CNN with Optimized RacoGrad
  2 | 
  3 | This guide shows how to implement a Convolutional Neural Network (CNN) using the optimized RacoGrad library components.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - Compiled C extensions (`matrix_multiplication.so`, `simd_ops.so`, `parallel_ops.so`)
  8 | - OpenCL setup (optional, for GPU acceleration)
  9 | - Optimized tensor implementation (`tensor_optimized.rkt`)
 10 | 
 11 | ## Step 1: Import Required Modules
 12 | 
 13 | ```racket
 14 | #lang racket
 15 | 
 16 | (require "tensor_optimized.rkt"  ; Optimized tensor operations
 17 |          "ffi_ops.rkt"          ; C extension bindings
 18 |          "load-mnist.rkt")      ; For MNIST dataset loading
 19 | ```
 20 | 
 21 | ## Step 2: Implement Optimized Convolution Layer
 22 | 
 23 | The convolution layer is critical for CNN performance. Here's an implementation using C extensions:
 24 | 
 25 | ```racket
 26 | ;; Forward pass through a convolution layer
 27 | (define (conv2d-forward input-tensor filter-tensor stride padding)
 28 |   (let* ([input-shape (t-opt:shape input-tensor)]
 29 |          [filter-shape (t-opt:shape filter-tensor)]
 30 |          [batch-size (car input-shape)]
 31 |          [in-channels (cadr input-shape)]
 32 |          [in-height (caddr input-shape)]
 33 |          [in-width (cadddr input-shape)]
 34 |          [out-channels (car filter-shape)]
 35 |          [filter-height (caddr filter-shape)]
 36 |          [filter-width (cadddr filter-shape)]
 37 |          
 38 |          ;; Calculate output dimensions
 39 |          [out-height (add1 (quotient (- (+ in-height (* 2 padding)) filter-height) stride))]
 40 |          [out-width (add1 (quotient (- (+ in-width (* 2 padding)) filter-width) stride))]
 41 |          
 42 |          ;; Create output tensor
 43 |          [output-tensor (t-opt:create (list batch-size out-channels out-height out-width)
 44 |                                      (make-vector (* batch-size out-channels out-height out-width) 0.0))])
 45 |     
 46 |     ;; Call the C extension for convolution
 47 |     (c:conv2d-forward 
 48 |       batch-size in-channels in-height in-width
 49 |       out-channels filter-height filter-width
 50 |       stride padding
 51 |       (t-opt:data input-tensor)
 52 |       (t-opt:data filter-tensor)
 53 |       (t-opt:data output-tensor))
 54 |     
 55 |     output-tensor))
 56 | ```
 57 | 
 58 | ## Step 3: Implement Pooling Layer
 59 | 
 60 | Max pooling is another critical operation for CNNs:
 61 | 
 62 | ```racket
 63 | ;; Max pooling layer (2x2 pooling with stride 2)
 64 | (define (max-pool-2x2 input-tensor)
 65 |   (let* ([input-shape (t-opt:shape input-tensor)]
 66 |          [batch-size (car input-shape)]
 67 |          [channels (cadr input-shape)]
 68 |          [in-height (caddr input-shape)]
 69 |          [in-width (cadddr input-shape)]
 70 |          
 71 |          ;; Output dimensions
 72 |          [out-height (quotient in-height 2)]
 73 |          [out-width (quotient in-width 2)]
 74 |          
 75 |          ;; Create output tensor
 76 |          [output-tensor (t-opt:create (list batch-size channels out-height out-width)
 77 |                                      (make-vector (* batch-size channels out-height out-width) 0.0))])
 78 |     
 79 |     ;; Call the C extension for max pooling
 80 |     (c:max-pool-2x2 
 81 |       batch-size channels in-height in-width
 82 |       (t-opt:data input-tensor)
 83 |       (t-opt:data output-tensor))
 84 |     
 85 |     output-tensor))
 86 | ```
 87 | 
 88 | ## Step 4: Implement the Fully Connected Layer
 89 | 
 90 | Using our optimized matrix multiplication:
 91 | 
 92 | ```racket
 93 | ;; Fully connected layer using optimized operations
 94 | (define (fc-layer input-tensor weights-tensor bias-tensor activation-fn)
 95 |   (let* ([input-flat (if (> (length (t-opt:shape input-tensor)) 2)
 96 |                          ;; Flatten input if it's coming from conv layer
 97 |                          (flatten-tensor input-tensor)
 98 |                          input-tensor)]
 99 |          [z (t-opt:add (t-opt:mul input-flat weights-tensor) bias-tensor)])
100 |     (activation-fn z)))
101 | ```
102 | 
103 | ## Step 5: Activation Functions
104 | 
105 | ReLU is the most common activation in CNNs:
106 | 
107 | ```racket
108 | ;; ReLU activation using C extension
109 | (define (relu-opt tensor)
110 |   (let* ([shape (t-opt:shape tensor)]
111 |          [size (apply * shape)]
112 |          [result (make-f64vector size 0.0)])
113 |     (c:relu-forward size (t-opt:data tensor) result)
114 |     (t-opt:create shape result)))
115 | ```
116 | 
117 | ## Step 6: Building the Complete CNN
118 | 
119 | Put everything together to build a CNN for MNIST:
120 | 
121 | ```racket
122 | ;; Define model parameters
123 | (define batch-size 64)
124 | (define learning-rate 0.01)
125 | 
126 | ;; Create model (LeNet-5 architecture)
127 | (define (create-lenet)
128 |   (let ([conv1-filters (t-opt:random (list 6 1 5 5) 0.1)]      ; 6 5x5 filters
129 |         [conv1-bias (t-opt:random (list 6) 0.1)]
130 |         
131 |         [conv2-filters (t-opt:random (list 16 6 5 5) 0.1)]     ; 16 5x5 filters
132 |         [conv2-bias (t-opt:random (list 16) 0.1)]
133 |         
134 |         [fc1-weights (t-opt:random (list 400 120) 0.1)]        ; 400 -> 120
135 |         [fc1-bias (t-opt:random (list 120) 0.1)]
136 |         
137 |         [fc2-weights (t-opt:random (list 120 84) 0.1)]         ; 120 -> 84
138 |         [fc2-bias (t-opt:random (list 84) 0.1)]
139 |         
140 |         [fc3-weights (t-opt:random (list 84 10) 0.1)]          ; 84 -> 10
141 |         [fc3-bias (t-opt:random (list 10) 0.1)])
142 |     
143 |     (values conv1-filters conv1-bias
144 |             conv2-filters conv2-bias
145 |             fc1-weights fc1-bias
146 |             fc2-weights fc2-bias
147 |             fc3-weights fc3-bias)))
148 | 
149 | ;; Forward pass through the network
150 | (define (forward-pass x
151 |                       conv1-filters conv1-bias
152 |                       conv2-filters conv2-bias
153 |                       fc1-weights fc1-bias
154 |                       fc2-weights fc2-bias
155 |                       fc3-weights fc3-bias)
156 |   
157 |   ;; First conv layer + relu + pool
158 |   (let* ([conv1 (conv2d-forward x conv1-filters 1 2)]
159 |          [relu1 (relu-opt conv1)]
160 |          [pool1 (max-pool-2x2 relu1)]
161 |          
162 |          ;; Second conv layer + relu + pool
163 |          [conv2 (conv2d-forward pool1 conv2-filters 1 0)]
164 |          [relu2 (relu-opt conv2)]
165 |          [pool2 (max-pool-2x2 relu2)]
166 |          
167 |          ;; Flatten and fully connected layers
168 |          [flatten (flatten-tensor pool2)]
169 |          
170 |          [fc1 (fc-layer flatten fc1-weights fc1-bias relu-opt)]
171 |          [fc2 (fc-layer fc1 fc2-weights fc2-bias relu-opt)]
172 |          [fc3 (fc-layer fc2 fc3-weights fc3-bias identity)])
173 |     
174 |     fc3))
175 | ```
176 | 
177 | ## Step 7: Using GPU Acceleration (Optional)
178 | 
179 | If OpenCL is configured:
180 | 
181 | ```racket
182 | ;; Check if GPU is available
183 | (define gpu-available? (c:check-opencl-available))
184 | 
185 | ;; Select appropriate implementation based on hardware
186 | (define conv2d-impl 
187 |   (if gpu-available?
188 |       conv2d-forward-gpu    ; Use GPU implementation
189 |       conv2d-forward))      ; Use CPU implementation
190 | ```
191 | 
192 | ## Step 8: Training Loop
193 | 
194 | An optimized training loop:
195 | 
196 | ```racket
197 | (define (train-epoch! dataset
198 |                       conv1-filters conv1-bias
199 |                       conv2-filters conv2-bias
200 |                       fc1-weights fc1-bias
201 |                       fc2-weights fc2-bias
202 |                       fc3-weights fc3-bias)
203 |   
204 |   ;; Process mini-batches in parallel when possible
205 |   (for ([batch (in-dataset-batches dataset batch-size)])
206 |     (let-values ([(inputs labels) (batch->tensors batch)])
207 |       ;; Forward pass
208 |       (define outputs 
209 |         (forward-pass inputs
210 |                       conv1-filters conv1-bias
211 |                       conv2-filters conv2-bias
212 |                       fc1-weights fc1-bias
213 |                       fc2-weights fc2-bias
214 |                       fc3-weights fc3-bias))
215 |       
216 |       ;; Compute loss (cross-entropy)
217 |       (define loss (cross-entropy-loss outputs labels))
218 |       
219 |       ;; Backward pass and update weights
220 |       ;; ... (omitted for brevity)
221 |       )))
222 | ```
223 | 
224 | ## Performance Considerations
225 | 
226 | 1. **Batch Processing**: Use the parallel C extensions to process batches
227 | 2. **Memory Reuse**: Use in-place operations when possible
228 | 3. **GPU Acceleration**: Identify operations that benefit most from GPU
229 | 
230 | ## Example Usage
231 | 
232 | ```racket
233 | ;; Load MNIST data
234 | (define mnist-data (load-mnist))
235 | 
236 | ;; Create model
237 | (let-values ([(conv1-filters conv1-bias
238 |               conv2-filters conv2-bias
239 |               fc1-weights fc1-bias
240 |               fc2-weights fc2-bias
241 |               fc3-weights fc3-bias) (create-lenet)])
242 |   
243 |   ;; Train for multiple epochs
244 |   (for ([epoch 10])
245 |     (printf "Epoch ~a\n" epoch)
246 |     (train-epoch! mnist-data
247 |                  conv1-filters conv1-bias
248 |                  conv2-filters conv2-bias
249 |                  fc1-weights fc1-bias
250 |                  fc2-weights fc2-bias
251 |                  fc3-weights fc3-bias)))
252 | ```


--------------------------------------------------------------------------------
/FNN.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt"
  4 |          "autograd.rkt"
  5 |          "device.rkt")
  6 | 
  7 | (provide (all-defined-out))
  8 | 
  9 | ;; Feedforward Neural Network for classification tasks
 10 | 
 11 | ;; Initialize a multi-layer feedforward neural network
 12 | ;; Example: (initialize-neural-network (list 784 256 128 10))
 13 | (define (initialize-neural-network layer-sizes [scale 0.01])
 14 |   (let* ([num-layers (length layer-sizes)]
 15 |          [weights (for/list ([i (in-range (sub1 num-layers))])
 16 |                     (t:random (list (list-ref layer-sizes i) 
 17 |                                    (list-ref layer-sizes (add1 i)))
 18 |                              scale))]
 19 |          [biases (for/list ([i (in-range (sub1 num-layers))])
 20 |                    (t:random (list 1 (list-ref layer-sizes (add1 i)))
 21 |                             scale))])
 22 |     (values weights biases)))
 23 | 
 24 | ;; Forward pass through the entire network
 25 | (define (forward-pass input weights biases activation-fn [final-activation softmax])
 26 |   (let* ([num-layers (add1 (length weights))]
 27 |          [layer-outputs '()]
 28 |          [layer-inputs '()]
 29 |          [current-activations input])
 30 |     
 31 |     ;; Process each layer
 32 |     (for ([i (in-range (length weights))])
 33 |       (let* ([w (list-ref weights i)]
 34 |              [b (list-ref biases i)]
 35 |              [is-final-layer? (= i (sub1 (length weights)))]
 36 |              [z (t:add (t:mul current-activations w) b)]
 37 |              [a (if is-final-layer?
 38 |                     (final-activation z)
 39 |                     (activation-fn z))])
 40 |         (set! layer-inputs (cons z layer-inputs))
 41 |         (set! layer-outputs (cons a layer-outputs))
 42 |         (set! current-activations a)))
 43 |     
 44 |     ;; Return final output and intermediate values for backprop
 45 |     (values (car layer-outputs)               ; Final output
 46 |             (reverse (cdr layer-outputs))     ; Hidden layer activations
 47 |             (reverse layer-inputs))))         ; Pre-activation values
 48 | 
 49 | ;; Calculate cross-entropy loss for classification
 50 | (define (cross-entropy-loss y-pred y-true)
 51 |   (let* ([m (car (t:shape y-true))]
 52 |          [epsilon 1e-15]
 53 |          [y-pred-clipped (t:create (t:shape y-pred)
 54 |                                  (for/list ([p (vector->list (t:data y-pred))])
 55 |                                    (max (min p (- 1.0 epsilon)) epsilon)))]
 56 |          [loss-vec (for/list ([i (in-range (vector-length (t:data y-true)))])
 57 |                     (* (vector-ref (t:data y-true) i)
 58 |                        (log (vector-ref (t:data y-pred-clipped) i))))])
 59 |     (- (/ (apply + loss-vec) m))))
 60 | 
 61 | ;; Backpropagation and parameter update
 62 | (define (backpropagation input y-true y-pred hidden-activations z-values weights biases learning-rate)
 63 |   (let* ([num-layers (length weights)]
 64 |          [batch-size (car (t:shape input))]
 65 |          
 66 |          ;; Calculate initial gradient (for output layer)
 67 |          [output-grad (t:sub y-pred y-true)]  ; For softmax + cross-entropy, this is the gradient
 68 |          
 69 |          ;; Initialize lists to store gradients
 70 |          [weight-gradients '()]
 71 |          [bias-gradients '()]
 72 |          
 73 |          ;; Add input to start of activations for backprop calculations
 74 |          [all-activations (cons input hidden-activations)]
 75 |          
 76 |          ;; Backpropagate through the network
 77 |          [current-grad output-grad])
 78 |     
 79 |     ;; Process each layer in reverse
 80 |     (for ([layer-idx (in-range (sub1 num-layers) -1 -1)])
 81 |       (let* ([activation (list-ref all-activations layer-idx)]
 82 |              [z (list-ref z-values layer-idx)]
 83 |              [w (list-ref weights layer-idx)]
 84 |              
 85 |              ;; For hidden layers, apply activation derivative to gradients
 86 |              [gradient (if (= layer-idx (sub1 num-layers))
 87 |                            current-grad  ; Output layer gradient
 88 |                            ;; Hidden layer gradient
 89 |                            (let ([grad-times-weights (t:mul current-grad (t:transpose (list-ref weights (add1 layer-idx))))]
 90 |                                  [activation-deriv (relu-derivative z)])
 91 |                              (t:mul grad-times-weights activation-deriv)))]
 92 |              
 93 |              ;; Calculate gradients for weights and biases
 94 |              [weight-grad (t:scale (t:mul (t:transpose activation) gradient) 
 95 |                                  (/ 1.0 batch-size))]
 96 |              [bias-grad (t:create (t:shape (list-ref biases layer-idx))
 97 |                                (for/list ([j (in-range (cadr (t:shape gradient)))])
 98 |                                  (/ (for/sum ([i (in-range batch-size)])
 99 |                                       (vector-ref (t:data gradient) (+ (* i (cadr (t:shape gradient))) j)))
100 |                                     batch-size)))])
101 |         
102 |         ;; Store gradients
103 |         (set! weight-gradients (cons weight-grad weight-gradients))
104 |         (set! bias-gradients (cons bias-grad bias-gradients))
105 |         
106 |         ;; Update current gradient for next layer
107 |         (set! current-grad gradient)))
108 |     
109 |     ;; Update weights and biases using gradients
110 |     (let ([new-weights (map (lambda (w gradient) 
111 |                              (t:sub w (t:scale gradient learning-rate))) 
112 |                            weights weight-gradients)]
113 |           [new-biases (map (lambda (b gradient) 
114 |                             (t:sub b (t:scale gradient learning-rate))) 
115 |                           biases bias-gradients)])
116 |       
117 |       (values new-weights new-biases))))
118 | 
119 | ;; Softmax function
120 | (define (softmax z)
121 |   (let* ([shape (t:shape z)]
122 |          [data (t:data z)]
123 |          [max-vals (for/vector ([i (in-range (car shape))])
124 |                     (apply max (for/list ([j (in-range (cadr shape))])
125 |                                (vector-ref data (+ (* i (cadr shape)) j)))))]
126 |          [exp-vals (for/vector ([i (in-range (vector-length data))])
127 |                     (exp (- (vector-ref data i) 
128 |                            (vector-ref max-vals (quotient i (cadr shape))))))]
129 |          [sum-vals (for/vector ([i (in-range (car shape))])
130 |                     (for/sum ([j (in-range (cadr shape))])
131 |                       (vector-ref exp-vals (+ (* i (cadr shape)) j))))])
132 |     (t:create shape
133 |               (for/vector ([i (in-range (vector-length exp-vals))])
134 |                 (/ (vector-ref exp-vals i)
135 |                    (vector-ref sum-vals (quotient i (cadr shape))))))))
136 | 
137 | ;; Predict classes from model output
138 | (define (predict output-tensor)
139 |   (let* ([shape (t:shape output-tensor)]
140 |          [batch-size (car shape)]
141 |          [num-classes (cadr shape)]
142 |          [data (t:data output-tensor)]
143 |          [predictions (for/list ([b (in-range batch-size)])
144 |                         (for/fold ([max-idx 0]
145 |                                     [max-val (vector-ref data (* b num-classes))])
146 |                                    ([c (in-range 1 num-classes)])
147 |                           (let ([val (vector-ref data (+ (* b num-classes) c))])
148 |                             (if (> val max-val)
149 |                                 (values c val)
150 |                                 (values max-idx max-val)))))])
151 |     predictions))
152 | 
153 | ;; Calculate accuracy
154 | (define (accuracy predictions y-true)
155 |   (let* ([num-samples (length predictions)]
156 |          [true-labels (let ([shape (t:shape y-true)]
157 |                             [data (t:data y-true)])
158 |                         (for/list ([i (in-range (car shape))])
159 |                           (for/first ([j (in-range (cadr shape))]
160 |                                      #:when (= 1.0 (vector-ref data (+ (* i (cadr shape)) j))))
161 |                             j)))]
162 |          [correct-count (for/sum ([i (in-range num-samples)])
163 |                           (if (= (list-ref predictions i) (list-ref true-labels i))
164 |                               1
165 |                               0))])
166 |     (/ correct-count num-samples)))
167 | 
168 | ;; Create batch data
169 | (define (get-batch X y indices)
170 |   (let* ([batch-size (length indices)]
171 |          [X-features (cadr (t:shape X))]
172 |          [y-classes (cadr (t:shape y))]
173 |          
174 |          [X-batch-data (make-vector (* batch-size X-features) 0.0)]
175 |          [y-batch-data (make-vector (* batch-size y-classes) 0.0)])
176 |     
177 |     ;; Copy data for each example in the batch
178 |     (for ([i (in-range batch-size)]
179 |           [idx (in-list indices)])
180 |       ;; Copy features
181 |       (for ([j (in-range X-features)])
182 |         (vector-set! X-batch-data 
183 |                      (+ (* i X-features) j)
184 |                      (vector-ref (t:data X) (+ (* idx X-features) j))))
185 |       
186 |       ;; Copy labels
187 |       (for ([j (in-range y-classes)])
188 |         (vector-set! y-batch-data 
189 |                      (+ (* i y-classes) j)
190 |                      (vector-ref (t:data y) (+ (* idx y-classes) j)))))
191 |     
192 |     (values (t:create (list batch-size X-features) X-batch-data)
193 |             (t:create (list batch-size y-classes) y-batch-data))))
194 | 
195 | ;; Simple training function for demonstration
196 | (define (train-fnn X-train y-train X-val y-val 
197 |                   [hidden-layers (list 128 64)] 
198 |                   [learning-rate 0.01] 
199 |                   [epochs 10] 
200 |                   [batch-size 32])
201 |   (printf "Initializing neural network...~n")
202 |   
203 |   ;; Determine network architecture
204 |   (define input-size (cadr (t:shape X-train)))
205 |   (define output-size (cadr (t:shape y-train)))
206 |   (define layer-sizes (append (list input-size) hidden-layers (list output-size)))
207 |   
208 |   ;; Initialize network parameters
209 |   (define-values (weights biases) 
210 |     (initialize-neural-network layer-sizes))
211 |   
212 |   ;; Early stopping parameters
213 |   (define patience 3)
214 |   (define best-val-accuracy 0.0)
215 |   (define wait 0)
216 |   (define best-weights weights)
217 |   (define best-biases biases)
218 |   
219 |   (printf "Neural network architecture: ~a~n" layer-sizes)
220 |   (printf "Starting training for ~a epochs...~n" epochs)
221 |   
222 |   ;; Training loop
223 |   (for/fold ([stop? #f]) 
224 |             ([epoch (in-range epochs)] #:break stop?)
225 |     (printf "Epoch ~a/~a:~n" (add1 epoch) epochs)
226 |     
227 |     ;; Shuffle training data
228 |     (define train-size (car (t:shape X-train)))
229 |     (define indices (shuffle (range train-size)))
230 |     (define num-batches (quotient train-size batch-size))
231 |     
232 |     ;; Initialize tracking variables
233 |     (define epoch-losses '())
234 |     
235 |     ;; Process each batch
236 |     (for ([batch (in-range num-batches)])
237 |       (let* ([start-idx (* batch batch-size)]
238 |              [end-idx (min (+ start-idx batch-size) train-size)]
239 |              [batch-indices (take (drop indices start-idx) (- end-idx start-idx))])
240 |         
241 |         ;; Get batch data
242 |         (define-values (X-batch y-batch) 
243 |           (get-batch X-train y-train batch-indices))
244 |         
245 |         ;; Forward pass
246 |         (define-values (y-pred hidden-activations z-values) 
247 |           (forward-pass X-batch weights biases relu))
248 |         
249 |         ;; Calculate loss
250 |         (define loss (cross-entropy-loss y-pred y-batch))
251 |         (set! epoch-losses (cons loss epoch-losses))
252 |         
253 |         ;; Backpropagation and parameter update
254 |         (define-values (new-weights new-biases) 
255 |           (backpropagation X-batch y-batch y-pred 
256 |                           hidden-activations z-values 
257 |                           weights biases learning-rate))
258 |         
259 |         ;; Update parameters
260 |         (set! weights new-weights)
261 |         (set! biases new-biases)
262 |         
263 |         ;; Print progress
264 |         (when (= (modulo batch 10) 0)
265 |           (printf "  Batch ~a/~a: Loss = ~a~n" 
266 |                   batch 
267 |                   num-batches 
268 |                   (real->decimal-string loss 4)))))
269 |     
270 |     ;; Evaluate on validation set
271 |     (define-values (val-pred _ __) 
272 |       (forward-pass X-val weights biases relu))
273 |     
274 |     (define val-predictions (predict val-pred))
275 |     (define val-accuracy (* 100.0 (accuracy val-predictions y-val)))
276 |     
277 |     ;; Calculate average loss for epoch
278 |     (define avg-loss 
279 |       (/ (apply + epoch-losses) (length epoch-losses)))
280 |     
281 |     (printf "Epoch Summary:~n")
282 |     (printf "  Average Loss: ~a~n" (real->decimal-string avg-loss 4))
283 |     (printf "  Validation Accuracy: ~a%~n" (real->decimal-string val-accuracy 2))
284 |     
285 |     ;; Early stopping check
286 |     (if (> val-accuracy best-val-accuracy)
287 |         (begin
288 |           (set! best-val-accuracy val-accuracy)
289 |           (set! best-weights weights)
290 |           (set! best-biases biases)
291 |           (set! wait 0)
292 |           (printf "  New best model saved!~n"))
293 |         (begin
294 |           (set! wait (add1 wait))
295 |           (printf "  No improvement for ~a epochs.~n" wait)))
296 |     
297 |     ;; Check if we should stop
298 |     (if (>= wait patience)
299 |         (begin
300 |           (printf "Early stopping triggered after ~a epochs without improvement.~n" 
301 |                   patience)
302 |           #t)  ; Stop training
303 |         #f))   ; Continue training
304 |   
305 |   ;; Restore best model
306 |   (set! weights best-weights)
307 |   (set! biases best-biases)
308 |   
309 |   (printf "Training Complete!~n")
310 |   (printf "Best Validation Accuracy: ~a%~n" 
311 |           (real->decimal-string best-val-accuracy 2))
312 |   
313 |   ;; Return the trained model
314 |   (values weights biases))
315 | 
316 | ;; Example usage in a main module
317 | (module+ main
318 |   (require "mnist.rkt")
319 |   
320 |   (printf "Loading MNIST data...~n")
321 |   
322 |   ;; Data loading functions are in mnist.rkt
323 |   (define-values (X-train-full y-train-full) (load-mnist-data "train"))
324 |   (define-values (X-test y-test) (load-mnist-data "test"))
325 |   
326 |   ;; Normalize and one-hot encode
327 |   (define X-train-norm (normalize X-train-full))
328 |   (define X-test-norm (normalize X-test))
329 |   (define y-train-onehot (one-hot y-train-full 10))
330 |   (define y-test-onehot (one-hot y-test 10))
331 |   
332 |   ;; Create validation split
333 |   (define train-size (car (t:shape X-train-norm)))
334 |   (define validation-split 0.1)
335 |   (define val-size (inexact->exact (floor (* train-size validation-split))))
336 |   (define train-size-actual (- train-size val-size))
337 |   
338 |   ;; Shuffle indices
339 |   (define all-indices (shuffle (range train-size)))
340 |   (define train-indices (take all-indices train-size-actual))
341 |   (define val-indices (drop all-indices train-size-actual))
342 |   
343 |   ;; Create training and validation sets
344 |   (define-values (X-train y-train) (get-batch X-train-norm y-train-onehot train-indices))
345 |   (define-values (X-val y-val) (get-batch X-train-norm y-train-onehot val-indices))
346 |   
347 |   (printf "Training FNN on MNIST...~n")
348 |   (printf "Train set: ~a examples~n" (car (t:shape X-train)))
349 |   (printf "Validation set: ~a examples~n" (car (t:shape X-val)))
350 |   (printf "Test set: ~a examples~n" (car (t:shape X-test-norm)))
351 |   
352 |   ;; Train the model
353 |   (define-values (trained-weights trained-biases) 
354 |     (train-fnn X-train y-train X-val y-val 
355 |               (list 128)  ; One hidden layer with 128 units
356 |               0.01        ; Learning rate
357 |               5           ; Epochs
358 |               64))        ; Batch size
359 |   
360 |   ;; Evaluate on test set
361 |   (define-values (test-pred _ __) 
362 |     (forward-pass X-test-norm trained-weights trained-biases relu))
363 |   
364 |   (define test-predictions (predict test-pred))
365 |   (define test-accuracy (* 100.0 (accuracy test-predictions y-test-onehot)))
366 |   
367 |   (printf "Final Test Accuracy: ~a%~n" 
368 |           (real->decimal-string test-accuracy 2)))


--------------------------------------------------------------------------------
/OPTIMIZATION_STRATEGY.md:
--------------------------------------------------------------------------------
  1 | # RacoGrad Optimization Strategy
  2 | 
  3 | This document outlines the comprehensive plan for optimizing the RacoGrad deep learning library to achieve exponential speedups.
  4 | 
  5 | ## Performance Optimization Techniques
  6 | 
  7 | ### 1. C Extensions (Immediate Wins)
  8 | 
  9 | Native C implementations provide significant speedups for core operations:
 10 | 
 11 | - **Matrix Multiplication**: 10-100x faster than pure Racket
 12 | - **Element-wise Operations**: 5-20x faster
 13 | - **Activation Functions**: 3-10x faster
 14 | 
 15 | **Implementation Files**:
 16 | - `matrix_multiplication.c`: Core C implementations
 17 | - `ffi_ops.rkt`: Racket FFI interface
 18 | 
 19 | **Usage**:
 20 | ```racket
 21 | (require "ffi_ops.rkt")
 22 | ;; Use C-accelerated matrix multiply
 23 | (c:matrix-multiply rows-a cols-a cols-b input-vec weights-vec output-vec)
 24 | ```
 25 | 
 26 | ### 2. SIMD Vectorization (CPU Optimization)
 27 | 
 28 | SIMD instructions leverage CPU vector units for parallel data processing:
 29 | 
 30 | - **AVX/SSE Instructions**: Process 4/2 doubles simultaneously
 31 | - **Key Operations**: Matrix multiplication, element-wise ops, activations
 32 | 
 33 | **Implementation Files**:
 34 | - `simd_ops.c`: SIMD-optimized implementations
 35 | - `simd_ffi.rkt`: Racket FFI interface for SIMD ops
 36 | 
 37 | **Expected Gains**:
 38 | - 2-4x speedup over basic C implementations
 39 | - Automatic fallback to non-SIMD code when unavailable
 40 | 
 41 | ### 3. Memory Optimization (Reduced Overhead)
 42 | 
 43 | Better memory management reduces allocation overhead:
 44 | 
 45 | - **Contiguous Memory**: Using `f64vector` instead of Racket vectors
 46 | - **In-place Operations**: Modifying tensors directly
 47 | - **Memory Layout**: Cache-friendly data arrangement
 48 | 
 49 | **Implementation Files**:
 50 | - `tensor_optimized.rkt`: Memory-optimized tensor implementation
 51 | 
 52 | **Usage Example**:
 53 | ```racket
 54 | ;; Out-of-place (returns new tensor)
 55 | (t-opt:add tensor1 tensor2)
 56 | ;; In-place (modifies tensor1)
 57 | (t-opt:add! tensor1 tensor2)
 58 | ```
 59 | 
 60 | ### 4. Parallelization (Multi-core Utilization)
 61 | 
 62 | Multi-threading for batch processing and large operations:
 63 | 
 64 | - **Batch Processing**: Split mini-batches across threads
 65 | - **Parallel Matrix Operations**: Divide work among threads
 66 | 
 67 | **Implementation Files**:
 68 | - `parallel_ops.c`: Parallel implementations using pthreads
 69 | - `parallel_ffi.rkt`: Racket FFI interface
 70 | 
 71 | **Expected Gains**:
 72 | - Near-linear scaling with number of cores
 73 | - 2-8x speedup on typical multi-core systems
 74 | 
 75 | ### 5. GPU Acceleration (Massive Parallelism)
 76 | 
 77 | Leveraging GPU for massively parallel computations:
 78 | 
 79 | - **OpenCL**: Cross-platform GPU acceleration
 80 | - **ArrayFire/ONNX**: High-level GPU libraries
 81 | - **Vulkan Compute**: Modern GPU API option
 82 | 
 83 | **Implementation Files**:
 84 | - `gpu_acceleration.md`: Implementation guide
 85 | - `ocl_kernels.cl`: OpenCL kernel examples
 86 | 
 87 | ## Integration Strategy
 88 | 
 89 | ### Phase 1: C Extensions & Memory Optimization
 90 | 
 91 | 1. Replace most performance-critical Racket functions with C implementations
 92 | 2. Implement memory-optimized tensor structure with `f64vector`
 93 | 3. Add in-place operation variants
 94 | 4. Benchmark and identify remaining bottlenecks
 95 | 
 96 | ### Phase 2: SIMD & Parallelization
 97 | 
 98 | 1. Add SIMD optimizations to C code
 99 | 2. Implement multi-threading for batch processing
100 | 3. Parallelize large matrix operations
101 | 4. Update Racket interfaces to expose thread count parameters
102 | 
103 | ### Phase 3: GPU Acceleration
104 | 
105 | 1. Start with OpenCL for broadest compatibility
106 | 2. Implement key kernels for matrix multiplication and convolution
107 | 3. Add automatic fallback to CPU when GPU is unavailable
108 | 4. Create tensor-to-GPU memory transfer utilities
109 | 
110 | ## Benchmarking Framework
111 | 
112 | Implement a comprehensive benchmarking system:
113 | 
114 | ```racket
115 | (benchmark-ops
116 |   (list 
117 |     (cons "Racket Matrix Multiply" (lambda () (t:mul A B)))
118 |     (cons "C Matrix Multiply" (lambda () (t-opt:mul A B)))
119 |     (cons "SIMD Matrix Multiply" (lambda () (t-simd:mul A B)))
120 |     (cons "GPU Matrix Multiply" (lambda () (t-gpu:mul A B)))))
121 | ```
122 | 
123 | ## Implementation Priorities
124 | 
125 | 1. **Matrix Multiplication**: Most impactful operation in neural networks
126 | 2. **Convolution Operations**: Critical for CNN performance
127 | 3. **Element-wise Operations**: Used throughout backpropagation
128 | 4. **Activation Functions**: Applied to every neuron
129 | 
130 | ## Usage Examples
131 | 
132 | ### Original Racket Code:
133 | ```racket
134 | (define output (dense-forward input weights biases relu))
135 | ```
136 | 
137 | ### Optimized Version:
138 | ```racket
139 | ;; CPU optimized with SIMD
140 | (define output (dense-forward-opt input weights biases relu-simd))
141 | 
142 | ;; GPU accelerated
143 | (define output (dense-forward-gpu input weights biases relu-gpu))
144 | ```
145 | 
146 | ## Deployment Considerations
147 | 
148 | - Auto-detect available optimizations (SIMD, number of cores, GPU)
149 | - Provide unified interface with automatic backend selection
150 | - Maintain compatibility with existing RacoGrad code


--------------------------------------------------------------------------------
/RACOGRAD_USER_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # RacoGrad: A Deep Learning Framework in Racket
  2 | 
  3 | ## Overview
  4 | 
  5 | RacoGrad is a deep learning framework implemented in Racket that provides functionality for creating, training, and evaluating neural networks. It includes support for:
  6 | 
  7 | - Tensor operations
  8 | - Automatic differentiation (autograd)
  9 | - Convolutional Neural Networks (CNNs)
 10 | - Device-aware computation (CPU, GPU via OpenCL, and Apple Silicon via MLX)
 11 | - MNIST dataset handling
 12 | 
 13 | ## Getting Started
 14 | 
 15 | ### Installation
 16 | 
 17 | 1. Make sure you have Racket installed on your system
 18 | 2. Clone this repository
 19 | 3. Compile the C/C++ extensions by running:
 20 |    ```bash
 21 |    ./compile_extensions.sh
 22 |    ```
 23 | 
 24 | ### Basic Usage
 25 | 
 26 | #### Training a Logistic Regression Model on MNIST
 27 | 
 28 | ```racket
 29 | #lang racket
 30 | (require "mnist.rkt")
 31 | 
 32 | ;; The mnist.rkt module will automatically load and train a logistic regression model
 33 | ;; on the MNIST dataset when required
 34 | ```
 35 | 
 36 | #### Training a CNN on MNIST
 37 | 
 38 | ```racket
 39 | #lang racket
 40 | (require "CNN.rkt")
 41 | 
 42 | ;; To train a CNN on the default device (MLX if available):
 43 | (train-cnn)
 44 | 
 45 | ;; To specify the device:
 46 | (train-cnn 'cpu)  ; Use CPU
 47 | (train-cnn 'mlx)  ; Use MLX (Apple Silicon)
 48 | (train-cnn 'gpu)  ; Use GPU (via OpenCL)
 49 | 
 50 | ;; To specify training parameters:
 51 | (train-cnn 'cpu 10 64)  ; 10 epochs, batch size 64
 52 | ```
 53 | 
 54 | ## Core Components
 55 | 
 56 | ### Tensor Operations
 57 | 
 58 | The tensor operations are provided by "tensor.rkt" and "tensor_device.rkt":
 59 | 
 60 | ```racket
 61 | (require "tensor.rkt")        ; Basic tensor operations
 62 | (require "tensor_device.rkt") ; Device-aware tensor operations
 63 | 
 64 | ;; Create a tensor
 65 | (define t (t:create '(2 3) #(1 2 3 4 5 6)))
 66 | 
 67 | ;; Create a device-aware tensor
 68 | (require "device.rkt")
 69 | (define dt (dt:create '(2 3) #(1 2 3 4 5 6) (cpu)))
 70 | 
 71 | ;; Perform operations
 72 | (t:add t1 t2)            ; Add two tensors
 73 | (dt:add dt1 dt2)         ; Add two device tensors
 74 | 
 75 | ;; Move tensor between devices
 76 | (dt:to dt (gpu))         ; Move tensor to GPU
 77 | ```
 78 | 
 79 | ### Automatic Differentiation
 80 | 
 81 | The autograd functionality is provided by "autograd.rkt":
 82 | 
 83 | ```racket
 84 | (require "autograd.rkt")
 85 | 
 86 | ;; Activation functions
 87 | (relu x)                ; ReLU activation
 88 | (sigmoid x)             ; Sigmoid activation
 89 | 
 90 | ;; Forward pass
 91 | (dense-forward input weights biases activation-fn)
 92 | 
 93 | ;; Backward pass
 94 | (dense-backward input weights biases output grad-output 
 95 |                 activation-derivative learning-rate)
 96 | ```
 97 | 
 98 | ### CNN Operations
 99 | 
100 | CNN operations are provided by "cnn_ops.rkt":
101 | 
102 | ```racket
103 | (require "cnn_ops.rkt")
104 | 
105 | ;; These are lower-level C function bindings
106 | ;; Higher-level API is provided in CNN.rkt
107 | 
108 | ;; Load optimal implementation for current device
109 | (load-optimal-ops (current-device))
110 | ```
111 | 
112 | ## Hardware Support
113 | 
114 | RacoGrad can utilize different hardware backends:
115 | 
116 | - **CPU**: Default, works on all systems
117 | - **GPU**: Via OpenCL for systems with compatible GPUs
118 | - **MLX**: Accelerated operations on Apple Silicon (M1/M2/M3)
119 | 
120 | To select a device:
121 | 
122 | ```racket
123 | (require "device.rkt")
124 | (require "hardware_detection.rkt")
125 | 
126 | ;; Set current device
127 | (set-current-device! (cpu))  ; Use CPU
128 | (set-current-device! (gpu))  ; Use GPU if available
129 | (set-current-device! (mlx))  ; Use MLX if on Apple Silicon
130 | 
131 | ;; Check hardware availability
132 | (gpu-available?)            ; Returns #t if GPU is available
133 | (device-available? 'mlx)    ; Returns #t if MLX is available
134 | ```
135 | 
136 | ## Example: Custom Model
137 | 
138 | Here's how to create a custom neural network:
139 | 
140 | ```racket
141 | (require "tensor.rkt")
142 | (require "autograd.rkt")
143 | 
144 | ;; Define model parameters
145 | (define input-size 784)
146 | (define hidden-size 128)
147 | (define output-size 10)
148 | 
149 | ;; Initialize weights and biases
150 | (define W1 (t:random (list input-size hidden-size) 0.01))
151 | (define b1 (t:random (list 1 hidden-size) 0.01))
152 | (define W2 (t:random (list hidden-size output-size) 0.01))
153 | (define b2 (t:random (list 1 output-size) 0.01))
154 | 
155 | ;; Forward pass function
156 | (define (my-model input)
157 |   (let* ([hidden (relu (dense-forward input W1 b1 identity))]
158 |          [output (dense-forward hidden W2 b2 identity)])
159 |     output))
160 | 
161 | ;; Predict function
162 | (define (predict output-tensor)
163 |   (argmax vector-ref (t:data output-tensor)))
164 | ```
165 | 
166 | ## Advanced Features
167 | 
168 | ### Model Saving/Loading
169 | 
170 | To save and load models, you can use Racket's built-in serialization:
171 | 
172 | ```racket
173 | (require racket/serialize)
174 | 
175 | ;; Save model parameters
176 | (define out (open-output-file "model.dat" #:exists 'replace))
177 | (serialize 
178 |   (list (t:data weights) (t:shape weights)
179 |         (t:data bias) (t:shape bias))
180 |   out)
181 | (close-output-port out)
182 | 
183 | ;; Load model parameters
184 | (define in (open-input-file "model.dat"))
185 | (define params (deserialize in))
186 | (define loaded-weights 
187 |   (t:create (second params) (first params)))
188 | (define loaded-bias
189 |   (t:create (fourth params) (third params)))
190 | (close-input-port in)
191 | ```
192 | 
193 | ## Troubleshooting
194 | 
195 | - If you encounter errors with the C extensions, make sure to run the `compile_extensions.sh` script.
196 | - Some operations require specific hardware. If not available, the framework will fall back to CPU.
197 | - MNIST data needs to be in the `mnist-data` directory. The scripts expect the standard MNIST files:
198 |   - `train-images.idx3-ubyte`
199 |   - `train-labels.idx1-ubyte`
200 |   - `t10k-images.idx3-ubyte`
201 |   - `t10k-labels.idx1-ubyte`
202 | 
203 | ## Performance Tips
204 | 
205 | - Use device-aware tensors (dt:*) rather than basic tensors (t:*) for better performance
206 | - When running on Apple Silicon, use the MLX device for best performance
207 | - Batch processing improves throughput significantly
208 | - For large datasets, use mini-batch training rather than full batch


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RacoGrad: A Deep Learning Framework in Racket
  2 | 
  3 | RacoGrad is a deep learning framework implemented in Racket, a dialect of the Lisp/Scheme family of programming languages. It provides a comprehensive set of tools for creating, training, and evaluating neural networks, with a focus on both educational value and practical functionality.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Tensor Operations**: Comprehensive set of tensor operations with shape checking and broadcasting
  8 | - **Device-Aware Computation**: Support for CPU, GPU (via OpenCL), and MLX (Apple Silicon)
  9 | - **Automatic Differentiation**: Backpropagation for gradient computation
 10 | - **CNN Support**: Implementation of convolutional neural networks with LeNet-5 architecture
 11 | - **MNIST Dataset Handling**: Tools for loading, preprocessing, and training on MNIST
 12 | - **Early Stopping and Validation**: Techniques to prevent overfitting during training
 13 | 
 14 | ## Installation
 15 | 
 16 | 1. Clone this repository:
 17 |    ```bash
 18 |    git clone https://github.com/dev-null321/RacoGrad.git
 19 |    cd RacoGrad
 20 |    ```
 21 | 
 22 | 2. Download the MNIST dataset:
 23 |    ```bash
 24 |    mkdir -p mnist-data
 25 |    cd mnist-data
 26 |    ```
 27 |    
 28 |    Download the following files and place them in the mnist-data directory:
 29 | 
 30 |    Note I used Kaggle do download the dataset. Download it and place it in your directory. 
 31 |    
 32 |    Note: Remember to gunzip these files after downloading.
 33 | 
 34 | 4. Compile the C extensions:
 35 |    ```bash
 36 |    ./compile_extensions.sh
 37 |    ```
 38 | 
 39 | ## Usage
 40 | 
 41 | ### Training a Logistic Regression Model on MNIST
 42 | 
 43 | ```racket
 44 | #lang racket
 45 | (require "mnist.rkt")
 46 | 
 47 | ;; The mnist.rkt module will automatically load and train a logistic regression model
 48 | ;; on the MNIST dataset when required
 49 | ```
 50 | 
 51 | ### Training a CNN on MNIST
 52 | 
 53 | ```racket
 54 | #lang racket
 55 | (require "CNN.rkt")
 56 | 
 57 | ;; To train a CNN on the default device (MLX if available):
 58 | (train-cnn)
 59 | 
 60 | ;; To specify the device:
 61 | (train-cnn 'cpu)  ; Use CPU
 62 | (train-cnn 'mlx)  ; Use MLX (Apple Silicon)
 63 | (train-cnn 'gpu)  ; Use GPU (via OpenCL)
 64 | 
 65 | ;; To specify training parameters:
 66 | (train-cnn 'cpu 10 64)  ; 10 epochs, batch size 64
 67 | ```
 68 | 
 69 | ### Tensor Operations
 70 | 
 71 | ```racket
 72 | (require "tensor.rkt")
 73 | 
 74 | ;; Create a tensor
 75 | (define t (t:create '(2 3) #(1 2 3 4 5 6)))
 76 | 
 77 | ;; Basic operations
 78 | (t:add t1 t2)      ; Add two tensors
 79 | (t:mul t1 t2)      ; Matrix multiplication
 80 | (t:scale t 2.0)    ; Scalar multiplication
 81 | (t:transpose t)    ; Transpose tensor
 82 | 
 83 | ;; Device-aware tensors
 84 | (require "tensor_device.rkt")
 85 | (require "device.rkt")
 86 | 
 87 | ;; Create a device tensor on CPU
 88 | (define dt (dt:create '(2 3) #(1 2 3 4 5 6) (cpu)))
 89 | 
 90 | ;; Move to GPU if available
 91 | (dt:to dt (gpu))
 92 | 
 93 | ;; Operations automatically use the appropriate device
 94 | (dt:add dt1 dt2)
 95 | ```
 96 | 
 97 | ## Documentation
 98 | 
 99 | For detailed information about the implementation and usage, refer to the following documents:
100 | 
101 | - [User Guide](./RACOGRAD_USER_GUIDE.md): Basic usage guide for users
102 | - [Implementation Details](./RACOGRAD_IMPLEMENTATION_DETAILS.md): Technical details of the implementation
103 | - [Optimization Strategy](./OPTIMIZATION_STRATEGY.md): Performance optimization strategies
104 | - [GPU Acceleration](./gpu_acceleration.md): Details on GPU acceleration
105 | 
106 | ## New Additions
107 | 
108 | This updated version of RacoGrad includes several major enhancements:
109 | 
110 | - **Convolutional Neural Networks**: Full implementation of CNN with backpropagation
111 | - **Device-Aware Computing**: Abstraction layer for running on different hardware
112 | - **Hardware Acceleration**: Support for GPU via OpenCL and Apple Silicon via MLX
113 | - **Improved MNIST Training**: Added validation splits and early stopping
114 | - **Better Documentation**: Comprehensive documentation of the implementation
115 | 
116 | ## Contributing
117 | 
118 | Contributions are welcome! Please feel free to submit a Pull Request.
119 | 
120 | ## License
121 | 
122 | This project is open source and available under the MIT License.
123 | 


--------------------------------------------------------------------------------
/autograd.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "tensor.rkt")
 4 | 
 5 | (provide dense-forward 
 6 |          mean-squared-error 
 7 |          dense-backward 
 8 |          relu 
 9 |          relu-derivative 
10 |          initialize-fnn 
11 |          sigmoid 
12 |          sigmoid-derivative)
13 | 
14 | ;; Activation functions
15 | (define (relu x)
16 |   (t:create (t:shape x) 
17 |             (for/vector ([v (t:data x)]) (max 0 v))))
18 | 
19 | (define (relu-derivative x)
20 |   (t:create (t:shape x) 
21 |             (for/vector ([v (t:data x)]) (if (> v 0) 1 0))))
22 | 
23 | (define (sigmoid x)
24 |   (t:create (t:shape x)
25 |             (for/vector ([v (t:data x)]) (/ 1 (+ 1 (exp (- v)))))))
26 | 
27 | (define (sigmoid-derivative x)
28 |   (let ([sig (sigmoid x)])
29 |     (t:create (t:shape x) 
30 |               (for/vector ([v (t:data sig)]) (* v (- 1 v))))))
31 | 
32 | (define (tanh x)
33 |   (t:create (t:shape x)
34 |             (for/vector ([v (t:data x)]) 
35 |               (let ([e^v (exp v)]
36 |                     [e^-v (exp (- v))])
37 |                 (/ (- e^v e^-v) (+ e^v e^-v))))))
38 | 
39 | (define (tanh-derivative x)
40 |   (let ([t (tanh x)])
41 |     (t:create (t:shape x)
42 |               (for/vector ([v (t:data t)]) (- 1 (* v v))))))
43 | 
44 | ;; Forward pass through a dense layer
45 | (define (dense-forward input weights biases activation-fn)
46 |   (let* ([mul-result (t:mul input weights)]
47 |          [output-dim (cadr (t:shape mul-result))]
48 |          [reshaped-biases (t:reshape biases (list output-dim))]
49 |          [z (t:add mul-result reshaped-biases)]
50 |          [activation-output (activation-fn z)])
51 |     activation-output))
52 | 
53 | ;; Mean Squared Error
54 | (define (mean-squared-error y-true y-pred)
55 |   (let* ([diff (t:sub y-true y-pred)]
56 |          [squared-diff (t:mul diff diff)]
57 |          [sum (apply + (vector->list (t:data squared-diff)))])
58 |     (/ sum (length (vector->list (t:data y-true))))))
59 | 
60 | ;; Backward pass for a dense layer
61 | (define (dense-backward input weights biases output grad-output activation-derivative learning-rate)
62 |   (let* ([grad-activation (activation-derivative output)]
63 |          [grad-z (t:mul grad-output grad-activation)]
64 |          [grad-weights (t:mul (t:transpose input) grad-z)]
65 |          [bias-len (vector-length (t:data biases))]
66 |          ;; Compute grad-biases by summing each column of grad-z
67 |          [grad-biases (t:create (list bias-len)
68 |                                 (for/vector ([j bias-len])
69 |                                   (apply +
70 |                                          (for/list ([i (car (t:shape grad-z))])
71 |                                            (vector-ref (t:data grad-z)
72 |                                                        (+ (* i bias-len) j))))))]
73 |          [grad-input (t:mul grad-z (t:transpose weights))])
74 |     (values grad-weights grad-biases grad-input)))
75 | 
76 | ;; Initialize a fully-connected neural network layer (input-tensor, weights, biases)
77 | (define (initialize-fnn batch-size input-dim output-dim)
78 |   (let* ([input-data (make-list (* batch-size input-dim) 0)]
79 |          [input-tensor (t:create (list batch-size input-dim) input-data)]
80 |          [weight-shape (list input-dim output-dim)]
81 |          [bias-shape (list output-dim)]
82 |          [weights (t:random weight-shape 1.0)]
83 |          [biases (t:random bias-shape 1.0)])
84 |     (values input-tensor weights biases)))
85 | 


--------------------------------------------------------------------------------
/benchmark.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt"
  4 |          "tensor_optimized.rkt"
  5 |          "ffi_ops.rkt")
  6 | 
  7 | (provide run-benchmarks)
  8 | 
  9 | ;; Run the benchmarks automatically when this file is executed directly
 10 | (module+ main
 11 |   (run-benchmarks))
 12 | 
 13 | ;; Utility to time function execution
 14 | (define (time-execution func)
 15 |   (let ([start (current-inexact-milliseconds)])
 16 |     (func)
 17 |     (- (current-inexact-milliseconds) start)))
 18 | 
 19 | ;; Run a benchmark multiple times and return average
 20 | (define (run-benchmark name func iterations)
 21 |   (printf "Running benchmark: ~a\n" name)
 22 |   (let* ([times (for/list ([i iterations])
 23 |                   (time-execution func))]
 24 |          [avg (/ (apply + times) iterations)])
 25 |     (printf "  Average time over ~a iterations: ~a ms\n" iterations avg)
 26 |     (cons name avg)))
 27 | 
 28 | ;; Helper function to create random matrices of given dimensions
 29 | (define (create-random-matrices rows-a cols-a cols-b)
 30 |   (let ([a (t:random (list rows-a cols-a) 1.0)]
 31 |         [b (t:random (list cols-a cols-b) 1.0)])
 32 |     (values a b)))
 33 | 
 34 | ;; Helper to create optimized random matrices
 35 | (define (create-random-optimized-matrices rows-a cols-a cols-b)
 36 |   (let ([a (t-opt:random (list rows-a cols-a) 1.0)]
 37 |         [b (t-opt:random (list cols-a cols-b) 1.0)])
 38 |     (values a b)))
 39 | 
 40 | ;; Benchmark matrix multiplication
 41 | (define (benchmark-matrix-mul iterations)
 42 |   (printf "\n=== Matrix Multiplication Benchmark ===\n")
 43 |   
 44 |   (define rows-a 100)
 45 |   (define cols-a 100)
 46 |   (define cols-b 100)
 47 |   
 48 |   (let-values ([(a b) (create-random-matrices rows-a cols-a cols-b)]
 49 |                [(a-opt b-opt) (create-random-optimized-matrices rows-a cols-a cols-b)])
 50 |     
 51 |     ;; Standard Racket implementation
 52 |     (define racket-result 
 53 |       (run-benchmark "Racket Matrix Multiply" 
 54 |                      (lambda () (t:mul a b))
 55 |                      iterations))
 56 |     
 57 |     ;; C implementation via optimized tensors
 58 |     (define c-result 
 59 |       (run-benchmark "C Matrix Multiply" 
 60 |                      (lambda () (t-opt:mul a-opt b-opt))
 61 |                      iterations))
 62 |     
 63 |     ;; Print speedup
 64 |     (printf "\nSpeedup: ~a times faster\n" 
 65 |             (exact->inexact (/ (cdr racket-result) (cdr c-result))))))
 66 | 
 67 | ;; Benchmark element-wise operations
 68 | (define (benchmark-elementwise iterations)
 69 |   (printf "\n=== Element-wise Operations Benchmark ===\n")
 70 |   
 71 |   (define size 1000000)
 72 |   
 73 |   (let ([a (t:random (list size) 1.0)]
 74 |         [b (t:random (list size) 1.0)]
 75 |         [a-opt (t-opt:random (list size) 1.0)]
 76 |         [b-opt (t-opt:random (list size) 1.0)])
 77 |     
 78 |     ;; Addition
 79 |     (define racket-add 
 80 |       (run-benchmark "Racket Add" 
 81 |                      (lambda () (t:add a b))
 82 |                      iterations))
 83 |     
 84 |     (define c-add 
 85 |       (run-benchmark "C Add" 
 86 |                      (lambda () (t-opt:add a-opt b-opt))
 87 |                      iterations))
 88 |     
 89 |     (printf "\nAdd Speedup: ~a times faster\n" 
 90 |             (exact->inexact (/ (cdr racket-add) (cdr c-add))))
 91 |     
 92 |     ;; Multiplication
 93 |     (define racket-mul 
 94 |       (run-benchmark "Racket Element-wise Mul" 
 95 |                      (lambda () (t:mul a b))
 96 |                      iterations))
 97 |     
 98 |     (define c-mul 
 99 |       (run-benchmark "C Element-wise Mul" 
100 |                      (lambda () (t-opt:mul a-opt b-opt))
101 |                      iterations))
102 |     
103 |     (printf "\nMultiply Speedup: ~a times faster\n" 
104 |             (exact->inexact (/ (cdr racket-mul) (cdr c-mul))))))
105 | 
106 | ;; Run all benchmarks
107 | (define (run-benchmarks [iterations 10])
108 |   (printf "\n====================================\n")
109 |   (printf "  RACOGRAD PERFORMANCE BENCHMARKS\n")
110 |   (printf "====================================\n")
111 |   
112 |   (benchmark-matrix-mul iterations)
113 |   (benchmark-elementwise iterations)
114 |   
115 |   (printf "\n====================================\n")
116 |   (printf "Benchmarks complete!\n")
117 |   (printf "See OPTIMIZATION_STRATEGY.md for more details on optimizations.\n")
118 |   (printf "====================================\n"))


--------------------------------------------------------------------------------
/cnn_benchmark.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "CNN.rkt"
 4 |          "device.rkt"
 5 |          "hardware_detection.rkt")
 6 | 
 7 | ;; Benchmark CNN on different devices
 8 | (define (benchmark-cnn)
 9 |   (printf "====================================\n")
10 |   (printf "  RACOGRAD CNN BENCHMARKS\n")
11 |   (printf "====================================\n")
12 |   
13 |   (print-hardware-info)
14 |   
15 |   ;; Set parameters for quick benchmarking
16 |   (define epochs 2)
17 |   (define batch-size 32)
18 |   
19 |   ;; Run on CPU
20 |   (printf "\n\nRunning CNN benchmark on CPU...\n")
21 |   (define cpu-result (train-cnn 'cpu epochs batch-size))
22 |   
23 |   (define results (list (cons 'cpu cpu-result)))
24 |   
25 |   ;; Run on MLX if available
26 |   (when (device-available? 'mlx)
27 |     (printf "\n\nRunning CNN benchmark on MLX (Apple Silicon)...\n")
28 |     (define mlx-result (train-cnn 'mlx epochs batch-size))
29 |     (set! results (cons (cons 'mlx mlx-result) results)))
30 |   
31 |   ;; Run on CUDA if available
32 |   (when (device-available? 'cuda)
33 |     (printf "\n\nRunning CNN benchmark on CUDA (NVIDIA GPU)...\n")
34 |     (define cuda-result (train-cnn 'cuda epochs batch-size))
35 |     (set! results (cons (cons 'cuda cuda-result) results)))
36 |   
37 |   ;; Run on OpenCL if available
38 |   (when (device-available? 'opencl)
39 |     (printf "\n\nRunning CNN benchmark on OpenCL...\n")
40 |     (define opencl-result (train-cnn 'opencl epochs batch-size))
41 |     (set! results (cons (cons 'opencl opencl-result) results)))
42 |   
43 |   ;; Compare results
44 |   (printf "\n\n====================================\n")
45 |   (printf "  PERFORMANCE COMPARISON\n")
46 |   (printf "====================================\n")
47 |   
48 |   (define cpu-time (hash-ref (cdr (assoc 'cpu results)) 'time))
49 |   (define cpu-accuracy (hash-ref (cdr (assoc 'cpu results)) 'accuracy))
50 |   
51 |   (printf "CPU Training Time: ~a seconds\n" (real->decimal-string cpu-time 2))
52 |   (printf "CPU Accuracy: ~a%\n\n" (real->decimal-string cpu-accuracy 2))
53 |   
54 |   (for ([result (in-list results)])
55 |     (unless (eq? (car result) 'cpu)
56 |       (let* ([device-type (car result)]
57 |              [device-result (cdr result)]
58 |              [device-time (hash-ref device-result 'time)]
59 |              [device-accuracy (hash-ref device-result 'accuracy)]
60 |              [speedup (/ cpu-time device-time)])
61 |         (printf "~a Training Time: ~a seconds\n" 
62 |                 device-type
63 |                 (real->decimal-string device-time 2))
64 |         (printf "~a Accuracy: ~a%\n" 
65 |                 device-type
66 |                 (real->decimal-string device-accuracy 2))
67 |         (printf "Speedup vs CPU: ~a times faster\n\n" 
68 |                 (real->decimal-string speedup 2)))))
69 |   
70 |   (printf "====================================\n"))
71 | 
72 | ;; Run the benchmark from the command line
73 | (module+ main
74 |   (benchmark-cnn))


--------------------------------------------------------------------------------
/cnn_ops.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | 
  6 | // Convolution operation (naive implementation)
  7 | void conv2d_forward(int batch_size, int in_channels, int in_height, int in_width,
  8 |                     int out_channels, int filter_height, int filter_width,
  9 |                     int stride, int padding,
 10 |                     double* input, double* filters, double* output) {
 11 |     
 12 |     int out_height = 1 + (in_height + 2 * padding - filter_height) / stride;
 13 |     int out_width = 1 + (in_width + 2 * padding - filter_width) / stride;
 14 |     
 15 |     // Initialize output to zeros
 16 |     int output_size = batch_size * out_channels * out_height * out_width;
 17 |     memset(output, 0, output_size * sizeof(double));
 18 | 
 19 |     // For each element in the batch
 20 |     for (int b = 0; b < batch_size; b++) {
 21 |         // For each output channel
 22 |         for (int oc = 0; oc < out_channels; oc++) {
 23 |             // For each input channel
 24 |             for (int ic = 0; ic < in_channels; ic++) {
 25 |                 // For each output position
 26 |                 for (int oh = 0; oh < out_height; oh++) {
 27 |                     for (int ow = 0; ow < out_width; ow++) {
 28 |                         // Calculate output index
 29 |                         int out_idx = ((b * out_channels + oc) * out_height + oh) * out_width + ow;
 30 |                         
 31 |                         // For each filter position
 32 |                         for (int fh = 0; fh < filter_height; fh++) {
 33 |                             for (int fw = 0; fw < filter_width; fw++) {
 34 |                                 // Calculate input position with padding
 35 |                                 int ih = oh * stride + fh - padding;
 36 |                                 int iw = ow * stride + fw - padding;
 37 |                                 
 38 |                                 // Skip if outside the input boundaries
 39 |                                 if (ih < 0 || ih >= in_height || iw < 0 || iw >= in_width) {
 40 |                                     continue;
 41 |                                 }
 42 |                                 
 43 |                                 // Calculate input index
 44 |                                 int in_idx = ((b * in_channels + ic) * in_height + ih) * in_width + iw;
 45 |                                 
 46 |                                 // Calculate filter index
 47 |                                 int filter_idx = ((oc * in_channels + ic) * filter_height + fh) * filter_width + fw;
 48 |                                 
 49 |                                 // Add contribution to output
 50 |                                 output[out_idx] += input[in_idx] * filters[filter_idx];
 51 |                             }
 52 |                         }
 53 |                     }
 54 |                 }
 55 |             }
 56 |         }
 57 |     }
 58 | }
 59 | 
 60 | // Max pooling 2x2 with stride 2
 61 | void max_pool_2x2(int batch_size, int channels, int in_height, int in_width,
 62 |                  double* input, double* output) {
 63 |     
 64 |     int out_height = in_height / 2;
 65 |     int out_width = in_width / 2;
 66 |     
 67 |     // Initialize output to very negative values
 68 |     int output_size = batch_size * channels * out_height * out_width;
 69 |     for (int i = 0; i < output_size; i++) {
 70 |         output[i] = -1e9;
 71 |     }
 72 |     
 73 |     // For each element in the batch
 74 |     for (int b = 0; b < batch_size; b++) {
 75 |         // For each channel
 76 |         for (int c = 0; c < channels; c++) {
 77 |             // For each pooling region
 78 |             for (int oh = 0; oh < out_height; oh++) {
 79 |                 for (int ow = 0; ow < out_width; ow++) {
 80 |                     // Calculate output index
 81 |                     int out_idx = ((b * channels + c) * out_height + oh) * out_width + ow;
 82 |                     
 83 |                     // For each input in the pooling region
 84 |                     for (int kh = 0; kh < 2; kh++) {
 85 |                         for (int kw = 0; kw < 2; kw++) {
 86 |                             // Calculate input index
 87 |                             int ih = oh * 2 + kh;
 88 |                             int iw = ow * 2 + kw;
 89 |                             int in_idx = ((b * channels + c) * in_height + ih) * in_width + iw;
 90 |                             
 91 |                             // Update max value
 92 |                             if (input[in_idx] > output[out_idx]) {
 93 |                                 output[out_idx] = input[in_idx];
 94 |                             }
 95 |                         }
 96 |                     }
 97 |                 }
 98 |             }
 99 |         }
100 |     }
101 | }
102 | 
103 | // Flatten a 4D tensor (batch, channels, height, width) to a 2D tensor (batch, channels*height*width)
104 | void flatten_tensor(int batch_size, int channels, int height, int width,
105 |                    double* input, double* output) {
106 |     
107 |     int flat_size = channels * height * width;
108 |     
109 |     // For each batch element
110 |     for (int b = 0; b < batch_size; b++) {
111 |         // For each channel
112 |         for (int c = 0; c < channels; c++) {
113 |             // For each height
114 |             for (int h = 0; h < height; h++) {
115 |                 // For each width
116 |                 for (int w = 0; w < width; w++) {
117 |                     // Calculate input index
118 |                     int in_idx = ((b * channels + c) * height + h) * width + w;
119 |                     
120 |                     // Calculate flat index
121 |                     int flat_idx = b * flat_size + (c * height * width + h * width + w);
122 |                     
123 |                     // Copy value
124 |                     output[flat_idx] = input[in_idx];
125 |                 }
126 |             }
127 |         }
128 |     }
129 | }
130 | 
131 | // Softmax operation
132 | void softmax(int batch_size, int num_classes, double* input, double* output) {
133 |     // For each batch
134 |     for (int b = 0; b < batch_size; b++) {
135 |         // Find max value for stability
136 |         double max_val = -1e9;
137 |         for (int c = 0; c < num_classes; c++) {
138 |             if (input[b * num_classes + c] > max_val) {
139 |                 max_val = input[b * num_classes + c];
140 |             }
141 |         }
142 |         
143 |         // Calculate exp and sum
144 |         double sum = 0.0;
145 |         for (int c = 0; c < num_classes; c++) {
146 |             output[b * num_classes + c] = exp(input[b * num_classes + c] - max_val);
147 |             sum += output[b * num_classes + c];
148 |         }
149 |         
150 |         // Normalize
151 |         for (int c = 0; c < num_classes; c++) {
152 |             output[b * num_classes + c] /= sum;
153 |         }
154 |     }
155 | }
156 | 
157 | // Cross entropy loss
158 | double cross_entropy_loss(int batch_size, int num_classes, double* pred, double* target) {
159 |     double loss = 0.0;
160 |     double epsilon = 1e-15;  // Small value for numerical stability
161 |     
162 |     for (int b = 0; b < batch_size; b++) {
163 |         for (int c = 0; c < num_classes; c++) {
164 |             // Clamp prediction for numerical stability
165 |             double p = pred[b * num_classes + c];
166 |             if (p < epsilon) p = epsilon;
167 |             if (p > 1.0 - epsilon) p = 1.0 - epsilon;
168 |             
169 |             // Add to loss if target is 1
170 |             if (target[b * num_classes + c] > 0.5) {
171 |                 loss -= log(p);
172 |             }
173 |         }
174 |     }
175 |     
176 |     return loss / batch_size;
177 | }


--------------------------------------------------------------------------------
/cnn_ops.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require ffi/unsafe
  4 |          ffi/vector
  5 |          "device.rkt"
  6 |          "hardware_detection.rkt")
  7 | 
  8 | ;; Provide CNN operations
  9 | (provide
 10 |  ;; CNN operations
 11 |  c:conv2d-forward
 12 |  c:max-pool-2x2
 13 |  c:flatten-tensor
 14 |  c:softmax
 15 |  c:cross-entropy-loss
 16 |  
 17 |  ;; Load functions based on available hardware
 18 |  load-optimal-ops)
 19 | 
 20 | ;; Load CPU implementations with error handling
 21 | (define cnn-lib 
 22 |   (with-handlers ([exn:fail? (lambda (e) 
 23 |                              (printf "Warning: cnn_ops library not found. ~a~n" (exn-message e))
 24 |                              #f)])
 25 |     (ffi-lib "cnn_ops" '("" "0"))))
 26 | 
 27 | ;; Try to load MLX implementations if on Apple Silicon
 28 | (define mlx-cnn-lib
 29 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 30 |     (ffi-lib "mlx_cnn_ops" '("" "0"))))
 31 | 
 32 | ;; Dummy implementations when library is not available
 33 | (define (dummy-conv2d batch-size in-channels in-height in-width
 34 |                     out-channels filter-height filter-width
 35 |                     stride padding
 36 |                     input filters output)
 37 |   (printf "Using dummy conv2d implementation~n"))
 38 | 
 39 | (define (dummy-max-pool batch-size channels in-height in-width
 40 |                        input output)
 41 |   (printf "Using dummy max-pool implementation~n"))
 42 | 
 43 | (define (dummy-flatten batch-size channels height width
 44 |                       input output)
 45 |   (printf "Using dummy flatten implementation~n"))
 46 | 
 47 | (define (dummy-softmax batch-size num-classes
 48 |                       input output)
 49 |   (printf "Using dummy softmax implementation~n"))
 50 | 
 51 | (define (dummy-cross-entropy batch-size num-classes
 52 |                            predictions targets)
 53 |   (printf "Using dummy cross-entropy implementation~n")
 54 |   0.1) ; Return a dummy loss
 55 | 
 56 | ;; Default implementations using CPU (or dummy if not available)
 57 | (define c:conv2d-forward
 58 |   (if cnn-lib
 59 |       (with-handlers ([exn:fail? (lambda (e) dummy-conv2d)])
 60 |         (get-ffi-obj "conv2d_forward" cnn-lib
 61 |                     (_fun _int _int _int _int    ; batch_size, in_channels, in_height, in_width
 62 |                           _int _int _int         ; out_channels, filter_height, filter_width
 63 |                           _int _int              ; stride, padding
 64 |                           _f64vector _f64vector _f64vector -> _void)))
 65 |       dummy-conv2d))
 66 | 
 67 | (define c:max-pool-2x2
 68 |   (if cnn-lib
 69 |       (with-handlers ([exn:fail? (lambda (e) dummy-max-pool)])
 70 |         (get-ffi-obj "max_pool_2x2" cnn-lib
 71 |                     (_fun _int _int _int _int    ; batch_size, channels, in_height, in_width
 72 |                           _f64vector _f64vector -> _void)))
 73 |       dummy-max-pool))
 74 | 
 75 | (define c:flatten-tensor
 76 |   (if cnn-lib
 77 |       (with-handlers ([exn:fail? (lambda (e) dummy-flatten)])
 78 |         (get-ffi-obj "flatten_tensor" cnn-lib
 79 |                     (_fun _int _int _int _int    ; batch_size, channels, height, width
 80 |                           _f64vector _f64vector -> _void)))
 81 |       dummy-flatten))
 82 | 
 83 | (define c:softmax
 84 |   (if cnn-lib
 85 |       (with-handlers ([exn:fail? (lambda (e) dummy-softmax)])
 86 |         (get-ffi-obj "softmax" cnn-lib
 87 |                     (_fun _int _int              ; batch_size, num_classes
 88 |                           _f64vector _f64vector -> _void)))
 89 |       dummy-softmax))
 90 | 
 91 | (define c:cross-entropy-loss
 92 |   (if cnn-lib
 93 |       (with-handlers ([exn:fail? (lambda (e) dummy-cross-entropy)])
 94 |         (get-ffi-obj "cross_entropy_loss" cnn-lib
 95 |                     (_fun _int _int              ; batch_size, num_classes
 96 |                           _f64vector _f64vector -> _double)))
 97 |       dummy-cross-entropy))
 98 | 
 99 | ;; MLX implementations (if available)
100 | (define mlx:conv2d-forward
101 |   (if mlx-cnn-lib
102 |       (with-handlers ([exn:fail? (lambda (e) 
103 |                                   (printf "MLX conv2d not available: ~a~n" (exn-message e))
104 |                                   #f)])
105 |         (get-ffi-obj "mlx_conv2d_forward" mlx-cnn-lib
106 |                      (_fun _int _int _int _int
107 |                            _int _int _int
108 |                            _int _int
109 |                            _f64vector _f64vector _f64vector -> _void)))
110 |       ;; If MLX not available, use dummy or CPU implementation
111 |       (if cnn-lib c:conv2d-forward dummy-conv2d)))
112 | 
113 | (define mlx:max-pool-2x2
114 |   (if mlx-cnn-lib
115 |       (with-handlers ([exn:fail? (lambda (e) #f)])
116 |         (get-ffi-obj "mlx_max_pool_2x2" mlx-cnn-lib
117 |                      (_fun _int _int _int _int
118 |                            _f64vector _f64vector -> _void)))
119 |       ;; If MLX not available, use dummy or CPU implementation
120 |       (if cnn-lib c:max-pool-2x2 dummy-max-pool)))
121 | 
122 | (define mlx:flatten-tensor
123 |   (if mlx-cnn-lib
124 |       (with-handlers ([exn:fail? (lambda (e) #f)])
125 |         (get-ffi-obj "mlx_flatten_tensor" mlx-cnn-lib
126 |                      (_fun _int _int _int _int
127 |                            _f64vector _f64vector -> _void)))
128 |       ;; If MLX not available, use dummy or CPU implementation
129 |       (if cnn-lib c:flatten-tensor dummy-flatten)))
130 | 
131 | (define mlx:softmax
132 |   (if mlx-cnn-lib
133 |       (with-handlers ([exn:fail? (lambda (e) #f)])
134 |         (get-ffi-obj "mlx_softmax" mlx-cnn-lib
135 |                      (_fun _int _int
136 |                            _f64vector _f64vector -> _void)))
137 |       ;; If MLX not available, use dummy or CPU implementation
138 |       (if cnn-lib c:softmax dummy-softmax)))
139 | 
140 | ;; Function to reload operations based on device
141 | (define (load-optimal-ops [dev (current-device)])
142 |   (cond
143 |     [(mlx-device? dev)
144 |      (printf "Loading MLX optimized CNN operations for Apple Silicon~n")
145 |      (if mlx-cnn-lib
146 |          (begin
147 |            (set! c:conv2d-forward mlx:conv2d-forward)
148 |            (set! c:max-pool-2x2 mlx:max-pool-2x2)
149 |            (set! c:flatten-tensor mlx:flatten-tensor)
150 |            (set! c:softmax mlx:softmax))
151 |          (printf "Warning: MLX library not available. Using CPU fallback.~n"))]
152 |     
153 |     [(gpu-device? dev)
154 |      (printf "GPU acceleration not yet implemented, using CPU fallback~n")]
155 |     
156 |     [else
157 |      (printf "Using standard CPU CNN operations~n")
158 |      (if (not cnn-lib)
159 |          (printf "Warning: CNN operations library not found. Using dummy implementations.~n"))]))


--------------------------------------------------------------------------------
/cnn_test.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt"
  4 |          "device.rkt"
  5 |          "tensor_device.rkt")
  6 | 
  7 | ;; Simple test for CNN components
  8 | ;; This will test the basic tensor operations needed for CNNs without
  9 | ;; relying on the C libraries or MNIST data
 10 | 
 11 | ;; Create a simple 4D tensor (batch_size, channels, height, width)
 12 | (define (make-test-tensor [batch-size 2] [channels 3] [height 8] [width 8])
 13 |   (let* ([data (make-vector (* batch-size channels height width) 0.0)])
 14 |     ;; Fill with simple pattern
 15 |     (for ([b (in-range batch-size)]
 16 |           [b-val (in-list '(0.1 0.2))])
 17 |       (for ([c (in-range channels)]
 18 |             [c-val (in-list '(0.01 0.02 0.03))])
 19 |         (for ([h (in-range height)])
 20 |           (for ([w (in-range width)])
 21 |             (let ([idx (+ (* b channels height width)
 22 |                          (* c height width)
 23 |                          (* h width)
 24 |                          w)])
 25 |               (vector-set! data idx (+ b-val c-val (* 0.1 h) (* 0.01 w))))))))
 26 |     
 27 |     ;; Create tensor
 28 |     (dt:create (list batch-size channels height width) data (cpu))))
 29 | 
 30 | ;; Simple convolutional operation (without C library)
 31 | (define (simple-conv2d input-tensor filter-tensor [stride 1] [padding 0])
 32 |   (let* ([input-shape (dt:shape input-tensor)]
 33 |          [filter-shape (dt:shape filter-tensor)]
 34 |          
 35 |          [batch-size (car input-shape)]
 36 |          [in-channels (cadr input-shape)]
 37 |          [in-height (caddr input-shape)]
 38 |          [in-width (cadddr input-shape)]
 39 |          
 40 |          [out-channels (car filter-shape)]
 41 |          [filter-height (caddr filter-shape)]
 42 |          [filter-width (cadddr filter-shape)]
 43 |          
 44 |          ;; Calculate output dimensions
 45 |          [out-height (add1 (quotient (- (+ in-height (* 2 padding)) filter-height) stride))]
 46 |          [out-width (add1 (quotient (- (+ in-width (* 2 padding)) filter-width) stride))]
 47 |          
 48 |          [output-data (make-vector (* batch-size out-channels out-height out-width) 0.0)]
 49 |          [output-tensor (dt:create (list batch-size out-channels out-height out-width) 
 50 |                                  output-data 
 51 |                                  (dt:device input-tensor))]
 52 |          
 53 |          [input-data (dt:data input-tensor)]
 54 |          [filter-data (dt:data filter-tensor)])
 55 |     
 56 |     ;; Perform convolution manually
 57 |     (for ([b (in-range batch-size)])
 58 |       (for ([oc (in-range out-channels)])
 59 |         (for ([oh (in-range out-height)])
 60 |           (for ([ow (in-range out-width)])
 61 |             ;; For each output position
 62 |             (let ([out-idx (+ (* b out-channels out-height out-width)
 63 |                              (* oc out-height out-width)
 64 |                              (* oh out-width)
 65 |                              ow)]
 66 |                   [sum 0.0])
 67 |               
 68 |               ;; Sum over input channels and filter dimensions
 69 |               (for ([ic (in-range in-channels)])
 70 |                 (for ([fh (in-range filter-height)])
 71 |                   (for ([fw (in-range filter-width)])
 72 |                     (let* ([ih (+ (* oh stride) fh (- padding))]
 73 |                            [iw (+ (* ow stride) fw (- padding))])
 74 |                       
 75 |                       ;; Check if input position is valid
 76 |                       (when (and (>= ih 0) (< ih in-height)
 77 |                                  (>= iw 0) (< iw in-width))
 78 |                         (let ([input-idx (+ (* b in-channels in-height in-width)
 79 |                                            (* ic in-height in-width)
 80 |                                            (* ih in-width)
 81 |                                            iw)]
 82 |                               [filter-idx (+ (* oc in-channels filter-height filter-width)
 83 |                                             (* ic filter-height filter-width)
 84 |                                             (* fh filter-width)
 85 |                                             fw)])
 86 |                           (set! sum (+ sum (* (vector-ref input-data input-idx)
 87 |                                              (vector-ref filter-data filter-idx))))))))))
 88 |               
 89 |               ;; Set output
 90 |               (vector-set! output-data out-idx sum)))))
 91 |     
 92 |     output-tensor))
 93 | 
 94 | ;; Simple max pooling (2x2 without C library)
 95 | (define (simple-max-pool input-tensor)
 96 |   (let* ([input-shape (dt:shape input-tensor)]
 97 |          [batch-size (car input-shape)]
 98 |          [channels (cadr input-shape)]
 99 |          [in-height (caddr input-shape)]
100 |          [in-width (cadddr input-shape)]
101 |          
102 |          [out-height (quotient in-height 2)]
103 |          [out-width (quotient in-width 2)]
104 |          
105 |          [output-data (make-vector (* batch-size channels out-height out-width) 0.0)]
106 |          [output-tensor (dt:create (list batch-size channels out-height out-width)
107 |                                  output-data
108 |                                  (dt:device input-tensor))]
109 |          
110 |          [input-data (dt:data input-tensor)])
111 |     
112 |     ;; Perform max pooling
113 |     (for ([b (in-range batch-size)])
114 |       (for ([c (in-range channels)])
115 |         (for ([oh (in-range out-height)])
116 |           (for ([ow (in-range out-width)])
117 |             (let ([out-idx (+ (* b channels out-height out-width)
118 |                              (* c out-height out-width)
119 |                              (* oh out-width)
120 |                              ow)]
121 |                   [max-val -inf.0])
122 |               
123 |               ;; Find max in 2x2 region
124 |               (for ([h (in-range (* oh 2) (+ (* oh 2) 2))])
125 |                 (for ([w (in-range (* ow 2) (+ (* ow 2) 2))])
126 |                   (let ([in-idx (+ (* b channels in-height in-width)
127 |                                   (* c in-height in-width)
128 |                                   (* h in-width)
129 |                                   w)])
130 |                     (set! max-val (max max-val (vector-ref input-data in-idx))))))
131 |               
132 |               ;; Set output
133 |               (vector-set! output-data out-idx max-val))))))
134 |     
135 |     output-tensor))
136 | 
137 | ;; Simple ReLU activation
138 | (define (simple-relu input-tensor)
139 |   (let* ([shape (dt:shape input-tensor)]
140 |          [size (apply * shape)]
141 |          [input-data (dt:data input-tensor)]
142 |          [output-data (make-vector size 0.0)]
143 |          [output-tensor (dt:create shape output-data (dt:device input-tensor))])
144 |     
145 |     ;; Apply ReLU
146 |     (for ([i (in-range size)])
147 |       (vector-set! output-data i (max 0.0 (vector-ref input-data i))))
148 |     
149 |     output-tensor))
150 | 
151 | ;; Simple tensor flatten (4D to 2D)
152 | (define (simple-flatten input-tensor)
153 |   (let* ([shape (dt:shape input-tensor)]
154 |          [batch-size (car shape)]
155 |          [channels (cadr shape)]
156 |          [height (caddr shape)]
157 |          [width (cadddr shape)]
158 |          [flat-size (* channels height width)]
159 |          
160 |          [output-data (make-vector (* batch-size flat-size) 0.0)]
161 |          [output-tensor (dt:create (list batch-size flat-size) 
162 |                                  output-data
163 |                                  (dt:device input-tensor))]
164 |          
165 |          [input-data (dt:data input-tensor)])
166 |     
167 |     ;; Flatten the tensor
168 |     (for ([b (in-range batch-size)])
169 |       (for ([c (in-range channels)])
170 |         (for ([h (in-range height)])
171 |           (for ([w (in-range width)])
172 |             (let ([in-idx (+ (* b channels height width)
173 |                             (* c height width)
174 |                             (* h width)
175 |                             w)]
176 |                   [out-idx (+ (* b flat-size)
177 |                              (* c height width)
178 |                              (* h width)
179 |                              w)])
180 |               (vector-set! output-data out-idx (vector-ref input-data in-idx)))))))
181 |     
182 |     output-tensor))
183 | 
184 | ;; Fully connected layer
185 | (define (simple-fc input-tensor weights bias [activation-fn simple-relu])
186 |   (let* ([batch-size (car (dt:shape input-tensor))]
187 |          [out-features (cadr (dt:shape weights))]
188 |          
189 |          [output-data (make-vector (* batch-size out-features) 0.0)]
190 |          [z (dt:create (list batch-size out-features)
191 |                       output-data
192 |                       (dt:device input-tensor))]
193 |          
194 |          [input-data (dt:data input-tensor)]
195 |          [weights-data (dt:data weights)]
196 |          [bias-data (dt:data bias)])
197 |     
198 |     ;; Matrix multiplication and bias addition
199 |     (for ([b (in-range batch-size)])
200 |       (for ([o (in-range out-features)])
201 |         (let ([z-idx (+ (* b out-features) o)]
202 |               [sum 0.0])
203 |           
204 |           ;; Dot product
205 |           (for ([i (in-range (cadr (dt:shape input-tensor)))])
206 |             (let ([in-idx (+ (* b (cadr (dt:shape input-tensor))) i)]
207 |                   [w-idx (+ (* i out-features) o)])
208 |               (set! sum (+ sum (* (vector-ref input-data in-idx)
209 |                                  (vector-ref weights-data w-idx))))))
210 |           
211 |           ;; Add bias
212 |           (set! sum (+ sum (vector-ref bias-data o)))
213 |           (vector-set! output-data z-idx sum))))
214 |     
215 |     ;; Apply activation function if provided
216 |     (if activation-fn
217 |         (activation-fn z)
218 |         z)))
219 | 
220 | ;; Simple softmax function
221 | (define (simple-softmax z)
222 |   (let* ([shape (dt:shape z)]
223 |          [batch-size (car shape)]
224 |          [features (cadr shape)]
225 |          [z-data (dt:data z)]
226 |          [output-data (make-vector (* batch-size features) 0.0)]
227 |          [output (dt:create shape output-data (dt:device z))])
228 |     
229 |     ;; Apply softmax: exp(x_i) / sum(exp(x_j))
230 |     (for ([b (in-range batch-size)])
231 |       (let* ([start-idx (* b features)]
232 |              [end-idx (+ start-idx features)]
233 |              
234 |              ;; Find max value for numerical stability
235 |              [max-val (for/fold ([max-val -inf.0])
236 |                                 ([i (in-range start-idx end-idx)])
237 |                         (max max-val (vector-ref z-data i)))]
238 |              
239 |              ;; Compute exp(x_i - max)
240 |              [exp-vals (for/vector ([i (in-range start-idx end-idx)])
241 |                          (exp (- (vector-ref z-data i) max-val)))]
242 |              
243 |              ;; Compute sum of exp values
244 |              [sum (for/sum ([i (in-range features)])
245 |                     (vector-ref exp-vals i))])
246 |         
247 |         ;; Normalize by sum
248 |         (for ([i (in-range features)])
249 |           (vector-set! output-data (+ start-idx i)
250 |                        (/ (vector-ref exp-vals i) sum)))))
251 |     
252 |     output))
253 | 
254 | ;; Run a simple CNN test
255 | (define (run-cnn-test)
256 |   (printf "Running basic CNN test...~n")
257 |   
258 |   ;; Create test input
259 |   (printf "Creating test input tensor...~n")
260 |   (define input (make-test-tensor 2 3 8 8))
261 |   (printf "Input shape: ~a~n" (dt:shape input))
262 |   
263 |   ;; Create test filters 
264 |   (printf "Creating test filters...~n")
265 |   (define filters1 (dt:random (list 6 3 3 3) 0.1))
266 |   (define bias1 (dt:random (list 1 6) 0.1))
267 |   (printf "First layer filter shape: ~a~n" (dt:shape filters1))
268 |   
269 |   ;; First convolution layer
270 |   (printf "Running first convolution layer...~n")
271 |   (define conv1 (simple-conv2d input filters1 1 1))
272 |   (printf "Conv1 output shape: ~a~n" (dt:shape conv1))
273 |   
274 |   ;; Add bias
275 |   (printf "Adding bias...~n")
276 |   (define conv1-with-bias conv1)  ; Simplified for test
277 |   
278 |   ;; ReLU activation
279 |   (printf "Applying ReLU...~n")
280 |   (define relu1 (simple-relu conv1-with-bias))
281 |   
282 |   ;; Max pooling
283 |   (printf "Max pooling...~n")
284 |   (define pool1 (simple-max-pool relu1))
285 |   (printf "Pool1 output shape: ~a~n" (dt:shape pool1))
286 |   
287 |   ;; Second layer
288 |   (printf "Creating second layer filters...~n")
289 |   (define filters2 (dt:random (list 12 6 3 3) 0.1))
290 |   (define bias2 (dt:random (list 1 12) 0.1))
291 |   
292 |   ;; Second convolution
293 |   (printf "Running second convolution layer...~n")
294 |   (define conv2 (simple-conv2d pool1 filters2 1 1))
295 |   (printf "Conv2 output shape: ~a~n" (dt:shape conv2))
296 |   
297 |   ;; Add bias
298 |   (define conv2-with-bias conv2)  ; Simplified for test
299 |   
300 |   ;; ReLU activation
301 |   (printf "Applying ReLU...~n")
302 |   (define relu2 (simple-relu conv2-with-bias))
303 |   
304 |   ;; Max pooling
305 |   (printf "Max pooling...~n")
306 |   (define pool2 (simple-max-pool relu2))
307 |   (printf "Pool2 output shape: ~a~n" (dt:shape pool2))
308 |   
309 |   ;; Flatten
310 |   (printf "Flattening tensor...~n")
311 |   (define flat (simple-flatten pool2))
312 |   (printf "Flattened shape: ~a~n" (dt:shape flat))
313 |   
314 |   ;; FC layers
315 |   (printf "Creating fully connected layers...~n")
316 |   (define flat-size (cadr (dt:shape flat)))
317 |   (define fc1-weights (dt:random (list flat-size 84) 0.1))
318 |   (define fc1-bias (dt:random (list 1 84) 0.1))
319 |   
320 |   (printf "Running FC layer 1...~n")
321 |   (define fc1 (simple-fc flat fc1-weights fc1-bias))
322 |   (printf "FC1 output shape: ~a~n" (dt:shape fc1))
323 |   
324 |   (define fc2-weights (dt:random (list 84 10) 0.1))
325 |   (define fc2-bias (dt:random (list 1 10) 0.1))
326 |   
327 |   (printf "Running FC layer 2...~n")
328 |   (define fc2 (simple-fc fc1 fc2-weights fc2-bias #f))
329 |   (printf "FC2 output shape: ~a~n" (dt:shape fc2))
330 |   
331 |   ;; Softmax
332 |   (printf "Applying softmax...~n")
333 |   (define output (simple-softmax fc2))
334 |   (printf "Final output shape: ~a~n" (dt:shape output))
335 |   
336 |   ;; Print sample of output
337 |   (printf "Sample output probabilities (first example):~n")
338 |   (for ([i (in-range 10)])
339 |     (printf "  Class ~a: ~a~n" i (vector-ref (dt:data output) i)))
340 |   
341 |   (printf "CNN test completed successfully!~n")
342 |   output)
343 | 
344 | ;; Run the test when the file is executed directly
345 | (module+ main
346 |   (run-cnn-test))


--------------------------------------------------------------------------------
/compile_extensions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Detect OS
 4 | if [[ "$OSTYPE" == "darwin"* ]]; then
 5 |     # macOS
 6 |     SHARED_EXT=".dylib"
 7 |     COMPILE_FLAGS="-dynamiclib -O3"
 8 |     if [[ $(sysctl -n machdep.cpu.features) == *"AVX"* ]]; then
 9 |         SIMD_FLAGS="-mavx"
10 |     elif [[ $(sysctl -n machdep.cpu.features) == *"SSE"* ]]; then
11 |         SIMD_FLAGS="-msse4.2"
12 |     else
13 |         SIMD_FLAGS=""
14 |     fi
15 | else
16 |     # Linux and others
17 |     SHARED_EXT=".so"
18 |     COMPILE_FLAGS="-shared -fPIC -O3"
19 |     if grep -q avx /proc/cpuinfo; then
20 |         SIMD_FLAGS="-mavx"
21 |     elif grep -q sse /proc/cpuinfo; then
22 |         SIMD_FLAGS="-msse4.2"
23 |     else
24 |         SIMD_FLAGS=""
25 |     fi
26 | fi
27 | 
28 | # Compile basic matrix operations
29 | echo "Compiling basic matrix operations..."
30 | cc $COMPILE_FLAGS -o matrix_multiplication$SHARED_EXT matrix_multiplication.c
31 | 
32 | # Compile SIMD operations
33 | echo "Compiling SIMD optimized operations..."
34 | cc $COMPILE_FLAGS $SIMD_FLAGS -o simd_ops$SHARED_EXT simd_ops.c
35 | 
36 | # Compile parallel operations
37 | echo "Compiling parallel operations..."
38 | cc $COMPILE_FLAGS -o parallel_ops$SHARED_EXT parallel_ops.c -lpthread
39 | 
40 | # Compile OpenCL operations
41 | if [ "$(uname)" == "Darwin" ]; then
42 |     echo "Compiling OpenCL operations (macOS)..."
43 |     cc $COMPILE_FLAGS -o matrix_opencl$SHARED_EXT matrix_opencl.c -framework OpenCL
44 | else
45 |     echo "Compiling OpenCL operations (Linux)..."
46 |     cc $COMPILE_FLAGS -o matrix_opencl$SHARED_EXT matrix_opencl.c -lOpenCL
47 | fi
48 | 
49 | # Compile MLX placeholder operations (for Apple Silicon)
50 | echo "Compiling MLX placeholder operations..."
51 | cc $COMPILE_FLAGS -o mlx_ops$SHARED_EXT mlx_ops.c
52 | 
53 | # Compile CUDA placeholder operations
54 | echo "Compiling CUDA placeholder operations..."
55 | cc $COMPILE_FLAGS -o cuda_ops$SHARED_EXT cuda_ops.c
56 | 
57 | # Compile CNN operations
58 | echo "Compiling CNN operations..."
59 | cc $COMPILE_FLAGS -o cnn_ops$SHARED_EXT cnn_ops.c
60 | 
61 | # Compile MLX CNN operations (for Apple Silicon)
62 | echo "Compiling MLX CNN operations..."
63 | if [[ "$OSTYPE" == "darwin"* ]]; then
64 |     cc $COMPILE_FLAGS -o mlx_cnn_ops$SHARED_EXT mlx_cnn_ops.c
65 | else
66 |     cc $COMPILE_FLAGS -fopenmp -o mlx_cnn_ops$SHARED_EXT mlx_cnn_ops.c
67 | fi
68 | 
69 | echo "Compilation complete!"
70 | echo
71 | echo "=================================================================="
72 | echo "OPTIMIZATION SUMMARY"
73 | echo "=================================================================="
74 | echo "1. C Extensions: Basic operations in C for improved performance"
75 | echo "2. SIMD Vectorization: Using CPU vector instructions when available"
76 | echo "   SIMD flags used: $SIMD_FLAGS"
77 | echo "3. Parallel Processing: Multi-threaded operations for batch processing"
78 | echo "4. Memory Optimization: Better memory layout and in-place operations"
79 | echo "5. GPU Acceleration: See gpu_acceleration.md for implementation options"
80 | echo "=================================================================="


--------------------------------------------------------------------------------
/cpu_benchmark.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt"
  4 |          "tensor_optimized.rkt"
  5 |          "hardware_detection.rkt")
  6 | 
  7 | (define (benchmark-matrix-multiplication sizes [iterations 5])
  8 |   (printf "===== Matrix Multiplication Benchmark =====~n")
  9 |   
 10 |   (for ([size sizes])
 11 |     (printf "Matrix size: ~a x ~a~n" size size)
 12 |     
 13 |     ;; Create random matrices
 14 |     (define A (t:random (list size size) 1.0))
 15 |     (define B (t:random (list size size) 1.0))
 16 |     
 17 |     ;; Create optimized matrices
 18 |     (define A-opt (t-opt:random (list size size) 1.0))
 19 |     (define B-opt (t-opt:random (list size size) 1.0))
 20 |     
 21 |     ;; Time standard Racket multiplication
 22 |     (define racket-times
 23 |       (for/list ([i (in-range iterations)])
 24 |         (let ([start (current-inexact-milliseconds)])
 25 |           (t:mul A B)
 26 |           (- (current-inexact-milliseconds) start))))
 27 |     
 28 |     (define avg-racket-time (/ (apply + racket-times) iterations))
 29 |     (printf "  Standard Racket time: ~a ms~n" (real->decimal-string avg-racket-time 2))
 30 |     
 31 |     ;; Time optimized C multiplication
 32 |     (define c-times
 33 |       (for/list ([i (in-range iterations)])
 34 |         (let ([start (current-inexact-milliseconds)])
 35 |           (t-opt:mul A-opt B-opt)
 36 |           (- (current-inexact-milliseconds) start))))
 37 |     
 38 |     (define avg-c-time (/ (apply + c-times) iterations))
 39 |     (printf "  Optimized C time: ~a ms~n" (real->decimal-string avg-c-time 2))
 40 |     
 41 |     ;; Calculate speedup
 42 |     (define speedup (/ avg-racket-time avg-c-time))
 43 |     (printf "  Speedup: ~a times faster~n" (real->decimal-string speedup 2))
 44 |     
 45 |     (printf "~n")))
 46 | 
 47 | (define (benchmark-elementwise-operations size [iterations 5])
 48 |   (printf "===== Element-wise Operations Benchmark =====~n")
 49 |   (printf "Vector size: ~a~n" size)
 50 |   
 51 |   ;; Create random vectors
 52 |   (define A (t:random (list size) 1.0))
 53 |   (define B (t:random (list size) 1.0))
 54 |   
 55 |   ;; Create optimized vectors
 56 |   (define A-opt (t-opt:random (list size) 1.0))
 57 |   (define B-opt (t-opt:random (list size) 1.0))
 58 |   
 59 |   ;; Time standard Racket addition
 60 |   (define racket-add-times
 61 |     (for/list ([i (in-range iterations)])
 62 |       (let ([start (current-inexact-milliseconds)])
 63 |         (t:add A B)
 64 |         (- (current-inexact-milliseconds) start))))
 65 |   
 66 |   (define avg-racket-add-time (/ (apply + racket-add-times) iterations))
 67 |   (printf "  Standard Racket add time: ~a ms~n" (real->decimal-string avg-racket-add-time 2))
 68 |   
 69 |   ;; Time optimized C addition
 70 |   (define c-add-times
 71 |     (for/list ([i (in-range iterations)])
 72 |       (let ([start (current-inexact-milliseconds)])
 73 |         (t-opt:add A-opt B-opt)
 74 |         (- (current-inexact-milliseconds) start))))
 75 |   
 76 |   (define avg-c-add-time (/ (apply + c-add-times) iterations))
 77 |   (printf "  Optimized C add time: ~a ms~n" (real->decimal-string avg-c-add-time 2))
 78 |   
 79 |   ;; Calculate speedup
 80 |   (define add-speedup (/ avg-racket-add-time avg-c-add-time))
 81 |   (printf "  Add speedup: ~a times faster~n" (real->decimal-string add-speedup 2))
 82 |   
 83 |   (printf "~n"))
 84 | 
 85 | (define (run-all-benchmarks)
 86 |   (printf "~n======================================~n")
 87 |   (printf "  RACOGRAD CPU PERFORMANCE BENCHMARKS~n")
 88 |   (printf "======================================~n")
 89 |   
 90 |   (print-hardware-info)
 91 |   
 92 |   (benchmark-matrix-multiplication '(100 500 1000))
 93 |   (benchmark-elementwise-operations 1000000)
 94 |   
 95 |   (printf "======================================~n")
 96 |   (printf "Benchmarks complete!~n")
 97 |   (printf "======================================~n"))
 98 | 
 99 | ;; Run benchmarks when executed directly
100 | (module+ main
101 |   (run-all-benchmarks))


--------------------------------------------------------------------------------
/cuda_ops.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdbool.h>
 4 | #include <string.h>
 5 | 
 6 | // #include <cuda_runtime.h> // Would be included in a real CUDA implementation
 7 | 
 8 | // Placeholder function to check if CUDA is available
 9 | int check_cuda_available() {
10 |     // This would actually check for CUDA-capable devices
11 |     // For now, just a placeholder that returns 0 (not available)
12 |     return 0;
13 | }
14 | 
15 | // Matrix multiplication using CUDA
16 | // This is a placeholder for actual CUDA implementation
17 | int matrix_multiply_cuda(int M, int N, int K, double* A, double* B, double* C) {
18 |     printf("CUDA Matrix multiplication called (placeholder).\n");
19 |     printf("Dimensions: (%d x %d) * (%d x %d) = (%d x %d)\n", M, K, K, N, M, N);
20 |     
21 |     // Fall back to CPU implementation for now
22 |     for (int i = 0; i < M; i++) {
23 |         for (int j = 0; j < N; j++) {
24 |             double sum = 0.0;
25 |             for (int k = 0; k < K; k++) {
26 |                 sum += A[i * K + k] * B[k * N + j];
27 |             }
28 |             C[i * N + j] = sum;
29 |         }
30 |     }
31 |     
32 |     return 0;
33 | }
34 | 
35 | // These are all placeholder functions that would actually use CUDA
36 | // in a real implementation
37 | 
38 | int tensor_add_cuda(int size, double* a, double* b, double* result) {
39 |     printf("CUDA tensor add called (placeholder).\n");
40 |     for (int i = 0; i < size; i++) {
41 |         result[i] = a[i] + b[i];
42 |     }
43 |     return 0;
44 | }
45 | 
46 | int tensor_sub_cuda(int size, double* a, double* b, double* result) {
47 |     printf("CUDA tensor subtract called (placeholder).\n");
48 |     for (int i = 0; i < size; i++) {
49 |         result[i] = a[i] - b[i];
50 |     }
51 |     return 0;
52 | }
53 | 
54 | int tensor_mul_elementwise_cuda(int size, double* a, double* b, double* result) {
55 |     printf("CUDA tensor element-wise multiply called (placeholder).\n");
56 |     for (int i = 0; i < size; i++) {
57 |         result[i] = a[i] * b[i];
58 |     }
59 |     return 0;
60 | }
61 | 
62 | int relu_forward_cuda(int size, double* input, double* output) {
63 |     printf("CUDA ReLU forward called (placeholder).\n");
64 |     for (int i = 0; i < size; i++) {
65 |         output[i] = input[i] > 0 ? input[i] : 0;
66 |     }
67 |     return 0;
68 | }
69 | 
70 | int relu_backward_cuda(int size, double* input, double* output) {
71 |     printf("CUDA ReLU backward called (placeholder).\n");
72 |     for (int i = 0; i < size; i++) {
73 |         output[i] = input[i] > 0 ? 1 : 0;
74 |     }
75 |     return 0;
76 | }


--------------------------------------------------------------------------------
/debuggin_lib.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "tensor.rkt")
 4 | (provide dense-forward mean-squared-error dense-backward relu relu-derivative initialize-fnn sigmoid sigmoid-derivative)
 5 | 
 6 | (define (relu x)
 7 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (max 0 v))))
 8 | 
 9 | (define (relu-derivative x)
10 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (if (> v 0) 1 0))))
11 | 
12 | (define (sigmoid x)
13 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (/ 1 (+ 1 (exp (- v)))))))
14 | 
15 | (define (sigmoid-derivative x)
16 |   (let ([sig (sigmoid x)])
17 |     (tensor (tensor-shape x) (for/vector ([v (tensor-data sig)]) (* v (- 1 v))))))
18 | 
19 | (define (tanh x)
20 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (/ (- (exp v) (exp (- v))) (+ (exp v) (exp (- v)))))))
21 | 
22 | (define (tanh-derivative x)
23 |   (let ([t (tanh x)])
24 |     (tensor (tensor-shape x) (for/vector ([v (tensor-data t)]) (- 1 (* v v))))))
25 | 
26 | (define (dense-forward input weights biases activation-fn)
27 |   (let* ([mul-result (tensor-multiply input weights)]
28 |          [mul-result-shape (tensor-shape mul-result)]
29 |          [output-dim (cadr mul-result-shape)]
30 |          [reshaped-biases (reshape-tensor biases (list output-dim))]
31 |          [z (tensor-add mul-result reshaped-biases)]
32 |          [activation-output (activation-fn z)])
33 |     activation-output))
34 | 
35 | (define (mean-squared-error y-true y-pred)
36 |   (let* ([diff (tensor-subtract y-true y-pred)]
37 |          [squared-diff (tensor-multiply diff diff)]
38 |          [sum (apply + (vector->list (tensor-data squared-diff)))])
39 |     (/ sum (length (vector->list (tensor-data y-true))))))
40 | 
41 | (define (dense-backward input weights biases output grad-output activation-derivative learning-rate)
42 |   (displayln "dense-backward: Starting")
43 |   (displayln (string-append "Input shape: " (format "~a" (tensor-shape input))))
44 |   (displayln (string-append "Weights shape: " (format "~a" (tensor-shape weights))))
45 |   (displayln (string-append "Biases shape: " (format "~a" (tensor-shape biases))))
46 |   (displayln (string-append "Output shape: " (format "~a" (tensor-shape output))))
47 |   (displayln (string-append "Grad-output shape: " (format "~a" (tensor-shape grad-output))))
48 |   
49 |   (let* ([grad-activation (activation-derivative output)]
50 |          [_ (displayln (string-append "Grad-activation shape: " (format "~a" (tensor-shape grad-activation))))]
51 |          [grad-z (tensor-multiply grad-output grad-activation)]
52 |          [_ (displayln (string-append "Grad-z shape: " (format "~a" (tensor-shape grad-z))))]
53 |          [grad-weights (tensor-multiply (transpose input) grad-z)]
54 |          [_ (displayln (string-append "Grad-weights shape: " (format "~a" (tensor-shape grad-weights))))]
55 |          [grad-biases (tensor (list (vector-length (tensor-data biases)))
56 |                               (for/vector ([j (vector-length (tensor-data biases))])
57 |                                 (apply + (for/list ([i (car (tensor-shape grad-z))])
58 |                                            (vector-ref (tensor-data grad-z) (+ (* i (vector-length (tensor-data biases))) j))))))]
59 |          [_ (displayln (string-append "Grad-biases shape: " (format "~a" (tensor-shape grad-biases))))]
60 |          [grad-input (tensor-multiply grad-z (transpose weights))]
61 |          [_ (displayln (string-append "Grad-input shape: " (format "~a" (tensor-shape grad-input))))])
62 |     (displayln "dense-backward: Finished")
63 |     (values grad-weights grad-biases grad-input)))
64 | 
65 | (define (initialize-fnn batch-size input-dim output-dim)
66 |   (let* ([input-data (make-list (* batch-size input-dim) 0)]
67 |          [input-tensor (create-tensor (list batch-size input-dim) input-data)]
68 |          [weight-shape (list input-dim output-dim)]
69 |          [bias-shape (list output-dim)]  ; Ensure same size as output dim
70 |          [weights (random-tensor weight-shape 1.0)]
71 |          [biases (random-tensor bias-shape 1.0)])
72 |     (values input-tensor weights biases)))
73 | 


--------------------------------------------------------------------------------
/demo.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "mnist.rkt"
 4 |          "CNN.rkt"
 5 |          "visualization.rkt"
 6 |          "tensor.rkt"
 7 |          "device.rkt"
 8 |          "hardware_detection.rkt")
 9 | 
10 | (provide run-mnist-demo run-cnn-demo run-full-demo)
11 | 
12 | ;; Function to run MNIST demo
13 | (define (run-mnist-demo [epochs 5] [batch-size 64])
14 |   (printf "Running MNIST Logistic Regression Demo~n")
15 |   (printf "=====================================~n")
16 |   
17 |   ;; Here we would customize mnist.rkt parameters and run it
18 |   ;; For demo purposes, we'll just use the default behavior
19 |   (system "racket -t mnist.rkt -m")
20 |   
21 |   (printf "MNIST Demo Complete!~n~n"))
22 | 
23 | ;; Function to run CNN demo
24 | (define (run-cnn-demo [device 'cpu] [epochs 3] [batch-size 32])
25 |   (printf "Running CNN Demo on device: ~a~n" device)
26 |   (printf "==================================~n")
27 |   
28 |   ;; Run CNN training with specified parameters
29 |   (let ([results (train-cnn device epochs batch-size)])
30 |     (printf "CNN Training Results:~n")
31 |     (printf "  Accuracy: ~a%~n" (hash-ref results 'accuracy))
32 |     (printf "  Training Time: ~a seconds~n" (hash-ref results 'time))
33 |     (printf "  Device: ~a~n" (hash-ref results 'device))
34 |     
35 |     results))
36 | 
37 | ;; Function to run hardware detection demo
38 | (define (run-hardware-demo)
39 |   (printf "Hardware Detection Demo~n")
40 |   (printf "======================~n")
41 |   
42 |   (printf "CPU: Available~n")
43 |   
44 |   (printf "GPU: ~a~n" 
45 |           (if (gpu-available?) "Available" "Not available"))
46 |   
47 |   (printf "MLX (Apple Silicon): ~a~n"
48 |           (if (device-available? 'mlx) "Available" "Not available"))
49 |   
50 |   (printf "Current device: ~a~n" (get-device-type (current-device)))
51 |   
52 |   (printf "Hardware Demo Complete!~n~n"))
53 | 
54 | ;; Function to run full demo
55 | (define (run-full-demo)
56 |   (printf "RacoGrad Full Demo~n")
57 |   (printf "=================~n~n")
58 |   
59 |   ;; Run hardware detection
60 |   (run-hardware-demo)
61 |   
62 |   ;; Run MNIST demo
63 |   (run-mnist-demo 3 64)
64 |   
65 |   ;; Run CNN demo on best available device
66 |   (let ([device (cond
67 |                   [(device-available? 'mlx) 'mlx]
68 |                   [(gpu-available?) 'gpu]
69 |                   [else 'cpu])])
70 |     (run-cnn-demo device 2 32))
71 |   
72 |   (printf "Full Demo Complete!~n"))
73 | 
74 | ;; Run the demo when this file is executed directly
75 | (module+ main
76 |   (define mode (if (> (vector-length (current-command-line-arguments)) 0)
77 |                   (string->symbol (vector-ref (current-command-line-arguments) 0))
78 |                   'full))
79 |   
80 |   (case mode
81 |     [(mnist) (run-mnist-demo)]
82 |     [(cnn) (run-cnn-demo)]
83 |     [(hardware) (run-hardware-demo)]
84 |     [else (run-full-demo)]))


--------------------------------------------------------------------------------
/device.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "hardware_detection.rkt")
  4 | 
  5 | (provide my-device?
  6 |          cpu-device?
  7 |          gpu-device?
  8 |          mlx-device?
  9 |          cuda-device?
 10 |          opencl-device?
 11 |          
 12 |          make-device
 13 |          cpu
 14 |          gpu
 15 |          mlx
 16 |          cuda
 17 |          opencl
 18 |          
 19 |          current-device
 20 |          set-current-device!
 21 |          device-synchronize
 22 |          
 23 |          get-device-type
 24 |          device-available?
 25 |          gpu-available?)
 26 | 
 27 | ;; Define device struct
 28 | (struct device (type properties) #:transparent #:mutable)
 29 | 
 30 | ;; Define device types
 31 | (define CPU 'cpu)
 32 | (define GPU 'gpu)
 33 | (define MLX 'mlx)
 34 | (define CUDA 'cuda)
 35 | (define OPENCL 'opencl)
 36 | 
 37 | ;; Helper predicates
 38 | (define (my-device? obj)
 39 |   (device? obj))
 40 | 
 41 | (define (cpu-device? dev)
 42 |   (and (my-device? dev) (eq? (device-type dev) CPU)))
 43 | 
 44 | (define (gpu-device? dev)
 45 |   (and (my-device? dev) 
 46 |        (or (eq? (device-type dev) GPU)
 47 |            (eq? (device-type dev) MLX)
 48 |            (eq? (device-type dev) CUDA)
 49 |            (eq? (device-type dev) OPENCL))))
 50 | 
 51 | (define (mlx-device? dev)
 52 |   (and (my-device? dev) (eq? (device-type dev) MLX)))
 53 | 
 54 | (define (cuda-device? dev)
 55 |   (and (my-device? dev) (eq? (device-type dev) CUDA)))
 56 | 
 57 | (define (opencl-device? dev)
 58 |   (and (my-device? dev) (eq? (device-type dev) OPENCL)))
 59 | 
 60 | ;; Default devices
 61 | (define cpu-device (device CPU (hash 'cores (get-optimal-num-threads))))
 62 | 
 63 | ;; Choose the best available GPU device
 64 | (define gpu-device 
 65 |   (cond
 66 |     [(has-mlx-support?)
 67 |      (device MLX (hash 'backend 'mlx))]
 68 |     [(has-cuda-support?)
 69 |      (device CUDA (hash 'backend 'cuda))]
 70 |     [(has-opencl?)
 71 |      (device OPENCL (hash 'backend 'opencl))]
 72 |     [else #f]))
 73 | 
 74 | ;; Specific GPU type devices
 75 | (define mlx-device
 76 |   (if (has-mlx-support?)
 77 |       (device MLX (hash 'backend 'mlx))
 78 |       #f))
 79 | 
 80 | (define cuda-device
 81 |   (if (has-cuda-support?)
 82 |       (device CUDA (hash 'backend 'cuda))
 83 |       #f))
 84 | 
 85 | (define opencl-device
 86 |   (if (has-opencl?)
 87 |       (device OPENCL (hash 'backend 'opencl))
 88 |       #f))
 89 | 
 90 | ;; Factory function
 91 | (define (make-device type [properties (hash)])
 92 |   (device type properties))
 93 | 
 94 | ;; Device accessors
 95 | (define (cpu)
 96 |   cpu-device)
 97 | 
 98 | (define (gpu)
 99 |   (if gpu-device
100 |       gpu-device
101 |       (error "No GPU device available on this system")))
102 | 
103 | (define (mlx)
104 |   (if mlx-device
105 |       mlx-device
106 |       (error "MLX not available - requires Apple Silicon")))
107 | 
108 | (define (cuda)
109 |   (if cuda-device
110 |       cuda-device
111 |       (error "CUDA not available - requires NVIDIA GPU")))
112 | 
113 | (define (opencl)
114 |   (if opencl-device
115 |       opencl-device
116 |       (error "OpenCL not available on this system")))
117 | 
118 | ;; Current device management
119 | (define current-device-param (make-parameter cpu-device))
120 | 
121 | (define (current-device)
122 |   (current-device-param))
123 | 
124 | (define (set-current-device! dev)
125 |   (when (not (my-device? dev))
126 |     (error "set-current-device!: expected a device, got ~a" dev))
127 |   (current-device-param dev))
128 | 
129 | ;; Device synchronization (no-op for CPU, will wait for GPU operations to complete)
130 | (define (device-synchronize [dev (current-device)])
131 |   (when (gpu-device? dev)
132 |     (printf "Synchronizing GPU operations~n")))
133 | 
134 | ;; Get device type
135 | (define (get-device-type [dev (current-device)])
136 |   (device-type dev))
137 | 
138 | ;; Check if a device is available
139 | (define (device-available? type)
140 |   (case type
141 |     [(cpu) #t]
142 |     [(gpu) (or (has-mlx-support?) (has-cuda-support?) (has-opencl?))]
143 |     [(mlx) (has-mlx-support?)]
144 |     [(cuda) (has-cuda-support?)]
145 |     [(opencl) (has-opencl?)]
146 |     [else #f]))
147 | 
148 | ;; Helper to check if any GPU is available
149 | (define (gpu-available?)
150 |   (or (has-mlx-support?) (has-cuda-support?) (has-opencl?)))
151 | 
152 | ;; Print hardware info on load
153 | (print-hardware-info)


--------------------------------------------------------------------------------
/ffi_ops.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require ffi/unsafe
 4 |          ffi/vector)
 5 | 
 6 | (provide c:matrix-multiply
 7 |          c:tensor-add
 8 |          c:tensor-sub
 9 |          c:tensor-mul-elementwise
10 |          c:tensor-scale
11 |          c:relu-forward
12 |          c:relu-backward
13 |          c:sigmoid-forward
14 |          c:sigmoid-backward)
15 | 
16 | ;; Load the shared library
17 | (define matrix-lib (ffi-lib "matrix_multiplication"))
18 | 
19 | ;; Define the FFI functions
20 | (define c:matrix-multiply
21 |   (get-ffi-obj "matrix_multiply" matrix-lib
22 |                (_fun _int _int _int _f64vector _f64vector _f64vector -> _void)))
23 | 
24 | (define c:tensor-add
25 |   (get-ffi-obj "tensor_add" matrix-lib
26 |                (_fun _int _f64vector _f64vector _f64vector -> _void)))
27 | 
28 | (define c:tensor-sub
29 |   (get-ffi-obj "tensor_sub" matrix-lib
30 |                (_fun _int _f64vector _f64vector _f64vector -> _void)))
31 | 
32 | (define c:tensor-mul-elementwise
33 |   (get-ffi-obj "tensor_mul_elementwise" matrix-lib
34 |                (_fun _int _f64vector _f64vector _f64vector -> _void)))
35 | 
36 | (define c:tensor-scale
37 |   (get-ffi-obj "tensor_scale" matrix-lib
38 |                (_fun _int _f64vector _double _f64vector -> _void)))
39 | 
40 | (define c:relu-forward
41 |   (get-ffi-obj "relu_forward" matrix-lib
42 |                (_fun _int _f64vector _f64vector -> _void)))
43 | 
44 | (define c:relu-backward
45 |   (get-ffi-obj "relu_backward" matrix-lib
46 |                (_fun _int _f64vector _f64vector -> _void)))
47 | 
48 | (define c:sigmoid-forward
49 |   (get-ffi-obj "sigmoid_forward" matrix-lib
50 |                (_fun _int _f64vector _f64vector -> _void)))
51 | 
52 | (define c:sigmoid-backward
53 |   (get-ffi-obj "sigmoid_backward" matrix-lib
54 |                (_fun _int _f64vector _f64vector -> _void)))


--------------------------------------------------------------------------------
/gpu_acceleration.md:
--------------------------------------------------------------------------------
  1 | # GPU Acceleration Guide for RacoGrad
  2 | 
  3 | This document outlines multiple approaches to add GPU acceleration to RacoGrad without requiring deep CUDA knowledge.
  4 | 
  5 | ## 1. OpenCL Integration (Recommended for beginners)
  6 | 
  7 | OpenCL is more accessible than CUDA and works across different GPU vendors (AMD, NVIDIA, Intel).
  8 | 
  9 | ### Setup Steps:
 10 | 
 11 | 1. Install OpenCL development kit:
 12 |    - macOS: Already included in the OS
 13 |    - Linux: `sudo apt install opencl-headers ocl-icd-opencl-dev`
 14 |    - Windows: Install vendor-specific OpenCL SDK
 15 | 
 16 | 2. Create OpenCL kernels for key operations (matrix multiplication example):
 17 | 
 18 | ```c
 19 | // matrix_mul.cl
 20 | __kernel void matrix_multiply(
 21 |     const int M, const int N, const int K,
 22 |     __global const float* A,
 23 |     __global const float* B,
 24 |     __global float* C)
 25 | {
 26 |     // Get global position in Y direction
 27 |     int row = get_global_id(0);
 28 |     // Get global position in X direction
 29 |     int col = get_global_id(1);
 30 |     
 31 |     float sum = 0.0f;
 32 |     for (int i = 0; i < K; i++) {
 33 |         sum += A[row * K + i] * B[i * N + col];
 34 |     }
 35 |     
 36 |     C[row * N + col] = sum;
 37 | }
 38 | ```
 39 | 
 40 | 3. Racket FFI to OpenCL:
 41 | 
 42 | ```racket
 43 | ;; Basic OpenCL FFI setup (simplified)
 44 | (define opencl-lib (ffi-lib "OpenCL"))
 45 | (define clCreateProgramWithSource 
 46 |   (get-ffi-obj "clCreateProgramWithSource" opencl-lib (_fun ...)))
 47 | ;; ... more FFI definitions
 48 | ```
 49 | 
 50 | ## 2. Use Existing GPU Libraries
 51 | 
 52 | Instead of writing OpenCL code directly, leverage existing libraries:
 53 | 
 54 | ### ArrayFire
 55 | 
 56 | ArrayFire is a high-performance library with GPU support:
 57 | 
 58 | ```
 59 | # Install ArrayFire
 60 | brew install arrayfire  # macOS
 61 | ```
 62 | 
 63 | Connect via FFI to ArrayFire's C API for common operations.
 64 | 
 65 | ### ONNX Runtime
 66 | 
 67 | ONNX Runtime provides GPU acceleration:
 68 | 
 69 | 1. Export your models to ONNX format
 70 | 2. Use ONNX Runtime for inference
 71 | 3. Connect via FFI to ONNX Runtime's C API
 72 | 
 73 | ### TensorFlow or PyTorch C++ APIs
 74 | 
 75 | Both frameworks offer C++ APIs that can be connected through FFI.
 76 | 
 77 | ## 3. Vulkan Compute
 78 | 
 79 | For newer GPUs, Vulkan Compute is another option:
 80 | 
 81 | 1. Vulkan SDK installation
 82 | 2. Kompute library for simplified Vulkan Compute
 83 | 
 84 | ## Implementation Strategy
 85 | 
 86 | 1. Start with the simplest option: OpenCL for critical operations
 87 | 2. Develop a fallback CPU path for compatibility
 88 | 3. Profile to identify bottlenecks
 89 | 4. Gradually replace more operations with GPU implementations
 90 | 
 91 | ## Compilation Instructions
 92 | 
 93 | For OpenCL on macOS:
 94 | ```
 95 | clang -framework OpenCL -o matrix_opencl matrix_opencl.c
 96 | ```
 97 | 
 98 | For OpenCL on Linux:
 99 | ```
100 | gcc -o matrix_opencl matrix_opencl.c -lOpenCL
101 | ```


--------------------------------------------------------------------------------
/gpu_benchmark.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor_device.rkt"
  4 |          "device.rkt"
  5 |          "hardware_detection.rkt")
  6 | 
  7 | (define (benchmark-matrix-multiplication sizes [iterations 5])
  8 |   (printf "===== Matrix Multiplication Benchmark =====~n")
  9 |   
 10 |   (for ([size sizes])
 11 |     (printf "Matrix size: ~a x ~a~n" size size)
 12 |     
 13 |     ;; Create random matrices on CPU
 14 |     (define A-cpu (dt:random (list size size) 1.0 (cpu)))
 15 |     (define B-cpu (dt:random (list size size) 1.0 (cpu)))
 16 |     
 17 |     ;; Time CPU multiplication
 18 |     (define cpu-times
 19 |       (for/list ([i (in-range iterations)])
 20 |         (let ([start (current-inexact-milliseconds)])
 21 |           (dt:mul A-cpu B-cpu)
 22 |           (- (current-inexact-milliseconds) start))))
 23 |     
 24 |     (define avg-cpu-time (/ (apply + cpu-times) iterations))
 25 |     (printf "  CPU time: ~a ms~n" (real->decimal-string avg-cpu-time 2))
 26 |     
 27 |     ;; If GPU is available, benchmark on GPU
 28 |     (when (gpu-available?)
 29 |       ;; Move matrices to GPU
 30 |       (define A-gpu (dt:to A-cpu (gpu)))
 31 |       (define B-gpu (dt:to B-cpu (gpu)))
 32 |       
 33 |       ;; Time GPU multiplication
 34 |       (define gpu-times
 35 |         (for/list ([i (in-range iterations)])
 36 |           (let ([start (current-inexact-milliseconds)])
 37 |             (dt:mul A-gpu B-gpu)
 38 |             (- (current-inexact-milliseconds) start))))
 39 |       
 40 |       (define avg-gpu-time (/ (apply + gpu-times) iterations))
 41 |       (printf "  GPU time: ~a ms~n" (real->decimal-string avg-gpu-time 2))
 42 |       
 43 |       ;; Calculate speedup
 44 |       (define speedup (/ avg-cpu-time avg-gpu-time))
 45 |       (printf "  Speedup: ~a times faster~n" (real->decimal-string speedup 2)))
 46 |     
 47 |     (printf "~n")))
 48 | 
 49 | (define (benchmark-elementwise-operations size [iterations 5])
 50 |   (printf "===== Element-wise Operations Benchmark =====~n")
 51 |   (printf "Vector size: ~a~n" size)
 52 |   
 53 |   ;; Create random vectors on CPU
 54 |   (define A-cpu (dt:random (list size) 1.0 (cpu)))
 55 |   (define B-cpu (dt:random (list size) 1.0 (cpu)))
 56 |   
 57 |   ;; Time CPU addition
 58 |   (define cpu-add-times
 59 |     (for/list ([i (in-range iterations)])
 60 |       (let ([start (current-inexact-milliseconds)])
 61 |         (dt:add A-cpu B-cpu)
 62 |         (- (current-inexact-milliseconds) start))))
 63 |   
 64 |   (define avg-cpu-add-time (/ (apply + cpu-add-times) iterations))
 65 |   (printf "  CPU add time: ~a ms~n" (real->decimal-string avg-cpu-add-time 2))
 66 |   
 67 |   ;; If GPU is available, benchmark on GPU
 68 |   (when (gpu-available?)
 69 |     ;; Move vectors to GPU
 70 |     (define A-gpu (dt:to A-cpu (gpu)))
 71 |     (define B-gpu (dt:to B-cpu (gpu)))
 72 |     
 73 |     ;; Time GPU addition
 74 |     (define gpu-add-times
 75 |       (for/list ([i (in-range iterations)])
 76 |         (let ([start (current-inexact-milliseconds)])
 77 |           (dt:add A-gpu B-gpu)
 78 |           (- (current-inexact-milliseconds) start))))
 79 |     
 80 |     (define avg-gpu-add-time (/ (apply + gpu-add-times) iterations))
 81 |     (printf "  GPU add time: ~a ms~n" (real->decimal-string avg-gpu-add-time 2))
 82 |     
 83 |     ;; Calculate speedup
 84 |     (define add-speedup (/ avg-cpu-add-time avg-gpu-add-time))
 85 |     (printf "  Add speedup: ~a times faster~n" (real->decimal-string add-speedup 2)))
 86 |   
 87 |   (printf "~n"))
 88 | 
 89 | (define (run-all-benchmarks)
 90 |   (printf "~n======================================~n")
 91 |   (printf "  RACOGRAD GPU PERFORMANCE BENCHMARKS~n")
 92 |   (printf "======================================~n")
 93 |   
 94 |   (print-hardware-info)
 95 |   
 96 |   (benchmark-matrix-multiplication '(100 500 1000))
 97 |   (benchmark-elementwise-operations 1000000)
 98 |   
 99 |   (printf "======================================~n")
100 |   (printf "Benchmarks complete!~n")
101 |   (printf "======================================~n"))
102 | 
103 | ;; Run benchmarks when executed directly
104 | (module+ main
105 |   (printf "Checking GPU availability...~n")
106 |   (if (gpu-available?)
107 |       (run-all-benchmarks)
108 |       (begin
109 |         (printf "GPU acceleration not available on this system.~n")
110 |         (printf "Running CPU-only benchmarks...~n")
111 |         (benchmark-matrix-multiplication '(100 500 1000)))))


--------------------------------------------------------------------------------
/hardware_detection.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require ffi/unsafe)
  4 | 
  5 | (provide detect-hardware-capabilities
  6 |          get-optimal-num-threads
  7 |          has-avx?
  8 |          has-sse?
  9 |          has-opencl?
 10 |          is-apple-silicon?
 11 |          has-mlx-support?
 12 |          has-cuda-support?
 13 |          print-hardware-info)
 14 | 
 15 | ;; Safely try to load the libraries with proper error handling
 16 | (define libcheck 
 17 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 18 |     (ffi-lib "simd_ops" '("" "0"))))
 19 | 
 20 | ;; Safely try to load the libraries and their availability checks
 21 | (define opencl-lib 
 22 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 23 |     (ffi-lib "matrix_opencl" '("" "0"))))
 24 | 
 25 | (define mlx-lib 
 26 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 27 |     (ffi-lib "mlx_ops" '("" "0"))))
 28 | 
 29 | (define cuda-lib 
 30 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 31 |     (ffi-lib "cuda_ops" '("" "0"))))
 32 | 
 33 | ;; Import availability check functions if libraries are available
 34 | (define check-opencl-available
 35 |   (if opencl-lib
 36 |       (with-handlers ([exn:fail? (lambda (e) (lambda () 0))])
 37 |         (get-ffi-obj "check_opencl_available" opencl-lib (_fun -> _int)))
 38 |       (lambda () 0)))
 39 | 
 40 | (define check-mlx-available
 41 |   (if mlx-lib
 42 |       (with-handlers ([exn:fail? (lambda (e) (lambda () 0))])
 43 |         (get-ffi-obj "check_mlx_available" mlx-lib (_fun -> _int)))
 44 |       (lambda () 0)))
 45 | 
 46 | (define check-cuda-available
 47 |   (if cuda-lib
 48 |       (with-handlers ([exn:fail? (lambda (e) (lambda () 0))])
 49 |         (get-ffi-obj "check_cuda_available" cuda-lib (_fun -> _int)))
 50 |       (lambda () 0)))
 51 | 
 52 | ;; Number of physical cores detection
 53 | (define (detect-num-cores)
 54 |   (cond
 55 |     [(equal? (system-type 'os) 'unix)
 56 |      (with-handlers ([exn:fail? (lambda (_) 4)]) ; Default to 4 if detection fails
 57 |        (let* ([output (with-output-to-string
 58 |                         (lambda () (system "sysctl -n hw.physicalcpu 2>/dev/null || 
 59 |                                             grep -c ^processor /proc/cpuinfo 2>/dev/null || 
 60 |                                             echo 4")))]
 61 |               [num (string->number (string-trim output))])
 62 |          (if num num 4)))]
 63 |     [(equal? (system-type 'os) 'windows)
 64 |      (with-handlers ([exn:fail? (lambda (_) 4)])
 65 |        (let* ([output (with-output-to-string
 66 |                         (lambda () (system "echo %NUMBER_OF_PROCESSORS%")))]
 67 |               [num (string->number (string-trim output))])
 68 |          (if num num 4)))]
 69 |     [else 4])) ; Default value
 70 | 
 71 | ;; Check for SIMD instruction support
 72 | (define (has-simd-support? type)
 73 |   (cond
 74 |     [(equal? (system-type 'os) 'unix)
 75 |      (case type
 76 |        [(avx) (with-handlers ([exn:fail? (lambda (_) #f)])
 77 |                (let ([output (with-output-to-string
 78 |                               (lambda () (system "grep -q avx /proc/cpuinfo && echo yes || 
 79 |                                                   sysctl -n machdep.cpu.features 2>/dev/null | grep -q AVX && echo yes")))])
 80 |                  (string-contains? (string-trim output) "yes")))]
 81 |        [(sse) (with-handlers ([exn:fail? (lambda (_) #f)])
 82 |                (let ([output (with-output-to-string
 83 |                               (lambda () (system "grep -q sse /proc/cpuinfo && echo yes || 
 84 |                                                   sysctl -n machdep.cpu.features 2>/dev/null | grep -q SSE && echo yes")))])
 85 |                  (string-contains? (string-trim output) "yes")))]
 86 |        [else #f])]
 87 |     [else #f])) ; Simplified for other OSes
 88 | 
 89 | ;; Check for OpenCL support
 90 | (define (has-opencl-support?)
 91 |   (with-handlers ([exn:fail? (lambda (e) #f)])
 92 |     (and opencl-lib
 93 |          (procedure? check-opencl-available)
 94 |          (= (check-opencl-available) 1))))
 95 | 
 96 | ;; Check for Apple Silicon (M1/M2/M3)
 97 | (define (detect-apple-silicon)
 98 |   (and (equal? (system-type 'os) 'macosx)
 99 |        (or (string-contains? (with-output-to-string 
100 |                               (lambda () (system "uname -m")))
101 |                              "arm64")
102 |            (string-contains? (with-output-to-string 
103 |                               (lambda () (system "sysctl -n machdep.cpu.brand_string 2>/dev/null")))
104 |                              "Apple"))))
105 | 
106 | ;; Check for MLX availability on Apple Silicon
107 | (define (has-mlx-support?)
108 |   (with-handlers ([exn:fail? (lambda (e) #f)])
109 |     (and (detect-apple-silicon)
110 |          ;; Check for MLX library in standard locations
111 |          (or mlx-lib 
112 |              (with-handlers ([exn:fail? (lambda (e) #f)])
113 |                (file-exists? "/opt/homebrew/lib/libmlx.dylib"))
114 |              (with-handlers ([exn:fail? (lambda (e) #f)])
115 |                (file-exists? "/usr/local/lib/libmlx.dylib"))
116 |              (with-handlers ([exn:fail? (lambda (e) #f)])
117 |                (file-exists? "/usr/lib/libmlx.dylib")))
118 |          ;; Fall back to environment variable check
119 |          (or (and (procedure? check-mlx-available)
120 |                   (= (check-mlx-available) 1))
121 |              (getenv "MLX_AVAILABLE")))))
122 | 
123 | ;; Check for CUDA availability
124 | (define (has-cuda-support?)
125 |   (with-handlers ([exn:fail? (lambda (e) #f)])
126 |     (and cuda-lib 
127 |          (procedure? check-cuda-available)
128 |          (= (check-cuda-available) 1))))
129 | 
130 | ;; Memory size detection (in GB)
131 | (define (detect-memory-size)
132 |   (cond
133 |     [(equal? (system-type 'os) 'unix)
134 |      (with-handlers ([exn:fail? (lambda (_) 8.0)]) ; Default to 8GB
135 |        (let* ([output (with-output-to-string
136 |                         (lambda () (system "sysctl -n hw.memsize 2>/dev/null || 
137 |                                             grep MemTotal /proc/meminfo 2>/dev/null | awk '{print $2}' || 
138 |                                             echo 8589934592")))]
139 |               [num (string->number (string-trim output))])
140 |          (if num
141 |              (if (< num 1000000) ; If returned in KB (Linux)
142 |                  (/ num 1024 1024)
143 |                  (/ num 1024 1024 1024))
144 |              8.0)))]
145 |     [else 8.0])) ; Default for other OSes
146 | 
147 | ;; Main hardware detection function
148 | (define (detect-hardware-capabilities)
149 |   (let ([cores (detect-num-cores)]
150 |         [avx (has-simd-support? 'avx)]
151 |         [sse (has-simd-support? 'sse)]
152 |         [opencl (has-opencl-support?)]
153 |         [apple-silicon (detect-apple-silicon)]
154 |         [mlx (has-mlx-support?)]
155 |         [cuda (has-cuda-support?)]
156 |         [memory (detect-memory-size)])
157 |     (hash 'cores cores
158 |           'avx avx
159 |           'sse sse
160 |           'opencl opencl
161 |           'apple-silicon apple-silicon
162 |           'mlx mlx
163 |           'cuda cuda
164 |           'memory memory)))
165 | 
166 | ;; Cache the detection results
167 | (define hardware-info (detect-hardware-capabilities))
168 | 
169 | ;; Accessor functions
170 | (define (get-optimal-num-threads)
171 |   (hash-ref hardware-info 'cores))
172 | 
173 | (define (has-avx?)
174 |   (hash-ref hardware-info 'avx))
175 | 
176 | (define (has-sse?)
177 |   (hash-ref hardware-info 'sse))
178 | 
179 | (define (has-opencl?)
180 |   (hash-ref hardware-info 'opencl))
181 | 
182 | (define (is-apple-silicon?)
183 |   (hash-ref hardware-info 'apple-silicon))
184 | 
185 | ;; Print hardware information with error handling
186 | (define (print-hardware-info)
187 |   (with-handlers ([exn:fail? (lambda (e)
188 |                              (printf "======================================\n")
189 |                              (printf "Error detecting hardware capabilities: ~a\n" (exn-message e))
190 |                              (printf "======================================\n"))])
191 |     (define info hardware-info)
192 |     (printf "======================================\n")
193 |     (printf "Hardware Capabilities for RacoGrad\n")
194 |     (printf "======================================\n")
195 |     (printf "CPU Cores: ~a\n" (hash-ref info 'cores 4))
196 |     (printf "AVX Support: ~a\n" (hash-ref info 'avx #f))
197 |     (printf "SSE Support: ~a\n" (hash-ref info 'sse #f))
198 |     (printf "OpenCL Available: ~a\n" (hash-ref info 'opencl #f))
199 |     (printf "Apple Silicon: ~a\n" (hash-ref info 'apple-silicon #f))
200 |     (printf "MLX Support: ~a\n" (hash-ref info 'mlx #f))
201 |     (printf "CUDA Support: ~a\n" (hash-ref info 'cuda #f))
202 |     (printf "System Memory: ~a GB\n" (hash-ref info 'memory 8.0))
203 |     (printf "--------------------------------------\n")
204 |     (printf "Recommended Configuration:\n")
205 |     (cond
206 |       [(hash-ref info 'mlx #f)
207 |        (printf "- Use MLX acceleration for Apple Silicon\n")]
208 |       [(hash-ref info 'cuda #f)
209 |        (printf "- Use CUDA acceleration for NVIDIA GPUs\n")]
210 |       [(hash-ref info 'opencl #f)
211 |        (printf "- Use OpenCL acceleration for cross-platform GPU support\n")]
212 |       [(hash-ref info 'avx #f)
213 |        (printf "- Use AVX-optimized functions\n")]
214 |       [(hash-ref info 'sse #f)
215 |        (printf "- Use SSE-optimized functions\n")]
216 |       [else
217 |        (printf "- Use basic C optimizations\n")])
218 |     (printf "- Use ~a threads for parallel operations\n" (hash-ref info 'cores 4))
219 |     (printf "======================================\n")))


--------------------------------------------------------------------------------
/load-mnist.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | (provide load-mnist-data)
 3 | (require "tensor.rkt")
 4 | 
 5 | (define (read-idx-file filename)
 6 |   (let* ([data (file->bytes filename)])
 7 |     ;; Parse IDX file format
 8 |     (let* ([magic-number (integer-bytes->integer data #f #t 0 4)]
 9 |            [data-type (bitwise-and magic-number #xFF)]
10 |            [dimension-count (bitwise-and (arithmetic-shift magic-number -8) #xFF)]
11 |            [dimensions (for/list ([i (in-range dimension-count)])
12 |                         (integer-bytes->integer data #f #t (+ 4 (* i 4)) (+ 8 (* i 4))))]
13 |            [data-start (+ 4 (* dimension-count 4))]
14 |            [data-length (- (bytes-length data) data-start)])
15 |       (values dimensions (subbytes data data-start)))))
16 | 
17 | (define (load-mnist-data type)
18 |   (let* ([images-file (build-path "mnist-data" 
19 |                                  (format "~a-images.idx3-ubyte" type))]
20 |          [labels-file (build-path "mnist-data" 
21 |                                  (format "~a-labels.idx1-ubyte" type))]
22 |          [images-dims+data (read-idx-file images-file)]
23 |          [labels-dims+data (read-idx-file labels-file)])
24 |     ;; Convert to tensors and return
25 |     (values 
26 |      (t:create (list (car (car images-dims+data)) 784)
27 |                (bytes->list (cdr images-dims+data)))
28 |      (t:create (list (car (car labels-dims+data)))
29 |                (bytes->list (cdr labels-dims+data))))))


--------------------------------------------------------------------------------
/matrix_multiplication.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <math.h>
 5 | 
 6 | // Basic matrix multiplication
 7 | void matrix_multiply(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c) {
 8 |     for (int i = 0; i < rows_a; ++i) {
 9 |         for (int j = 0; j < cols_b; ++j) {
10 |             double sum = 0.0;
11 |             for (int k = 0; k < cols_a; ++k) {
12 |                 sum += a[i * cols_a + k] * b[k * cols_b + j];
13 |             }
14 |             c[i * cols_b + j] = sum;
15 |         }
16 |     }
17 | }
18 | 
19 | // Add element-wise tensor operations
20 | void tensor_add(int size, double *a, double *b, double *result) {
21 |     for (int i = 0; i < size; i++) {
22 |         result[i] = a[i] + b[i];
23 |     }
24 | }
25 | 
26 | void tensor_sub(int size, double *a, double *b, double *result) {
27 |     for (int i = 0; i < size; i++) {
28 |         result[i] = a[i] - b[i];
29 |     }
30 | }
31 | 
32 | void tensor_mul_elementwise(int size, double *a, double *b, double *result) {
33 |     for (int i = 0; i < size; i++) {
34 |         result[i] = a[i] * b[i];
35 |     }
36 | }
37 | 
38 | void tensor_scale(int size, double *a, double scalar, double *result) {
39 |     for (int i = 0; i < size; i++) {
40 |         result[i] = a[i] * scalar;
41 |     }
42 | }
43 | 
44 | // Activation functions in C for speed
45 | void relu_forward(int size, double *input, double *output) {
46 |     for (int i = 0; i < size; i++) {
47 |         output[i] = input[i] > 0 ? input[i] : 0;
48 |     }
49 | }
50 | 
51 | void relu_backward(int size, double *input, double *output) {
52 |     for (int i = 0; i < size; i++) {
53 |         output[i] = input[i] > 0 ? 1 : 0;
54 |     }
55 | }
56 | 
57 | void sigmoid_forward(int size, double *input, double *output) {
58 |     for (int i = 0; i < size; i++) {
59 |         output[i] = 1.0 / (1.0 + exp(-input[i]));
60 |     }
61 | }
62 | 
63 | void sigmoid_backward(int size, double *input, double *output) {
64 |     for (int i = 0; i < size; i++) {
65 |         double sigmoid_val = 1.0 / (1.0 + exp(-input[i]));
66 |         output[i] = sigmoid_val * (1.0 - sigmoid_val);
67 |     }
68 | }


--------------------------------------------------------------------------------
/matrix_opencl.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #ifdef __APPLE__
  6 | #include <OpenCL/opencl.h>
  7 | #else
  8 | #include <CL/cl.h>
  9 | #endif
 10 | 
 11 | // Matrix multiplication kernel as a string
 12 | const char* matrixMulKernelSource = 
 13 | "__kernel void matrixMul(\n"
 14 | "   const int M, const int N, const int K,\n"
 15 | "   __global const double* A,\n" 
 16 | "   __global const double* B,\n" 
 17 | "   __global double* C) {\n"
 18 | "   \n"
 19 | "   // Get global position in the grid\n"
 20 | "   const int row = get_global_id(0);\n" 
 21 | "   const int col = get_global_id(1);\n"
 22 | "   \n"
 23 | "   // Ensure we don't go out of bounds\n"
 24 | "   if (row < M && col < N) {\n"
 25 | "       double sum = 0.0;\n"
 26 | "       for (int k = 0; k < K; k++) {\n"
 27 | "           sum += A[row * K + k] * B[k * N + col];\n"
 28 | "       }\n"
 29 | "       C[row * N + col] = sum;\n"
 30 | "   }\n"
 31 | "}\n";
 32 | 
 33 | // OpenCL context and related variables
 34 | cl_context context = NULL;
 35 | cl_command_queue queue = NULL;
 36 | cl_program program = NULL;
 37 | cl_kernel kernel = NULL;
 38 | cl_int err;
 39 | cl_bool is_opencl_initialized = CL_FALSE;
 40 | 
 41 | // Initialize OpenCL
 42 | cl_bool initialize_opencl() {
 43 |     if (is_opencl_initialized) return CL_TRUE;
 44 |     
 45 |     // Get platform and device information
 46 |     cl_platform_id platform_id = NULL;
 47 |     cl_device_id device_id = NULL;   
 48 |     cl_uint ret_num_devices;
 49 |     cl_uint ret_num_platforms;
 50 |     
 51 |     err = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
 52 |     if (err != CL_SUCCESS) {
 53 |         printf("Failed to get platform ID: %d\n", err);
 54 |         return CL_FALSE;
 55 |     }
 56 |     
 57 |     err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
 58 |     if (err != CL_SUCCESS) {
 59 |         // Try CPU if GPU is not available
 60 |         err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices);
 61 |         if (err != CL_SUCCESS) {
 62 |             printf("Failed to get device ID: %d\n", err);
 63 |             return CL_FALSE;
 64 |         }
 65 |         printf("Using CPU device (GPU not available)\n");
 66 |     }
 67 |     
 68 |     // Create an OpenCL context
 69 |     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
 70 |     if (err != CL_SUCCESS) {
 71 |         printf("Failed to create context: %d\n", err);
 72 |         return CL_FALSE;
 73 |     }
 74 |     
 75 |     // Create a command queue
 76 |     queue = clCreateCommandQueue(context, device_id, 0, &err);
 77 |     if (err != CL_SUCCESS) {
 78 |         printf("Failed to create command queue: %d\n", err);
 79 |         clReleaseContext(context);
 80 |         return CL_FALSE;
 81 |     }
 82 |     
 83 |     // Create the compute program from the source buffer
 84 |     program = clCreateProgramWithSource(context, 1, (const char **)&matrixMulKernelSource, NULL, &err);
 85 |     if (err != CL_SUCCESS) {
 86 |         printf("Failed to create program: %d\n", err);
 87 |         clReleaseCommandQueue(queue);
 88 |         clReleaseContext(context);
 89 |         return CL_FALSE;
 90 |     }
 91 |     
 92 |     // Build the program executable
 93 |     err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
 94 |     if (err != CL_SUCCESS) {
 95 |         size_t len;
 96 |         char buffer[2048];
 97 |         
 98 |         printf("Failed to build program executable: %d\n", err);
 99 |         clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
100 |         printf("%s\n", buffer);
101 |         
102 |         clReleaseProgram(program);
103 |         clReleaseCommandQueue(queue);
104 |         clReleaseContext(context);
105 |         return CL_FALSE;
106 |     }
107 |     
108 |     // Create the compute kernel
109 |     kernel = clCreateKernel(program, "matrixMul", &err);
110 |     if (err != CL_SUCCESS) {
111 |         printf("Failed to create kernel: %d\n", err);
112 |         clReleaseProgram(program);
113 |         clReleaseCommandQueue(queue);
114 |         clReleaseContext(context);
115 |         return CL_FALSE;
116 |     }
117 |     
118 |     is_opencl_initialized = CL_TRUE;
119 |     return CL_TRUE;
120 | }
121 | 
122 | // Clean up OpenCL resources
123 | void cleanup_opencl() {
124 |     if (kernel) clReleaseKernel(kernel);
125 |     if (program) clReleaseProgram(program);
126 |     if (queue) clReleaseCommandQueue(queue);
127 |     if (context) clReleaseContext(context);
128 |     is_opencl_initialized = CL_FALSE;
129 | }
130 | 
131 | // Matrix multiplication using OpenCL
132 | int matrix_multiply_opencl(int M, int N, int K, double* A, double* B, double* C) {
133 |     cl_mem d_A, d_B, d_C;
134 |     size_t global_work_size[2];
135 |     const size_t A_size = M * K * sizeof(double);
136 |     const size_t B_size = K * N * sizeof(double);
137 |     const size_t C_size = M * N * sizeof(double);
138 |     
139 |     // Initialize OpenCL if not already done
140 |     if (!initialize_opencl()) {
141 |         return -1;
142 |     }
143 |     
144 |     // Create memory buffers on the device
145 |     d_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, A_size, A, &err);
146 |     if (err != CL_SUCCESS) {
147 |         printf("Failed to create buffer for A: %d\n", err);
148 |         return -1;
149 |     }
150 |     
151 |     d_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, B_size, B, &err);
152 |     if (err != CL_SUCCESS) {
153 |         printf("Failed to create buffer for B: %d\n", err);
154 |         clReleaseMemObject(d_A);
155 |         return -1;
156 |     }
157 |     
158 |     d_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, C_size, NULL, &err);
159 |     if (err != CL_SUCCESS) {
160 |         printf("Failed to create buffer for C: %d\n", err);
161 |         clReleaseMemObject(d_A);
162 |         clReleaseMemObject(d_B);
163 |         return -1;
164 |     }
165 |     
166 |     // Set the arguments of the kernel
167 |     err = clSetKernelArg(kernel, 0, sizeof(int), (void *)&M);
168 |     err |= clSetKernelArg(kernel, 1, sizeof(int), (void *)&N);
169 |     err |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&K);
170 |     err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&d_A);
171 |     err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void *)&d_B);
172 |     err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&d_C);
173 |     
174 |     if (err != CL_SUCCESS) {
175 |         printf("Failed to set kernel arguments: %d\n", err);
176 |         clReleaseMemObject(d_A);
177 |         clReleaseMemObject(d_B);
178 |         clReleaseMemObject(d_C);
179 |         return -1;
180 |     }
181 |     
182 |     // Define work group size and execute the kernel
183 |     global_work_size[0] = M;
184 |     global_work_size[1] = N;
185 |     
186 |     err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL);
187 |     if (err != CL_SUCCESS) {
188 |         printf("Failed to execute kernel: %d\n", err);
189 |         clReleaseMemObject(d_A);
190 |         clReleaseMemObject(d_B);
191 |         clReleaseMemObject(d_C);
192 |         return -1;
193 |     }
194 |     
195 |     // Read back the results from the device
196 |     err = clEnqueueReadBuffer(queue, d_C, CL_TRUE, 0, C_size, C, 0, NULL, NULL);
197 |     if (err != CL_SUCCESS) {
198 |         printf("Failed to read output matrix: %d\n", err);
199 |         clReleaseMemObject(d_A);
200 |         clReleaseMemObject(d_B);
201 |         clReleaseMemObject(d_C);
202 |         return -1;
203 |     }
204 |     
205 |     // Release resources
206 |     clReleaseMemObject(d_A);
207 |     clReleaseMemObject(d_B);
208 |     clReleaseMemObject(d_C);
209 |     
210 |     return 0;
211 | }
212 | 
213 | // Expose a function that can be called to check if OpenCL is available
214 | int check_opencl_available() {
215 |     cl_bool result = initialize_opencl();
216 |     if (result == CL_TRUE) {
217 |         cleanup_opencl();
218 |         return 1;
219 |     }
220 |     return 0;
221 | }


--------------------------------------------------------------------------------
/mlx_cnn_ops.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | 
  6 | #ifdef __APPLE__
  7 | // To be replaced with actual MLX includes when available
  8 | // #include <mlx_c.h>
  9 | #endif
 10 | 
 11 | // MLX version of convolution operation
 12 | void mlx_conv2d_forward(int batch_size, int in_channels, int in_height, int in_width,
 13 |                         int out_channels, int filter_height, int filter_width,
 14 |                         int stride, int padding,
 15 |                         double* input, double* filters, double* output) {
 16 |     printf("MLX Convolution called - optimized for Apple Silicon\n");
 17 |     
 18 |     // For now, just call the CPU implementation
 19 |     // In a real implementation, this would use MLX's optimized convolution
 20 |     
 21 |     int out_height = 1 + (in_height + 2 * padding - filter_height) / stride;
 22 |     int out_width = 1 + (in_width + 2 * padding - filter_width) / stride;
 23 |     
 24 |     // Initialize output to zeros
 25 |     int output_size = batch_size * out_channels * out_height * out_width;
 26 |     memset(output, 0, output_size * sizeof(double));
 27 | 
 28 |     // Optimized for Apple Silicon - would use MLX operations
 29 |     // Current implementation falls back to a somewhat optimized CPU version
 30 |     
 31 |     // Cache-efficient implementation
 32 |     for (int b = 0; b < batch_size; b++) {
 33 |         for (int oc = 0; oc < out_channels; oc++) {
 34 |             for (int oh = 0; oh < out_height; oh++) {
 35 |                 for (int ow = 0; ow < out_width; ow++) {
 36 |                     int out_idx = ((b * out_channels + oc) * out_height + oh) * out_width + ow;
 37 |                     double sum = 0.0;
 38 |                     
 39 |                     for (int ic = 0; ic < in_channels; ic++) {
 40 |                         for (int fh = 0; fh < filter_height; fh++) {
 41 |                             int ih = oh * stride + fh - padding;
 42 |                             if (ih < 0 || ih >= in_height) continue;
 43 |                             
 44 |                             for (int fw = 0; fw < filter_width; fw++) {
 45 |                                 int iw = ow * stride + fw - padding;
 46 |                                 if (iw < 0 || iw >= in_width) continue;
 47 |                                 
 48 |                                 int in_idx = ((b * in_channels + ic) * in_height + ih) * in_width + iw;
 49 |                                 int filter_idx = ((oc * in_channels + ic) * filter_height + fh) * filter_width + fw;
 50 |                                 
 51 |                                 sum += input[in_idx] * filters[filter_idx];
 52 |                             }
 53 |                         }
 54 |                     }
 55 |                     
 56 |                     output[out_idx] = sum;
 57 |                 }
 58 |             }
 59 |         }
 60 |     }
 61 | }
 62 | 
 63 | // MLX version of max pooling
 64 | void mlx_max_pool_2x2(int batch_size, int channels, int in_height, int in_width,
 65 |                      double* input, double* output) {
 66 |     printf("MLX Max Pooling called - optimized for Apple Silicon\n");
 67 |     
 68 |     // Calculate output dimensions
 69 |     int out_height = in_height / 2;
 70 |     int out_width = in_width / 2;
 71 |     
 72 |     // Optimized max pooling for Apple Silicon
 73 |     // Removing OpenMP pragma on Mac
 74 |     //#pragma omp parallel for collapse(3)
 75 |     for (int b = 0; b < batch_size; b++) {
 76 |         for (int c = 0; c < channels; c++) {
 77 |             for (int oh = 0; oh < out_height; oh++) {
 78 |                 for (int ow = 0; ow < out_width; ow++) {
 79 |                     int out_idx = ((b * channels + c) * out_height + oh) * out_width + ow;
 80 |                     
 81 |                     // Find max in the 2x2 region
 82 |                     double max_val = -INFINITY;
 83 |                     for (int kh = 0; kh < 2; kh++) {
 84 |                         for (int kw = 0; kw < 2; kw++) {
 85 |                             int ih = oh * 2 + kh;
 86 |                             int iw = ow * 2 + kw;
 87 |                             int in_idx = ((b * channels + c) * in_height + ih) * in_width + iw;
 88 |                             
 89 |                             if (input[in_idx] > max_val) {
 90 |                                 max_val = input[in_idx];
 91 |                             }
 92 |                         }
 93 |                     }
 94 |                     
 95 |                     output[out_idx] = max_val;
 96 |                 }
 97 |             }
 98 |         }
 99 |     }
100 | }
101 | 
102 | // MLX version of tensor flattening
103 | void mlx_flatten_tensor(int batch_size, int channels, int height, int width,
104 |                         double* input, double* output) {
105 |     printf("MLX Flatten called - optimized for Apple Silicon\n");
106 |     
107 |     int flat_size = channels * height * width;
108 |     
109 |     // Simple memory copy - already efficient since we're just reshaping
110 |     for (int b = 0; b < batch_size; b++) {
111 |         memcpy(output + b * flat_size, input + b * flat_size, flat_size * sizeof(double));
112 |     }
113 | }
114 | 
115 | // MLX version of softmax
116 | void mlx_softmax(int batch_size, int num_classes, double* input, double* output) {
117 |     printf("MLX Softmax called - optimized for Apple Silicon\n");
118 |     
119 |     // Optimized for Apple Silicon
120 |     // Removing OpenMP pragma on Mac
121 |     //#pragma omp parallel for
122 |     for (int b = 0; b < batch_size; b++) {
123 |         // Find max value for numerical stability
124 |         double max_val = -INFINITY;
125 |         for (int c = 0; c < num_classes; c++) {
126 |             if (input[b * num_classes + c] > max_val) {
127 |                 max_val = input[b * num_classes + c];
128 |             }
129 |         }
130 |         
131 |         // Compute exponentials and sum
132 |         double sum = 0.0;
133 |         for (int c = 0; c < num_classes; c++) {
134 |             output[b * num_classes + c] = exp(input[b * num_classes + c] - max_val);
135 |             sum += output[b * num_classes + c];
136 |         }
137 |         
138 |         // Normalize
139 |         for (int c = 0; c < num_classes; c++) {
140 |             output[b * num_classes + c] /= sum;
141 |         }
142 |     }
143 | }
144 | 
145 | // Check if MLX is available (placeholder)
146 | int check_mlx_available() {
147 | #ifdef __APPLE__
148 |     // On Apple Silicon this would check for MLX availability
149 |     // For now, return true since we're on a Mac
150 |     return 1;
151 | #else
152 |     return 0;
153 | #endif
154 | }


--------------------------------------------------------------------------------
/mlx_ops.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <stdbool.h>
 4 | #include <string.h>
 5 | 
 6 | #ifdef __APPLE__
 7 | // Check if MLX is available - this would actually import the MLX C API
 8 | // #include <mlx_c.h> // This would be the actual MLX C API header
 9 | #endif
10 | 
11 | // Placeholder function to check if MLX is available
12 | int check_mlx_available() {
13 | #ifdef __APPLE__
14 |     // This would check if we're on Apple Silicon and MLX is installed
15 |     // For now, just a placeholder that returns 0 (not available)
16 |     return 0;
17 | #else
18 |     return 0;
19 | #endif
20 | }
21 | 
22 | // Matrix multiplication using MLX
23 | // This is a placeholder for actual MLX implementation
24 | int matrix_multiply_mlx(int M, int N, int K, double* A, double* B, double* C) {
25 |     printf("MLX Matrix multiplication called (placeholders).\n");
26 |     printf("Dimensions: (%d x %d) * (%d x %d) = (%d x %d)\n", M, K, K, N, M, N);
27 |     
28 |     // Fall back to CPU implementation for now
29 |     for (int i = 0; i < M; i++) {
30 |         for (int j = 0; j < N; j++) {
31 |             double sum = 0.0;
32 |             for (int k = 0; k < K; k++) {
33 |                 sum += A[i * K + k] * B[k * N + j];
34 |             }
35 |             C[i * N + j] = sum;
36 |         }
37 |     }
38 |     
39 |     return 0;
40 | }
41 | 
42 | // These are all placeholder functions that would actually use MLX
43 | // in a real implementation
44 | 
45 | int tensor_add_mlx(int size, double* a, double* b, double* result) {
46 |     printf("MLX tensor add called (placeholder).\n");
47 |     for (int i = 0; i < size; i++) {
48 |         result[i] = a[i] + b[i];
49 |     }
50 |     return 0;
51 | }
52 | 
53 | int tensor_sub_mlx(int size, double* a, double* b, double* result) {
54 |     printf("MLX tensor subtract called (placeholder).\n");
55 |     for (int i = 0; i < size; i++) {
56 |         result[i] = a[i] - b[i];
57 |     }
58 |     return 0;
59 | }
60 | 
61 | int tensor_mul_elementwise_mlx(int size, double* a, double* b, double* result) {
62 |     printf("MLX tensor element-wise multiply called (placeholder).\n");
63 |     for (int i = 0; i < size; i++) {
64 |         result[i] = a[i] * b[i];
65 |     }
66 |     return 0;
67 | }
68 | 
69 | int relu_forward_mlx(int size, double* input, double* output) {
70 |     printf("MLX ReLU forward called (placeholder).\n");
71 |     for (int i = 0; i < size; i++) {
72 |         output[i] = input[i] > 0 ? input[i] : 0;
73 |     }
74 |     return 0;
75 | }
76 | 
77 | int relu_backward_mlx(int size, double* input, double* output) {
78 |     printf("MLX ReLU backward called (placeholder).\n");
79 |     for (int i = 0; i < size; i++) {
80 |         output[i] = input[i] > 0 ? 1 : 0;
81 |     }
82 |     return 0;
83 | }


--------------------------------------------------------------------------------
/mnist.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt")
  4 | (require "autograd.rkt")
  5 | 
  6 | ;; MNIST Loading functions
  7 | (define (read-idx3-ubyte filename)
  8 |   (let* ([p (open-input-file filename #:mode 'binary)]
  9 |          [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)]
 10 |          [num-images (integer-bytes->integer (read-bytes 4 p) #f #t)]
 11 |          [num-rows (integer-bytes->integer (read-bytes 4 p) #f #t)]
 12 |          [num-cols (integer-bytes->integer (read-bytes 4 p) #f #t)]
 13 |          [data (make-vector (* num-images num-rows num-cols) 0)])
 14 |     (for ([i (in-range (vector-length data))])
 15 |       (vector-set! data i (read-byte p)))
 16 |     (close-input-port p)
 17 |     (t:create (list num-images (* num-rows num-cols)) data)))
 18 | 
 19 | (define (read-idx1-ubyte filename)
 20 |   (let* ([p (open-input-file filename #:mode 'binary)]
 21 |          [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)]
 22 |          [num-items (integer-bytes->integer (read-bytes 4 p) #f #t)]
 23 |          [data (make-vector num-items 0)])
 24 |     (for ([i (in-range num-items)])
 25 |       (vector-set! data i (read-byte p)))
 26 |     (close-input-port p)
 27 |     (t:create (list num-items 1) data)))
 28 | 
 29 | ;; Load MNIST data
 30 | (define (load-mnist-data type)
 31 |   (let* ([base-path "/path to mnist/"]
 32 |          [images-file (string-append base-path 
 33 |                                      (if (equal? type "train")
 34 |                                          "train-images.idx3-ubyte"
 35 |                                          "t10k-images.idx3-ubyte"))]
 36 |          [labels-file (string-append base-path 
 37 |                                      (if (equal? type "train")
 38 |                                          "train-labels.idx1-ubyte"
 39 |                                          "t10k-labels.idx1-ubyte"))])
 40 |     (printf "Looking for images-file at: ~a~n" images-file)
 41 |     (printf "Looking for labels-file at: ~a~n" labels-file)
 42 | 
 43 |     ;; Error checking
 44 |     (unless (file-exists? images-file)
 45 |       (error 'load-mnist-data 
 46 |              (format "MNIST images file not found: ~a" images-file)))
 47 |     (unless (file-exists? labels-file)
 48 |       (error 'load-mnist-data 
 49 |              (format "MNIST labels file not found: ~a" labels-file)))
 50 |     (values (read-idx3-ubyte images-file)
 51 |             (read-idx1-ubyte labels-file))))
 52 | 
 53 | ;; Normalize the data
 54 | (define (normalize X)
 55 |   (t:scale X (/ 1.0 255.0)))
 56 | 
 57 | ;; One-hot encode labels
 58 | (define (one-hot y num-classes)
 59 |   (let* ([num-samples (car (t:shape y))]
 60 |          [encoded (make-vector (* num-samples num-classes) 0.0)])
 61 |     (for ([i (in-range num-samples)])
 62 |       (vector-set! encoded (+ (* i num-classes) (vector-ref (t:data y) i)) 1.0))
 63 |     (t:create (list num-samples num-classes) encoded)))
 64 | 
 65 | ;; Get a batch of data given indices
 66 | (define (get-batch X indices)
 67 |   (let* ([batch-size (length indices)]
 68 |          [feature-size (cadr (t:shape X))]
 69 |          [batch-data (make-vector (* batch-size feature-size) 0.0)])
 70 |     (for ([i (range batch-size)]
 71 |           [idx indices])
 72 |       (for ([j (range feature-size)])
 73 |         (vector-set! batch-data 
 74 |                      (+ (* i feature-size) j)
 75 |                      (vector-ref (t:data X) (+ (* idx feature-size) j)))))
 76 |     (t:create (list batch-size feature-size) batch-data)))
 77 | 
 78 | ;; Softmax function
 79 | (define (softmax z)
 80 |   (let* ([shape (t:shape z)]
 81 |          [data (t:data z)]
 82 |          [max-vals (for/vector ([i (in-range (car shape))])
 83 |                     (apply max (for/list ([j (in-range (cadr shape))])
 84 |                                (vector-ref data (+ (* i (cadr shape)) j)))))]
 85 |          [exp-vals (for/vector ([i (in-range (vector-length data))])
 86 |                     (exp (- (vector-ref data i) 
 87 |                            (vector-ref max-vals (quotient i (cadr shape))))))]
 88 |          [sum-vals (for/vector ([i (in-range (car shape))])
 89 |                     (for/sum ([j (in-range (cadr shape))])
 90 |                       (vector-ref exp-vals (+ (* i (cadr shape)) j))))])
 91 |     (t:create shape
 92 |               (for/vector ([i (in-range (vector-length exp-vals))])
 93 |                 (/ (vector-ref exp-vals i)
 94 |                    (vector-ref sum-vals (quotient i (cadr shape))))))))
 95 | 
 96 | ;; Forward pass
 97 | (define (broadcast-bias b batch-size)
 98 |   (let* ([bias-shape (t:shape b)]
 99 |          [num-classes (cadr bias-shape)]
100 |          [expanded-data (make-vector (* batch-size num-classes) 0.0)])
101 |     (for ([i (in-range batch-size)])
102 |       (for ([j (in-range num-classes)])
103 |         (vector-set! expanded-data (+ (* i num-classes) j)
104 |                      (vector-ref (t:data b) j))))
105 |     (t:create (list batch-size num-classes) expanded-data)))
106 | 
107 | (define (forward X w b)
108 |   (let* ([z (t:add (t:mul X w) (broadcast-bias b (car (t:shape X))))]) ; Broadcast b
109 |     (softmax z)))
110 | 
111 | ;; Cross-entropy loss
112 | (define (cross-entropy y-pred y-true)
113 |   (let* ([m (car (t:shape y-true))]
114 |          [epsilon 1e-15]
115 |          [y-pred-clipped (t:create (t:shape y-pred)
116 |                                  (for/list ([p (vector->list (t:data y-pred))])
117 |                                    (max (min p (- 1.0 epsilon)) epsilon)))]
118 |          [loss-vec (for/list ([i (in-range (vector-length (t:data y-true)))])
119 |                     (* (vector-ref (t:data y-true) i)
120 |                        (log (vector-ref (t:data y-pred-clipped) i))))])
121 |     (- (/ (apply + loss-vec) m))))
122 | 
123 | ;; Initialize parameters
124 | (define input-size 784)  ; 28x28 pixels
125 | (define num-classes 10)  ; digits 0-9
126 | (define weights (t:random (list input-size num-classes) 0.01))  ; (input-size, num-classes)
127 | (define bias (t:random (list 1 num-classes) 0.01))              ; (1, num-classes)
128 | 
129 | ;; Training hyperparameters
130 | (define learning-rate 0.1)
131 | (define epochs 3)     ; Reduced for testing
132 | (define batch-size 64)
133 | 
134 | ;; Load and initialize MNIST data
135 | (printf "Loading MNIST data...~n")
136 | (define-values (X-train y-train) (load-mnist-data "train")) ; Load training data
137 | (define-values (X-test y-test) (load-mnist-data "test"))    ; Load test data
138 | 
139 | (printf "Normalizing data...~n")
140 | (set! X-train (normalize X-train)) ; Normalize X-train
141 | (set! X-test (normalize X-test))   ; Normalize X-test
142 | 
143 | (printf "One-hot encoding labels...~n")
144 | (set! y-train (one-hot y-train 10)) ; Convert y-train to one-hot
145 | (set! y-test (one-hot y-test 10))   ; Convert y-test to one-hot
146 | 
147 | ;; Training loop using autograd
148 | (define (train-batch X-batch y-batch)
149 |   (let* ([y-pred (forward X-batch weights bias)]
150 |          [loss (cross-entropy y-pred y-batch)]
151 |          
152 |          ;; Compute gradient of loss with respect to predictions
153 |          [batch-size (car (t:shape y-batch))]
154 |          [dloss (t:sub y-pred y-batch)]  ; for cross-entropy with softmax, gradient is (pred - true)
155 |          
156 |          ;; Use autograd to compute gradients
157 |          ;; Since we don't have a full autograd system, we'll compute gradients manually
158 |          ;; but structure it to use the autograd module's functions
159 |          
160 |          ;; For softmax regression, gradient for weights: X^T * (y_pred - y_true) / batch_size
161 |          [gradient-w (t:scale (t:mul (t:transpose X-batch) dloss) 
162 |                             (/ 1.0 batch-size))]
163 |          
164 |          ;; Gradient for bias: sum(y_pred - y_true, axis=0) / batch_size
165 |          [gradient-b (t:create (t:shape bias)
166 |                              (for/list ([j (in-range num-classes)])
167 |                                (/ (for/sum ([i (in-range batch-size)])
168 |                                     (vector-ref (t:data dloss) (+ (* i num-classes) j)))
169 |                                   batch-size)))])
170 |     
171 |     ;; Update parameters with gradients
172 |     (set! weights (t:sub weights (t:scale gradient-w learning-rate)))
173 |     (set! bias (t:sub bias (t:scale gradient-b learning-rate)))
174 |     
175 |     loss))
176 | 
177 | (define (get-test-accuracy X y)
178 |   (let* ([predictions (forward X weights bias)]
179 |          [num-samples (car (t:shape X))]
180 |          [num-classes (cadr (t:shape y))]
181 |          [pred-data (t:data predictions)]
182 |          [true-data (t:data y)]
183 |          [correct-count 0])
184 |     
185 |     ; Count correct predictions
186 |     (for ([i (range num-samples)])
187 |       (let* ([start-idx (* i num-classes)]
188 |              ; Find predicted class (max probability index)
189 |              [pred-vals (for/list ([j (range num-classes)])
190 |                          (vector-ref pred-data (+ start-idx j)))]
191 |              [pred-class (argmax (lambda (j) (list-ref pred-vals j))
192 |                                (range num-classes))]
193 |              ; Find true class 
194 |              [true-class (for/first ([j (range num-classes)]
195 |                                    #:when (= 1.0 (vector-ref true-data (+ start-idx j))))
196 |                           j)])
197 |         ; Increment counter if prediction matches truth
198 |         (when (= pred-class true-class)
199 |           (set! correct-count (add1 correct-count)))))
200 |     
201 |     ; Return accuracy as percentage
202 |     (exact->inexact (* 100.0 (/ correct-count num-samples)))))
203 | 
204 | ;; Create a validation split from training data
205 | (define validation-split 0.1)  ; 10% for validation
206 | (define train-size (car (t:shape X-train)))
207 | (define validation-size (inexact->exact (floor (* train-size validation-split))))
208 | (define actual-train-size (- train-size validation-size))
209 | 
210 | ;; Shuffle indices and split data
211 | (define all-indices (shuffle (range train-size)))
212 | (define train-indices (take all-indices actual-train-size))
213 | (define validation-indices (drop all-indices actual-train-size))
214 | 
215 | ;; Create validation set
216 | (define X-val (get-batch X-train validation-indices))
217 | (define y-val (get-batch y-train validation-indices))
218 | 
219 | ;; Update training set
220 | (define X-train-actual (get-batch X-train train-indices))
221 | (define y-train-actual (get-batch y-train train-indices))
222 | 
223 | ;; Set as new training set
224 | (set! X-train X-train-actual)
225 | (set! y-train y-train-actual)
226 | 
227 | ;; Define early stopping parameters
228 | (define patience 3)
229 | (define min-delta 0.001)
230 | (define wait 0)
231 | (define best-val-accuracy 0.0)
232 | (define best-weights weights)
233 | (define best-bias bias)
234 | 
235 | ;; Main training loop with early stopping
236 | (printf "Starting training...~n")
237 | (printf "Training on ~a samples, validating on ~a samples, testing on ~a samples~n~n" 
238 |         (car (t:shape X-train)) 
239 |         (car (t:shape X-val))
240 |         (car (t:shape X-test)))
241 | 
242 | (define best-test-accuracy 0.0)
243 | 
244 | (for/fold ([stop? #f]) 
245 |           ([epoch (in-range epochs)] #:break stop?)
246 |   (printf "Epoch ~a/~a:~n" (add1 epoch) epochs)
247 |   
248 |   (let* ([indices (shuffle (range (car (t:shape X-train))))]
249 |          [num-batches (quotient (length indices) batch-size)]
250 |          [epoch-losses '()])
251 | 
252 |     ; Train on batches
253 |     (for ([batch (in-range num-batches)])
254 |       (let* ([batch-indices (take (drop indices (* batch batch-size)) batch-size)]
255 |              [X-batch (get-batch X-train batch-indices)]
256 |              [y-batch (get-batch y-train batch-indices)]
257 |              [loss (train-batch X-batch y-batch)])
258 |         
259 |         (set! epoch-losses (cons loss epoch-losses))
260 |         
261 |         (when (= (modulo batch 50) 0)
262 |           (printf "  Batch ~a/~a - Loss: ~a~n" 
263 |                   batch 
264 |                   num-batches 
265 |                   (real->decimal-string loss 4)))))
266 |     
267 |     ; Epoch evaluation with validation
268 |     (let* ([avg-loss (/ (apply + epoch-losses) (length epoch-losses))]
269 |            [val-accuracy (get-test-accuracy X-val y-val)]
270 |            [test-accuracy (get-test-accuracy X-test y-test)])
271 |       
272 |       (printf "~nEpoch Summary:~n")
273 |       (printf "  Average Loss: ~a~n" (real->decimal-string avg-loss 4))
274 |       (printf "  Validation Accuracy: ~a%~n" (real->decimal-string val-accuracy 2))
275 |       (printf "  Test Accuracy: ~a%~n" (real->decimal-string test-accuracy 2))
276 |       
277 |       ;; Save best model based on validation accuracy
278 |       (if (> val-accuracy best-val-accuracy)
279 |           (begin
280 |             (set! best-val-accuracy val-accuracy)
281 |             (set! best-weights (t:create (t:shape weights) (t:data weights)))
282 |             (set! best-bias (t:create (t:shape bias) (t:data bias)))
283 |             (set! wait 0)
284 |             (printf "  New best model saved!~n"))
285 |           (begin
286 |             (set! wait (add1 wait))
287 |             (printf "  No improvement for ~a epochs.~n" wait)))
288 |       
289 |       ;; Save best test accuracy for reporting
290 |       (when (> test-accuracy best-test-accuracy)
291 |         (set! best-test-accuracy test-accuracy))
292 |       
293 |       (printf "  Best Validation Accuracy: ~a%~n" (real->decimal-string best-val-accuracy 2))
294 |       (printf "  Best Test Accuracy So Far: ~a%~n~n" (real->decimal-string best-test-accuracy 2))
295 |       
296 |       ;; Check early stopping condition
297 |       (if (>= wait patience)
298 |           (begin
299 |             (printf "Early stopping triggered after ~a epochs without improvement.~n" patience)
300 |             #t)  ; Stop training
301 |           #f)))) ; Continue training
302 | 
303 | ;; Restore best model
304 | (set! weights best-weights)
305 | (set! bias best-bias)
306 | 
307 | (printf "Training Complete!~n")
308 | (printf "Final Test Accuracy: ~a%~n" 
309 |         (real->decimal-string (get-test-accuracy X-test y-test) 2))
310 | (printf "Best Validation Accuracy: ~a%~n"
311 |         (real->decimal-string best-val-accuracy 2))
312 | (printf "Best Test Accuracy: ~a%~n"
313 |         (real->decimal-string best-test-accuracy 2))
314 | 


--------------------------------------------------------------------------------
/mnist_benchmark.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "mnist_device.rkt"
 4 |          "device.rkt"
 5 |          "hardware_detection.rkt")
 6 | 
 7 | ;; Benchmark on all available devices
 8 | (define (benchmark-mnist epochs batch-size)
 9 |   (printf "====================================\n")
10 |   (printf "  RACOGRAD MNIST BENCHMARKS\n")
11 |   (printf "====================================\n")
12 |   
13 |   (print-hardware-info)
14 |   
15 |   ;; Run on CPU
16 |   (printf "\n\nRunning MNIST benchmark on CPU...\n")
17 |   (define cpu-result (train-mnist 'cpu batch-size epochs))
18 |   
19 |   (define results (list (cons 'cpu cpu-result)))
20 |   
21 |   ;; Run on MLX if available
22 |   (when (has-mlx-support?)
23 |     (printf "\n\nRunning MNIST benchmark on MLX (Apple Silicon)...\n")
24 |     (define mlx-result (train-mnist 'mlx batch-size epochs))
25 |     (set! results (cons (cons 'mlx mlx-result) results)))
26 |   
27 |   ;; Run on CUDA if available
28 |   (when (has-cuda-support?)
29 |     (printf "\n\nRunning MNIST benchmark on CUDA (NVIDIA GPU)...\n")
30 |     (define cuda-result (train-mnist 'cuda batch-size epochs))
31 |     (set! results (cons (cons 'cuda cuda-result) results)))
32 |   
33 |   ;; Run on OpenCL if available
34 |   (when (has-opencl?)
35 |     (printf "\n\nRunning MNIST benchmark on OpenCL...\n")
36 |     (define opencl-result (train-mnist 'opencl batch-size epochs))
37 |     (set! results (cons (cons 'opencl opencl-result) results)))
38 |   
39 |   ;; Compare results
40 |   (printf "\n\n====================================\n")
41 |   (printf "  PERFORMANCE COMPARISON\n")
42 |   (printf "====================================\n")
43 |   
44 |   (define cpu-time (hash-ref (cdr (assoc 'cpu results)) 'time))
45 |   (define cpu-accuracy (hash-ref (cdr (assoc 'cpu results)) 'accuracy))
46 |   
47 |   (printf "CPU Training Time: ~a seconds\n" (real->decimal-string cpu-time 2))
48 |   (printf "CPU Accuracy: ~a%\n\n" (real->decimal-string cpu-accuracy 2))
49 |   
50 |   (for ([result (in-list results)])
51 |     (unless (eq? (car result) 'cpu)
52 |       (let* ([device-type (car result)]
53 |              [device-result (cdr result)]
54 |              [device-time (hash-ref device-result 'time)]
55 |              [device-accuracy (hash-ref device-result 'accuracy)]
56 |              [speedup (/ cpu-time device-time)])
57 |         (printf "~a Training Time: ~a seconds\n" 
58 |                 device-type
59 |                 (real->decimal-string device-time 2))
60 |         (printf "~a Accuracy: ~a%\n" 
61 |                 device-type
62 |                 (real->decimal-string device-accuracy 2))
63 |         (printf "Speedup vs CPU: ~a times faster\n\n" 
64 |                 (real->decimal-string speedup 2)))))
65 |   
66 |   (printf "====================================\n"))
67 | 
68 | ;; Run the benchmark when executed directly
69 | (module+ main
70 |   ;; Use smaller values for quick testing
71 |   (benchmark-mnist 2 128))


--------------------------------------------------------------------------------
/mnist_device.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor_device.rkt"
  4 |          "device.rkt"
  5 |          "autograd.rkt")
  6 | 
  7 | ;; MNIST Loading functions
  8 | (define (read-idx3-ubyte filename)
  9 |   (let* ([p (open-input-file filename #:mode 'binary)]
 10 |          [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)]
 11 |          [num-images (integer-bytes->integer (read-bytes 4 p) #f #t)]
 12 |          [num-rows (integer-bytes->integer (read-bytes 4 p) #f #t)]
 13 |          [num-cols (integer-bytes->integer (read-bytes 4 p) #f #t)]
 14 |          [data (make-vector (* num-images num-rows num-cols) 0)])
 15 |     (for ([i (in-range (vector-length data))])
 16 |       (vector-set! data i (read-byte p)))
 17 |     (close-input-port p)
 18 |     (dt:create (list num-images (* num-rows num-cols)) data)))
 19 | 
 20 | (define (read-idx1-ubyte filename)
 21 |   (let* ([p (open-input-file filename #:mode 'binary)]
 22 |          [magic-number (integer-bytes->integer (read-bytes 4 p) #f #t)]
 23 |          [num-items (integer-bytes->integer (read-bytes 4 p) #f #t)]
 24 |          [data (make-vector num-items 0)])
 25 |     (for ([i (in-range num-items)])
 26 |       (vector-set! data i (read-byte p)))
 27 |     (close-input-port p)
 28 |     (dt:create (list num-items 1) data)))
 29 | 
 30 | ;; Load MNIST data
 31 | (define (load-mnist-data type)
 32 |   (let* ([base-path "/path to mnist/"]
 33 |          [images-file (string-append base-path 
 34 |                                      (if (equal? type "train")
 35 |                                          "train-images.idx3-ubyte"
 36 |                                          "t10k-images.idx3-ubyte"))]
 37 |          [labels-file (string-append base-path 
 38 |                                      (if (equal? type "train")
 39 |                                          "train-labels.idx1-ubyte"
 40 |                                          "t10k-labels.idx1-ubyte"))])
 41 |     (printf "Looking for images-file at: ~a~n" images-file)
 42 |     (printf "Looking for labels-file at: ~a~n" labels-file)
 43 | 
 44 |     ;; Error checking
 45 |     (unless (file-exists? images-file)
 46 |       (error 'load-mnist-data 
 47 |              (format "MNIST images file not found: ~a" images-file)))
 48 |     (unless (file-exists? labels-file)
 49 |       (error 'load-mnist-data 
 50 |              (format "MNIST labels file not found: ~a" labels-file)))
 51 |     (values (read-idx3-ubyte images-file)
 52 |             (read-idx1-ubyte labels-file))))
 53 | 
 54 | ;; Normalize the data
 55 | (define (normalize X)
 56 |   (dt:scale X (/ 1.0 255.0)))
 57 | 
 58 | ;; One-hot encode labels
 59 | (define (one-hot y num-classes)
 60 |   (let* ([num-samples (car (dt:shape y))]
 61 |          [encoded (make-vector (* num-samples num-classes) 0.0)])
 62 |     (for ([i (in-range num-samples)])
 63 |       (vector-set! encoded (+ (* i num-classes) (vector-ref (dt:data y) i)) 1.0))
 64 |     (dt:create (list num-samples num-classes) encoded)))
 65 | 
 66 | ;; Get a batch of data given indices
 67 | (define (get-batch X indices)
 68 |   (let* ([batch-size (length indices)]
 69 |          [feature-size (cadr (dt:shape X))]
 70 |          [batch-data (make-vector (* batch-size feature-size) 0.0)])
 71 |     (for ([i (range batch-size)]
 72 |           [idx indices])
 73 |       (for ([j (range feature-size)])
 74 |         (vector-set! batch-data 
 75 |                      (+ (* i feature-size) j)
 76 |                      (vector-ref (dt:data X) (+ (* idx feature-size) j)))))
 77 |     (dt:create (list batch-size feature-size) batch-data)))
 78 | 
 79 | ;; Softmax function
 80 | (define (softmax z)
 81 |   (let* ([shape (dt:shape z)]
 82 |          [data (dt:data z)]
 83 |          [max-vals (for/vector ([i (in-range (car shape))])
 84 |                     (apply max (for/list ([j (in-range (cadr shape))])
 85 |                                (vector-ref data (+ (* i (cadr shape)) j)))))]
 86 |          [exp-vals (for/vector ([i (in-range (vector-length data))])
 87 |                     (exp (- (vector-ref data i) 
 88 |                            (vector-ref max-vals (quotient i (cadr shape))))))]
 89 |          [sum-vals (for/vector ([i (in-range (car shape))])
 90 |                     (for/sum ([j (in-range (cadr shape))])
 91 |                       (vector-ref exp-vals (+ (* i (cadr shape)) j))))])
 92 |     (dt:create shape
 93 |               (for/vector ([i (in-range (vector-length exp-vals))])
 94 |                 (/ (vector-ref exp-vals i)
 95 |                    (vector-ref sum-vals (quotient i (cadr shape))))))))
 96 | 
 97 | ;; Forward pass
 98 | (define (broadcast-bias b batch-size)
 99 |   (let* ([bias-shape (dt:shape b)]
100 |          [num-classes (cadr bias-shape)]
101 |          [expanded-data (make-vector (* batch-size num-classes) 0.0)])
102 |     (for ([i (in-range batch-size)])
103 |       (for ([j (in-range num-classes)])
104 |         (vector-set! expanded-data (+ (* i num-classes) j)
105 |                      (vector-ref (dt:data b) j))))
106 |     (dt:create (list batch-size num-classes) expanded-data)))
107 | 
108 | (define (forward X w b)
109 |   (let* ([z (dt:add (dt:mul X w) (broadcast-bias b (car (dt:shape X))))]) ; Broadcast b
110 |     (softmax z)))
111 | 
112 | ;; Cross-entropy loss
113 | (define (cross-entropy y-pred y-true)
114 |   (let* ([m (car (dt:shape y-true))]
115 |          [epsilon 1e-15]
116 |          [y-pred-clipped (dt:create (dt:shape y-pred)
117 |                                  (for/list ([p (vector->list (dt:data y-pred))])
118 |                                    (max (min p (- 1.0 epsilon)) epsilon)))]
119 |          [loss-vec (for/list ([i (in-range (vector-length (dt:data y-true)))])
120 |                     (* (vector-ref (dt:data y-true) i)
121 |                        (log (vector-ref (dt:data y-pred-clipped) i))))])
122 |     (- (/ (apply + loss-vec) m))))
123 | 
124 | ;; Initialize parameters
125 | (define input-size 784)  ; 28x28 pixels
126 | (define num-classes 10)  ; digits 0-9
127 | 
128 | ;; Run the training process with a specified device
129 | (provide train-mnist)
130 | 
131 | (define (train-mnist device-type batch-size [epochs 10])
132 |   (printf "~nStarting MNIST training on device: ~a~n" device-type)
133 |   
134 |   ;; Set the device for computation
135 |   (cond
136 |     [(eq? device-type 'cpu)
137 |      (set-current-device! (cpu))]
138 |     [(eq? device-type 'mlx)
139 |      (if (device-available? 'mlx)
140 |          (set-current-device! (mlx))
141 |          (begin 
142 |            (printf "MLX not available. Falling back to CPU.\n")
143 |            (set-current-device! (cpu))))]
144 |     [(eq? device-type 'cuda)
145 |      (if (device-available? 'cuda)
146 |          (set-current-device! (cuda))
147 |          (begin 
148 |            (printf "CUDA not available. Falling back to CPU.\n")
149 |            (set-current-device! (cpu))))]
150 |     [(eq? device-type 'opencl)
151 |      (if (device-available? 'opencl)
152 |          (set-current-device! (opencl))
153 |          (begin 
154 |            (printf "OpenCL not available. Falling back to CPU.\n")
155 |            (set-current-device! (cpu))))]
156 |     [(eq? device-type 'gpu)
157 |      (if (gpu-available?)
158 |          (set-current-device! (gpu))
159 |          (begin 
160 |            (printf "No GPU available. Falling back to CPU.\n")
161 |            (set-current-device! (cpu))))]
162 |     [else 
163 |      (printf "Unknown device type: ~a. Using CPU.\n" device-type)
164 |      (set-current-device! (cpu))])
165 |   
166 |   (printf "Using device: ~a~n" (get-device-type (current-device)))
167 |   
168 |   ;; Create model parameters on the selected device
169 |   (define weights (dt:random (list input-size num-classes) 0.01))  ; (input-size, num-classes)
170 |   (define bias (dt:random (list 1 num-classes) 0.01))              ; (1, num-classes)
171 |   
172 |   ;; Training hyperparameters
173 |   (define learning-rate 0.1)
174 |   
175 |   ;; Load and initialize MNIST data
176 |   (printf "Loading MNIST data...~n")
177 |   (define-values (X-train y-train) (load-mnist-data "train")) ; Load training data
178 |   (define-values (X-test y-test) (load-mnist-data "test"))    ; Load test data
179 |   
180 |   (printf "Normalizing data...~n")
181 |   (set! X-train (normalize X-train)) ; Normalize X-train
182 |   (set! X-test (normalize X-test))   ; Normalize X-test
183 |   
184 |   (printf "One-hot encoding labels...~n")
185 |   (set! y-train (one-hot y-train 10)) ; Convert y-train to one-hot
186 |   (set! y-test (one-hot y-test 10))   ; Convert y-test to one-hot
187 |   
188 |   ;; Move data to device if using GPU
189 |   (when (eq? device-type 'gpu)
190 |     (printf "Moving data to GPU...~n")
191 |     (set! X-train (dt:to X-train (current-device)))
192 |     (set! y-train (dt:to y-train (current-device)))
193 |     (set! X-test (dt:to X-test (current-device)))
194 |     (set! y-test (dt:to y-test (current-device))))
195 |   
196 |   ;; Training loop
197 |   (define (train-batch X-batch y-batch)
198 |     (let* ([y-pred (forward X-batch weights bias)]
199 |            [loss (cross-entropy y-pred y-batch)]
200 |            [error (dt:sub y-pred y-batch)]
201 |            [gradient-w (dt:scale (dt:mul (dt:transpose X-batch) error) 
202 |                               (/ 1.0 (car (dt:shape X-batch))))]
203 |            [gradient-b (dt:create (dt:shape bias)
204 |                                (for/list ([j (in-range num-classes)])
205 |                                  (/ (for/sum ([i (in-range (car (dt:shape error)))])
206 |                                       (vector-ref (dt:data error) (+ (* i num-classes) j)))
207 |                                     (car (dt:shape error)))))])
208 |       ;; Update parameters
209 |       (set! weights (dt:sub weights (dt:scale gradient-w learning-rate)))
210 |       (set! bias (dt:sub bias (dt:scale gradient-b learning-rate)))
211 |       loss))
212 |   
213 |   (define (get-test-accuracy X y)
214 |     (let* ([predictions (forward X weights bias)]
215 |            [num-samples (car (dt:shape X))]
216 |            [num-classes (cadr (dt:shape y))]
217 |            [pred-data (dt:data predictions)]
218 |            [true-data (dt:data y)]
219 |            [correct-count 0])
220 |       
221 |       ; Count correct predictions
222 |       (for ([i (range num-samples)])
223 |         (let* ([start-idx (* i num-classes)]
224 |                ; Find predicted class (max probability index)
225 |                [pred-vals (for/list ([j (range num-classes)])
226 |                            (vector-ref pred-data (+ start-idx j)))]
227 |                [pred-class (argmax (lambda (j) (list-ref pred-vals j))
228 |                                  (range num-classes))]
229 |                ; Find true class 
230 |                [true-class (for/first ([j (range num-classes)]
231 |                                      #:when (= 1.0 (vector-ref true-data (+ start-idx j))))
232 |                             j)])
233 |           ; Increment counter if prediction matches truth
234 |           (when (= pred-class true-class)
235 |             (set! correct-count (add1 correct-count)))))
236 |       
237 |       ; Return accuracy as percentage
238 |       (exact->inexact (* 100.0 (/ correct-count num-samples)))))
239 |   
240 |   ;; Track time for benchmarking
241 |   (define start-time (current-inexact-milliseconds))
242 |   
243 |   ;; Main training loop
244 |   (printf "Starting training...~n")
245 |   (printf "Training on ~a samples, validating on ~a samples~n~n" 
246 |           (car (dt:shape X-train)) 
247 |           (car (dt:shape X-test)))
248 |   
249 |   (define best-accuracy 0.0)
250 |   
251 |   (for ([epoch (in-range epochs)])
252 |     (printf "Epoch ~a/~a:~n" (add1 epoch) epochs)
253 |     
254 |     (let* ([indices (shuffle (range (car (dt:shape X-train))))]
255 |            [num-batches (quotient (length indices) batch-size)]
256 |            [epoch-losses '()])
257 |       
258 |       ; Train on batches
259 |       (for ([batch (in-range num-batches)])
260 |         (let* ([batch-indices (take (drop indices (* batch batch-size)) batch-size)]
261 |                [X-batch (get-batch X-train batch-indices)]
262 |                [y-batch (get-batch y-train batch-indices)]
263 |                [loss (train-batch X-batch y-batch)])
264 |           
265 |           (set! epoch-losses (cons loss epoch-losses))
266 |           
267 |           (when (= (modulo batch 50) 0)
268 |             (printf "  Batch ~a/~a - Loss: ~a~n" 
269 |                     batch 
270 |                     num-batches 
271 |                     (real->decimal-string loss 4)))))
272 |       
273 |       ; Epoch evaluation
274 |       (let* ([avg-loss (/ (apply + epoch-losses) (length epoch-losses))]
275 |              [test-accuracy (get-test-accuracy X-test y-test)])
276 |         
277 |         (when (> test-accuracy best-accuracy)
278 |           (set! best-accuracy test-accuracy))
279 |         
280 |         (printf "~nEpoch Summary:~n")
281 |         (printf "  Average Loss: ~a~n" (real->decimal-string avg-loss 4))
282 |         (printf "  Test Accuracy: ~a%~n" (real->decimal-string test-accuracy 2))
283 |         (printf "  Best Accuracy So Far: ~a%~n~n" (real->decimal-string best-accuracy 2)))))
284 |   
285 |   (define end-time (current-inexact-milliseconds))
286 |   (define total-time (/ (- end-time start-time) 1000.0))
287 |   
288 |   (printf "Training Complete!~n")
289 |   (printf "Final Test Accuracy: ~a%~n" 
290 |           (real->decimal-string (get-test-accuracy X-test y-test) 2))
291 |   (printf "Total training time: ~a seconds~n" (real->decimal-string total-time 2))
292 |   
293 |   ;; Return benchmark results
294 |   (hash 'accuracy (get-test-accuracy X-test y-test)
295 |         'time total-time
296 |         'device device-type))
297 | 
298 | ;; When executed directly, run on both CPU and GPU (if available)
299 | (module+ main
300 |   (printf "Starting CPU training...~n")
301 |   (define cpu-result (train-mnist 'cpu 128 2))
302 |   
303 |   (if (gpu-available?)
304 |       (let ([gpu-result (train-mnist 'gpu 128 2)])
305 |         (printf "~n~n===== PERFORMANCE COMPARISON =====~n")
306 |         (printf "CPU Training Time: ~a seconds~n" (real->decimal-string (hash-ref cpu-result 'time) 2))
307 |         (printf "GPU Training Time: ~a seconds~n" (real->decimal-string (hash-ref gpu-result 'time) 2))
308 |         (printf "Speedup: ~a times faster~n" 
309 |                 (real->decimal-string (/ (hash-ref cpu-result 'time) (hash-ref gpu-result 'time)) 2))
310 |         (printf "CPU Accuracy: ~a%~n" (real->decimal-string (hash-ref cpu-result 'accuracy) 2))
311 |         (printf "GPU Accuracy: ~a%~n" (real->decimal-string (hash-ref gpu-result 'accuracy) 2))
312 |         (printf "===============================~n"))
313 |       (printf "~nGPU acceleration not available on this system. Skipping GPU training.~n")))
314 | 


--------------------------------------------------------------------------------
/parallel_ops.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <pthread.h>
  4 | 
  5 | // Thread arguments for parallel matrix multiplication
  6 | typedef struct {
  7 |     int start_row;
  8 |     int end_row;
  9 |     int cols_a;
 10 |     int cols_b;
 11 |     double *a;
 12 |     double *b;
 13 |     double *c;
 14 | } thread_data_t;
 15 | 
 16 | // Thread function for matrix multiplication
 17 | void* matrix_multiply_thread(void *arg) {
 18 |     thread_data_t *data = (thread_data_t*)arg;
 19 |     
 20 |     for (int i = data->start_row; i < data->end_row; i++) {
 21 |         for (int j = 0; j < data->cols_b; j++) {
 22 |             double sum = 0.0;
 23 |             for (int k = 0; k < data->cols_a; k++) {
 24 |                 sum += data->a[i * data->cols_a + k] * data->b[k * data->cols_b + j];
 25 |             }
 26 |             data->c[i * data->cols_b + j] = sum;
 27 |         }
 28 |     }
 29 |     
 30 |     pthread_exit(NULL);
 31 | }
 32 | 
 33 | // Parallel matrix multiplication
 34 | void matrix_multiply_parallel(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c, int num_threads) {
 35 |     pthread_t threads[num_threads];
 36 |     thread_data_t thread_data[num_threads];
 37 |     
 38 |     int rows_per_thread = rows_a / num_threads;
 39 |     int remainder = rows_a % num_threads;
 40 |     
 41 |     int start_row = 0;
 42 |     for (int t = 0; t < num_threads; t++) {
 43 |         int end_row = start_row + rows_per_thread + (t < remainder ? 1 : 0);
 44 |         
 45 |         thread_data[t].start_row = start_row;
 46 |         thread_data[t].end_row = end_row;
 47 |         thread_data[t].cols_a = cols_a;
 48 |         thread_data[t].cols_b = cols_b;
 49 |         thread_data[t].a = a;
 50 |         thread_data[t].b = b;
 51 |         thread_data[t].c = c;
 52 |         
 53 |         pthread_create(&threads[t], NULL, matrix_multiply_thread, &thread_data[t]);
 54 |         
 55 |         start_row = end_row;
 56 |     }
 57 |     
 58 |     // Wait for all threads to complete
 59 |     for (int t = 0; t < num_threads; t++) {
 60 |         pthread_join(threads[t], NULL);
 61 |     }
 62 | }
 63 | 
 64 | // Thread function for elementwise operations
 65 | typedef struct {
 66 |     int start_idx;
 67 |     int end_idx;
 68 |     double *a;
 69 |     double *b;
 70 |     double *result;
 71 |     int operation; // 0: add, 1: subtract, 2: multiply
 72 | } elementwise_thread_data_t;
 73 | 
 74 | void* elementwise_operation_thread(void *arg) {
 75 |     elementwise_thread_data_t *data = (elementwise_thread_data_t*)arg;
 76 |     
 77 |     switch(data->operation) {
 78 |         case 0: // Add
 79 |             for (int i = data->start_idx; i < data->end_idx; i++) {
 80 |                 data->result[i] = data->a[i] + data->b[i];
 81 |             }
 82 |             break;
 83 |         case 1: // Subtract
 84 |             for (int i = data->start_idx; i < data->end_idx; i++) {
 85 |                 data->result[i] = data->a[i] - data->b[i];
 86 |             }
 87 |             break;
 88 |         case 2: // Multiply
 89 |             for (int i = data->start_idx; i < data->end_idx; i++) {
 90 |                 data->result[i] = data->a[i] * data->b[i];
 91 |             }
 92 |             break;
 93 |     }
 94 |     
 95 |     pthread_exit(NULL);
 96 | }
 97 | 
 98 | // Parallel elementwise operations
 99 | void tensor_elementwise_parallel(int size, double *a, double *b, double *result, int operation, int num_threads) {
100 |     pthread_t threads[num_threads];
101 |     elementwise_thread_data_t thread_data[num_threads];
102 |     
103 |     int elems_per_thread = size / num_threads;
104 |     int remainder = size % num_threads;
105 |     
106 |     int start_idx = 0;
107 |     for (int t = 0; t < num_threads; t++) {
108 |         int end_idx = start_idx + elems_per_thread + (t < remainder ? 1 : 0);
109 |         
110 |         thread_data[t].start_idx = start_idx;
111 |         thread_data[t].end_idx = end_idx;
112 |         thread_data[t].a = a;
113 |         thread_data[t].b = b;
114 |         thread_data[t].result = result;
115 |         thread_data[t].operation = operation;
116 |         
117 |         pthread_create(&threads[t], NULL, elementwise_operation_thread, &thread_data[t]);
118 |         
119 |         start_idx = end_idx;
120 |     }
121 |     
122 |     // Wait for all threads to complete
123 |     for (int t = 0; t < num_threads; t++) {
124 |         pthread_join(threads[t], NULL);
125 |     }
126 | }
127 | 
128 | // Wrapper functions
129 | void tensor_add_parallel(int size, double *a, double *b, double *result, int num_threads) {
130 |     tensor_elementwise_parallel(size, a, b, result, 0, num_threads);
131 | }
132 | 
133 | void tensor_sub_parallel(int size, double *a, double *b, double *result, int num_threads) {
134 |     tensor_elementwise_parallel(size, a, b, result, 1, num_threads);
135 | }
136 | 
137 | void tensor_mul_elementwise_parallel(int size, double *a, double *b, double *result, int num_threads) {
138 |     tensor_elementwise_parallel(size, a, b, result, 2, num_threads);
139 | }
140 | 
141 | // Batch processing with parallelism for neural networks
142 | typedef struct {
143 |     int start_sample;
144 |     int end_sample;
145 |     int input_dim;
146 |     int output_dim;
147 |     double *inputs;
148 |     double *weights;
149 |     double *biases;
150 |     double *outputs;
151 | } batch_thread_data_t;
152 | 
153 | void* process_batch_thread(void *arg) {
154 |     batch_thread_data_t *data = (batch_thread_data_t*)arg;
155 |     
156 |     for (int i = data->start_sample; i < data->end_sample; i++) {
157 |         // For each sample in this thread's portion of the batch
158 |         for (int j = 0; j < data->output_dim; j++) {
159 |             double sum = data->biases[j];
160 |             for (int k = 0; k < data->input_dim; k++) {
161 |                 sum += data->inputs[i * data->input_dim + k] * data->weights[k * data->output_dim + j];
162 |             }
163 |             // Apply ReLU
164 |             data->outputs[i * data->output_dim + j] = sum > 0 ? sum : 0;
165 |         }
166 |     }
167 |     
168 |     pthread_exit(NULL);
169 | }
170 | 
171 | // Process a batch of samples through a layer in parallel
172 | void process_batch_parallel(int batch_size, int input_dim, int output_dim, 
173 |                            double *inputs, double *weights, double *biases, 
174 |                            double *outputs, int num_threads) {
175 |     pthread_t threads[num_threads];
176 |     batch_thread_data_t thread_data[num_threads];
177 |     
178 |     int samples_per_thread = batch_size / num_threads;
179 |     int remainder = batch_size % num_threads;
180 |     
181 |     int start_sample = 0;
182 |     for (int t = 0; t < num_threads; t++) {
183 |         int end_sample = start_sample + samples_per_thread + (t < remainder ? 1 : 0);
184 |         
185 |         thread_data[t].start_sample = start_sample;
186 |         thread_data[t].end_sample = end_sample;
187 |         thread_data[t].input_dim = input_dim;
188 |         thread_data[t].output_dim = output_dim;
189 |         thread_data[t].inputs = inputs;
190 |         thread_data[t].weights = weights;
191 |         thread_data[t].biases = biases;
192 |         thread_data[t].outputs = outputs;
193 |         
194 |         pthread_create(&threads[t], NULL, process_batch_thread, &thread_data[t]);
195 |         
196 |         start_sample = end_sample;
197 |     }
198 |     
199 |     // Wait for all threads to complete
200 |     for (int t = 0; t < num_threads; t++) {
201 |         pthread_join(threads[t], NULL);
202 |     }
203 | }


--------------------------------------------------------------------------------
/regression-test.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "tensor.rkt")
 4 | (require "deep_learn_library.rkt")
 5 | 
 6 | ;; Create sample data
 7 | ;; X: 2 features, 4 samples
 8 | (define X (t:create '(4 2) '(1 2   
 9 |                             2 3
10 |                             3 4
11 |                             4 5)))
12 | 
13 | ;; y: binary labels (0 or 1)
14 | (define y (t:create '(4 1) '(0.0 0.0 1.0 1.0))) 
15 | 
16 | ;; Initialize weights and bias
17 | (define weights (t:random '(2 1) 1.0))  ; 2 features -> 1 output
18 | (define bias (t:random '(1) 1.0))
19 | 
20 | ;; Sigmoid function
21 | (define (sigmoid z)
22 |   (t:create (t:shape z)
23 |             (for/list ([x (vector->list (t:data z))])
24 |               (/ 1.0 (+ 1.0 (exp (- x)))))))
25 | 
26 | ;; Forward pass
27 | (define (forward X w b)
28 |   (let* ([z (t:add (t:mul X w) b)])
29 |     (sigmoid z)))
30 | 
31 | ;; Binary Cross Entropy Loss
32 | (define (binary-cross-entropy y-pred y-true)
33 |   (let* ([m (car (t:shape y-true))]
34 |          [epsilon 1e-15]  ; To avoid log(0)
35 |          [y-pred-clipped (t:create (t:shape y-pred)
36 |                                  (for/list ([p (vector->list (t:data y-pred))])
37 |                                    (max (min p (- 1.0 epsilon)) epsilon)))]
38 |          [loss-vec (for/list ([p (vector->list (t:data y-pred-clipped))]
39 |                              [t (vector->list (t:data y-true))])
40 |                     (+ (* t (log p))
41 |                        (* (- 1 t) (log (- 1.0 p)))))])
42 |     (- (/ (apply + loss-vec) m))))
43 | 
44 | ;; Training loop
45 | (define learning-rate 0.1)
46 | (define epochs 100)
47 | 
48 | (for ([epoch (in-range epochs)])
49 |   (let* ([;; Forward pass
50 |           y-pred (forward X weights bias)]
51 |          [;; Calculate loss
52 |           loss (binary-cross-entropy y-pred y)])
53 |     
54 |     ;; Print progress every 10 epochs
55 |     (when (= (modulo epoch 10) 0)
56 |       (printf "Epoch ~a, Loss: ~a~n" epoch loss))
57 |     
58 |     ;; Calculate gradients (simplified for this example)
59 |     (let* ([error (t:sub y-pred y)]
60 |            [gradient-w (t:scale (t:mul (t:transpose X) error) (/ 1.0 (car (t:shape X))))]
61 |            [gradient-b (t:create '(1) 
62 |                                (list (/ (for/sum ([e (vector->list (t:data error))]) e)
63 |                                       (car (t:shape X)))))])
64 |       
65 |       ;; Update weights and bias
66 |       (set! weights (t:sub weights (t:scale gradient-w learning-rate)))
67 |       (set! bias (t:sub bias (t:scale gradient-b learning-rate))))))
68 | 
69 | ;; Test the model
70 | (printf "~nFinal weights:~n")
71 | (t:print weights)
72 | (printf "~nFinal bias:~n")
73 | (t:print bias)
74 | 
75 | ;; Make predictions
76 | (printf "~nFinal predictions:~n")
77 | (t:print (forward X weights bias))


--------------------------------------------------------------------------------
/simd_ops.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | 
  5 | #ifdef __AVX__
  6 | #include <immintrin.h>
  7 | #endif
  8 | 
  9 | #ifdef __SSE__
 10 | #include <xmmintrin.h>
 11 | #endif
 12 | 
 13 | // Matrix multiplication with SIMD optimization where available
 14 | void matrix_multiply_simd(int rows_a, int cols_a, int cols_b, double *a, double *b, double *c) {
 15 | #ifdef __AVX__
 16 |     // AVX optimized version (processes 4 doubles at once)
 17 |     for (int i = 0; i < rows_a; ++i) {
 18 |         for (int j = 0; j < cols_b; ++j) {
 19 |             __m256d sum_vec = _mm256_setzero_pd();
 20 |             
 21 |             // Process 4 elements at a time
 22 |             int k = 0;
 23 |             for (; k <= cols_a - 4; k += 4) {
 24 |                 __m256d a_vec = _mm256_loadu_pd(&a[i * cols_a + k]);
 25 |                 
 26 |                 // For each of the 4 elements, we need to multiply by b[k+n][j]
 27 |                 __m256d b_vec = _mm256_set_pd(
 28 |                     b[(k+3) * cols_b + j],
 29 |                     b[(k+2) * cols_b + j],
 30 |                     b[(k+1) * cols_b + j],
 31 |                     b[k * cols_b + j]
 32 |                 );
 33 |                 
 34 |                 // Multiply and add
 35 |                 sum_vec = _mm256_add_pd(sum_vec, _mm256_mul_pd(a_vec, b_vec));
 36 |             }
 37 |             
 38 |             // Horizontal sum of the vector
 39 |             double sum_array[4];
 40 |             _mm256_storeu_pd(sum_array, sum_vec);
 41 |             double sum = sum_array[0] + sum_array[1] + sum_array[2] + sum_array[3];
 42 |             
 43 |             // Handle remaining elements
 44 |             for (; k < cols_a; ++k) {
 45 |                 sum += a[i * cols_a + k] * b[k * cols_b + j];
 46 |             }
 47 |             
 48 |             c[i * cols_b + j] = sum;
 49 |         }
 50 |     }
 51 | #elif defined(__SSE__)
 52 |     // SSE optimized version (processes 2 doubles at once)
 53 |     for (int i = 0; i < rows_a; ++i) {
 54 |         for (int j = 0; j < cols_b; ++j) {
 55 |             __m128d sum_vec = _mm_setzero_pd();
 56 |             
 57 |             // Process 2 elements at a time
 58 |             int k = 0;
 59 |             for (; k <= cols_a - 2; k += 2) {
 60 |                 __m128d a_vec = _mm_loadu_pd(&a[i * cols_a + k]);
 61 |                 __m128d b_vec = _mm_set_pd(
 62 |                     b[(k+1) * cols_b + j],
 63 |                     b[k * cols_b + j]
 64 |                 );
 65 |                 
 66 |                 // Multiply and add
 67 |                 sum_vec = _mm_add_pd(sum_vec, _mm_mul_pd(a_vec, b_vec));
 68 |             }
 69 |             
 70 |             // Horizontal sum
 71 |             double sum_array[2];
 72 |             _mm_storeu_pd(sum_array, sum_vec);
 73 |             double sum = sum_array[0] + sum_array[1];
 74 |             
 75 |             // Handle remaining elements
 76 |             for (; k < cols_a; ++k) {
 77 |                 sum += a[i * cols_a + k] * b[k * cols_b + j];
 78 |             }
 79 |             
 80 |             c[i * cols_b + j] = sum;
 81 |         }
 82 |     }
 83 | #else
 84 |     // Fallback to standard implementation
 85 |     for (int i = 0; i < rows_a; ++i) {
 86 |         for (int j = 0; j < cols_b; ++j) {
 87 |             double sum = 0.0;
 88 |             for (int k = 0; k < cols_a; ++k) {
 89 |                 sum += a[i * cols_a + k] * b[k * cols_b + j];
 90 |             }
 91 |             c[i * cols_b + j] = sum;
 92 |         }
 93 |     }
 94 | #endif
 95 | }
 96 | 
 97 | // Element-wise operations with SIMD
 98 | void tensor_add_simd(int size, double *a, double *b, double *result) {
 99 | #ifdef __AVX__
100 |     int i = 0;
101 |     // Process 4 doubles at a time with AVX
102 |     for (; i <= size - 4; i += 4) {
103 |         __m256d a_vec = _mm256_loadu_pd(&a[i]);
104 |         __m256d b_vec = _mm256_loadu_pd(&b[i]);
105 |         __m256d res_vec = _mm256_add_pd(a_vec, b_vec);
106 |         _mm256_storeu_pd(&result[i], res_vec);
107 |     }
108 |     // Handle remaining elements
109 |     for (; i < size; i++) {
110 |         result[i] = a[i] + b[i];
111 |     }
112 | #elif defined(__SSE__)
113 |     int i = 0;
114 |     // Process 2 doubles at a time with SSE
115 |     for (; i <= size - 2; i += 2) {
116 |         __m128d a_vec = _mm_loadu_pd(&a[i]);
117 |         __m128d b_vec = _mm_loadu_pd(&b[i]);
118 |         __m128d res_vec = _mm_add_pd(a_vec, b_vec);
119 |         _mm_storeu_pd(&result[i], res_vec);
120 |     }
121 |     // Handle remaining elements
122 |     for (; i < size; i++) {
123 |         result[i] = a[i] + b[i];
124 |     }
125 | #else
126 |     // Fallback implementation
127 |     for (int i = 0; i < size; i++) {
128 |         result[i] = a[i] + b[i];
129 |     }
130 | #endif
131 | }
132 | 
133 | // Other SIMD operations (similar structure to tensor_add_simd)
134 | void tensor_mul_elementwise_simd(int size, double *a, double *b, double *result) {
135 | #ifdef __AVX__
136 |     int i = 0;
137 |     for (; i <= size - 4; i += 4) {
138 |         __m256d a_vec = _mm256_loadu_pd(&a[i]);
139 |         __m256d b_vec = _mm256_loadu_pd(&b[i]);
140 |         __m256d res_vec = _mm256_mul_pd(a_vec, b_vec);
141 |         _mm256_storeu_pd(&result[i], res_vec);
142 |     }
143 |     for (; i < size; i++) {
144 |         result[i] = a[i] * b[i];
145 |     }
146 | #elif defined(__SSE__)
147 |     int i = 0;
148 |     for (; i <= size - 2; i += 2) {
149 |         __m128d a_vec = _mm_loadu_pd(&a[i]);
150 |         __m128d b_vec = _mm_loadu_pd(&b[i]);
151 |         __m128d res_vec = _mm_mul_pd(a_vec, b_vec);
152 |         _mm_storeu_pd(&result[i], res_vec);
153 |     }
154 |     for (; i < size; i++) {
155 |         result[i] = a[i] * b[i];
156 |     }
157 | #else
158 |     for (int i = 0; i < size; i++) {
159 |         result[i] = a[i] * b[i];
160 |     }
161 | #endif
162 | }
163 | 
164 | // SIMD optimized ReLU
165 | void relu_forward_simd(int size, double *input, double *output) {
166 | #ifdef __AVX__
167 |     int i = 0;
168 |     __m256d zeros = _mm256_setzero_pd();
169 |     
170 |     for (; i <= size - 4; i += 4) {
171 |         __m256d in_vec = _mm256_loadu_pd(&input[i]);
172 |         __m256d res_vec = _mm256_max_pd(in_vec, zeros);
173 |         _mm256_storeu_pd(&output[i], res_vec);
174 |     }
175 |     
176 |     for (; i < size; i++) {
177 |         output[i] = input[i] > 0 ? input[i] : 0;
178 |     }
179 | #elif defined(__SSE__)
180 |     int i = 0;
181 |     __m128d zeros = _mm_setzero_pd();
182 |     
183 |     for (; i <= size - 2; i += 2) {
184 |         __m128d in_vec = _mm_loadu_pd(&input[i]);
185 |         __m128d res_vec = _mm_max_pd(in_vec, zeros);
186 |         _mm_storeu_pd(&output[i], res_vec);
187 |     }
188 |     
189 |     for (; i < size; i++) {
190 |         output[i] = input[i] > 0 ? input[i] : 0;
191 |     }
192 | #else
193 |     for (int i = 0; i < size; i++) {
194 |         output[i] = input[i] > 0 ? input[i] : 0;
195 |     }
196 | #endif
197 | }


--------------------------------------------------------------------------------
/simple-test.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require "tensor.rkt")
 4 | (require "grad.rkt")
 5 | 
 6 | (define (test-fnn)
 7 |   (let* ([input-dim 3]
 8 |          [hidden-dim 4]
 9 |          [output-dim 2]
10 |          [batch-size 2]
11 |          [learning-rate 0.01]
12 |          [num-epochs 20]
13 |          [input-data (list (list 0.1 0.2 0.3) (list 0.4 0.5 0.6))]
14 |          [output-data (list (list 0.7 0.8) (list 0.9 0.1))])
15 | 
16 |     (define (create-data-tensor data)
17 |       (let ([flattened-data (apply append data)])
18 |         (create-tensor (list (length data) (length (car data))) flattened-data)))
19 | 
20 |     (let* ([input-tensor (create-data-tensor input-data)]
21 |            [output-tensor (create-data-tensor output-data)]
22 |            [hidden-weights (random-tensor (list input-dim hidden-dim) 0.1)]
23 |            [hidden-biases (random-tensor (list hidden-dim) 0.1)]
24 |            [output-weights (random-tensor (list hidden-dim output-dim) 0.1)]
25 |            [output-biases (random-tensor (list output-dim) 0.1)])
26 | 
27 |       (displayln "Initial hidden weights:")
28 |       (print-tensor hidden-weights)
29 |       (displayln "Initial hidden biases:")
30 |       (print-tensor hidden-biases)
31 |       (displayln "Initial output weights:")
32 |       (print-tensor output-weights)
33 |       (displayln "Initial output biases:")
34 |       (print-tensor output-biases)
35 |       (newline)
36 | 
37 |       (for ([epoch (in-range num-epochs)])
38 |         (let* ([hidden-output (dense-forward input-tensor hidden-weights hidden-biases)]
39 |                [output (dense-forward hidden-output output-weights output-biases)]
40 |                [loss (mean-squared-error output-tensor output)]
41 |                [output-grad (tensor-subtract output output-tensor)])
42 |           (displayln (string-append "Epoch: " (number->string epoch) ", Loss: " (number->string loss)))
43 | 
44 |           (let-values ([(output-grad-weights output-grad-biases output-grad-input)
45 |                         (dense-backward hidden-output output-weights output-biases output output-grad learning-rate)]
46 |                        [(hidden-grad-weights hidden-grad-biases _)
47 |                         (dense-backward input-tensor hidden-weights hidden-biases hidden-output
48 |                                         (tensor-multiply output-grad (transpose output-weights))
49 |                                         learning-rate)])
50 |             (set! output-weights output-grad-weights)
51 |             (set! output-biases output-grad-biases)
52 |             (set! hidden-weights hidden-grad-weights)
53 |             (set! hidden-biases hidden-grad-biases))))
54 | 
55 |       (displayln "Final hidden weights:")
56 |       (print-tensor hidden-weights)
57 |       (displayln "Final hidden biases:")
58 |       (print-tensor hidden-biases)
59 |       (displayln "Final output weights:")
60 |       (print-tensor output-weights)
61 |       (displayln "Final output biases:")
62 |       (print-tensor output-biases))))
63 | 
64 | (test-fnn)
65 | 


--------------------------------------------------------------------------------
/tensor.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (provide (struct-out tensor) ; Export the tensor struct and its accessors
  4 |          ;; core operations
  5 |          t:create ; creates a tensor
  6 |          t:random ; random tensor
  7 |          t:reshape
  8 |          t:print
  9 | 
 10 |          ;; math operations
 11 |          t:add
 12 |          t:sub
 13 |          t:mul ; Matrix multiplication and elementwise
 14 |          t:dot ; Dot Product
 15 |          t:scale ; Scalar Multiplication
 16 |          t:transpose
 17 | 
 18 |          ;; Accessors
 19 |          t:shape ; Get shape
 20 |          t:data  ; Get Data
 21 |          t:ref
 22 |          ; t:at    ; Uncomment if implemented
 23 |          ; t:slice ; Uncomment if implemented
 24 |          )
 25 | 
 26 | (struct tensor (shape data) #:transparent)
 27 | 
 28 | ;; Wrapper accessors
 29 | (define (t:shape t)
 30 |   (tensor-shape t))
 31 | 
 32 | (define (t:data t)
 33 |   (tensor-data t))
 34 | 
 35 | ;; Create a tensor
 36 | (define (t:create shape data)
 37 |   (let ((vec-data (if (vector? data) data (list->vector data))))
 38 |     (cond
 39 |       [(= (apply * shape) (vector-length vec-data))
 40 |        (tensor shape vec-data)]
 41 |       [else
 42 |        (begin
 43 |          (println "Error: Data does not match, please check the size")
 44 |          #f)])))
 45 | 
 46 | ;; Print tensor
 47 | (define (t:print t)
 48 |   (let ([shape (tensor-shape t)]
 49 |         [data (tensor-data t)])
 50 |     (cond
 51 |       [(= (length shape) 1)
 52 |        (display "[")
 53 |        (for ([i (in-range (car shape))])
 54 |          (display (vector-ref data i))
 55 |          (display " "))
 56 |        (display "]")
 57 |        (newline)]
 58 |       [(= (length shape) 2)
 59 |        (for ([i (in-range (car shape))])
 60 |          (display "[")
 61 |          (for ([j (in-range (cadr shape))])
 62 |            (display (vector-ref data (+ (* i (cadr shape)) j)))
 63 |            (display " "))
 64 |          (display "]")
 65 |          (newline))]
 66 |       [else
 67 |        (error "t:print: unsupported tensor shape")])))
 68 | 
 69 | ;; Random tensor
 70 | (define (t:random shape range)
 71 |   (let* ((size (apply * shape))
 72 |          (max-value (inexact->exact (floor (* range 10000)))))
 73 |     (tensor shape
 74 |             (for/vector ([i size])
 75 |               (/ (random max-value) 10000.0)))))
 76 | 
 77 | ;; Reshape tensor
 78 | (define (t:reshape t new-shape)
 79 |   (let ([original-size (apply * (tensor-shape t))]
 80 |         [new-size (apply * new-shape)])
 81 |     (if (= original-size new-size)
 82 |         (tensor new-shape (tensor-data t))
 83 |         (error "t:reshape: New shape must have the same number of elements as the original shape"))))
 84 | 
 85 | ;; Add tensors
 86 | (define (t:add t1 t2)
 87 |   (let ([shape1 (tensor-shape t1)]
 88 |         [shape2 (tensor-shape t2)])
 89 |     (cond
 90 |       [(equal? shape1 shape2)
 91 |        (tensor shape1
 92 |                (for/vector ([i (vector-length (tensor-data t1))])
 93 |                  (+ (vector-ref (tensor-data t1) i)
 94 |                     (vector-ref (tensor-data t2) i))))]
 95 |       [(= (length shape1) 1)
 96 |        (let ([scalar-val (vector-ref (tensor-data t1) 0)])
 97 |          (tensor shape2
 98 |                  (for/vector ([i (vector-length (tensor-data t2))])
 99 |                    (+ scalar-val (vector-ref (tensor-data t2) i)))))]
100 | 
101 |       [(= (length shape2) 1)
102 |        (let ([scalar-val (vector-ref (tensor-data t2) 0)])
103 |          (tensor shape1
104 |                  (for/vector ([i (vector-length (tensor-data t1))])
105 |                    (+ (vector-ref (tensor-data t1) i) scalar-val))))]
106 |       [else
107 |        (error "t:add: Tensors must have the same shape or be broadcastable for addition")])))
108 | 
109 | ;; Subtract tensors
110 | (define (t:sub t1 t2)
111 |   (let ([shape1 (tensor-shape t1)]
112 |         [shape2 (tensor-shape t2)])
113 |     (cond
114 |       [(equal? shape1 shape2)
115 |        (tensor shape1
116 |                (for/vector ([i (vector-length (tensor-data t1))])
117 |                  (- (vector-ref (tensor-data t1) i)
118 |                     (vector-ref (tensor-data t2) i))))]
119 |       [else
120 |        (error "t:sub: Tensors must have the same shape for subtraction")])))
121 | 
122 | ;; Multiply tensors (Matrix multiply or elementwise)
123 | (define (t:mul t1 t2)
124 |   (let ([shape1 (tensor-shape t1)]
125 |         [shape2 (tensor-shape t2)])
126 |     (cond
127 |       ;; Matrix multiplication: (A: MxN) * (B: NxP) -> (C: MxP)
128 |       [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2)))
129 |        (let* ([rows-a (car shape1)]
130 |               [cols-a (cadr shape1)]
131 |               [cols-b (cadr shape2)]
132 |               [result (make-vector (* rows-a cols-b) 0.0)])
133 |          (for ([i (in-range rows-a)])
134 |            (for ([j (in-range cols-b)])
135 |              (for ([k (in-range cols-a)])
136 |                (vector-set! result (+ (* i cols-b) j)
137 |                             (+ (vector-ref result (+ (* i cols-b) j))
138 |                                (* (t:ref t1 i k) (t:ref t2 k j)))))))
139 |          (tensor (list rows-a cols-b) result))]
140 | 
141 |       ;; Vector (1D) * Matrix (2D) multiplication when shapes align
142 |       [(and (= (length shape1) 1) (= (length shape2) 2) (= (car shape1) (car shape2)))
143 |        (let* ([rows-b (car shape2)]
144 |               [cols-b (cadr shape2)]
145 |               [result (make-vector cols-b 0.0)])
146 |          (for ([j (in-range cols-b)])
147 |            (for ([i (in-range rows-b)])
148 |              (vector-set! result j
149 |                           (+ (vector-ref result j)
150 |                              (* (vector-ref (tensor-data t1) i)
151 |                                 (t:ref t2 i j))))))
152 |          (tensor (list cols-b) result))]
153 | 
154 |       ;; Elementwise multiplication if shapes match and are both 2D (or same dimension)
155 |       [(equal? shape1 shape2)
156 |        (tensor shape1 (vector-map * (tensor-data t1) (tensor-data t2)))]
157 | 
158 |       [else
159 |        (error "t:mul: Tensors must have compatible shapes for multiplication")]))) 
160 | 
161 | ;; Reference element at (i, j)
162 | (define (t:ref t i j)
163 |   (vector-ref (tensor-data t) (+ (* i (cadr (tensor-shape t))) j)))
164 | 
165 | ;; Transpose a matrix (2D only)
166 | (define (t:transpose t)
167 |   (let* ([shape (tensor-shape t)]
168 |          [rows (car shape)]
169 |          [cols (cadr shape)]
170 |          [data (tensor-data t)]
171 |          [new-data (make-vector (apply * (reverse shape)) 0)])
172 |     (for* ([i rows]
173 |            [j cols])
174 |       (vector-set! new-data (+ (* j rows) i) (vector-ref data (+ (* i cols) j))))
175 |     (tensor (reverse shape) new-data)))
176 | 
177 | ;; Scalar multiply a tensor
178 | (define (t:scale t scalar)
179 |   (let ([data (tensor-data t)])
180 |     (tensor (tensor-shape t)
181 |             (for/vector ([v data])
182 |               (* v scalar)))))
183 | 
184 | ;; Dot product (1D only)
185 | (define (t:dot t1 t2)
186 |   (let ([data1 (tensor-data t1)]
187 |         [data2 (tensor-data t2)])
188 |     (if (not (= (vector-length data1) (vector-length data2)))
189 |         (error "t:dot: Tensors must have the same length for dot product")
190 |         (apply + (map * data1 data2)))))
191 | 


--------------------------------------------------------------------------------
/tensor2.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require ffi/unsafe
  4 |          ffi/unsafe/define
  5 |          ffi/unsafe/cvector)
  6 | 
  7 | ; Define the C function
  8 | (define-ffi-definer define-c (ffi-lib "./matrix_multiplication.so"))
  9 | (define-c matrix_multiply (_fun _pointer _pointer _pointer _int _int _int -> _void))
 10 | 
 11 | (provide tensor create-tensor tensor-add tensor-subtract tensor-multiply dense-forward mean-squared-error dense-backward print-tensor random-tensor reshape-tensor 
 12 |   transpose scalar-multiply dot-product relu relu-derivative tensor-shape tensor-data initialize-fnn)
 13 | 
 14 | (struct tensor (shape data) #:transparent)
 15 | 
 16 | (define (create-tensor shape data)
 17 |   (let ((vec-data (if (vector? data) data (list->vector data))))
 18 |     (cond
 19 |       [(= (apply * shape) (vector-length vec-data))
 20 |        (tensor shape vec-data)]
 21 |       [else
 22 |        (begin
 23 |          (println "Error: Data does not match, please check the size")
 24 |          #f)])))
 25 | 
 26 | (define (print-tensor t)
 27 |   (let ([shape (tensor-shape t)]
 28 |         [data (tensor-data t)])
 29 |     (for ([i (in-range (car shape))])
 30 |       (display "[")
 31 |       (for ([j (in-range (cadr shape))])
 32 |         (display (vector-ref data (+ (* i (cadr shape)) j)))
 33 |         (display " "))
 34 |       (display "]")
 35 |       (newline))))
 36 | 
 37 | ; Create random tensor
 38 | (define (random-tensor shape range)
 39 |   (let* ((size (apply * shape))
 40 |          (max-value (inexact->exact (floor (* range 10000)))))
 41 |     (tensor shape
 42 |             (for/vector ([i size])
 43 |               (/ (random max-value) 10000.0)))))
 44 | 
 45 | ; Reshaping tensors
 46 | (define (reshape-tensor t new-shape)
 47 |   (let ([original-size (apply * (tensor-shape t))]
 48 |         [new-size (apply * new-shape)])
 49 |     (if (= original-size new-size)
 50 |         (tensor new-shape (tensor-data t))
 51 |         (error "New shape must have the same number of elements as the original shape"))))
 52 | 
 53 | ; Element-wise operations
 54 | (define (tensor-add t1 t2)
 55 |   (let ([shape1 (tensor-shape t1)]
 56 |         [shape2 (tensor-shape t2)])
 57 |     (displayln (string-append "Adding tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2)))
 58 |     (cond
 59 |       [(equal? shape1 shape2)
 60 |        (tensor shape1 (for/vector ([i (vector-length (tensor-data t1))])
 61 |                         (+ (vector-ref (tensor-data t1) i)
 62 |                            (vector-ref (tensor-data t2) i))))]
 63 |       [else
 64 |        (error "Tensors must have the same shape for addition")])))
 65 | 
 66 | (define (tensor-subtract t1 t2)
 67 |   (let ([shape1 (tensor-shape t1)]
 68 |         [shape2 (tensor-shape t2)])
 69 |     (displayln (string-append "Subtracting tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2)))
 70 |     (cond
 71 |       [(equal? shape1 shape2)
 72 |        (tensor shape1 (for/vector ([i (vector-length (tensor-data t1))])
 73 |                         (- (vector-ref (tensor-data t1) i)
 74 |                            (vector-ref (tensor-data t2) i))))]
 75 |       [else
 76 |        (error "Tensors must have the same shape for subtraction")])))
 77 | 
 78 | (define (tensor-multiply t1 t2)
 79 |   (let ([shape1 (tensor-shape t1)]
 80 |         [shape2 (tensor-shape t2)])
 81 |     (displayln (string-append "Multiplying tensors with shapes: " (format "~a" shape1) " and " (format "~a" shape2)))
 82 |     (cond
 83 |       [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2)))
 84 |        (let* ([rows-a (car shape1)]
 85 |               [cols-a (cadr shape1)]
 86 |               [cols-b (cadr shape2)]
 87 |               [a (tensor-data t1)]
 88 |               [b (tensor-data t2)]
 89 |               [c (make-vector (* rows-a cols-b) 0.0)]
 90 |               [ptr-a (_cvector _double (vector->list a))]
 91 |               [ptr-b (_cvector _double (vector->list b))]
 92 |               [ptr-c (_cvector _double (vector->list c))])
 93 |          (matrix_multiply ptr-a ptr-b ptr-c rows-a cols-a cols-b)
 94 |          (tensor (list rows-a cols-b) (list->vector (cvector->list ptr-c))))]
 95 |       [else
 96 |        (error "Tensors must have compatible shapes for multiplication")])))
 97 | ; Activation function (ReLU)
 98 | (define (relu x)
 99 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (max 0 v))))
100 | 
101 | ; Derivative of ReLU
102 | (define (relu-derivative x)
103 |   (tensor (tensor-shape x) (for/vector ([v (tensor-data x)]) (if (> v 0) 1 0))))
104 | 
105 | ; Dense layer forward propagation
106 | (define (dense-forward input weights biases)
107 |   (let* ([z (tensor-add (tensor-multiply input weights) biases)]
108 |          [activation-output (relu z)])
109 |     activation-output))
110 | 
111 | ; Loss function
112 | (define (mean-squared-error y-true y-pred)
113 |   (let* ([diff (tensor-subtract y-true y-pred)]
114 |          [squared-diff (tensor-multiply diff diff)]
115 |          [sum (apply + (tensor-data squared-diff))])
116 |     (/ sum (length (tensor-data y-true)))))
117 | 
118 | ; Backpropagation for dense layer
119 | (define (dense-backward input weights biases output grad-output learning-rate)
120 |   (let* ([grad-activation (relu-derivative output)]
121 |          [grad-z (tensor-multiply grad-output grad-activation)]
122 |          [grad-biases (tensor (tensor-shape biases)
123 |                               (for/vector ([i (tensor-data grad-z)])
124 |                                 (apply + (tensor-data i))))]
125 |          [grad-weights (tensor-multiply (transpose input) grad-z)]
126 |          [grad-input (tensor-multiply grad-z (transpose weights))])
127 |     (let* ([new-weights (tensor-subtract weights (scalar-multiply grad-weights learning-rate))]
128 |            [new-biases (tensor-subtract biases (scalar-multiply grad-biases learning-rate))])
129 |       (values new-weights new-biases grad-input))))
130 | 
131 | (define (transpose t)
132 |   (let* ([shape (tensor-shape t)]
133 |          [rows (car shape)]
134 |          [cols (cadr shape)]
135 |          [data (tensor-data t)]
136 |          [new-data (make-vector (apply * (reverse shape)) 0)])
137 |     (for* ([i rows]
138 |            [j cols])
139 |       (vector-set! new-data (+ (* j rows) i) (vector-ref data (+ (* i cols) j))))
140 |     (tensor (reverse shape) new-data)))
141 | 
142 | (define (scalar-multiply t scalar)
143 |   (tensor (tensor-shape t) (for/vector ([v (tensor-data t)]) (* v scalar))))
144 | 
145 | ; Example of initializing a simple FNN
146 | (define (initialize-fnn batch-size input-dim output-dim)
147 |   (let* ([input-data (make-list (* batch-size input-dim) 0)]
148 |          [input-tensor (create-tensor (list batch-size input-dim) input-data)]
149 |          [weight-shape (list input-dim output-dim)]
150 |          [bias-shape (list output-dim)]
151 |          [weights (random-tensor weight-shape 1.0)]
152 |          [biases (random-tensor bias-shape 1.0)])
153 |     (values input-tensor weights biases)))
154 | 
155 | ; Dot product
156 | (define (dot-product t1 t2)
157 |   (let ([data1 (tensor-data t1)]
158 |         [data2 (tensor-data t2)])
159 |     (if (not (= (length data1) (length data2)))
160 |         (error "Tensors must have the same length for dot product")
161 |         (apply + (map * data1 data2)))))


--------------------------------------------------------------------------------
/tensor_device.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "tensor.rkt"
  4 |          "device.rkt"
  5 |          "hardware_detection.rkt"
  6 |          ffi/unsafe
  7 |          ffi/vector)
  8 | 
  9 | ;; Provide extended tensor operations with device support
 10 | (provide 
 11 |  ;; Core operations
 12 |  dt:create     ; Create a tensor on a specific device
 13 |  dt:to         ; Move a tensor to a specific device
 14 |  dt:random     ; Create a random tensor on a device
 15 |  dt:print      ; Print a tensor
 16 |  dt:reshape    ; Reshape a tensor
 17 |  dt:device     ; Get the device of a tensor
 18 |  
 19 |  ;; Math operations
 20 |  dt:add        ; Add tensors
 21 |  dt:sub        ; Subtract tensors
 22 |  dt:mul        ; Matrix multiplication or elementwise
 23 |  dt:dot        ; Dot product
 24 |  dt:scale      ; Scalar multiplication
 25 |  dt:transpose  ; Transpose a tensor
 26 |  
 27 |  ;; Accessors (same as tensor.rkt)
 28 |  dt:shape      ; Get shape
 29 |  dt:data       ; Get data
 30 |  dt:ref        ; Get value at index
 31 |  )
 32 | 
 33 | ;; Define device-aware tensor structure as a separate struct
 34 | ;; Instead of extending tensor, which causes issues
 35 | (struct dt:tensor (shape data device) #:transparent)
 36 | 
 37 | ;; Create tensor on device
 38 | (define (dt:create shape data [dev (current-device)])
 39 |   (let ([t (t:create shape data)])
 40 |     (dt:tensor (t:shape t) (t:data t) dev)))
 41 | 
 42 | ;; Get device of tensor
 43 | (define (dt:device t)
 44 |   (dt:tensor-device t))
 45 | 
 46 | ;; Move tensor to device
 47 | (define (dt:to t dev)
 48 |   (cond
 49 |     [(dt:tensor? t)
 50 |      (let ([current-dev (dt:tensor-device t)])
 51 |        (cond
 52 |          ;; Same device - no-op
 53 |          [(equal? current-dev dev) t]
 54 |          
 55 |          ;; Move to GPU from CPU
 56 |          [(and (cpu-device? current-dev) (gpu-device? dev))
 57 |           (printf "Moving tensor to GPU device~n")
 58 |           (dt:tensor (dt:tensor-shape t) (dt:tensor-data t) dev)]
 59 |          
 60 |          ;; Move to CPU from GPU
 61 |          [(and (gpu-device? current-dev) (cpu-device? dev))
 62 |           (printf "Moving tensor to CPU device~n")
 63 |           (dt:tensor (dt:tensor-shape t) (dt:tensor-data t) dev)]
 64 |          
 65 |          [else t]))]
 66 |     
 67 |     ;; Convert regular tensor to device tensor
 68 |     [(tensor? t)
 69 |      (dt:tensor (t:shape t) (t:data t) dev)]
 70 |     
 71 |     [else
 72 |      (error "dt:to: expected a tensor, got ~a" t)]))
 73 | 
 74 | ;; Create random tensor on device
 75 | (define (dt:random shape range [dev (current-device)])
 76 |   (let ([t (t:random shape range)])
 77 |     (dt:tensor (t:shape t) (t:data t) dev)))
 78 | 
 79 | ;; Print tensor
 80 | (define (dt:print t)
 81 |   (when (dt:tensor? t)
 82 |     ;; Create a regular tensor to print
 83 |     (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))])
 84 |       (t:print regular-tensor)
 85 |       (printf "Device: ~a~n" (get-device-type (dt:tensor-device t))))))
 86 | 
 87 | ;; Reshape tensor
 88 | (define (dt:reshape t new-shape)
 89 |   (when (dt:tensor? t)
 90 |     (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))]
 91 |           [dev (dt:tensor-device t)])
 92 |       (let ([reshaped (t:reshape regular-tensor new-shape)])
 93 |         (dt:tensor (t:shape reshaped) (t:data reshaped) dev)))))
 94 | 
 95 | ;; Add tensors
 96 | (define (dt:add t1 t2)
 97 |   (cond
 98 |     [(and (dt:tensor? t1) (dt:tensor? t2))
 99 |      (let ([dev1 (dt:tensor-device t1)]
100 |            [dev2 (dt:tensor-device t2)])
101 |        (cond
102 |          ;; Both on CPU
103 |          [(and (cpu-device? dev1) (cpu-device? dev2))
104 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
105 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
106 |             (let ([sum (t:add regular-t1 regular-t2)])
107 |               (dt:tensor (t:shape sum) (t:data sum) dev1)))]
108 |          
109 |          ;; Different devices - move to same device
110 |          [(not (equal? dev1 dev2))
111 |           (let ([target-dev (if (gpu-device? dev1) dev1 dev2)])
112 |             (dt:add (dt:to t1 target-dev) (dt:to t2 target-dev)))]
113 |          
114 |          ;; Both on GPU - use OpenCL implementation if available
115 |          [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?))
116 |           (printf "Performing GPU tensor addition~n")
117 |           ;; For now, fall back to CPU implementation
118 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
119 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
120 |             (let ([sum (t:add regular-t1 regular-t2)])
121 |               (dt:tensor (t:shape sum) (t:data sum) dev1)))]
122 |          
123 |          [else
124 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
125 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
126 |             (let ([sum (t:add regular-t1 regular-t2)])
127 |               (dt:tensor (t:shape sum) (t:data sum) dev1)))]))]
128 |     
129 |     ;; Handle regular tensors
130 |     [else
131 |      (let ([dev (current-device)])
132 |        (let ([dt1 (if (dt:tensor? t1) 
133 |                        t1 
134 |                        (dt:tensor (t:shape t1) (t:data t1) dev))]
135 |              [dt2 (if (dt:tensor? t2) 
136 |                        t2 
137 |                        (dt:tensor (t:shape t2) (t:data t2) dev))])
138 |          (dt:add dt1 dt2)))]))
139 | 
140 | ;; Subtract tensors
141 | (define (dt:sub t1 t2)
142 |   (cond
143 |     [(and (dt:tensor? t1) (dt:tensor? t2))
144 |      (let ([dev1 (dt:tensor-device t1)]
145 |            [dev2 (dt:tensor-device t2)])
146 |        (cond
147 |          ;; Both on CPU
148 |          [(and (cpu-device? dev1) (cpu-device? dev2))
149 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
150 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
151 |             (let ([diff (t:sub regular-t1 regular-t2)])
152 |               (dt:tensor (t:shape diff) (t:data diff) dev1)))]
153 |          
154 |          ;; Different devices - move to same device
155 |          [(not (equal? dev1 dev2))
156 |           (let ([target-dev (if (gpu-device? dev1) dev1 dev2)])
157 |             (dt:sub (dt:to t1 target-dev) (dt:to t2 target-dev)))]
158 |          
159 |          ;; Both on GPU - use OpenCL implementation if available
160 |          [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?))
161 |           (printf "Performing GPU tensor subtraction~n")
162 |           ;; For now, fall back to CPU implementation
163 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
164 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
165 |             (let ([diff (t:sub regular-t1 regular-t2)])
166 |               (dt:tensor (t:shape diff) (t:data diff) dev1)))]
167 |          
168 |          [else
169 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
170 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
171 |             (let ([diff (t:sub regular-t1 regular-t2)])
172 |               (dt:tensor (t:shape diff) (t:data diff) dev1)))]))]
173 |     
174 |     ;; Handle regular tensors
175 |     [else
176 |      (let ([dev (current-device)])
177 |        (let ([dt1 (if (dt:tensor? t1) 
178 |                        t1 
179 |                        (dt:tensor (t:shape t1) (t:data t1) dev))]
180 |              [dt2 (if (dt:tensor? t2) 
181 |                        t2 
182 |                        (dt:tensor (t:shape t2) (t:data t2) dev))])
183 |          (dt:sub dt1 dt2)))]))
184 | 
185 | ;; Multiply tensors
186 | (define (dt:mul t1 t2)
187 |   (cond
188 |     [(and (dt:tensor? t1) (dt:tensor? t2))
189 |      (let ([dev1 (dt:tensor-device t1)]
190 |            [dev2 (dt:tensor-device t2)])
191 |        (cond
192 |          ;; Both on CPU
193 |          [(and (cpu-device? dev1) (cpu-device? dev2))
194 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
195 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
196 |             (let ([prod (t:mul regular-t1 regular-t2)])
197 |               (dt:tensor (t:shape prod) (t:data prod) dev1)))]
198 |          
199 |          ;; Different devices - move to same device
200 |          [(not (equal? dev1 dev2))
201 |           (let ([target-dev (if (gpu-device? dev1) dev1 dev2)])
202 |             (dt:mul (dt:to t1 target-dev) (dt:to t2 target-dev)))]
203 |          
204 |          ;; Both on GPU - use OpenCL implementation if available
205 |          [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?))
206 |           (printf "Performing GPU tensor multiplication~n")
207 |           ;; TODO: Implement GPU matrix multiplication using OpenCL
208 |           ;; For now, fall back to CPU implementation
209 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
210 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
211 |             (let ([prod (t:mul regular-t1 regular-t2)])
212 |               (dt:tensor (t:shape prod) (t:data prod) dev1)))]
213 |          
214 |          [else
215 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
216 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
217 |             (let ([prod (t:mul regular-t1 regular-t2)])
218 |               (dt:tensor (t:shape prod) (t:data prod) dev1)))]))]
219 |     
220 |     ;; Handle regular tensors
221 |     [else
222 |      (let ([dev (current-device)])
223 |        (let ([dt1 (if (dt:tensor? t1) 
224 |                        t1 
225 |                        (dt:tensor (t:shape t1) (t:data t1) dev))]
226 |              [dt2 (if (dt:tensor? t2) 
227 |                        t2 
228 |                        (dt:tensor (t:shape t2) (t:data t2) dev))])
229 |          (dt:mul dt1 dt2)))]))
230 | 
231 | ;; Scalar multiply
232 | (define (dt:scale t scalar)
233 |   (if (dt:tensor? t)
234 |       (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))]
235 |             [dev (dt:tensor-device t)])
236 |         (let ([scaled (t:scale regular-tensor scalar)])
237 |           (dt:tensor (t:shape scaled) (t:data scaled) dev)))
238 |       (let ([scaled (t:scale t scalar)])
239 |         (dt:tensor (t:shape scaled) (t:data scaled) (current-device)))))
240 | 
241 | ;; Transpose
242 | (define (dt:transpose t)
243 |   (if (dt:tensor? t)
244 |       (let ([regular-tensor (t:create (dt:tensor-shape t) (dt:tensor-data t))]
245 |             [dev (dt:tensor-device t)])
246 |         (let ([transposed (t:transpose regular-tensor)])
247 |           (dt:tensor (t:shape transposed) (t:data transposed) dev)))
248 |       (let ([transposed (t:transpose t)])
249 |         (dt:tensor (t:shape transposed) (t:data transposed) (current-device)))))
250 | 
251 | ;; Dot product
252 | (define (dt:dot t1 t2)
253 |   (cond
254 |     [(and (dt:tensor? t1) (dt:tensor? t2))
255 |      (let ([dev1 (dt:tensor-device t1)]
256 |            [dev2 (dt:tensor-device t2)])
257 |        (cond
258 |          ;; Different devices - move to same device
259 |          [(not (equal? dev1 dev2))
260 |           (let ([target-dev (if (gpu-device? dev1) dev1 dev2)])
261 |             (dt:dot (dt:to t1 target-dev) (dt:to t2 target-dev)))]
262 |          
263 |          ;; Both on GPU - use OpenCL implementation if available
264 |          [(and (gpu-device? dev1) (gpu-device? dev2) (has-opencl?))
265 |           (printf "Performing GPU dot product~n")
266 |           ;; For now, fall back to CPU implementation
267 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
268 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
269 |             (t:dot regular-t1 regular-t2))]
270 |          
271 |          [else
272 |           (let ([regular-t1 (t:create (dt:tensor-shape t1) (dt:tensor-data t1))]
273 |                 [regular-t2 (t:create (dt:tensor-shape t2) (dt:tensor-data t2))])
274 |             (t:dot regular-t1 regular-t2))]))]
275 |     
276 |     ;; Handle regular tensors
277 |     [else
278 |      (let ([dev (current-device)])
279 |        (let ([regular-t1 (if (dt:tensor? t1) 
280 |                              (t:create (dt:tensor-shape t1) (dt:tensor-data t1))
281 |                              t1)]
282 |              [regular-t2 (if (dt:tensor? t2) 
283 |                              (t:create (dt:tensor-shape t2) (dt:tensor-data t2))
284 |                              t2)])
285 |          (t:dot regular-t1 regular-t2)))]))
286 | 
287 | ;; Accessors
288 | (define (dt:shape t)
289 |   (if (dt:tensor? t)
290 |       (dt:tensor-shape t)
291 |       (t:shape t)))
292 | 
293 | (define (dt:data t)
294 |   (if (dt:tensor? t)
295 |       (dt:tensor-data t)
296 |       (t:data t)))
297 | 
298 | (define (dt:ref t i j)
299 |   (if (dt:tensor? t)
300 |       (vector-ref (dt:tensor-data t) (+ (* i (cadr (dt:tensor-shape t))) j))
301 |       (t:ref t i j)))


--------------------------------------------------------------------------------
/tensor_optimized.rkt:
--------------------------------------------------------------------------------
  1 | #lang racket
  2 | 
  3 | (require "ffi_ops.rkt")
  4 | (require ffi/vector)
  5 | 
  6 | (provide (struct-out tensor-opt) 
  7 |          t-opt:create
  8 |          t-opt:random
  9 |          t-opt:reshape
 10 |          t-opt:print
 11 |          t-opt:add
 12 |          t-opt:add!
 13 |          t-opt:sub
 14 |          t-opt:sub!
 15 |          t-opt:mul
 16 |          t-opt:transpose
 17 |          t-opt:shape
 18 |          t-opt:data
 19 |          t-opt:ref
 20 |          t-opt:scale
 21 |          t-opt:scale!)
 22 | 
 23 | ;; Optimized tensor structure
 24 | ;; Uses a f64vector for better FFI compatibility and memory layout
 25 | (struct tensor-opt (shape data) #:transparent)
 26 | 
 27 | ;; Accessors
 28 | (define (t-opt:shape t)
 29 |   (tensor-opt-shape t))
 30 | 
 31 | (define (t-opt:data t)
 32 |   (tensor-opt-data t))
 33 | 
 34 | ;; Create tensor from data
 35 | (define (t-opt:create shape data)
 36 |   (let ((vec-data (cond
 37 |                     [(f64vector? data) data]
 38 |                     [(vector? data) (list->f64vector (vector->list data))]
 39 |                     [else (list->f64vector data)])))
 40 |     (cond
 41 |       [(= (apply * shape) (f64vector-length vec-data))
 42 |        (tensor-opt shape vec-data)]
 43 |       [else
 44 |        (error "t-opt:create: Data does not match shape")])))
 45 | 
 46 | ;; Random tensor
 47 | (define (t-opt:random shape range)
 48 |   (let* ((size (apply * shape))
 49 |          (max-value (inexact->exact (floor (* range 10000)))))
 50 |     (tensor-opt shape
 51 |                 (list->f64vector
 52 |                   (for/list ([i size])
 53 |                     (/ (random max-value) 10000.0))))))
 54 | 
 55 | ;; Reshape tensor
 56 | (define (t-opt:reshape t new-shape)
 57 |   (let ([original-size (apply * (tensor-opt-shape t))]
 58 |         [new-size (apply * new-shape)])
 59 |     (if (= original-size new-size)
 60 |         (tensor-opt new-shape (tensor-opt-data t))
 61 |         (error "t-opt:reshape: New shape must have the same number of elements"))))
 62 | 
 63 | ;; Print tensor
 64 | (define (t-opt:print t)
 65 |   (let ([shape (tensor-opt-shape t)]
 66 |         [data (tensor-opt-data t)])
 67 |     (cond
 68 |       [(= (length shape) 1)
 69 |        (display "[")
 70 |        (for ([i (in-range (car shape))])
 71 |          (display (f64vector-ref data i))
 72 |          (display " "))
 73 |        (display "]")
 74 |        (newline)]
 75 |       [(= (length shape) 2)
 76 |        (for ([i (in-range (car shape))])
 77 |          (display "[")
 78 |          (for ([j (in-range (cadr shape))])
 79 |            (display (f64vector-ref data (+ (* i (cadr shape)) j)))
 80 |            (display " "))
 81 |          (display "]")
 82 |          (newline))]
 83 |       [else (error "t-opt:print: Unsupported tensor shape")])))
 84 | 
 85 | ;; Add tensors - out of place
 86 | (define (t-opt:add t1 t2)
 87 |   (let ([shape1 (tensor-opt-shape t1)]
 88 |         [shape2 (tensor-opt-shape t2)])
 89 |     (cond
 90 |       [(equal? shape1 shape2)
 91 |        (let* ([size (apply * shape1)]
 92 |               [result (make-f64vector size 0.0)])
 93 |          (c:tensor-add size 
 94 |                       (tensor-opt-data t1) 
 95 |                       (tensor-opt-data t2) 
 96 |                       result)
 97 |          (tensor-opt shape1 result))]
 98 |       [else
 99 |        (error "t-opt:add: Tensors must have the same shape")])))
100 | 
101 | ;; Add tensors - in-place version (t1 += t2)
102 | (define (t-opt:add! t1 t2)
103 |   (let ([shape1 (tensor-opt-shape t1)]
104 |         [shape2 (tensor-opt-shape t2)])
105 |     (cond
106 |       [(equal? shape1 shape2)
107 |        (let ([size (apply * shape1)])
108 |          (c:tensor-add size 
109 |                       (tensor-opt-data t1) 
110 |                       (tensor-opt-data t2) 
111 |                       (tensor-opt-data t1))
112 |          t1)]
113 |       [else
114 |        (error "t-opt:add!: Tensors must have the same shape")])))
115 | 
116 | ;; Subtract tensors - out of place
117 | (define (t-opt:sub t1 t2)
118 |   (let ([shape1 (tensor-opt-shape t1)]
119 |         [shape2 (tensor-opt-shape t2)])
120 |     (cond
121 |       [(equal? shape1 shape2)
122 |        (let* ([size (apply * shape1)]
123 |               [result (make-f64vector size 0.0)])
124 |          (c:tensor-sub size 
125 |                       (tensor-opt-data t1) 
126 |                       (tensor-opt-data t2) 
127 |                       result)
128 |          (tensor-opt shape1 result))]
129 |       [else
130 |        (error "t-opt:sub: Tensors must have the same shape")])))
131 | 
132 | ;; Subtract tensors - in-place version (t1 -= t2)
133 | (define (t-opt:sub! t1 t2)
134 |   (let ([shape1 (tensor-opt-shape t1)]
135 |         [shape2 (tensor-opt-shape t2)])
136 |     (cond
137 |       [(equal? shape1 shape2)
138 |        (let ([size (apply * shape1)])
139 |          (c:tensor-sub size 
140 |                       (tensor-opt-data t1) 
141 |                       (tensor-opt-data t2) 
142 |                       (tensor-opt-data t1))
143 |          t1)]
144 |       [else
145 |        (error "t-opt:sub!: Tensors must have the same shape")])))
146 | 
147 | ;; Scale tensor - out of place
148 | (define (t-opt:scale t scalar)
149 |   (let* ([shape (tensor-opt-shape t)]
150 |          [size (apply * shape)]
151 |          [result (make-f64vector size 0.0)])
152 |     (c:tensor-scale size 
153 |                    (tensor-opt-data t) 
154 |                    scalar 
155 |                    result)
156 |     (tensor-opt shape result)))
157 | 
158 | ;; Scale tensor - in-place version (t *= scalar)
159 | (define (t-opt:scale! t scalar)
160 |   (let* ([shape (tensor-opt-shape t)]
161 |          [size (apply * shape)])
162 |     (c:tensor-scale size 
163 |                    (tensor-opt-data t) 
164 |                    scalar 
165 |                    (tensor-opt-data t))
166 |     t))
167 | 
168 | ;; Matrix multiplication
169 | (define (t-opt:mul t1 t2)
170 |   (let ([shape1 (tensor-opt-shape t1)]
171 |         [shape2 (tensor-opt-shape t2)])
172 |     (cond
173 |       ;; Matrix multiplication: (A: MxN) * (B: NxP) -> (C: MxP)
174 |       [(and (= (length shape1) 2) (= (length shape2) 2) (= (cadr shape1) (car shape2)))
175 |        (let* ([rows-a (car shape1)]
176 |               [cols-a (cadr shape1)]
177 |               [cols-b (cadr shape2)]
178 |               [result (make-f64vector (* rows-a cols-b) 0.0)])
179 |          (c:matrix-multiply rows-a cols-a cols-b 
180 |                            (tensor-opt-data t1) 
181 |                            (tensor-opt-data t2) 
182 |                            result)
183 |          (tensor-opt (list rows-a cols-b) result))]
184 |       
185 |       ;; Elementwise multiplication if shapes match
186 |       [(equal? shape1 shape2)
187 |        (let* ([size (apply * shape1)]
188 |               [result (make-f64vector size 0.0)])
189 |          (c:tensor-mul-elementwise size 
190 |                                   (tensor-opt-data t1) 
191 |                                   (tensor-opt-data t2) 
192 |                                   result)
193 |          (tensor-opt shape1 result))]
194 |       
195 |       [else
196 |        (error "t-opt:mul: Tensors must have compatible shapes")])))
197 | 
198 | ;; Reference element at (i, j)
199 | (define (t-opt:ref t i j)
200 |   (f64vector-ref (tensor-opt-data t) (+ (* i (cadr (tensor-opt-shape t))) j)))
201 | 
202 | ;; Transpose a matrix (2D only)
203 | (define (t-opt:transpose t)
204 |   (let* ([shape (tensor-opt-shape t)]
205 |          [rows (car shape)]
206 |          [cols (cadr shape)]
207 |          [data (tensor-opt-data t)]
208 |          [new-data (make-f64vector (apply * (reverse shape)) 0.0)])
209 |     (for* ([i rows]
210 |            [j cols])
211 |       (f64vector-set! new-data (+ (* j rows) i) (f64vector-ref data (+ (* i cols) j))))
212 |     (tensor-opt (reverse shape) new-data)))


--------------------------------------------------------------------------------
/test.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (printf "Starting RacoGrad test script...~n~n")
 4 | 
 5 | ;; Test MNIST
 6 | (printf "Testing MNIST implementation (abbreviated run for quick testing)...~n")
 7 | (printf "=============================================================~n")
 8 | 
 9 | ;; Set MNIST to run just a couple of epochs
10 | (parameterize ([current-command-line-arguments (vector "-m")])
11 |   (dynamic-require "mnist.rkt" #f))
12 | 
13 | ;; Wait a bit
14 | (sleep 2)
15 | 
16 | ;; Test CNN
17 | (printf "~n~nTesting CNN implementation (abbreviated run for quick testing)...~n")
18 | (printf "=================================================================~n")
19 | 
20 | ;; Run CNN with minimal epochs/examples for testing
21 | (parameterize ([current-command-line-arguments (vector "-m")])
22 |   (dynamic-require "CNN.rkt" #f))
23 | 
24 | (printf "~n~nTests completed!~n")


--------------------------------------------------------------------------------
/visualization.rkt:
--------------------------------------------------------------------------------
 1 | #lang racket
 2 | 
 3 | (require plot)
 4 | 
 5 | (provide plot-training-history)
 6 | 
 7 | ;; Function to plot training history
 8 | ;; history: a list of entries in format (epoch loss accuracy val-accuracy)
 9 | (define (plot-training-history history [filename "training_history.png"])
10 |   (let* ([epochs (map first history)]
11 |          [losses (map second history)]
12 |          [accuracies (map third history)]
13 |          [val-accuracies (map fourth history)])
14 |     
15 |     ;; Generate the loss plot
16 |     (define loss-plot
17 |       (parameterize ([plot-x-label "Epoch"]
18 |                      [plot-y-label "Loss"]
19 |                      [plot-title "Training Loss"]
20 |                      [plot-font-size 12])
21 |         (plot
22 |          (list
23 |           (points (map vector epochs losses)
24 |                   #:color 'blue
25 |                   #:sym 'circle
26 |                   #:size 6)
27 |           (lines (map vector epochs losses)
28 |                  #:color 'blue
29 |                  #:width 2
30 |                  #:style 'solid
31 |                  #:label "Training Loss"))
32 |          #:x-min 0
33 |          #:y-min 0
34 |          #:width 500
35 |          #:height 300)))
36 |     
37 |     ;; Generate the accuracy plot
38 |     (define accuracy-plot
39 |       (parameterize ([plot-x-label "Epoch"]
40 |                      [plot-y-label "Accuracy (%)"]
41 |                      [plot-title "Training and Validation Accuracy"]
42 |                      [plot-font-size 12])
43 |         (plot
44 |          (list
45 |           (points (map vector epochs accuracies)
46 |                   #:color 'green
47 |                   #:sym 'circle
48 |                   #:size 6)
49 |           (lines (map vector epochs accuracies)
50 |                  #:color 'green
51 |                  #:width 2
52 |                  #:style 'solid
53 |                  #:label "Training Accuracy")
54 |           (points (map vector epochs val-accuracies)
55 |                   #:color 'red
56 |                   #:sym 'triangle
57 |                   #:size 6)
58 |           (lines (map vector epochs val-accuracies)
59 |                  #:color 'red
60 |                  #:width 2
61 |                  #:style 'long-dash
62 |                  #:label "Validation Accuracy"))
63 |          #:x-min 0
64 |          #:y-min 0
65 |          #:y-max 100
66 |          #:width 500
67 |          #:height 300)))
68 |     
69 |     ;; Create a combined plot
70 |     (define combined-plot
71 |       (vl-append 10 loss-plot accuracy-plot))
72 |     
73 |     ;; Save the plots
74 |     (save-plot filename combined-plot 'png)
75 |     
76 |     (printf "Training history plot saved to ~a~n" filename)))
77 | 
78 | ;; Example usage
79 | (module+ main
80 |   ;; Generate sample history data
81 |   (define sample-history
82 |     (for/list ([epoch (in-range 10)])
83 |       (let* ([loss (- 1.0 (* 0.09 epoch))]
84 |              [acc (* 10 epoch)]
85 |              [val-acc (- (* 10 epoch) 5)])
86 |         (list epoch loss acc val-acc))))
87 |   
88 |   ;; Plot the sample data
89 |   (plot-training-history sample-history))


--------------------------------------------------------------------------------