├── .DS_Store
├── .gitignore
├── .vscode
    └── settings.json
├── README.md
├── assets
    ├── architecture.excalidraw
    ├── architecture.png
    ├── mnist-mlp.excalidraw
    └── mnist-mlp.png
├── cuda
    ├── .DS_Store
    ├── naive-gpu
    │   └── 1layer.cu
    └── vroom
    │   ├── comparing
    │       ├── batch-compare-backward.cu
    │       ├── batch-compare-forward.cu
    │       └── batch-matmul-compare.cu
    │   └── v1.cu
├── downloader.py
├── naive-cpu
    └── v1.c
├── python
    ├── c-friendly.py
    ├── torch_reference.ipynb
    └── torch_reference.py
└── requirements.txt


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # watch out for those pesky .DS_Store & binary files
2 | data
3 | python/data
4 | python/venv
5 | .vscode
6 | .gitignore
7 | mnist_data
8 | dev


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.associations": {
 3 |         "ostream": "cpp",
 4 |         "chrono": "cpp",
 5 |         "dataloader.cuh": "c",
 6 |         "stdio.h": "c",
 7 |         "random": "cpp",
 8 |         "queue": "cpp",
 9 |         "stack": "cpp",
10 |         "iostream": "cpp",
11 |         "cstddef": "cpp",
12 |         "array": "cpp",
13 |         "atomic": "cpp",
14 |         "bit": "cpp",
15 |         "*.tcc": "cpp",
16 |         "bitset": "cpp",
17 |         "cctype": "cpp",
18 |         "cinttypes": "cpp",
19 |         "clocale": "cpp",
20 |         "cmath": "cpp",
21 |         "compare": "cpp",
22 |         "complex": "cpp",
23 |         "concepts": "cpp",
24 |         "condition_variable": "cpp",
25 |         "csignal": "cpp",
26 |         "cstdarg": "cpp",
27 |         "cstdint": "cpp",
28 |         "cstdio": "cpp",
29 |         "cstdlib": "cpp",
30 |         "cstring": "cpp",
31 |         "ctime": "cpp",
32 |         "cwchar": "cpp",
33 |         "cwctype": "cpp",
34 |         "deque": "cpp",
35 |         "forward_list": "cpp",
36 |         "list": "cpp",
37 |         "map": "cpp",
38 |         "set": "cpp",
39 |         "string": "cpp",
40 |         "unordered_map": "cpp",
41 |         "unordered_set": "cpp",
42 |         "vector": "cpp",
43 |         "exception": "cpp",
44 |         "algorithm": "cpp",
45 |         "functional": "cpp",
46 |         "iterator": "cpp",
47 |         "memory": "cpp",
48 |         "memory_resource": "cpp",
49 |         "numeric": "cpp",
50 |         "optional": "cpp",
51 |         "ratio": "cpp",
52 |         "regex": "cpp",
53 |         "string_view": "cpp",
54 |         "system_error": "cpp",
55 |         "tuple": "cpp",
56 |         "type_traits": "cpp",
57 |         "utility": "cpp",
58 |         "fstream": "cpp",
59 |         "future": "cpp",
60 |         "initializer_list": "cpp",
61 |         "iomanip": "cpp",
62 |         "iosfwd": "cpp",
63 |         "istream": "cpp",
64 |         "limits": "cpp",
65 |         "mutex": "cpp",
66 |         "new": "cpp",
67 |         "numbers": "cpp",
68 |         "semaphore": "cpp",
69 |         "shared_mutex": "cpp",
70 |         "sstream": "cpp",
71 |         "stdexcept": "cpp",
72 |         "stop_token": "cpp",
73 |         "streambuf": "cpp",
74 |         "thread": "cpp",
75 |         "cfenv": "cpp",
76 |         "typeindex": "cpp",
77 |         "typeinfo": "cpp",
78 |         "valarray": "cpp",
79 |         "variant": "cpp",
80 |         "filesystem": "cpp",
81 |         "__locale": "cpp"
82 |     }
83 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MNIST in CUDA
 2 | 
 3 | ![](assets/mnist-mlp.png)
 4 | > This is instruction manual for understanding + using the mnist training run in CUDA
 5 | 
 6 | 
 7 | ## Setup
 8 | > DISCLAIMER: ensure you have a GPU with compute capability 5.0 or greater (at least maxwell architecture). See compatibilty guide: https://docs.nvidia.com/deeplearning/cudnn/latest/reference/support-matrix.html
 9 | ```bash
10 | git clone https://github.com/Infatoshi/mnist-cuda
11 | python3 -m venv venv
12 | source venv/bin/activate
13 | pip install -r requirements.txt
14 | ```
15 | ## Purpose
16 | 
17 | We train an MLP on the MNIST dataset.
18 | We implement both the batched training run in pytorch, then translate over to CUDA C/C++ using iteratively optimized GPU kernels. I purposely left out batchnorm, residual blocks, lower-precision, and other optimizations to keep the code simple and easy to understand. It would also take wayyyy longer to implement and explain.
19 | 
20 | 
21 | ## What we need to watch out for + pay attention to:
22 | 
23 | - [row vs col major](https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication)
24 | - [tensor cores](https://docs.nvidia.com/cuda/cublas/#tensor-core-usage)
25 | 
26 | ## Accelerate the data transfer via Prefetching
27 | 
28 | - [Unified vs Explicit Memory in CUDA](https://github.com/lintenn/cudaAddVectors-explicit-vs-unified-memory)
29 | - [Maximizing Unified Memory Performance](https://developer.nvidia.com/blog/maximizing-unified-memory-performance-cuda/)
30 | - Prefetching is automatically taken care of by unified memory via **streams** (this is what is has lower latency in the github link above)
31 |   - [CUDA streams - Lei Mao](https://leimao.github.io/blog/CUDA-Stream/)
32 |   - [NVIDIA Docs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#asynchronous-concurrent-execution)
33 |   - Streams allow for overlapping data transfer (prefetching) with computation.
34 |   - While one stream is executing a kernel, another stream can be transferring data for the next computation.
35 |   - This technique is often called "double buffering" or "multi-buffering" when extended to more buffers.
36 | 
37 | ## Kernel Conversion
38 | > we will change the following functions to kernels:
39 | matmul_a_bt and matmul_at_b
40 | relu_forward and relu_backward
41 | bias_forward and bias_backward
42 | softmax
43 | compute_grad_output
44 | compute_output_gradients
45 | compute_hidden_gradients
46 | update_gradients
47 | 


--------------------------------------------------------------------------------
/assets/architecture.excalidraw:
--------------------------------------------------------------------------------
   1 | {
   2 |   "type": "excalidraw",
   3 |   "version": 2,
   4 |   "source": "https://excalidraw.com",
   5 |   "elements": [
   6 |     {
   7 |       "type": "rectangle",
   8 |       "version": 121,
   9 |       "versionNonce": 1332964315,
  10 |       "index": "a9",
  11 |       "isDeleted": false,
  12 |       "id": "Tb5EZSqLfMsjXk12HZZH2",
  13 |       "fillStyle": "solid",
  14 |       "strokeWidth": 2,
  15 |       "strokeStyle": "solid",
  16 |       "roughness": 1,
  17 |       "opacity": 100,
  18 |       "angle": 0,
  19 |       "x": 460,
  20 |       "y": 179,
  21 |       "strokeColor": "#1e1e1e",
  22 |       "backgroundColor": "transparent",
  23 |       "width": 310,
  24 |       "height": 68,
  25 |       "seed": 195673173,
  26 |       "groupIds": [],
  27 |       "frameId": null,
  28 |       "roundness": {
  29 |         "type": 3
  30 |       },
  31 |       "boundElements": [
  32 |         {
  33 |           "type": "text",
  34 |           "id": "226hBQzcNEGHGlDXnn7fS"
  35 |         },
  36 |         {
  37 |           "id": "WOHMQMCgLty5Tquv57o3x",
  38 |           "type": "arrow"
  39 |         },
  40 |         {
  41 |           "id": "5-IqAH6FDB3OTGQo3zcbC",
  42 |           "type": "arrow"
  43 |         }
  44 |       ],
  45 |       "updated": 1720154708284,
  46 |       "link": null,
  47 |       "locked": false
  48 |     },
  49 |     {
  50 |       "type": "text",
  51 |       "version": 60,
  52 |       "versionNonce": 647234719,
  53 |       "index": "a9V",
  54 |       "isDeleted": false,
  55 |       "id": "226hBQzcNEGHGlDXnn7fS",
  56 |       "fillStyle": "solid",
  57 |       "strokeWidth": 2,
  58 |       "strokeStyle": "solid",
  59 |       "roughness": 1,
  60 |       "opacity": 100,
  61 |       "angle": 0,
  62 |       "x": 520.3000869750977,
  63 |       "y": 200.5,
  64 |       "strokeColor": "#1e1e1e",
  65 |       "backgroundColor": "transparent",
  66 |       "width": 189.3998260498047,
  67 |       "height": 25,
  68 |       "seed": 604860667,
  69 |       "groupIds": [],
  70 |       "frameId": null,
  71 |       "roundness": null,
  72 |       "boundElements": [],
  73 |       "updated": 1720239838246,
  74 |       "link": null,
  75 |       "locked": false,
  76 |       "fontSize": 20,
  77 |       "fontFamily": 1,
  78 |       "text": "X -> (B, 1, 28, 28)",
  79 |       "textAlign": "center",
  80 |       "verticalAlign": "middle",
  81 |       "containerId": "Tb5EZSqLfMsjXk12HZZH2",
  82 |       "originalText": "X -> (B, 1, 28, 28)",
  83 |       "autoResize": true,
  84 |       "lineHeight": 1.25
  85 |     },
  86 |     {
  87 |       "type": "arrow",
  88 |       "version": 218,
  89 |       "versionNonce": 811579281,
  90 |       "index": "aC",
  91 |       "isDeleted": false,
  92 |       "id": "WOHMQMCgLty5Tquv57o3x",
  93 |       "fillStyle": "solid",
  94 |       "strokeWidth": 2,
  95 |       "strokeStyle": "solid",
  96 |       "roughness": 1,
  97 |       "opacity": 100,
  98 |       "angle": 0,
  99 |       "x": 606.8094064949607,
 100 |       "y": 248,
 101 |       "strokeColor": "#1e1e1e",
 102 |       "backgroundColor": "transparent",
 103 |       "width": 0.19059350503925998,
 104 |       "height": 49,
 105 |       "seed": 2010511989,
 106 |       "groupIds": [],
 107 |       "frameId": null,
 108 |       "roundness": {
 109 |         "type": 2
 110 |       },
 111 |       "boundElements": [],
 112 |       "updated": 1720239960994,
 113 |       "link": null,
 114 |       "locked": false,
 115 |       "startBinding": {
 116 |         "elementId": "Tb5EZSqLfMsjXk12HZZH2",
 117 |         "focus": 0.053675053385165365,
 118 |         "gap": 1
 119 |       },
 120 |       "endBinding": null,
 121 |       "lastCommittedPoint": null,
 122 |       "startArrowhead": null,
 123 |       "endArrowhead": "arrow",
 124 |       "points": [
 125 |         [
 126 |           0,
 127 |           0
 128 |         ],
 129 |         [
 130 |           0.19059350503925998,
 131 |           49
 132 |         ]
 133 |       ]
 134 |     },
 135 |     {
 136 |       "type": "rectangle",
 137 |       "version": 355,
 138 |       "versionNonce": 676668063,
 139 |       "index": "aa",
 140 |       "isDeleted": false,
 141 |       "id": "a4bTGDbNZih7uekkgQdnF",
 142 |       "fillStyle": "solid",
 143 |       "strokeWidth": 2,
 144 |       "strokeStyle": "solid",
 145 |       "roughness": 1,
 146 |       "opacity": 100,
 147 |       "angle": 0,
 148 |       "x": 829,
 149 |       "y": 175,
 150 |       "strokeColor": "#1e1e1e",
 151 |       "backgroundColor": "transparent",
 152 |       "width": 276,
 153 |       "height": 70,
 154 |       "seed": 413798971,
 155 |       "groupIds": [],
 156 |       "frameId": null,
 157 |       "roundness": {
 158 |         "type": 3
 159 |       },
 160 |       "boundElements": [
 161 |         {
 162 |           "type": "text",
 163 |           "id": "NcrKQWyJrYeqex0ldZ7aJ"
 164 |         },
 165 |         {
 166 |           "id": "5-IqAH6FDB3OTGQo3zcbC",
 167 |           "type": "arrow"
 168 |         },
 169 |         {
 170 |           "id": "Gl5Sw9zFzPIVic4meB60T",
 171 |           "type": "arrow"
 172 |         }
 173 |       ],
 174 |       "updated": 1720240138338,
 175 |       "link": null,
 176 |       "locked": false
 177 |     },
 178 |     {
 179 |       "type": "text",
 180 |       "version": 327,
 181 |       "versionNonce": 680082719,
 182 |       "index": "ab",
 183 |       "isDeleted": false,
 184 |       "id": "NcrKQWyJrYeqex0ldZ7aJ",
 185 |       "fillStyle": "solid",
 186 |       "strokeWidth": 2,
 187 |       "strokeStyle": "solid",
 188 |       "roughness": 1,
 189 |       "opacity": 100,
 190 |       "angle": 0,
 191 |       "x": 839.3201522827148,
 192 |       "y": 197.5,
 193 |       "strokeColor": "#1e1e1e",
 194 |       "backgroundColor": "transparent",
 195 |       "width": 255.3596954345703,
 196 |       "height": 25,
 197 |       "seed": 1200618709,
 198 |       "groupIds": [],
 199 |       "frameId": null,
 200 |       "roundness": null,
 201 |       "boundElements": [],
 202 |       "updated": 1720239838246,
 203 |       "link": null,
 204 |       "locked": false,
 205 |       "fontSize": 20,
 206 |       "fontFamily": 1,
 207 |       "text": "Dataloader gets a batch",
 208 |       "textAlign": "center",
 209 |       "verticalAlign": "middle",
 210 |       "containerId": "a4bTGDbNZih7uekkgQdnF",
 211 |       "originalText": "Dataloader gets a batch",
 212 |       "autoResize": true,
 213 |       "lineHeight": 1.25
 214 |     },
 215 |     {
 216 |       "type": "arrow",
 217 |       "version": 617,
 218 |       "versionNonce": 588130933,
 219 |       "index": "ac",
 220 |       "isDeleted": false,
 221 |       "id": "5-IqAH6FDB3OTGQo3zcbC",
 222 |       "fillStyle": "solid",
 223 |       "strokeWidth": 2,
 224 |       "strokeStyle": "solid",
 225 |       "roughness": 1,
 226 |       "opacity": 100,
 227 |       "angle": 0,
 228 |       "x": 827,
 229 |       "y": 208.77830746985512,
 230 |       "strokeColor": "#1e1e1e",
 231 |       "backgroundColor": "transparent",
 232 |       "width": 52,
 233 |       "height": 1.864830883005169,
 234 |       "seed": 1412391483,
 235 |       "groupIds": [],
 236 |       "frameId": null,
 237 |       "roundness": {
 238 |         "type": 2
 239 |       },
 240 |       "boundElements": [],
 241 |       "updated": 1720154761064,
 242 |       "link": null,
 243 |       "locked": false,
 244 |       "startBinding": {
 245 |         "elementId": "a4bTGDbNZih7uekkgQdnF",
 246 |         "focus": 0.15297822093938598,
 247 |         "gap": 2
 248 |       },
 249 |       "endBinding": {
 250 |         "elementId": "Tb5EZSqLfMsjXk12HZZH2",
 251 |         "focus": 0.08547008547008547,
 252 |         "gap": 5
 253 |       },
 254 |       "lastCommittedPoint": null,
 255 |       "startArrowhead": null,
 256 |       "endArrowhead": "arrow",
 257 |       "points": [
 258 |         [
 259 |           0,
 260 |           0
 261 |         ],
 262 |         [
 263 |           -52,
 264 |           1.864830883005169
 265 |         ]
 266 |       ]
 267 |     },
 268 |     {
 269 |       "type": "arrow",
 270 |       "version": 198,
 271 |       "versionNonce": 1232196113,
 272 |       "index": "ad",
 273 |       "isDeleted": false,
 274 |       "id": "02JUczFgbNAnZ_XK7SONP",
 275 |       "fillStyle": "solid",
 276 |       "strokeWidth": 2,
 277 |       "strokeStyle": "solid",
 278 |       "roughness": 1,
 279 |       "opacity": 100,
 280 |       "angle": 0,
 281 |       "x": 775,
 282 |       "y": 711.1284390243902,
 283 |       "strokeColor": "#1e1e1e",
 284 |       "backgroundColor": "transparent",
 285 |       "width": 66,
 286 |       "height": 38.87156097560978,
 287 |       "seed": 1399797883,
 288 |       "groupIds": [],
 289 |       "frameId": null,
 290 |       "roundness": {
 291 |         "type": 2
 292 |       },
 293 |       "boundElements": [],
 294 |       "updated": 1720240155126,
 295 |       "link": null,
 296 |       "locked": false,
 297 |       "startBinding": {
 298 |         "elementId": "FDItF61llBXUrbKwoTXXI",
 299 |         "focus": -0.42416759838920154,
 300 |         "gap": 1
 301 |       },
 302 |       "endBinding": {
 303 |         "elementId": "B_wNqm25NGAtRVmirgvYP",
 304 |         "focus": -0.5189441487483543,
 305 |         "gap": 1
 306 |       },
 307 |       "lastCommittedPoint": null,
 308 |       "startArrowhead": null,
 309 |       "endArrowhead": "arrow",
 310 |       "points": [
 311 |         [
 312 |           0,
 313 |           0
 314 |         ],
 315 |         [
 316 |           66,
 317 |           38.87156097560978
 318 |         ]
 319 |       ]
 320 |     },
 321 |     {
 322 |       "type": "rectangle",
 323 |       "version": 166,
 324 |       "versionNonce": 1082925713,
 325 |       "index": "ae",
 326 |       "isDeleted": false,
 327 |       "id": "B_wNqm25NGAtRVmirgvYP",
 328 |       "fillStyle": "solid",
 329 |       "strokeWidth": 2,
 330 |       "strokeStyle": "solid",
 331 |       "roughness": 1,
 332 |       "opacity": 100,
 333 |       "angle": 0,
 334 |       "x": 840,
 335 |       "y": 736,
 336 |       "strokeColor": "#1e1e1e",
 337 |       "backgroundColor": "transparent",
 338 |       "width": 227,
 339 |       "height": 60,
 340 |       "seed": 1422467323,
 341 |       "groupIds": [],
 342 |       "frameId": null,
 343 |       "roundness": {
 344 |         "type": 3
 345 |       },
 346 |       "boundElements": [
 347 |         {
 348 |           "type": "text",
 349 |           "id": "cYREyB1eDLvVCxnSeVAHd"
 350 |         },
 351 |         {
 352 |           "id": "02JUczFgbNAnZ_XK7SONP",
 353 |           "type": "arrow"
 354 |         }
 355 |       ],
 356 |       "updated": 1720240153683,
 357 |       "link": null,
 358 |       "locked": false
 359 |     },
 360 |     {
 361 |       "type": "text",
 362 |       "version": 132,
 363 |       "versionNonce": 1197339793,
 364 |       "index": "af",
 365 |       "isDeleted": false,
 366 |       "id": "cYREyB1eDLvVCxnSeVAHd",
 367 |       "fillStyle": "solid",
 368 |       "strokeWidth": 2,
 369 |       "strokeStyle": "solid",
 370 |       "roughness": 1,
 371 |       "opacity": 100,
 372 |       "angle": 0,
 373 |       "x": 846.3601226806641,
 374 |       "y": 741,
 375 |       "strokeColor": "#1e1e1e",
 376 |       "backgroundColor": "transparent",
 377 |       "width": 214.27975463867188,
 378 |       "height": 50,
 379 |       "seed": 2095659675,
 380 |       "groupIds": [],
 381 |       "frameId": null,
 382 |       "roundness": null,
 383 |       "boundElements": [],
 384 |       "updated": 1720240148754,
 385 |       "link": null,
 386 |       "locked": false,
 387 |       "fontSize": 20,
 388 |       "fontFamily": 1,
 389 |       "text": "CrossEntropyLoss w/ \nSoftmax",
 390 |       "textAlign": "center",
 391 |       "verticalAlign": "middle",
 392 |       "containerId": "B_wNqm25NGAtRVmirgvYP",
 393 |       "originalText": "CrossEntropyLoss w/ \nSoftmax",
 394 |       "autoResize": true,
 395 |       "lineHeight": 1.25
 396 |     },
 397 |     {
 398 |       "type": "arrow",
 399 |       "version": 442,
 400 |       "versionNonce": 580566353,
 401 |       "index": "ag",
 402 |       "isDeleted": false,
 403 |       "id": "Gl5Sw9zFzPIVic4meB60T",
 404 |       "fillStyle": "solid",
 405 |       "strokeWidth": 2,
 406 |       "strokeStyle": "solid",
 407 |       "roughness": 1,
 408 |       "opacity": 100,
 409 |       "angle": 0,
 410 |       "x": 882,
 411 |       "y": 262,
 412 |       "strokeColor": "#1e1e1e",
 413 |       "backgroundColor": "transparent",
 414 |       "width": 2,
 415 |       "height": 369,
 416 |       "seed": 1570799611,
 417 |       "groupIds": [],
 418 |       "frameId": null,
 419 |       "roundness": {
 420 |         "type": 2
 421 |       },
 422 |       "boundElements": [],
 423 |       "updated": 1720240164336,
 424 |       "link": null,
 425 |       "locked": false,
 426 |       "startBinding": {
 427 |         "elementId": "a4bTGDbNZih7uekkgQdnF",
 428 |         "focus": 0.6130569501098212,
 429 |         "gap": 17
 430 |       },
 431 |       "endBinding": {
 432 |         "elementId": "bMXxc2iYQD9OKukWZgVhi",
 433 |         "focus": 0.00921406396789014,
 434 |         "gap": 12.002668957710789
 435 |       },
 436 |       "lastCommittedPoint": null,
 437 |       "startArrowhead": null,
 438 |       "endArrowhead": "arrow",
 439 |       "points": [
 440 |         [
 441 |           0,
 442 |           0
 443 |         ],
 444 |         [
 445 |           -2,
 446 |           369
 447 |         ]
 448 |       ]
 449 |     },
 450 |     {
 451 |       "type": "arrow",
 452 |       "version": 265,
 453 |       "versionNonce": 633629279,
 454 |       "index": "ai",
 455 |       "isDeleted": false,
 456 |       "id": "dyST1-PXoEsDXgjTBDOom",
 457 |       "fillStyle": "solid",
 458 |       "strokeWidth": 2,
 459 |       "strokeStyle": "solid",
 460 |       "roughness": 1,
 461 |       "opacity": 100,
 462 |       "angle": 0,
 463 |       "x": 1005,
 464 |       "y": 717,
 465 |       "strokeColor": "#1e1e1e",
 466 |       "backgroundColor": "transparent",
 467 |       "width": 0,
 468 |       "height": 393,
 469 |       "seed": 1136485621,
 470 |       "groupIds": [],
 471 |       "frameId": null,
 472 |       "roundness": {
 473 |         "type": 2
 474 |       },
 475 |       "boundElements": [],
 476 |       "updated": 1720240157303,
 477 |       "link": null,
 478 |       "locked": false,
 479 |       "startBinding": null,
 480 |       "endBinding": {
 481 |         "elementId": "W-FMv5BfpNwKuBZtFDJPw",
 482 |         "focus": -0.012987012987012988,
 483 |         "gap": 8.002431575230332
 484 |       },
 485 |       "lastCommittedPoint": null,
 486 |       "startArrowhead": null,
 487 |       "endArrowhead": "arrow",
 488 |       "points": [
 489 |         [
 490 |           0,
 491 |           0
 492 |         ],
 493 |         [
 494 |           0,
 495 |           -393
 496 |         ]
 497 |       ]
 498 |     },
 499 |     {
 500 |       "type": "ellipse",
 501 |       "version": 162,
 502 |       "versionNonce": 1352715185,
 503 |       "index": "aj",
 504 |       "isDeleted": false,
 505 |       "id": "bMXxc2iYQD9OKukWZgVhi",
 506 |       "fillStyle": "solid",
 507 |       "strokeWidth": 2,
 508 |       "strokeStyle": "solid",
 509 |       "roughness": 1,
 510 |       "opacity": 100,
 511 |       "angle": 0,
 512 |       "x": 799,
 513 |       "y": 643,
 514 |       "strokeColor": "#1e1e1e",
 515 |       "backgroundColor": "transparent",
 516 |       "width": 160,
 517 |       "height": 72.99999999999997,
 518 |       "seed": 1540359669,
 519 |       "groupIds": [],
 520 |       "frameId": null,
 521 |       "roundness": {
 522 |         "type": 2
 523 |       },
 524 |       "boundElements": [
 525 |         {
 526 |           "type": "text",
 527 |           "id": "WCUFJus6hMfoXEtlcd0Zs"
 528 |         },
 529 |         {
 530 |           "id": "Gl5Sw9zFzPIVic4meB60T",
 531 |           "type": "arrow"
 532 |         }
 533 |       ],
 534 |       "updated": 1720240164045,
 535 |       "link": null,
 536 |       "locked": false
 537 |     },
 538 |     {
 539 |       "type": "text",
 540 |       "version": 110,
 541 |       "versionNonce": 107802847,
 542 |       "index": "ak",
 543 |       "isDeleted": false,
 544 |       "id": "WCUFJus6hMfoXEtlcd0Zs",
 545 |       "fillStyle": "solid",
 546 |       "strokeWidth": 2,
 547 |       "strokeStyle": "solid",
 548 |       "roughness": 1,
 549 |       "opacity": 100,
 550 |       "angle": 0,
 551 |       "x": 857.9314727638653,
 552 |       "y": 667.190602486691,
 553 |       "strokeColor": "#1e1e1e",
 554 |       "backgroundColor": "transparent",
 555 |       "width": 41.999969482421875,
 556 |       "height": 25,
 557 |       "seed": 2053893467,
 558 |       "groupIds": [],
 559 |       "frameId": null,
 560 |       "roundness": null,
 561 |       "boundElements": [],
 562 |       "updated": 1720240161330,
 563 |       "link": null,
 564 |       "locked": false,
 565 |       "fontSize": 20,
 566 |       "fontFamily": 1,
 567 |       "text": "FWD",
 568 |       "textAlign": "center",
 569 |       "verticalAlign": "middle",
 570 |       "containerId": "bMXxc2iYQD9OKukWZgVhi",
 571 |       "originalText": "FWD",
 572 |       "autoResize": true,
 573 |       "lineHeight": 1.25
 574 |     },
 575 |     {
 576 |       "type": "ellipse",
 577 |       "version": 55,
 578 |       "versionNonce": 1638133215,
 579 |       "index": "al",
 580 |       "isDeleted": false,
 581 |       "id": "W-FMv5BfpNwKuBZtFDJPw",
 582 |       "fillStyle": "solid",
 583 |       "strokeWidth": 2,
 584 |       "strokeStyle": "solid",
 585 |       "roughness": 1,
 586 |       "opacity": 100,
 587 |       "angle": 0,
 588 |       "x": 927,
 589 |       "y": 256,
 590 |       "strokeColor": "#1e1e1e",
 591 |       "backgroundColor": "transparent",
 592 |       "width": 154,
 593 |       "height": 60,
 594 |       "seed": 2111931605,
 595 |       "groupIds": [],
 596 |       "frameId": null,
 597 |       "roundness": {
 598 |         "type": 2
 599 |       },
 600 |       "boundElements": [
 601 |         {
 602 |           "type": "text",
 603 |           "id": "djk1Lt9IuiKWjg7fDTQ7T"
 604 |         },
 605 |         {
 606 |           "id": "dyST1-PXoEsDXgjTBDOom",
 607 |           "type": "arrow"
 608 |         }
 609 |       ],
 610 |       "updated": 1720240145253,
 611 |       "link": null,
 612 |       "locked": false
 613 |     },
 614 |     {
 615 |       "type": "text",
 616 |       "version": 15,
 617 |       "versionNonce": 359446065,
 618 |       "index": "am",
 619 |       "isDeleted": false,
 620 |       "id": "djk1Lt9IuiKWjg7fDTQ7T",
 621 |       "fillStyle": "solid",
 622 |       "strokeWidth": 2,
 623 |       "strokeStyle": "solid",
 624 |       "roughness": 1,
 625 |       "opacity": 100,
 626 |       "angle": 0,
 627 |       "x": 975.3927970747101,
 628 |       "y": 273.2867965644036,
 629 |       "strokeColor": "#1e1e1e",
 630 |       "backgroundColor": "transparent",
 631 |       "width": 57.31996154785156,
 632 |       "height": 25,
 633 |       "seed": 1491854523,
 634 |       "groupIds": [],
 635 |       "frameId": null,
 636 |       "roundness": null,
 637 |       "boundElements": [],
 638 |       "updated": 1720239838246,
 639 |       "link": null,
 640 |       "locked": false,
 641 |       "fontSize": 20,
 642 |       "fontFamily": 1,
 643 |       "text": "BKWD",
 644 |       "textAlign": "center",
 645 |       "verticalAlign": "middle",
 646 |       "containerId": "W-FMv5BfpNwKuBZtFDJPw",
 647 |       "originalText": "BKWD",
 648 |       "autoResize": true,
 649 |       "lineHeight": 1.25
 650 |     },
 651 |     {
 652 |       "id": "252rC5Ah4gyn8FABm_5NN",
 653 |       "type": "rectangle",
 654 |       "x": 462,
 655 |       "y": 296,
 656 |       "width": 311,
 657 |       "height": 71,
 658 |       "angle": 0,
 659 |       "strokeColor": "#1e1e1e",
 660 |       "backgroundColor": "transparent",
 661 |       "fillStyle": "solid",
 662 |       "strokeWidth": 2,
 663 |       "strokeStyle": "solid",
 664 |       "roughness": 1,
 665 |       "opacity": 100,
 666 |       "groupIds": [],
 667 |       "frameId": null,
 668 |       "index": "an",
 669 |       "roundness": {
 670 |         "type": 3
 671 |       },
 672 |       "seed": 1657504177,
 673 |       "version": 132,
 674 |       "versionNonce": 1541264031,
 675 |       "isDeleted": false,
 676 |       "boundElements": [
 677 |         {
 678 |           "type": "text",
 679 |           "id": "giSDh_vvh8OoE56tLSEab"
 680 |         },
 681 |         {
 682 |           "id": "R0fTzItfD5zQl_LTuH2iZ",
 683 |           "type": "arrow"
 684 |         }
 685 |       ],
 686 |       "updated": 1720240007882,
 687 |       "link": null,
 688 |       "locked": false
 689 |     },
 690 |     {
 691 |       "id": "giSDh_vvh8OoE56tLSEab",
 692 |       "type": "text",
 693 |       "x": 520.2600936889648,
 694 |       "y": 319,
 695 |       "width": 194.4798126220703,
 696 |       "height": 25,
 697 |       "angle": 0,
 698 |       "strokeColor": "#1e1e1e",
 699 |       "backgroundColor": "transparent",
 700 |       "fillStyle": "solid",
 701 |       "strokeWidth": 2,
 702 |       "strokeStyle": "solid",
 703 |       "roughness": 1,
 704 |       "opacity": 100,
 705 |       "groupIds": [],
 706 |       "frameId": null,
 707 |       "index": "anV",
 708 |       "roundness": null,
 709 |       "seed": 1033794257,
 710 |       "version": 21,
 711 |       "versionNonce": 1913163313,
 712 |       "isDeleted": false,
 713 |       "boundElements": null,
 714 |       "updated": 1720240001071,
 715 |       "link": null,
 716 |       "locked": false,
 717 |       "text": "Flatten -> (B, 784)",
 718 |       "fontSize": 20,
 719 |       "fontFamily": 1,
 720 |       "textAlign": "center",
 721 |       "verticalAlign": "middle",
 722 |       "containerId": "252rC5Ah4gyn8FABm_5NN",
 723 |       "originalText": "Flatten -> (B, 784)",
 724 |       "autoResize": true,
 725 |       "lineHeight": 1.25
 726 |     },
 727 |     {
 728 |       "id": "R0fTzItfD5zQl_LTuH2iZ",
 729 |       "type": "arrow",
 730 |       "x": 615,
 731 |       "y": 367,
 732 |       "width": 1,
 733 |       "height": 44,
 734 |       "angle": 0,
 735 |       "strokeColor": "#1e1e1e",
 736 |       "backgroundColor": "transparent",
 737 |       "fillStyle": "solid",
 738 |       "strokeWidth": 2,
 739 |       "strokeStyle": "solid",
 740 |       "roughness": 1,
 741 |       "opacity": 100,
 742 |       "groupIds": [],
 743 |       "frameId": null,
 744 |       "index": "ap",
 745 |       "roundness": {
 746 |         "type": 2
 747 |       },
 748 |       "seed": 2094851455,
 749 |       "version": 42,
 750 |       "versionNonce": 1377008255,
 751 |       "isDeleted": false,
 752 |       "boundElements": null,
 753 |       "updated": 1720240007882,
 754 |       "link": null,
 755 |       "locked": false,
 756 |       "points": [
 757 |         [
 758 |           0,
 759 |           0
 760 |         ],
 761 |         [
 762 |           -1,
 763 |           44
 764 |         ]
 765 |       ],
 766 |       "lastCommittedPoint": null,
 767 |       "startBinding": {
 768 |         "elementId": "252rC5Ah4gyn8FABm_5NN",
 769 |         "focus": 0.010832424572882589,
 770 |         "gap": 1
 771 |       },
 772 |       "endBinding": null,
 773 |       "startArrowhead": null,
 774 |       "endArrowhead": "arrow"
 775 |     },
 776 |     {
 777 |       "id": "8TuQ_WHfiSFXafyH497Yl",
 778 |       "type": "rectangle",
 779 |       "x": 467,
 780 |       "y": 416,
 781 |       "width": 304,
 782 |       "height": 74,
 783 |       "angle": 0,
 784 |       "strokeColor": "#1e1e1e",
 785 |       "backgroundColor": "transparent",
 786 |       "fillStyle": "solid",
 787 |       "strokeWidth": 2,
 788 |       "strokeStyle": "solid",
 789 |       "roughness": 1,
 790 |       "opacity": 100,
 791 |       "groupIds": [],
 792 |       "frameId": null,
 793 |       "index": "aq",
 794 |       "roundness": {
 795 |         "type": 3
 796 |       },
 797 |       "seed": 1189619391,
 798 |       "version": 99,
 799 |       "versionNonce": 2031612881,
 800 |       "isDeleted": false,
 801 |       "boundElements": [
 802 |         {
 803 |           "type": "text",
 804 |           "id": "vfAXY-vLTQ7k-h5kCS-hO"
 805 |         },
 806 |         {
 807 |           "id": "N5t1cG6wymc0G3VV4WAdy",
 808 |           "type": "arrow"
 809 |         }
 810 |       ],
 811 |       "updated": 1720240080586,
 812 |       "link": null,
 813 |       "locked": false
 814 |     },
 815 |     {
 816 |       "id": "vfAXY-vLTQ7k-h5kCS-hO",
 817 |       "type": "text",
 818 |       "x": 479.7501220703125,
 819 |       "y": 428,
 820 |       "width": 278.499755859375,
 821 |       "height": 50,
 822 |       "angle": 0,
 823 |       "strokeColor": "#1e1e1e",
 824 |       "backgroundColor": "transparent",
 825 |       "fillStyle": "solid",
 826 |       "strokeWidth": 2,
 827 |       "strokeStyle": "solid",
 828 |       "roughness": 1,
 829 |       "opacity": 100,
 830 |       "groupIds": [],
 831 |       "frameId": null,
 832 |       "index": "ar",
 833 |       "roundness": null,
 834 |       "seed": 352394897,
 835 |       "version": 49,
 836 |       "versionNonce": 1612744721,
 837 |       "isDeleted": false,
 838 |       "boundElements": null,
 839 |       "updated": 1720240072802,
 840 |       "link": null,
 841 |       "locked": false,
 842 |       "text": "Linear -> (B, 784) @ (784, \n256) = (B, 256)",
 843 |       "fontSize": 20,
 844 |       "fontFamily": 1,
 845 |       "textAlign": "center",
 846 |       "verticalAlign": "middle",
 847 |       "containerId": "8TuQ_WHfiSFXafyH497Yl",
 848 |       "originalText": "Linear -> (B, 784) @ (784, 256) = (B, 256)",
 849 |       "autoResize": true,
 850 |       "lineHeight": 1.25
 851 |     },
 852 |     {
 853 |       "id": "N5t1cG6wymc0G3VV4WAdy",
 854 |       "type": "arrow",
 855 |       "x": 614,
 856 |       "y": 490,
 857 |       "width": 2,
 858 |       "height": 39,
 859 |       "angle": 0,
 860 |       "strokeColor": "#1e1e1e",
 861 |       "backgroundColor": "transparent",
 862 |       "fillStyle": "solid",
 863 |       "strokeWidth": 2,
 864 |       "strokeStyle": "solid",
 865 |       "roughness": 1,
 866 |       "opacity": 100,
 867 |       "groupIds": [],
 868 |       "frameId": null,
 869 |       "index": "as",
 870 |       "roundness": {
 871 |         "type": 2
 872 |       },
 873 |       "seed": 972584913,
 874 |       "version": 33,
 875 |       "versionNonce": 3561969,
 876 |       "isDeleted": false,
 877 |       "boundElements": null,
 878 |       "updated": 1720240080586,
 879 |       "link": null,
 880 |       "locked": false,
 881 |       "points": [
 882 |         [
 883 |           0,
 884 |           0
 885 |         ],
 886 |         [
 887 |           2,
 888 |           39
 889 |         ]
 890 |       ],
 891 |       "lastCommittedPoint": null,
 892 |       "startBinding": {
 893 |         "elementId": "8TuQ_WHfiSFXafyH497Yl",
 894 |         "focus": 0.04481839386871043,
 895 |         "gap": 1
 896 |       },
 897 |       "endBinding": null,
 898 |       "startArrowhead": null,
 899 |       "endArrowhead": "arrow"
 900 |     },
 901 |     {
 902 |       "id": "DyktbCEmygt1qtlFFoMfX",
 903 |       "type": "rectangle",
 904 |       "x": 464,
 905 |       "y": 527,
 906 |       "width": 315,
 907 |       "height": 70,
 908 |       "angle": 0,
 909 |       "strokeColor": "#1e1e1e",
 910 |       "backgroundColor": "transparent",
 911 |       "fillStyle": "solid",
 912 |       "strokeWidth": 2,
 913 |       "strokeStyle": "solid",
 914 |       "roughness": 1,
 915 |       "opacity": 100,
 916 |       "groupIds": [],
 917 |       "frameId": null,
 918 |       "index": "at",
 919 |       "roundness": {
 920 |         "type": 3
 921 |       },
 922 |       "seed": 2053853617,
 923 |       "version": 129,
 924 |       "versionNonce": 1848488031,
 925 |       "isDeleted": false,
 926 |       "boundElements": [
 927 |         {
 928 |           "type": "text",
 929 |           "id": "6M7P3dpAANA56DOMAm4tk"
 930 |         },
 931 |         {
 932 |           "id": "N9bJrjrXf9FaYkZTgVyo4",
 933 |           "type": "arrow"
 934 |         }
 935 |       ],
 936 |       "updated": 1720240105382,
 937 |       "link": null,
 938 |       "locked": false
 939 |     },
 940 |     {
 941 |       "id": "6M7P3dpAANA56DOMAm4tk",
 942 |       "type": "text",
 943 |       "x": 535.090087890625,
 944 |       "y": 549.5,
 945 |       "width": 172.81982421875,
 946 |       "height": 25,
 947 |       "angle": 0,
 948 |       "strokeColor": "#1e1e1e",
 949 |       "backgroundColor": "transparent",
 950 |       "fillStyle": "solid",
 951 |       "strokeWidth": 2,
 952 |       "strokeStyle": "solid",
 953 |       "roughness": 1,
 954 |       "opacity": 100,
 955 |       "groupIds": [],
 956 |       "frameId": null,
 957 |       "index": "atV",
 958 |       "roundness": null,
 959 |       "seed": 1939366193,
 960 |       "version": 20,
 961 |       "versionNonce": 1378467505,
 962 |       "isDeleted": false,
 963 |       "boundElements": null,
 964 |       "updated": 1720240096881,
 965 |       "link": null,
 966 |       "locked": false,
 967 |       "text": "ReLU -> (B, 256)",
 968 |       "fontSize": 20,
 969 |       "fontFamily": 1,
 970 |       "textAlign": "center",
 971 |       "verticalAlign": "middle",
 972 |       "containerId": "DyktbCEmygt1qtlFFoMfX",
 973 |       "originalText": "ReLU -> (B, 256)",
 974 |       "autoResize": true,
 975 |       "lineHeight": 1.25
 976 |     },
 977 |     {
 978 |       "id": "N9bJrjrXf9FaYkZTgVyo4",
 979 |       "type": "arrow",
 980 |       "x": 615,
 981 |       "y": 596,
 982 |       "width": 0,
 983 |       "height": 39,
 984 |       "angle": 0,
 985 |       "strokeColor": "#1e1e1e",
 986 |       "backgroundColor": "transparent",
 987 |       "fillStyle": "solid",
 988 |       "strokeWidth": 2,
 989 |       "strokeStyle": "solid",
 990 |       "roughness": 1,
 991 |       "opacity": 100,
 992 |       "groupIds": [],
 993 |       "frameId": null,
 994 |       "index": "av",
 995 |       "roundness": {
 996 |         "type": 2
 997 |       },
 998 |       "seed": 1360399615,
 999 |       "version": 60,
1000 |       "versionNonce": 1249644607,
1001 |       "isDeleted": false,
1002 |       "boundElements": null,
1003 |       "updated": 1720240105382,
1004 |       "link": null,
1005 |       "locked": false,
1006 |       "points": [
1007 |         [
1008 |           0,
1009 |           0
1010 |         ],
1011 |         [
1012 |           0,
1013 |           39
1014 |         ]
1015 |       ],
1016 |       "lastCommittedPoint": null,
1017 |       "startBinding": {
1018 |         "elementId": "DyktbCEmygt1qtlFFoMfX",
1019 |         "focus": 0.04126984126984127,
1020 |         "gap": 1
1021 |       },
1022 |       "endBinding": null,
1023 |       "startArrowhead": null,
1024 |       "endArrowhead": "arrow"
1025 |     },
1026 |     {
1027 |       "id": "FDItF61llBXUrbKwoTXXI",
1028 |       "type": "rectangle",
1029 |       "x": 462,
1030 |       "y": 639,
1031 |       "width": 317,
1032 |       "height": 72,
1033 |       "angle": 0,
1034 |       "strokeColor": "#1e1e1e",
1035 |       "backgroundColor": "transparent",
1036 |       "fillStyle": "solid",
1037 |       "strokeWidth": 2,
1038 |       "strokeStyle": "solid",
1039 |       "roughness": 1,
1040 |       "opacity": 100,
1041 |       "groupIds": [],
1042 |       "frameId": null,
1043 |       "index": "aw",
1044 |       "roundness": {
1045 |         "type": 3
1046 |       },
1047 |       "seed": 1190546559,
1048 |       "version": 91,
1049 |       "versionNonce": 186610257,
1050 |       "isDeleted": false,
1051 |       "boundElements": [
1052 |         {
1053 |           "type": "text",
1054 |           "id": "_7Mk3eUecqZw4aJygXBMz"
1055 |         },
1056 |         {
1057 |           "id": "02JUczFgbNAnZ_XK7SONP",
1058 |           "type": "arrow"
1059 |         }
1060 |       ],
1061 |       "updated": 1720240153683,
1062 |       "link": null,
1063 |       "locked": false
1064 |     },
1065 |     {
1066 |       "id": "_7Mk3eUecqZw4aJygXBMz",
1067 |       "type": "text",
1068 |       "x": 467.7401428222656,
1069 |       "y": 650,
1070 |       "width": 305.51971435546875,
1071 |       "height": 50,
1072 |       "angle": 0,
1073 |       "strokeColor": "#1e1e1e",
1074 |       "backgroundColor": "transparent",
1075 |       "fillStyle": "solid",
1076 |       "strokeWidth": 2,
1077 |       "strokeStyle": "solid",
1078 |       "roughness": 1,
1079 |       "opacity": 100,
1080 |       "groupIds": [],
1081 |       "frameId": null,
1082 |       "index": "ax",
1083 |       "roundness": null,
1084 |       "seed": 446522335,
1085 |       "version": 43,
1086 |       "versionNonce": 1837740351,
1087 |       "isDeleted": false,
1088 |       "boundElements": null,
1089 |       "updated": 1720240129464,
1090 |       "link": null,
1091 |       "locked": false,
1092 |       "text": "Linear -> (B, 256) @ (256, 10)\n= (B, 10)",
1093 |       "fontSize": 20,
1094 |       "fontFamily": 1,
1095 |       "textAlign": "center",
1096 |       "verticalAlign": "middle",
1097 |       "containerId": "FDItF61llBXUrbKwoTXXI",
1098 |       "originalText": "Linear -> (B, 256) @ (256, 10)\n= (B, 10)",
1099 |       "autoResize": true,
1100 |       "lineHeight": 1.25
1101 |     },
1102 |     {
1103 |       "id": "Wn2D_MDQlGufT2Nv2ZPaD",
1104 |       "type": "text",
1105 |       "x": 902,
1106 |       "y": 448,
1107 |       "width": 85.05990600585938,
1108 |       "height": 50,
1109 |       "angle": 0,
1110 |       "strokeColor": "#1e1e1e",
1111 |       "backgroundColor": "transparent",
1112 |       "fillStyle": "solid",
1113 |       "strokeWidth": 2,
1114 |       "strokeStyle": "solid",
1115 |       "roughness": 1,
1116 |       "opacity": 100,
1117 |       "groupIds": [],
1118 |       "frameId": null,
1119 |       "index": "ay",
1120 |       "roundness": null,
1121 |       "seed": 719543391,
1122 |       "version": 123,
1123 |       "versionNonce": 1609278399,
1124 |       "isDeleted": false,
1125 |       "boundElements": null,
1126 |       "updated": 1720240192707,
1127 |       "link": null,
1128 |       "locked": false,
1129 |       "text": "Gradient\nDescent",
1130 |       "fontSize": 20,
1131 |       "fontFamily": 1,
1132 |       "textAlign": "left",
1133 |       "verticalAlign": "top",
1134 |       "containerId": null,
1135 |       "originalText": "Gradient\nDescent",
1136 |       "autoResize": true,
1137 |       "lineHeight": 1.25
1138 |     }
1139 |   ],
1140 |   "appState": {
1141 |     "gridSize": null,
1142 |     "viewBackgroundColor": "#ffffff"
1143 |   },
1144 |   "files": {}
1145 | }


--------------------------------------------------------------------------------
/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/assets/architecture.png


--------------------------------------------------------------------------------
/assets/mnist-mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/assets/mnist-mlp.png


--------------------------------------------------------------------------------
/cuda/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/cuda/.DS_Store


--------------------------------------------------------------------------------
/cuda/naive-gpu/1layer.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <string.h>
  6 | #include <cuda_runtime.h>
  7 | 
  8 | #define INPUT_SIZE 784
  9 | #define HIDDEN_SIZE 4096
 10 | #define OUTPUT_SIZE 10
 11 | #define TRAIN_SIZE 10000
 12 | #define TEST_SIZE 1000
 13 | #define BATCH_SIZE 32
 14 | #define EPOCHS 20
 15 | #define LEARNING_RATE 0.05
 16 | 
 17 | typedef struct {
 18 |     float *weights1;
 19 |     float *weights2;
 20 |     float *bias1;
 21 |     float *bias2;
 22 |     float *grad_weights1;
 23 |     float *grad_weights2;
 24 |     float *grad_bias1;
 25 |     float *grad_bias2;
 26 | } NeuralNetwork;
 27 | 
 28 | // Modify the CUDA_CHECK macro to print more information
 29 | #define CUDA_CHECK(call) \
 30 |     do { \
 31 |         cudaError_t error = call; \
 32 |         if (error != cudaSuccess) { \
 33 |             fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
 34 |                     cudaGetErrorString(error)); \
 35 |             cudaDeviceReset(); \
 36 |             exit(EXIT_FAILURE); \
 37 |         } \
 38 |     } while(0)
 39 | 
 40 | // load batched img data
 41 | void load_data(const char *filename, float *data, int size) {
 42 |     FILE *file = fopen(filename, "rb");
 43 |     if (file == NULL) {
 44 |         fprintf(stderr, "Error opening file: %s\n", filename);
 45 |         exit(1);
 46 |     }
 47 |     size_t read_size = fread(data, sizeof(float), size, file);
 48 |     if (read_size != size) {
 49 |         fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size);
 50 |         exit(1);
 51 |     }
 52 |     fclose(file);
 53 | }
 54 | 
 55 | // load batch labels
 56 | void load_labels(const char *filename, int *labels, int size) {
 57 |     FILE *file = fopen(filename, "rb");
 58 |     if (file == NULL) {
 59 |         fprintf(stderr, "Error opening file: %s\n", filename);
 60 |         exit(1);
 61 |     }
 62 |     size_t read_size = fread(labels, sizeof(int), size, file);
 63 |     if (read_size != size) {
 64 |         fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size);
 65 |         exit(1);
 66 |     }
 67 |     fclose(file);
 68 | }
 69 | 
 70 | // kaiming init func for weights
 71 | void initialize_weights(float *weights, int size) {
 72 |     float scale = sqrtf(2.0f / size);
 73 |     for (int i = 0; i < size; i++) {
 74 |         weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f);
 75 |     }
 76 | }
 77 | 
 78 | // basic init for biases
 79 | void initialize_bias(float *bias, int size) {
 80 |     for (int i = 0; i < size; i++) {
 81 |         bias[i] = 0.0f;
 82 |     }
 83 | }
 84 | 
 85 | // CUDA kernel for matrix multiplication (A @ B)
 86 | __global__ void matmul_a_b_kernel(float *A, float *B, float *C, int m, int n, int k) {
 87 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 88 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 89 | 
 90 |     if (row < m && col < k) {
 91 |         float sum = 0.0f;
 92 |         for (int i = 0; i < n; ++i) {
 93 |             sum += A[row * n + i] * B[i * k + col];
 94 |         }
 95 |         C[row * k + col] = sum;
 96 |     }
 97 | }
 98 | 
 99 | // CUDA kernel for matrix multiplication (A @ B.T)
100 | __global__ void matmul_a_bt_kernel(float *A, float *B, float *C, int m, int n, int k) {
101 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
102 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
103 | 
104 |     if (row < m && col < k) {
105 |         float sum = 0.0f;
106 |         for (int i = 0; i < n; ++i) {
107 |             sum += A[row * n + i] * B[col * n + i];
108 |         }
109 |         C[row * k + col] = sum;
110 |     }
111 | }
112 | 
113 | // CUDA kernel for matrix multiplication (A.T @ B)
114 | __global__ void matmul_at_b_kernel(float *A, float *B, float *C, int m, int n, int k) {
115 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
116 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
117 | 
118 |     if (row < n && col < k) {
119 |         float sum = 0.0f;
120 |         for (int i = 0; i < m; ++i) {
121 |             sum += A[i * n + row] * B[i * k + col];
122 |         }
123 |         C[row * k + col] = sum;
124 |     }
125 | }
126 | 
127 | // CUDA kernel for ReLU activation
128 | __global__ void relu_kernel(float *x, int size) {
129 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
130 |     if (idx < size) {
131 |         x[idx] = fmaxf(0.0f, x[idx]);
132 |     }
133 | }
134 | 
135 | // CUDA kernel for bias addition
136 | __global__ void bias_add_kernel(float *x, float *bias, int batch_size, int size) {
137 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
138 |     int b = idx / size;
139 |     int i = idx % size;
140 | 
141 |     if (b < batch_size && i < size) {
142 |         x[idx] += bias[i];
143 |     }
144 | }
145 | 
146 | // CUDA kernel for softmax
147 | __global__ void softmax_kernel(float *x, int batch_size, int size) {
148 |     int b = blockIdx.x;
149 |     if (b < batch_size) {
150 |         float max_val = x[b * size];
151 |         for (int i = 1; i < size; ++i) {
152 |             max_val = fmaxf(max_val, x[b * size + i]);
153 |         }
154 | 
155 |         float sum = 0.0f;
156 |         for (int i = 0; i < size; ++i) {
157 |             x[b * size + i] = expf(x[b * size + i] - max_val);
158 |             sum += x[b * size + i];
159 |         }
160 | 
161 |         for (int i = 0; i < size; ++i) {
162 |             x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f);
163 |         }
164 |     }
165 | }
166 | 
167 | __global__ void clip_gradients_kernel(float *gradients, int size, float max_norm) {
168 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
169 |     if (idx < size) {
170 |         float grad = gradients[idx];
171 |         if (grad > max_norm) {
172 |             gradients[idx] = max_norm;
173 |         } else if (grad < -max_norm) {
174 |             gradients[idx] = -max_norm;
175 |         }
176 |     }
177 | }
178 | 
179 | 
180 | // Modified forward function using CUDA kernels
181 | void forward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int batch_size) {
182 |     // 1024 threads/blocks
183 |     dim3 block_size(32, 32);
184 |     // just enough blocks + threads for our naive matmul kernel
185 |     dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (batch_size + block_size.y - 1) / block_size.y);
186 | 
187 |     // Input to Hidden (X @ W1)
188 |     matmul_a_b_kernel<<<grid_size, block_size>>>(d_input, nn->weights1, d_hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE);
189 |     CUDA_CHECK(cudaGetLastError());
190 | 
191 |     // Add bias1 (one bias term for each neuron (multiple weights))
192 |     bias_add_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, nn->bias1, batch_size, HIDDEN_SIZE);
193 |     CUDA_CHECK(cudaGetLastError());
194 | 
195 |     // Apply ReLU
196 |     relu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, batch_size * HIDDEN_SIZE);
197 |     CUDA_CHECK(cudaGetLastError());
198 | 
199 |     // Hidden to Output (Hidden @ W2)
200 |     grid_size.x = (OUTPUT_SIZE + block_size.x - 1) / block_size.x;
201 |     grid_size.y = (batch_size + block_size.y - 1) / block_size.y;
202 |     matmul_a_b_kernel<<<grid_size, block_size>>>(d_hidden, nn->weights2, d_output, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
203 |     CUDA_CHECK(cudaGetLastError());
204 | 
205 |     // Add bias2 (also one bias term per neuron)
206 |     bias_add_kernel<<<(batch_size * OUTPUT_SIZE + 255) / 256, 256>>>(d_output, nn->bias2, batch_size, OUTPUT_SIZE);
207 |     CUDA_CHECK(cudaGetLastError());
208 | 
209 |     // Apply softmax
210 |     softmax_kernel<<<batch_size, 1>>>(d_output, batch_size, OUTPUT_SIZE);
211 |     CUDA_CHECK(cudaGetLastError());
212 | 
213 |     CUDA_CHECK(cudaDeviceSynchronize());
214 | }
215 | 
216 | // Modify cross_entropy_loss to work with batches (w/out softmax because we already do this in the forward pass)
217 | float cross_entropy_loss(float *output, int *labels, int batch_size) {
218 |     float total_loss = 0.0f;
219 |     for (int b = 0; b < batch_size; b++) {
220 |         total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f));
221 |     }
222 |     return total_loss / batch_size;
223 | }
224 | 
225 | // Add this CUDA kernel to zero out gradients
226 | __global__ void zero_grad_kernel(float *grad, int size) {
227 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
228 |     if (idx < size) {
229 |         grad[idx] = 0.0f;
230 |     }
231 | }
232 | 
233 | // CUDA kernel for computing output gradients
234 | __global__ void compute_output_gradients_kernel(float *grad_output, float *output, int *labels, int batch_size) {
235 |     int b = blockIdx.x * blockDim.x + threadIdx.x;
236 |     if (b < batch_size) {
237 |         for (int i = 0; i < OUTPUT_SIZE; ++i) {
238 |             grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i];
239 |         }
240 |         grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f;
241 |     }
242 | }
243 | 
244 | // CUDA kernel for updating gradients
245 | __global__ void update_gradients_kernel(float *grad_weights, float *grad_bias, float *grad_layer, float *prev_layer, int batch_size, int prev_size, int curr_size) {
246 |     int i = blockIdx.y;
247 |     int j = blockIdx.x * blockDim.x + threadIdx.x;
248 | 
249 |     if (i < curr_size && j < prev_size) {
250 |         float grad_w_sum = 0.0f;
251 |         for (int b = 0; b < batch_size; ++b) {
252 |             grad_w_sum += grad_layer[b * curr_size + i] * prev_layer[b * prev_size + j];
253 |         }
254 |         atomicAdd(&grad_weights[i * prev_size + j], grad_w_sum);
255 | 
256 |         if (j == 0) {
257 |             float grad_b_sum = 0.0f;
258 |             for (int b = 0; b < batch_size; ++b) {
259 |                 grad_b_sum += grad_layer[b * curr_size + i];
260 |             }
261 |             atomicAdd(&grad_bias[i], grad_b_sum);
262 |         }
263 |     }
264 | }
265 | 
266 | __global__ void drelu_kernel(float *x, float *d_ReLU_out, int size) {
267 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
268 |     if (idx < size) {
269 |         d_ReLU_out[idx] = x[idx] > 0.0f ? 1.0f : 0.0f;
270 |     }
271 | }
272 | 
273 | // Element-wise multiplication of d_dX2 and d_grad_hidden
274 | __global__ void multiply_gradients_kernel(float *grad1, float *grad2, int size) {
275 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
276 |     if (idx < size) {
277 |         grad1[idx] *= grad2[idx];
278 |     }
279 | }
280 | 
281 | // Modified backward function using CUDA kernels
282 | // shape rotating is on par with the visual example (excalidraw diagram) in the mnist-cuda git repo (also found in "assets")
283 | void backward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int *d_labels, int batch_size) {
284 |     // Initialize gradients to zero using CUDA kernel
285 | 
286 |     zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
287 |     CUDA_CHECK(cudaGetLastError());
288 | 
289 |     zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
290 |     CUDA_CHECK(cudaGetLastError());
291 | 
292 |     zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE);
293 |     CUDA_CHECK(cudaGetLastError());
294 | 
295 |     zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE);
296 |     CUDA_CHECK(cudaGetLastError());
297 |     
298 |     // Compute gradients for output layer
299 |     float *d_grad_output;
300 |     CUDA_CHECK(cudaMalloc(&d_grad_output, batch_size * OUTPUT_SIZE * sizeof(float)));
301 |     compute_output_gradients_kernel<<<(batch_size + 255) / 256, 256>>>(d_grad_output, d_output, d_labels, batch_size);
302 |     CUDA_CHECK(cudaGetLastError());
303 | 
304 |     // Update gradients for weights2 (W2.grad = grad_output.T @ hidden)
305 |     dim3 block_size(32, 32);
306 |     dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (OUTPUT_SIZE + block_size.y - 1) / block_size.y);
307 |     matmul_at_b_kernel<<<grid_size, block_size>>>(d_hidden, d_grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
308 |     CUDA_CHECK(cudaGetLastError());
309 | 
310 |     // Update gradients for bias2
311 |     update_gradients_kernel<<<grid_size, block_size>>>(nn->grad_weights2, nn->grad_bias2, d_grad_output, d_hidden, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
312 |     CUDA_CHECK(cudaGetLastError());
313 | 
314 |     // Compute dX2 (gradient of loss w.r.t. input of second layer)
315 |     float *d_dX2;
316 |     CUDA_CHECK(cudaMalloc(&d_dX2, batch_size * HIDDEN_SIZE * sizeof(float)));
317 |     grid_size.x = (HIDDEN_SIZE + block_size.x - 1) / block_size.x;
318 |     grid_size.y = (batch_size + block_size.y - 1) / block_size.y;
319 |     matmul_a_bt_kernel<<<grid_size, block_size>>>(d_grad_output, nn->weights2, d_dX2, batch_size, OUTPUT_SIZE, HIDDEN_SIZE);
320 |     CUDA_CHECK(cudaGetLastError());
321 | 
322 |     // Compute d_ReLU_out (element-wise multiplication with ReLU derivative)
323 |     float *d_grad_hidden;
324 |     CUDA_CHECK(cudaMalloc(&d_grad_hidden, batch_size * HIDDEN_SIZE * sizeof(float)));
325 |     drelu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, d_grad_hidden, batch_size * HIDDEN_SIZE);
326 |     CUDA_CHECK(cudaGetLastError());
327 | 
328 | 
329 |     multiply_gradients_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_dX2, d_grad_hidden, batch_size * HIDDEN_SIZE);
330 |     CUDA_CHECK(cudaGetLastError());
331 | 
332 |     // Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input)
333 |     grid_size.x = (INPUT_SIZE + block_size.x - 1) / block_size.x;
334 |     grid_size.y = (HIDDEN_SIZE + block_size.y - 1) / block_size.y;
335 |     matmul_at_b_kernel<<<grid_size, block_size>>>(d_input, d_dX2, nn->grad_weights1, batch_size, INPUT_SIZE, HIDDEN_SIZE);
336 |     CUDA_CHECK(cudaGetLastError());
337 | 
338 |     // Update gradients for bias1
339 |     update_gradients_kernel<<<grid_size, block_size>>>(nn->grad_weights1, nn->grad_bias1, d_dX2, d_input, batch_size, INPUT_SIZE, HIDDEN_SIZE);
340 |     CUDA_CHECK(cudaGetLastError());
341 | 
342 |     // Free allocated memory
343 |     CUDA_CHECK(cudaFree(d_grad_output));
344 |     CUDA_CHECK(cudaFree(d_dX2));
345 |     CUDA_CHECK(cudaFree(d_grad_hidden));
346 | 
347 |     CUDA_CHECK(cudaDeviceSynchronize());
348 | }
349 | 
350 | // gradient descent step
351 | __global__ void update_weights_kernel(float *weights, float *grad_weights, int size) {
352 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
353 |     if (idx < size) {
354 |         weights[idx] -= LEARNING_RATE * grad_weights[idx];
355 |     }
356 | }
357 | 
358 | void update_weights(NeuralNetwork *nn) {
359 |     int block_size = 256;
360 |     int grid_size;
361 | 
362 |     // Update weights1
363 |     grid_size = (HIDDEN_SIZE * INPUT_SIZE + block_size - 1) / block_size;
364 |     update_weights_kernel<<<grid_size, block_size>>>(nn->weights1, nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
365 |     CUDA_CHECK(cudaGetLastError());
366 | 
367 |     // Update weights2
368 |     grid_size = (OUTPUT_SIZE * HIDDEN_SIZE + block_size - 1) / block_size;
369 |     update_weights_kernel<<<grid_size, block_size>>>(nn->weights2, nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
370 |     CUDA_CHECK(cudaGetLastError());
371 | 
372 |     // Update bias1
373 |     grid_size = (HIDDEN_SIZE + block_size - 1) / block_size;
374 |     update_weights_kernel<<<grid_size, block_size>>>(nn->bias1, nn->grad_bias1, HIDDEN_SIZE);
375 |     CUDA_CHECK(cudaGetLastError());
376 | 
377 |     // Update bias2
378 |     grid_size = (OUTPUT_SIZE + block_size - 1) / block_size;
379 |     update_weights_kernel<<<grid_size, block_size>>>(nn->bias2, nn->grad_bias2, OUTPUT_SIZE);
380 |     CUDA_CHECK(cudaGetLastError());
381 | 
382 |     CUDA_CHECK(cudaDeviceSynchronize());
383 | }
384 | 
385 | 
386 | // Modify evaluate_accuracy to handle larger datasets by processing in batches
387 | float evaluate_accuracy(NeuralNetwork *nn, float *d_X_test, int *d_y_test, float *d_hidden, float *d_output, int total_size) {
388 |     int num_batches = (total_size + BATCH_SIZE - 1) / BATCH_SIZE;
389 |     int total_correct = 0;
390 |     int total_processed = 0;
391 | 
392 |     for (int batch = 0; batch < num_batches; batch++) {
393 |         int current_batch_size = (batch == num_batches - 1) ? 
394 |             (total_size - batch * BATCH_SIZE) : BATCH_SIZE;
395 |         
396 |         if (current_batch_size <= 0) break;
397 | 
398 |         forward(nn, &d_X_test[batch * BATCH_SIZE * INPUT_SIZE], 
399 |                 d_hidden, d_output, current_batch_size);
400 |         
401 |         float *h_output = (float *)malloc(current_batch_size * OUTPUT_SIZE * sizeof(float));
402 |         int *h_y_test = (int *)malloc(current_batch_size * sizeof(int));
403 |         
404 |         CUDA_CHECK(cudaMemcpy(h_output, d_output, 
405 |             current_batch_size * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
406 |         CUDA_CHECK(cudaMemcpy(h_y_test, &d_y_test[batch * BATCH_SIZE], 
407 |             current_batch_size * sizeof(int), cudaMemcpyDeviceToHost));
408 |         
409 |         for (int i = 0; i < current_batch_size; i++) {
410 |             int predicted = 0;
411 |             for (int j = 1; j < OUTPUT_SIZE; j++) {
412 |                 if (h_output[i * OUTPUT_SIZE + j] > h_output[i * OUTPUT_SIZE + predicted]) {
413 |                     predicted = j;
414 |                 }
415 |             }
416 |             if (predicted == h_y_test[i]) {
417 |                 total_correct++;
418 |             }
419 |         }
420 |         
421 |         total_processed += current_batch_size;
422 |         free(h_output);
423 |         free(h_y_test);
424 |     }
425 |     
426 |     return 100.0f * total_correct / total_processed;
427 | }
428 | 
429 | // Modify train function
430 | void train(NeuralNetwork *nn, float *X_train, int *y_train, float *X_test, int *y_test) {
431 |     float *d_X_train, *d_X_test, *d_hidden, *d_output;
432 |     int *d_y_train, *d_y_test;
433 | 
434 |     // Allocate memory for training and test data
435 |     CUDA_CHECK(cudaMalloc(&d_X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float)));
436 |     CUDA_CHECK(cudaMalloc(&d_X_test, TEST_SIZE * INPUT_SIZE * sizeof(float)));
437 |     CUDA_CHECK(cudaMalloc(&d_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float)));
438 |     CUDA_CHECK(cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
439 |     CUDA_CHECK(cudaMalloc(&d_y_train, TRAIN_SIZE * sizeof(int)));
440 |     CUDA_CHECK(cudaMalloc(&d_y_test, TEST_SIZE * sizeof(int)));
441 | 
442 |     // Copy data to GPU
443 |     CUDA_CHECK(cudaMemcpy(d_X_train, X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
444 |     CUDA_CHECK(cudaMemcpy(d_X_test, X_test, TEST_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
445 |     CUDA_CHECK(cudaMemcpy(d_y_train, y_train, TRAIN_SIZE * sizeof(int), cudaMemcpyHostToDevice));
446 |     CUDA_CHECK(cudaMemcpy(d_y_test, y_test, TEST_SIZE * sizeof(int), cudaMemcpyHostToDevice));
447 | 
448 |     int num_batches = TRAIN_SIZE / BATCH_SIZE;
449 | 
450 |     for (int epoch = 0; epoch < EPOCHS; epoch++) {
451 |         float total_loss = 0.0f;
452 |         
453 |         // Zero out gradients at the beginning of each epoch
454 |         zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
455 |         zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
456 |         zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE);
457 |         zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE);
458 |         CUDA_CHECK(cudaDeviceSynchronize());
459 | 
460 |         for (int batch = 0; batch < num_batches; batch++) {
461 |             int start_idx = batch * BATCH_SIZE;
462 |             
463 |             forward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, BATCH_SIZE);
464 | 
465 |             float *h_output = (float *)malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
466 |             CUDA_CHECK(cudaMemcpy(h_output, d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
467 | 
468 |             float loss = cross_entropy_loss(h_output, &y_train[start_idx], BATCH_SIZE);
469 |             total_loss += loss;
470 | 
471 |             free(h_output);
472 | 
473 |             backward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, &d_y_train[start_idx], BATCH_SIZE);
474 |             update_weights(nn);
475 | 
476 |             if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) {
477 |                 // Use random batch from test set for accuracy reporting
478 |                 int test_start_idx = rand() % (TEST_SIZE - BATCH_SIZE);
479 |                 float test_accuracy = evaluate_accuracy(nn, 
480 |                     &d_X_test[test_start_idx * INPUT_SIZE],
481 |                     &d_y_test[test_start_idx],
482 |                     d_hidden, d_output, BATCH_SIZE);
483 |                 
484 |                 printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Test Accuracy: %.2f%%\n", 
485 |                        epoch + 1, EPOCHS, batch + 1, num_batches, 
486 |                        total_loss / (batch + 1), test_accuracy);
487 |             }
488 |         }
489 |         
490 |         // Evaluate on entire test set at end of epoch
491 |         float test_accuracy = evaluate_accuracy(nn, d_X_test, d_y_test, d_hidden, d_output, TEST_SIZE);
492 |         printf("Epoch %d/%d completed, Loss: %.4f, Test Accuracy: %.2f%%\n", 
493 |             epoch + 1, EPOCHS, total_loss / num_batches, test_accuracy);
494 |     }
495 |     
496 |     // Free GPU memory
497 |     CUDA_CHECK(cudaFree(d_X_train));
498 |     CUDA_CHECK(cudaFree(d_X_test));
499 |     CUDA_CHECK(cudaFree(d_hidden));
500 |     CUDA_CHECK(cudaFree(d_output));
501 |     CUDA_CHECK(cudaFree(d_y_train));
502 |     CUDA_CHECK(cudaFree(d_y_test));
503 | }
504 | 
505 | // Modified initialize function to allocate memory for gradients
506 | void initialize_neural_network(NeuralNetwork *nn) {
507 |     CUDA_CHECK(cudaMalloc(&nn->weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
508 |     CUDA_CHECK(cudaMalloc(&nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
509 |     CUDA_CHECK(cudaMalloc(&nn->bias1, HIDDEN_SIZE * sizeof(float)));
510 |     CUDA_CHECK(cudaMalloc(&nn->bias2, OUTPUT_SIZE * sizeof(float)));
511 |     CUDA_CHECK(cudaMalloc(&nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
512 |     CUDA_CHECK(cudaMalloc(&nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
513 |     CUDA_CHECK(cudaMalloc(&nn->grad_bias1, HIDDEN_SIZE * sizeof(float)));
514 |     CUDA_CHECK(cudaMalloc(&nn->grad_bias2, OUTPUT_SIZE * sizeof(float)));
515 | 
516 |     // Allocate temporary host memory
517 |     float *h_weights1 = (float *)malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
518 |     float *h_weights2 = (float *)malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
519 |     float *h_bias1 = (float *)malloc(HIDDEN_SIZE * sizeof(float));
520 |     float *h_bias2 = (float *)malloc(OUTPUT_SIZE * sizeof(float));
521 | 
522 |     // Initialize weights and biases on the host
523 |     initialize_weights(h_weights1, HIDDEN_SIZE * INPUT_SIZE);
524 |     initialize_weights(h_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
525 |     initialize_bias(h_bias1, HIDDEN_SIZE);
526 |     initialize_bias(h_bias2, OUTPUT_SIZE);
527 | 
528 |     // Copy initialized values to device
529 |     CUDA_CHECK(cudaMemcpy(nn->weights1, h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
530 |     CUDA_CHECK(cudaMemcpy(nn->weights2, h_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
531 |     CUDA_CHECK(cudaMemcpy(nn->bias1, h_bias1, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice));
532 |     CUDA_CHECK(cudaMemcpy(nn->bias2, h_bias2, OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice));
533 | 
534 |     // Free temporary host memory
535 |     free(h_weights1);
536 |     free(h_weights2);
537 |     free(h_bias1);
538 |     free(h_bias2);
539 | }
540 | 
541 | int main() {
542 |     srand(time(NULL));
543 | 
544 |     NeuralNetwork nn;
545 |     initialize_neural_network(&nn);
546 | 
547 |     float *X_train = (float *)malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float));
548 |     int *y_train = (int *)malloc(TRAIN_SIZE * sizeof(int));
549 |     float *X_test = (float *)malloc(TEST_SIZE * INPUT_SIZE * sizeof(float));
550 |     int *y_test = (int *)malloc(TEST_SIZE * sizeof(int));
551 | 
552 |     load_data("../../mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE);
553 |     load_labels("../../mnist_data/y_train.bin", y_train, TRAIN_SIZE);
554 |     load_data("../../mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE);
555 |     load_labels("../../mnist_data/y_test.bin", y_test, TEST_SIZE);
556 | 
557 |     // print first image in the terminal
558 |     for (int i = 0; i < 28; i++) {
559 |         for (int j = 0; j < 28; j++) {
560 |             if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) {
561 |                 printf("X");
562 |             } else {
563 |                 printf(" ");
564 |             }
565 |         }
566 |         printf("\n");
567 |     }
568 | 
569 |     printf("First 10 training labels: ");
570 |     for (int i = 0; i < 10; i++) {
571 |         printf("%d ", y_train[i]);
572 |     }
573 |     printf("\n");
574 |     
575 |     // Start timing
576 |     struct timespec start, end;
577 |     clock_gettime(CLOCK_MONOTONIC, &start);
578 |     
579 |     train(&nn, X_train, y_train, X_test, y_test);
580 | 
581 |     // End timing
582 |     clock_gettime(CLOCK_MONOTONIC, &end);
583 |     
584 |     // Calculate duration in seconds with milliseconds
585 |     double training_time = (end.tv_sec - start.tv_sec) + 
586 |                           (end.tv_nsec - start.tv_nsec) / 1e9;
587 |     
588 |     printf("\nTotal training time: %.2f sec\n", training_time);
589 | 
590 |     CUDA_CHECK(cudaFree(nn.weights1));
591 |     CUDA_CHECK(cudaFree(nn.weights2));
592 |     CUDA_CHECK(cudaFree(nn.bias1));
593 |     CUDA_CHECK(cudaFree(nn.bias2));
594 |     CUDA_CHECK(cudaFree(nn.grad_weights1));
595 |     CUDA_CHECK(cudaFree(nn.grad_weights2));
596 |     CUDA_CHECK(cudaFree(nn.grad_bias1));
597 |     CUDA_CHECK(cudaFree(nn.grad_bias2));
598 |     free(X_train);
599 |     free(y_train);
600 |     free(X_test);
601 |     free(y_test);
602 | 
603 |     cudaError_t err = cudaGetLastError();
604 |     if (err != cudaSuccess) {
605 |         fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
606 |         return 1;
607 |     }
608 | 
609 |     return 0;
610 | }
611 | 


--------------------------------------------------------------------------------
/cuda/vroom/comparing/batch-compare-backward.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <cublas_v2.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <math.h>
  6 | #include <iostream>
  7 | 
  8 | #define CHECK_CUDA(call) { \
  9 |     cudaError_t err = call; \
 10 |     if (err != cudaSuccess) { \
 11 |         fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
 12 |         exit(EXIT_FAILURE); \
 13 |     } \
 14 | }
 15 | 
 16 | #define CHECK_CUBLAS(call) { \
 17 |     cublasStatus_t status = call; \
 18 |     if (status != CUBLAS_STATUS_SUCCESS) { \
 19 |         fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \
 20 |         exit(EXIT_FAILURE); \
 21 |     } \
 22 | }
 23 | 
 24 | __device__ float relu_derivative(float x) {
 25 |     return x > 0 ? 1.0f : 0.0f;
 26 | }
 27 | 
 28 | __global__ void compute_output_gradient(float *output, int *labels, float *grad_output, int output_size, int batch_size) {
 29 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 30 |     int batch_idx = blockIdx.y;
 31 | 
 32 |     if (idx < output_size && batch_idx < batch_size) {
 33 |         int index = batch_idx * output_size + idx;
 34 |         grad_output[index] = output[index];
 35 |         if (idx == labels[batch_idx]) {
 36 |             grad_output[index] -= 1.0f;
 37 |         }
 38 |     }
 39 | }
 40 | 
 41 | __global__ void compute_hidden_gradient(float *hidden, float *weights2, float *grad_output, float *grad_hidden,
 42 |                                         int hidden_size, int output_size, int batch_size) {
 43 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 44 |     int batch_idx = blockIdx.y;
 45 | 
 46 |     if (idx < hidden_size && batch_idx < batch_size) {
 47 |         float sum = 0.0f;
 48 |         for (int i = 0; i < output_size; i++) {
 49 |             sum += grad_output[batch_idx * output_size + i] * weights2[i * hidden_size + idx];
 50 |         }
 51 |         int hidden_index = batch_idx * hidden_size + idx;
 52 |         grad_hidden[hidden_index] = sum * relu_derivative(hidden[hidden_index]);
 53 |     }
 54 | }
 55 | 
 56 | __global__ void compute_weight_gradients(float *input, float *grad_hidden, float *grad_weights1,
 57 |                                          int input_size, int hidden_size, int batch_size) {
 58 |     int hidden_idx = blockIdx.x * blockDim.x + threadIdx.x;
 59 |     int input_idx = blockIdx.y * blockDim.y + threadIdx.y;
 60 | 
 61 |     if (hidden_idx < hidden_size && input_idx < input_size) {
 62 |         float sum = 0.0f;
 63 |         for (int b = 0; b < batch_size; b++) {
 64 |             sum += grad_hidden[b * hidden_size + hidden_idx] * input[b * input_size + input_idx];
 65 |         }
 66 |         grad_weights1[hidden_idx * input_size + input_idx] = sum;
 67 |     }
 68 | }
 69 | 
 70 | __global__ void compute_bias_gradients(float *grad_hidden, float *grad_bias1, int hidden_size, int batch_size) {
 71 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 72 | 
 73 |     if (idx < hidden_size) {
 74 |         float sum = 0.0f;
 75 |         for (int b = 0; b < batch_size; b++) {
 76 |             sum += grad_hidden[b * hidden_size + idx];
 77 |         }
 78 |         grad_bias1[idx] = sum;
 79 |     }
 80 | }
 81 | 
 82 | __device__ float relu(float x) {
 83 |     return fmaxf(x, 0.0f);
 84 | }
 85 | 
 86 | __global__ void add_bias_and_relu(float *data, float *bias, int size, int batch_size) {
 87 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 88 |     int batch_idx = blockIdx.y;
 89 | 
 90 |     if (idx < size && batch_idx < batch_size) {
 91 |         int index = batch_idx * size + idx;
 92 |         data[index] = relu(data[index] + bias[idx]);
 93 |     }
 94 | }
 95 | 
 96 | __global__ void add_bias(float *data, float *bias, int size, int batch_size) {
 97 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 98 |     int batch_idx = blockIdx.y;
 99 | 
100 |     if (idx < size && batch_idx < batch_size) {
101 |         int index = batch_idx * size + idx;
102 |         data[index] += bias[idx];
103 |     }
104 | }
105 | 
106 | 
107 | __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) {
108 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
109 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
110 | 
111 |     if (row < M && col < N) {
112 |         float sum = 0.0f;
113 |         for (int i = 0; i < K; i++) {
114 |             sum += A[row * K + i] * B[i * N + col];
115 |         }
116 |         C[row * N + col] = sum;
117 |     }
118 | }
119 | 
120 | __global__ void add_bias_naive(float *input, float *bias, int rows, int cols) {
121 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
122 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
123 | 
124 |     if (row < rows && col < cols) {
125 |         input[row * cols + col] += bias[col];
126 |     }
127 | }
128 | 
129 | __global__ void apply_relu_naive(float *input, int size) {
130 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
131 | 
132 |     if (idx < size) {
133 |         input[idx] = relu(input[idx]);
134 |     }
135 | }
136 | 
137 | 
138 | void forward_pass_naive(float *input, float *weights1, float *bias1, float *hidden,
139 |                         float *weights2, float *bias2, float *output,
140 |                         int input_size, int hidden_size, int output_size, int batch_size) {
141 |     // Define grid and block dimensions
142 |     dim3 block_dim(32, 32);
143 |     dim3 grid_dim_1((hidden_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y);
144 |     dim3 grid_dim_2((output_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y);
145 | 
146 | 
147 |     // // print inputs
148 |     // float *h_input = (float*)malloc(batch_size * input_size * sizeof(float));
149 |     // CHECK_CUDA(cudaMemcpy(h_input, input, batch_size * input_size * sizeof(float), cudaMemcpyDeviceToHost));
150 |     // std::cout << "input to naive: " << std::endl;
151 |     // for (int i = 0; i < batch_size * input_size; i++) {
152 |     //     printf("%f ", h_input[i]);
153 |     //     if ((i+1) % input_size == 0) {
154 |     //         printf("\n");
155 |     //     }
156 |     // }
157 |     // // copy back to device
158 |     // CHECK_CUDA(cudaMemcpy(input, h_input, batch_size * input_size * sizeof(float), cudaMemcpyHostToDevice));
159 | 
160 |     // First layer: input to hidden
161 |     matmul_forward_naive<<<grid_dim_1, block_dim>>>(input, weights1, hidden, batch_size, hidden_size, input_size);
162 | 
163 |     // print "hidden" values
164 |     // copy hidden to host
165 |     // float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
166 |     // CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
167 |     // std::cout << "naive hidden values (no bias): " << std::endl;
168 |     // for (int i = 0; i < batch_size * hidden_size; i++) {
169 |     //     printf("%f ", h_hidden[i]);
170 |     //     if ((i+1) % hidden_size == 0) {
171 |     //         printf("\n");
172 |     //     }
173 |     // }
174 |     // // copy back to device
175 |     // CHECK_CUDA(cudaMemcpy(hidden, h_hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyHostToDevice));
176 | 
177 | 
178 |     add_bias_naive<<<grid_dim_1, block_dim>>>(hidden, bias1, batch_size, hidden_size);
179 |     apply_relu_naive<<<(batch_size * hidden_size + 255) / 256, 256>>>(hidden, batch_size * hidden_size);
180 | 
181 |     // Second layer: hidden to output
182 |     matmul_forward_naive<<<grid_dim_2, block_dim>>>(hidden, weights2, output, batch_size, output_size, hidden_size);
183 |     add_bias_naive<<<grid_dim_2, block_dim>>>(output, bias2, batch_size, output_size);
184 | }
185 | 
186 | void backward_pass_naive(float *input, float *hidden, float *output, int *labels,
187 |                          float *weights1, float *weights2,
188 |                          float *grad_weights1, float *grad_weights2,
189 |                          float *grad_bias1, float *grad_bias2,
190 |                          int input_size, int hidden_size, int output_size, int batch_size) {
191 |                             
192 |     float *d_grad_output, *d_grad_hidden;
193 |     CHECK_CUDA(cudaMalloc(&d_grad_output, batch_size * output_size * sizeof(float)));
194 |     CHECK_CUDA(cudaMalloc(&d_grad_hidden, batch_size * hidden_size * sizeof(float)));
195 | 
196 |     dim3 block(256);
197 |     dim3 grid_output((output_size + block.x - 1) / block.x, batch_size);
198 |     dim3 grid_hidden((hidden_size + block.x - 1) / block.x, batch_size);
199 | 
200 |     compute_output_gradient<<<grid_output, block>>>(output, labels, d_grad_output, output_size, batch_size);
201 |     compute_hidden_gradient<<<grid_hidden, block>>>(hidden, weights2, d_grad_output, d_grad_hidden, hidden_size, output_size, batch_size);
202 | 
203 |     dim3 block_weights(16, 16);
204 |     dim3 grid_weights((hidden_size + block_weights.x - 1) / block_weights.x,
205 |                       (input_size + block_weights.y - 1) / block_weights.y);
206 |     compute_weight_gradients<<<grid_weights, block_weights>>>(input, d_grad_hidden, grad_weights1, input_size, hidden_size, batch_size);
207 | 
208 |     compute_bias_gradients<<<(hidden_size + 255) / 256, 256>>>(d_grad_hidden, grad_bias1, hidden_size, batch_size);
209 | 
210 |     // For grad_weights2 and grad_bias2, we can reuse the existing kernels with different dimensions
211 |     dim3 grid_weights2((output_size + block_weights.x - 1) / block_weights.x,
212 |                        (hidden_size + block_weights.y - 1) / block_weights.y);
213 |     compute_weight_gradients<<<grid_weights2, block_weights>>>(hidden, d_grad_output, grad_weights2, hidden_size, output_size, batch_size);
214 | 
215 |     compute_bias_gradients<<<(output_size + 255) / 256, 256>>>(d_grad_output, grad_bias2, output_size, batch_size);
216 | 
217 |     CHECK_CUDA(cudaFree(d_grad_output));
218 |     CHECK_CUDA(cudaFree(d_grad_hidden));
219 | }
220 | 
221 | void backward_pass_cublas(cublasHandle_t handle,
222 |                           float *input, float *hidden, float *output, int *labels,
223 |                           float *weights1, float *weights2,
224 |                           float *grad_weights1, float *grad_weights2,
225 |                           float *grad_bias1, float *grad_bias2,
226 |                           int input_size, int hidden_size, int output_size, int batch_size) {
227 |     float *d_grad_output, *d_grad_hidden;
228 |     CHECK_CUDA(cudaMalloc(&d_grad_output, batch_size * output_size * sizeof(float)));
229 |     CHECK_CUDA(cudaMalloc(&d_grad_hidden, batch_size * hidden_size * sizeof(float)));
230 | 
231 |     dim3 block(256);
232 |     dim3 grid_output((output_size + block.x - 1) / block.x, batch_size);
233 |     dim3 grid_hidden((hidden_size + block.x - 1) / block.x, batch_size);
234 | 
235 |     compute_output_gradient<<<grid_output, block>>>(output, labels, d_grad_output, output_size, batch_size);
236 | 
237 |     // Compute grad_hidden using cuBLAS
238 |     float alpha = 1.0f, beta = 0.0f;
239 |     CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N,
240 |                              hidden_size, batch_size, output_size,
241 |                              &alpha, weights2, output_size,
242 |                              d_grad_output, output_size,
243 |                              &beta, d_grad_hidden, hidden_size));
244 | 
245 |     // Apply ReLU derivative
246 |     dim3 block_relu(256);
247 |     dim3 grid_relu((batch_size * hidden_size + block_relu.x - 1) / block_relu.x);
248 |     compute_hidden_gradient<<<grid_hidden, block>>>(hidden, weights2, d_grad_output, d_grad_hidden, hidden_size, output_size, batch_size);
249 | 
250 |     // Compute grad_weights1 using cuBLAS
251 |     CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
252 |                              input_size, hidden_size, batch_size,
253 |                              &alpha, input, input_size,
254 |                              d_grad_hidden, hidden_size,
255 |                              &beta, grad_weights1, input_size));
256 | 
257 |     // Compute grad_bias1
258 |     compute_bias_gradients<<<(hidden_size + 255) / 256, 256>>>(d_grad_hidden, grad_bias1, hidden_size, batch_size);
259 | 
260 |     // Compute grad_weights2 using cuBLAS
261 |     CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
262 |                              hidden_size, output_size, batch_size,
263 |                              &alpha, hidden, hidden_size,
264 |                              d_grad_output, output_size,
265 |                              &beta, grad_weights2, hidden_size));
266 | 
267 |     // Compute grad_bias2
268 |     compute_bias_gradients<<<(output_size + 255) / 256, 256>>>(d_grad_output, grad_bias2, output_size, batch_size);
269 | 
270 |     CHECK_CUDA(cudaFree(d_grad_output));
271 |     CHECK_CUDA(cudaFree(d_grad_hidden));
272 | }
273 | 
274 | void compare_results(float *output1, float *output2, int size) {
275 |     float eps = 1e-5f;
276 |     int max_diff_idx = 0;
277 |     for (int i = 0; i < size; i++) {
278 |         float diff = fabsf(output1[i] - output2[i]);
279 |         if (diff > eps) {
280 |             printf("Results differ at index %d: %f vs %f\n", i, output1[i], output2[i]);
281 |             max_diff_idx = i;
282 |             break;
283 |         }
284 |     }
285 | }
286 | 
287 | 
288 | int main() {
289 |     const int batch_size = 2;
290 |     const int input_size = 4;
291 |     const int hidden_size = 4;
292 |     const int output_size = 1;
293 | 
294 |     size_t input_bytes = batch_size * input_size * sizeof(float);
295 |     size_t hidden_bytes = batch_size * hidden_size * sizeof(float);
296 |     size_t output_bytes = batch_size * output_size * sizeof(float);
297 |     size_t weights1_bytes = input_size * hidden_size * sizeof(float);
298 |     size_t weights2_bytes = hidden_size * output_size * sizeof(float);
299 |     size_t bias1_bytes = hidden_size * sizeof(float);
300 |     size_t bias2_bytes = output_size * sizeof(float);
301 | 
302 |     float *d_input, *d_hidden, *d_output, *d_output_cublas;
303 |     float *d_weights1, *d_weights2, *d_bias1, *d_bias2;
304 |     float *d_grad_weights1, *d_grad_weights2, *d_grad_bias1, *d_grad_bias2;
305 |     float *d_grad_weights1_cublas, *d_grad_weights2_cublas, *d_grad_bias1_cublas, *d_grad_bias2_cublas;
306 |     int *d_labels;
307 | 
308 |     // Allocate memory
309 |     CHECK_CUDA(cudaMalloc(&d_input, input_bytes));
310 |     CHECK_CUDA(cudaMalloc(&d_hidden, hidden_bytes));
311 |     CHECK_CUDA(cudaMalloc(&d_output, output_bytes));
312 |     CHECK_CUDA(cudaMalloc(&d_output_cublas, output_bytes));
313 |     CHECK_CUDA(cudaMalloc(&d_weights1, weights1_bytes));
314 |     CHECK_CUDA(cudaMalloc(&d_weights2, weights2_bytes));
315 |     CHECK_CUDA(cudaMalloc(&d_bias1, bias1_bytes));
316 |     CHECK_CUDA(cudaMalloc(&d_bias2, bias2_bytes));
317 |     CHECK_CUDA(cudaMalloc(&d_grad_weights1, weights1_bytes));
318 |     CHECK_CUDA(cudaMalloc(&d_grad_weights2, weights2_bytes));
319 |     CHECK_CUDA(cudaMalloc(&d_grad_bias1, bias1_bytes));
320 |     CHECK_CUDA(cudaMalloc(&d_grad_bias2, bias2_bytes));
321 |     CHECK_CUDA(cudaMalloc(&d_grad_weights1_cublas, weights1_bytes));
322 |     CHECK_CUDA(cudaMalloc(&d_grad_weights2_cublas, weights2_bytes));
323 |     CHECK_CUDA(cudaMalloc(&d_grad_bias1_cublas, bias1_bytes));
324 |     CHECK_CUDA(cudaMalloc(&d_grad_bias2_cublas, bias2_bytes));
325 |     CHECK_CUDA(cudaMalloc(&d_labels, batch_size * sizeof(int)));
326 | 
327 |     // Initialize data
328 |     float h_input[batch_size * input_size] = {1.0f, 2.0f, 3.0f, 4.0f,
329 |                                               2.0f, 4.0f, 6.0f, 8.0f};
330 |     float h_weights1[input_size * hidden_size] = {1.0f, 2.0f, 3.0f, 4.0f,
331 |                                                   2.0f, 4.0f, 6.0f, 8.0f,
332 |                                                   3.0f, 6.0f, 9.0f, 12.0f,
333 |                                                   4.0f, 8.0f, 12.0f, 16.0f};
334 |     float h_bias1[hidden_size] = {1.0f, 2.0f, 3.0f, 4.0f};
335 |     float h_weights2[hidden_size * output_size] = {1.0f, 2.0f, 3.0f, 4.0f};
336 |     float h_bias2[output_size] = {1.0f};
337 |     int h_labels[batch_size] = {0, 0}; // Assuming binary classification
338 | 
339 |     // Copy data to device
340 |     CHECK_CUDA(cudaMemcpy(d_input, h_input, input_bytes, cudaMemcpyHostToDevice));
341 |     CHECK_CUDA(cudaMemcpy(d_weights1, h_weights1, weights1_bytes, cudaMemcpyHostToDevice));
342 |     CHECK_CUDA(cudaMemcpy(d_bias1, h_bias1, bias1_bytes, cudaMemcpyHostToDevice));
343 |     CHECK_CUDA(cudaMemcpy(d_weights2, h_weights2, weights2_bytes, cudaMemcpyHostToDevice));
344 |     CHECK_CUDA(cudaMemcpy(d_bias2, h_bias2, bias2_bytes, cudaMemcpyHostToDevice));
345 |     CHECK_CUDA(cudaMemcpy(d_labels, h_labels, batch_size * sizeof(int), cudaMemcpyHostToDevice));
346 | 
347 |     // Forward pass (naive)
348 |     forward_pass_naive(d_input, d_weights1, d_bias1, d_hidden,
349 |                        d_weights2, d_bias2, d_output,
350 |                        input_size, hidden_size, output_size, batch_size);
351 | 
352 |     // Forward pass (cuBLAS)
353 |     cublasHandle_t handle;
354 |     CHECK_CUBLAS(cublasCreate(&handle));
355 |     // forward_pass_cublas(handle, d_input, d_weights1, d_bias1, d_hidden,
356 |     //                     d_weights2, d_bias2, d_output_cublas,
357 |     //                     input_size, hidden_size, output_size, batch_size);
358 | 
359 |     // Backward pass (naive)
360 |     backward_pass_naive(d_input, d_hidden, d_output, d_labels,
361 |                         d_weights1, d_weights2,
362 |                         d_grad_weights1, d_grad_weights2,
363 |                         d_grad_bias1, d_grad_bias2,
364 |                         input_size, hidden_size, output_size, batch_size);
365 | 
366 |     // Backward pass (cuBLAS)
367 |     backward_pass_cublas(handle, d_input, d_hidden, d_output_cublas, d_labels,
368 |                          d_weights1, d_weights2,
369 |                          d_grad_weights1_cublas, d_grad_weights2_cublas,
370 |                          d_grad_bias1_cublas, d_grad_bias2_cublas,
371 |                          input_size, hidden_size, output_size, batch_size);
372 | 
373 |     CHECK_CUBLAS(cublasDestroy(handle));
374 | 
375 |     // Compare results
376 |     float *h_output = (float*)malloc(output_bytes);
377 |     float *h_output_cublas = (float*)malloc(output_bytes);
378 |     float *h_grad_weights1 = (float*)malloc(weights1_bytes);
379 |     float *h_grad_weights1_cublas = (float*)malloc(weights1_bytes);
380 | 
381 |     CHECK_CUDA(cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost));
382 |     CHECK_CUDA(cudaMemcpy(h_output_cublas, d_output_cublas, output_bytes, cudaMemcpyDeviceToHost));
383 |     CHECK_CUDA(cudaMemcpy(h_grad_weights1, d_grad_weights1, weights1_bytes, cudaMemcpyDeviceToHost));
384 |     CHECK_CUDA(cudaMemcpy(h_grad_weights1_cublas, d_grad_weights1_cublas, weights1_bytes, cudaMemcpyDeviceToHost));
385 | 
386 |     printf("Comparing forward pass results:\n");
387 |     compare_results(h_output, h_output_cublas, batch_size * output_size);
388 | 
389 |     printf("Comparing backward pass results (grad_weights1):\n");
390 |     compare_results(h_grad_weights1, h_grad_weights1_cublas, input_size * hidden_size);
391 | 
392 |     // Free memory
393 |     free(h_output);
394 |     free(h_output_cublas);
395 |     free(h_grad_weights1);
396 |     free(h_grad_weights1_cublas);
397 | 
398 |     CHECK_CUDA(cudaFree(d_input));
399 |     CHECK_CUDA(cudaFree(d_hidden));
400 |     CHECK_CUDA(cudaFree(d_output));
401 |     CHECK_CUDA(cudaFree(d_output_cublas));
402 |     CHECK_CUDA(cudaFree(d_weights1));
403 |     CHECK_CUDA(cudaFree(d_weights2));
404 |     CHECK_CUDA(cudaFree(d_bias1));
405 |     CHECK_CUDA(cudaFree(d_bias2));
406 |     CHECK_CUDA(cudaFree(d_grad_weights1));
407 |     CHECK_CUDA(cudaFree(d_grad_weights2));
408 |     CHECK_CUDA(cudaFree(d_grad_bias1));
409 |     CHECK_CUDA(cudaFree(d_grad_bias2));
410 |     CHECK_CUDA(cudaFree(d_grad_weights1_cublas));
411 |     CHECK_CUDA(cudaFree(d_grad_weights2_cublas));
412 |     CHECK_CUDA(cudaFree(d_grad_bias1_cublas));
413 |     CHECK_CUDA(cudaFree(d_grad_bias2_cublas));
414 |     CHECK_CUDA(cudaFree(d_labels));
415 | 
416 |     return 0;
417 | }


--------------------------------------------------------------------------------
/cuda/vroom/comparing/batch-compare-forward.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <cublas_v2.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <math.h>
  6 | #include <curand.h>
  7 | #include <iostream>
  8 | 
  9 | using namespace std;
 10 | 
 11 | #define CHECK_CURAND(call) { \
 12 |     curandStatus_t status = call; \
 13 |     if (status != CURAND_STATUS_SUCCESS) { \
 14 |         fprintf(stderr, "cuRAND error in %s:%d\n", __FILE__, __LINE__); \
 15 |         exit(EXIT_FAILURE); \
 16 |     } \
 17 | }
 18 | 
 19 | #define CHECK_CUDA(call) { \
 20 |     cudaError_t err = call; \
 21 |     if (err != cudaSuccess) { \
 22 |         fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
 23 |         exit(EXIT_FAILURE); \
 24 |     } \
 25 | }
 26 | 
 27 | #define CHECK_CUBLAS(call) { \
 28 |     cublasStatus_t status = call; \
 29 |     if (status != CUBLAS_STATUS_SUCCESS) { \
 30 |         fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \
 31 |         exit(EXIT_FAILURE); \
 32 |     } \
 33 | }
 34 | 
 35 | // __device__ float relu(float x) {
 36 | //     return fmaxf(x, 0.0f);
 37 | // }
 38 | 
 39 | // __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) {
 40 | //     int row = blockIdx.y * blockDim.y + threadIdx.y;
 41 | //     int col = blockIdx.x * blockDim.x + threadIdx.x;
 42 | 
 43 | //     if (row < M && col < N) {
 44 | //         float sum = 0.0f;
 45 | //         for (int i = 0; i < K; i++) {
 46 | //             sum += A[row * K + i] * B[i * N + col];
 47 | //         }
 48 | //         C[row * N + col] = sum;
 49 | //     }
 50 | // }
 51 | 
 52 | // __global__ void forward_pass(float *input, float *weights1, float *bias1, float *hidden,
 53 | //                              float *weights2, float *bias2, float *output,
 54 | //                              int input_size, int hidden_size, int output_size, int batch_size) {
 55 | //     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 56 | //     int batch_idx = blockIdx.y;
 57 | 
 58 | //     if (idx < hidden_size && batch_idx < batch_size) {
 59 | //         float sum = 0.0f;
 60 | //         for (int i = 0; i < input_size; i++) {
 61 | //             sum += weights1[idx * input_size + i] * input[batch_idx * input_size + i];
 62 | //         }
 63 | //         float hidden_val = relu(sum + bias1[idx]);
 64 | //         hidden[batch_idx * hidden_size + idx] = hidden_val;
 65 | //     }
 66 | 
 67 | //     __syncthreads();
 68 | 
 69 | //     if (idx < output_size && batch_idx < batch_size) {
 70 | //         float sum = 0.0f;
 71 | //         for (int i = 0; i < hidden_size; i++) {
 72 | //             sum += weights2[idx * hidden_size + i] * hidden[batch_idx * hidden_size + i];
 73 | //         }
 74 | //         float output_val = sum + bias2[idx];
 75 | //         output[batch_idx * output_size + idx] = output_val;
 76 | //     }
 77 | // }
 78 | 
 79 | 
 80 | __device__ float relu(float x) {
 81 |     return fmaxf(x, 0.0f);
 82 | }
 83 | 
 84 | __global__ void add_bias_and_relu(float *data, float *bias, int size, int batch_size) {
 85 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 86 |     int batch_idx = blockIdx.y;
 87 | 
 88 |     if (idx < size && batch_idx < batch_size) {
 89 |         int index = batch_idx * size + idx;
 90 |         data[index] = relu(data[index] + bias[idx]);
 91 |     }
 92 | }
 93 | 
 94 | __global__ void add_bias(float *data, float *bias, int size, int batch_size) {
 95 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 96 |     int batch_idx = blockIdx.y;
 97 | 
 98 |     if (idx < size && batch_idx < batch_size) {
 99 |         int index = batch_idx * size + idx;
100 |         data[index] += bias[idx];
101 |     }
102 | }
103 | 
104 | 
105 | __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) {
106 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
107 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
108 | 
109 |     if (row < M && col < N) {
110 |         float sum = 0.0f;
111 |         for (int i = 0; i < K; i++) {
112 |             sum += A[row * K + i] * B[i * N + col];
113 |         }
114 |         C[row * N + col] = sum;
115 |     }
116 | }
117 | 
118 | __global__ void add_bias_naive(float *input, float *bias, int rows, int cols) {
119 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
120 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
121 | 
122 |     if (row < rows && col < cols) {
123 |         input[row * cols + col] += bias[col];
124 |     }
125 | }
126 | 
127 | __global__ void apply_relu_naive(float *input, int size) {
128 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
129 | 
130 |     if (idx < size) {
131 |         input[idx] = relu(input[idx]);
132 |     }
133 | }
134 | 
135 | void cublasMatmul(cublasHandle_t handle, float *d_A, float *d_B, float *d_C, int M, int K, int N) {
136 |     float alpha = 1.0f, beta = 0.0f;
137 | 
138 |     CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, 
139 |                         &alpha, d_B, N, d_A, K, &beta, d_C, N));
140 | 
141 | }
142 | 
143 | __global__ void forward_pass(float *input, float *weights1, float *bias1, float *hidden,
144 |                              float *weights2, float *bias2, float *output,
145 |                              int input_size, int hidden_size, int output_size, int batch_size) {
146 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
147 |     int batch_idx = blockIdx.y;
148 | 
149 |     if (idx < hidden_size && batch_idx < batch_size) {
150 |         float sum = 0.0f;
151 |         for (int i = 0; i < input_size; i++) {
152 |             sum += weights1[idx * input_size + i] * input[batch_idx * input_size + i];
153 |         }
154 |         hidden[batch_idx * hidden_size + idx] = relu(sum + bias1[idx]);
155 |     }
156 | 
157 |     __syncthreads();
158 | 
159 |     if (idx < output_size && batch_idx < batch_size) {
160 |         float sum = 0.0f;
161 |         for (int i = 0; i < hidden_size; i++) {
162 |             sum += weights2[idx * hidden_size + i] * hidden[batch_idx * hidden_size + i];
163 |         }
164 |         output[batch_idx * output_size + idx] = sum + bias2[idx];
165 |     }
166 | }
167 | 
168 | void forward_pass_wrapper(float *d_input, float *d_weights1, float *d_bias1, float *d_hidden,
169 |                           float *d_weights2, float *d_bias2, float *d_output,
170 |                           int input_size, int hidden_size, int output_size, int batch_size) {
171 |     dim3 block_dim(256);
172 |     dim3 grid_dim((max(hidden_size, output_size) + block_dim.x - 1) / block_dim.x, batch_size);
173 | 
174 |     forward_pass<<<grid_dim, block_dim>>>(d_input, d_weights1, d_bias1, d_hidden,
175 |                                           d_weights2, d_bias2, d_output,
176 |                                           input_size, hidden_size, output_size, batch_size);
177 | 
178 |     // Print hidden layer values
179 |     float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
180 |     CHECK_CUDA(cudaMemcpy(h_hidden, d_hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
181 |     std::cout << "Forward pass hidden layer values:" << std::endl;
182 |     for (int i = 0; i < batch_size * hidden_size; i++) {
183 |         printf("%f ", h_hidden[i]);
184 |         if ((i+1) % hidden_size == 0) printf("\n");
185 |     }
186 |     free(h_hidden);
187 | 
188 |     // Print output values
189 |     float *h_output = (float*)malloc(batch_size * output_size * sizeof(float));
190 |     CHECK_CUDA(cudaMemcpy(h_output, d_output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost));
191 |     std::cout << "Forward pass output values:" << std::endl;
192 |     for (int i = 0; i < batch_size * output_size; i++) {
193 |         printf("%f ", h_output[i]);
194 |         if ((i+1) % output_size == 0) printf("\n");
195 |     }
196 |     free(h_output);
197 | }
198 | 
199 | void forward_pass_naive(float *input, float *weights1, float *bias1, float *hidden,
200 |                         float *weights2, float *bias2, float *output,
201 |                         int input_size, int hidden_size, int output_size, int batch_size) {
202 |     dim3 block_dim(32, 32);
203 |     dim3 grid_dim_1((hidden_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y);
204 |     dim3 grid_dim_2((output_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y);
205 | 
206 |     // First layer: input to hidden
207 |     matmul_forward_naive<<<grid_dim_1, block_dim>>>(input, weights1, hidden, batch_size, hidden_size, input_size);
208 |     
209 |     // Print hidden values after matmul
210 |     float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
211 |     CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
212 |     std::cout << "Naive hidden values after matmul:" << std::endl;
213 |     for (int i = 0; i < batch_size * hidden_size; i++) {
214 |         printf("%f ", h_hidden[i]);
215 |         if ((i+1) % hidden_size == 0) printf("\n");
216 |     }
217 |     free(h_hidden);
218 | 
219 |     add_bias_naive<<<grid_dim_1, block_dim>>>(hidden, bias1, batch_size, hidden_size);
220 |     
221 |     // Print hidden values after adding bias
222 |     h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
223 |     CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
224 |     std::cout << "Naive hidden values after adding bias:" << std::endl;
225 |     for (int i = 0; i < batch_size * hidden_size; i++) {
226 |         printf("%f ", h_hidden[i]);
227 |         if ((i+1) % hidden_size == 0) printf("\n");
228 |     }
229 |     free(h_hidden);
230 | 
231 |     apply_relu_naive<<<(batch_size * hidden_size + 255) / 256, 256>>>(hidden, batch_size * hidden_size);
232 |     
233 |     // Print hidden values after ReLU
234 |     h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
235 |     CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
236 |     std::cout << "Naive hidden values after ReLU:" << std::endl;
237 |     for (int i = 0; i < batch_size * hidden_size; i++) {
238 |         printf("%f ", h_hidden[i]);
239 |         if ((i+1) % hidden_size == 0) printf("\n");
240 |     }
241 |     free(h_hidden);
242 | 
243 |     // Second layer: hidden to output
244 |     matmul_forward_naive<<<grid_dim_2, block_dim>>>(hidden, weights2, output, batch_size, output_size, hidden_size);
245 |     add_bias_naive<<<grid_dim_2, block_dim>>>(output, bias2, batch_size, output_size);
246 | 
247 |     // Print final output
248 |     float *h_output = (float*)malloc(batch_size * output_size * sizeof(float));
249 |     CHECK_CUDA(cudaMemcpy(h_output, output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost));
250 |     std::cout << "Naive final output:" << std::endl;
251 |     for (int i = 0; i < batch_size * output_size; i++) {
252 |         printf("%f ", h_output[i]);
253 |         if ((i+1) % output_size == 0) printf("\n");
254 |     }
255 |     std::cout << std::endl << std::endl;
256 |     free(h_output);
257 | }
258 | 
259 | void forward_pass_cublas(cublasHandle_t handle, float *input, float *weights1, float *bias1, float *hidden,
260 |                          float *weights2, float *bias2, float *output,
261 |                          int input_size, int hidden_size, int output_size, int batch_size) {
262 |     float alpha = 1.0f, beta = 0.0f;
263 | 
264 |     // First layer: input to hidden
265 |     cublasMatmul(handle, input, weights1, hidden, batch_size, input_size, hidden_size);
266 |     
267 |     // Print hidden values after matmul
268 |     float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
269 |     CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
270 |     std::cout << "cuBLAS hidden values after matmul:" << std::endl;
271 |     for (int i = 0; i < batch_size * hidden_size; i++) {
272 |         printf("%f ", h_hidden[i]);
273 |         if ((i+1) % hidden_size == 0) printf("\n");
274 |     }
275 |     free(h_hidden);
276 | 
277 |     dim3 block(256);
278 |     dim3 grid((hidden_size + block.x - 1) / block.x, batch_size);
279 |     add_bias_and_relu<<<grid, block>>>(hidden, bias1, hidden_size, batch_size);
280 |     
281 |     // Print hidden values after bias and ReLU
282 |     h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float));
283 |     CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost));
284 |     std::cout << "cuBLAS hidden values after bias and ReLU:" << std::endl;
285 |     for (int i = 0; i < batch_size * hidden_size; i++) {
286 |         printf("%f ", h_hidden[i]);
287 |         if ((i+1) % hidden_size == 0) printf("\n");
288 |     }
289 |     free(h_hidden);
290 | 
291 |     // Second layer: hidden to output
292 |     cublasMatmul(handle, hidden, weights2, output, batch_size, hidden_size, output_size);
293 |     
294 |     grid = dim3((output_size + block.x - 1) / block.x, batch_size);
295 |     add_bias<<<grid, block>>>(output, bias2, output_size, batch_size);
296 | 
297 |     // Print final output
298 |     float *h_output = (float*)malloc(batch_size * output_size * sizeof(float));
299 |     CHECK_CUDA(cudaMemcpy(h_output, output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost));
300 |     std::cout << "cuBLAS final output:" << std::endl;
301 |     for (int i = 0; i < batch_size * output_size; i++) {
302 |         printf("%f ", h_output[i]);
303 |         if ((i+1) % output_size == 0) printf("\n");
304 |     }
305 |     free(h_output);
306 | }
307 | 
308 | void compare_results(float *output1, float *output2, int size) {
309 |     float max_diff = 0.0f;
310 |     int max_diff_idx = 0;
311 |     for (int i = 0; i < size; i++) {
312 |         float diff = fabsf(output1[i] - output2[i]);
313 |         if (diff > max_diff) {
314 |             max_diff = diff;
315 |             max_diff_idx = i;
316 |             std::cout << "max_diff_idx: " << max_diff_idx << std::endl;
317 |         }
318 |     }
319 | }
320 | 
321 | __global__ void scale_array(float *arr, int size, float scale) {
322 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
323 |     if (idx < size) {
324 |         arr[idx] = (arr[idx] - 0.5f) * scale;
325 |     }
326 | }
327 | 
328 | int main() {
329 |     const int batch_size = 2; // M -> batch_size
330 |     const int input_size = 4; // K -> 784
331 |     const int hidden_size = 4; // N -> 256
332 |     const int output_size = 1; // O
333 | 
334 |     size_t input_bytes = batch_size * input_size * sizeof(float); // M * K
335 |     size_t hidden_bytes = batch_size * hidden_size * sizeof(float); // M * N
336 |     size_t output_bytes = batch_size * output_size * sizeof(float); // M * O
337 |     size_t weights1_bytes = input_size * hidden_size * sizeof(float); // K * N
338 |     size_t weights2_bytes = hidden_size * output_size * sizeof(float); // N * O
339 |     size_t bias1_bytes = hidden_size * sizeof(float);
340 |     size_t bias2_bytes = output_size * sizeof(float);
341 | 
342 |     float *d_input, *d_weights1, *d_bias1, *d_hidden, *d_weights2, *d_bias2, *d_output, *d_output_cublas;
343 | 
344 |     CHECK_CUDA(cudaMalloc(&d_input, input_bytes));
345 |     CHECK_CUDA(cudaMalloc(&d_weights1, weights1_bytes));
346 |     CHECK_CUDA(cudaMalloc(&d_bias1, bias1_bytes));
347 |     CHECK_CUDA(cudaMalloc(&d_hidden, hidden_bytes));
348 |     CHECK_CUDA(cudaMalloc(&d_weights2, weights2_bytes));
349 |     CHECK_CUDA(cudaMalloc(&d_bias2, bias2_bytes));
350 |     CHECK_CUDA(cudaMalloc(&d_output, output_bytes));
351 |     CHECK_CUDA(cudaMalloc(&d_output_cublas, output_bytes));
352 | 
353 |     float h_input[batch_size * input_size] = {1.0f, -2.0f, 3.0f, -4.0f,
354 |                                               2.0f, -4.0f, 6.0f, -8.0f};
355 | 
356 |     float h_weights1[input_size * hidden_size] = {-1.0f, 2.0f, 3.0f, 4.0f,
357 |                                                   2.0f, -4.0f, 6.0f, 8.0f,
358 |                                                   3.0f, 6.0f, -9.0f, 12.0f,
359 |                                                   4.0f, 8.0f, 12.0f, -16.0f};
360 | 
361 |     float h_bias1[hidden_size] = {-1.0f, -2.0f, -3.0f, -4.0f};
362 | 
363 |     float h_weights2[hidden_size * output_size] = {1.0f, 2.0f, 3.0f, 4.0f};
364 | 
365 |     float h_bias2[output_size] = {-1.0f};
366 | 
367 |     CHECK_CUDA(cudaMemcpy(d_input, h_input, input_bytes, cudaMemcpyHostToDevice));
368 | 
369 |     CHECK_CUDA(cudaMemcpy(d_weights1, h_weights1, weights1_bytes, cudaMemcpyHostToDevice));
370 | 
371 |     CHECK_CUDA(cudaMemcpy(d_bias1, h_bias1, bias1_bytes, cudaMemcpyHostToDevice));
372 | 
373 |     CHECK_CUDA(cudaMemcpy(d_weights2, h_weights2, weights2_bytes, cudaMemcpyHostToDevice));
374 | 
375 |     CHECK_CUDA(cudaMemcpy(d_bias2, h_bias2, bias2_bytes, cudaMemcpyHostToDevice));
376 | 
377 | 
378 |     dim3 block(256);
379 |     dim3 grid((max(hidden_size, output_size) + block.x - 1) / block.x, batch_size);
380 |     forward_pass_naive(d_input, d_weights1, d_bias1, d_hidden,
381 |                                 d_weights2, d_bias2, d_output,
382 |                                 input_size, hidden_size, output_size, batch_size);
383 | 
384 |     CHECK_CUDA(cudaGetLastError());
385 |     CHECK_CUDA(cudaDeviceSynchronize());
386 | 
387 |     cublasHandle_t handle;
388 |     CHECK_CUBLAS(cublasCreate(&handle));
389 |     forward_pass_cublas(handle, d_input, d_weights1, d_bias1, d_hidden,
390 |                         d_weights2, d_bias2, d_output_cublas,
391 |                         input_size, hidden_size, output_size, batch_size);
392 |     CHECK_CUBLAS(cublasDestroy(handle));
393 | 
394 |     float *h_output = (float*)malloc(output_bytes);
395 |     float *h_output_cublas = (float*)malloc(output_bytes);
396 |     CHECK_CUDA(cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost));
397 |     CHECK_CUDA(cudaMemcpy(h_output_cublas, d_output_cublas, output_bytes, cudaMemcpyDeviceToHost));
398 |     // In main()
399 |     forward_pass_wrapper(d_input, d_weights1, d_bias1, d_hidden,
400 |                         d_weights2, d_bias2, d_output,
401 |                         input_size, hidden_size, output_size, batch_size);
402 | 
403 |     CHECK_CUDA(cudaGetLastError());
404 |     CHECK_CUDA(cudaDeviceSynchronize());
405 | 
406 |     compare_results(h_output, h_output_cublas, batch_size * output_size);
407 | 
408 |     free(h_output);
409 |     free(h_output_cublas);
410 |     CHECK_CUDA(cudaFree(d_input));
411 |     CHECK_CUDA(cudaFree(d_weights1));
412 |     CHECK_CUDA(cudaFree(d_bias1));
413 |     CHECK_CUDA(cudaFree(d_hidden));
414 |     CHECK_CUDA(cudaFree(d_weights2));
415 |     CHECK_CUDA(cudaFree(d_bias2));
416 |     CHECK_CUDA(cudaFree(d_output));
417 |     CHECK_CUDA(cudaFree(d_output_cublas));
418 | 
419 |     return 0;
420 | }


--------------------------------------------------------------------------------
/cuda/vroom/comparing/batch-matmul-compare.cu:
--------------------------------------------------------------------------------
  1 | #include <cuda_runtime.h>
  2 | #include <cublas_v2.h>
  3 | #include <stdio.h>
  4 | #include <stdlib.h>
  5 | #include <iostream>
  6 | 
  7 | #define CHECK_CUDA(call) { \
  8 |     cudaError_t err = call; \
  9 |     if (err != cudaSuccess) { \
 10 |         fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
 11 |         exit(EXIT_FAILURE); \
 12 |     } \
 13 | }
 14 | 
 15 | #define CHECK_CUBLAS(call) { \
 16 |     cublasStatus_t status = call; \
 17 |     if (status != CUBLAS_STATUS_SUCCESS) { \
 18 |         fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \
 19 |         exit(EXIT_FAILURE); \
 20 |     } \
 21 | }
 22 | 
 23 | // Naive matrix multiplication kernel
 24 | __global__ void naiveMatmulKernel(float* A, float* B, float* C, int M, int N, int K) {
 25 |     int row = blockIdx.y * blockDim.y + threadIdx.y;
 26 |     int col = blockIdx.x * blockDim.x + threadIdx.x;
 27 | 
 28 |     if (row < M && col < N) {
 29 |         float sum = 0.0f;
 30 |         for (int i = 0; i < K; ++i) {
 31 |             sum += A[row * K + i] * B[i * N + col];
 32 |         }
 33 |         C[row * N + col] = sum;
 34 |     }
 35 | }
 36 | 
 37 | // Function to initialize a matrix with random values
 38 | void initMatrix(float* mat, int rows, int cols) {
 39 |     for (int i = 0; i < rows * cols; ++i) {
 40 |         // mat[i] = static_cast<float>(rand()) / RAND_MAX;
 41 |         // set to i and static cast to float
 42 |         mat[i] = static_cast<float>(i) * 0.05;
 43 |     }
 44 | }
 45 | 
 46 | // Function to compare two matrices
 47 | bool compareMatrices(float* A, float* B, int size, float tolerance = 1e-5) {
 48 |     for (int i = 0; i < size; ++i) {
 49 |         std::cout << "A[" << i << "] = " << A[i] << " B[" << i << "] = " << B[i] << std::endl;
 50 |         if (fabs(A[i] - B[i]) > tolerance) {
 51 |             return false;
 52 |         }
 53 |     }
 54 |     return true;
 55 | }
 56 | 
 57 | int main() {
 58 |     const int M = 4; // batchsize
 59 |     const int K = 4; // input size
 60 |     const int N = 6; // hidden size
 61 | 
 62 |     // (batch_size, input_size) x (input_size, hidden_size) = (batch_size, hidden_size) = (4, 6)
 63 | 
 64 | 
 65 |     size_t bytes_A = M * K * sizeof(float);
 66 |     size_t bytes_B = K * N * sizeof(float);
 67 |     size_t bytes_C = M * N * sizeof(float);
 68 | 
 69 |     float *h_A, *h_B, *h_C_naive, *h_C_cublas;
 70 |     float *d_A, *d_B, *d_C_naive, *d_C_cublas;
 71 | 
 72 |     // Allocate host memory
 73 |     h_A = (float*)malloc(bytes_A);
 74 |     h_B = (float*)malloc(bytes_B);
 75 |     h_C_naive = (float*)malloc(bytes_C);
 76 |     h_C_cublas = (float*)malloc(bytes_C);
 77 | 
 78 |     // Initialize matrices
 79 |     initMatrix(h_A, M, K);
 80 |     initMatrix(h_B, K, N);
 81 | 
 82 |     // Allocate device memory
 83 |     CHECK_CUDA(cudaMalloc(&d_A, bytes_A));
 84 |     CHECK_CUDA(cudaMalloc(&d_B, bytes_B));
 85 |     CHECK_CUDA(cudaMalloc(&d_C_naive, bytes_C));
 86 |     CHECK_CUDA(cudaMalloc(&d_C_cublas, bytes_C));
 87 | 
 88 |     // Copy data to device
 89 |     CHECK_CUDA(cudaMemcpy(d_A, h_A, bytes_A, cudaMemcpyHostToDevice));
 90 |     CHECK_CUDA(cudaMemcpy(d_B, h_B, bytes_B, cudaMemcpyHostToDevice));
 91 | 
 92 |     // Naive kernel
 93 |     dim3 blockDim(32, 32);
 94 |     dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y);
 95 |     naiveMatmulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C_naive, M, N, K);
 96 |     CHECK_CUDA(cudaGetLastError());
 97 |     CHECK_CUDA(cudaDeviceSynchronize());
 98 | 
 99 |     // cuBLAS
100 |     cublasHandle_t handle;
101 |     CHECK_CUBLAS(cublasCreate(&handle));
102 |     float alpha = 1.0f;
103 |     float beta = 0.0f;
104 | 
105 | 
106 |     // w @ x -> 
107 |     CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, 
108 |                             &alpha, d_B, N, d_A, K, &beta, d_C_cublas, N));
109 | 
110 |     CHECK_CUBLAS(cublasDestroy(handle));
111 | 
112 |     // Copy results back to host
113 |     CHECK_CUDA(cudaMemcpy(h_C_naive, d_C_naive, bytes_C, cudaMemcpyDeviceToHost));
114 |     CHECK_CUDA(cudaMemcpy(h_C_cublas, d_C_cublas, bytes_C, cudaMemcpyDeviceToHost));
115 | 
116 |     // Compare results
117 |     bool results_match = compareMatrices(h_C_naive, h_C_cublas, M * N);
118 |     if (results_match) {
119 |         printf("Naive and cuBLAS results match!\n");
120 |     } else {
121 |         printf("Naive and cuBLAS results do not match!\n");
122 |     }
123 | 
124 |     // print all results
125 |     std::cout << "naive\n";
126 |     for (int i = 0; i < M * K; i++) {
127 |         // std::cout << "naive idx " << i << " = " << h_C_naive[i] << std::endl;
128 |         // std::cout << "cublas idx " << i << " = " << h_C_cublas[i] << std::endl;
129 |         std::cout << h_C_naive[i];
130 |         if (i % M == 0) {
131 |             std::cout << "\n";
132 |         }
133 |         
134 |     }
135 | 
136 |     std::cout << "\n\n";
137 |     std::cout << "cublas\n";
138 |     for (int i = 0; i < M * K; i++) {
139 |         std::cout << h_C_cublas[i];
140 |         if (i % M == 0) {
141 |             std::cout << "\n";
142 |         }
143 |     }
144 | 
145 |     // Free memory
146 |     free(h_A);
147 |     free(h_B);
148 |     free(h_C_naive);
149 |     free(h_C_cublas);
150 |     CHECK_CUDA(cudaFree(d_A));
151 |     CHECK_CUDA(cudaFree(d_B));
152 |     CHECK_CUDA(cudaFree(d_C_naive));
153 |     CHECK_CUDA(cudaFree(d_C_cublas));
154 | 
155 |     return 0;
156 | }


--------------------------------------------------------------------------------
/cuda/vroom/v1.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <cuda_runtime.h>
  4 | #include <curand_kernel.h>
  5 | #include <cublas_v2.h>
  6 | 
  7 | #define INPUT_SIZE 784
  8 | #define HIDDEN_SIZE 256
  9 | #define OUTPUT_SIZE 10
 10 | #define BATCH_SIZE 4
 11 | 
 12 | #define CUDA_CHECK(call) \
 13 |     do { \
 14 |         cudaError_t error = call; \
 15 |         if (error != cudaSuccess) { \
 16 |             fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
 17 |                     cudaGetErrorString(error)); \
 18 |             exit(EXIT_FAILURE); \
 19 |         } \
 20 |     } while(0)
 21 | 
 22 | __global__ void init_random(float *data, int size, unsigned long long seed) {
 23 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 24 |     if (idx < size) {
 25 |         curandState state;
 26 |         curand_init(seed, idx, 0, &state);
 27 |         data[idx] = curand_uniform(&state) * 2.0f - 1.0f;
 28 |     }
 29 | }
 30 | 
 31 | __global__ void relu_derivative(float *grad, float *x, int size) {
 32 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 33 |     if (idx < size) {
 34 |         grad[idx] *= (x[idx] > 0) ? 1.0f : 0.0f;
 35 |     }
 36 | }
 37 | 
 38 | 
 39 | __global__ void backward_pass_naive(float *input, float *hidden, float *output, int *labels,
 40 |                                     float *weights1, float *weights2,
 41 |                                     float *grad_weights1, float *grad_weights2,
 42 |                                     float *grad_bias1, float *grad_bias2,
 43 |                                     int input_size, int hidden_size, int output_size, int batch_size) {
 44 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 45 |     int batch_idx = blockIdx.y;
 46 | 
 47 |     __shared__ float grad_output[OUTPUT_SIZE];
 48 | 
 49 |     if (idx < output_size && batch_idx < batch_size) {
 50 |         grad_output[idx] = output[batch_idx * output_size + idx];
 51 |         if (idx == labels[batch_idx]) {
 52 |             grad_output[idx] -= 1.0f;
 53 |         }
 54 |     }
 55 | 
 56 |     __syncthreads();
 57 | 
 58 |     if (idx < hidden_size && batch_idx < batch_size) {
 59 |         float grad_hidden = 0.0f;
 60 |         for (int i = 0; i < output_size; i++) {
 61 |             grad_hidden += grad_output[i] * weights2[i * hidden_size + idx];
 62 |         }
 63 |         grad_hidden *= (hidden[batch_idx * hidden_size + idx] > 0) ? 1.0f : 0.0f;  // ReLU derivative
 64 | 
 65 |         for (int i = 0; i < input_size; i++) {
 66 |             atomicAdd(&grad_weights1[idx * input_size + i], grad_hidden * input[batch_idx * input_size + i]);
 67 |         }
 68 |         atomicAdd(&grad_bias1[idx], grad_hidden);
 69 |     }
 70 | 
 71 |     if (idx < output_size * hidden_size && batch_idx < batch_size) {
 72 |         int i = idx / hidden_size;
 73 |         int j = idx % hidden_size;
 74 |         atomicAdd(&grad_weights2[idx], grad_output[i] * hidden[batch_idx * hidden_size + j]);
 75 |     }
 76 | 
 77 |     if (idx < output_size && batch_idx < batch_size) {
 78 |         atomicAdd(&grad_bias2[idx], grad_output[idx]);
 79 |     }
 80 | }
 81 | 
 82 | __global__ void compute_output_gradient(float *output, int *labels, float *grad_output, int output_size, int batch_size) {
 83 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 84 |     int batch_idx = blockIdx.y;
 85 | 
 86 |     if (idx < output_size && batch_idx < batch_size) {
 87 |         int index = batch_idx * output_size + idx;
 88 |         grad_output[index] = output[index];
 89 |         if (idx == labels[batch_idx]) {
 90 |             grad_output[index] -= 1.0f;
 91 |         }
 92 |     }
 93 | }
 94 | 
 95 | __global__ void compute_hidden_gradient(float *grad_hidden, float *grad_output, float *weights2, float *hidden,
 96 |                                         int hidden_size, int output_size, int batch_size) {
 97 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 98 |     int batch_idx = blockIdx.y;
 99 | 
100 |     if (idx < hidden_size && batch_idx < batch_size) {
101 |         float grad = 0.0f;
102 |         for (int i = 0; i < output_size; i++) {
103 |             grad += grad_output[batch_idx * output_size + i] * weights2[i * hidden_size + idx];
104 |         }
105 |         grad_hidden[batch_idx * hidden_size + idx] = grad * ((hidden[batch_idx * hidden_size + idx] > 0) ? 1.0f : 0.0f);
106 |     }
107 | }
108 | 
109 | __global__ void compute_bias_gradient(float *grad_bias, float *grad, int size, int batch_size) {
110 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
111 | 
112 |     if (idx < size) {
113 |         float sum = 0.0f;
114 |         for (int i = 0; i < batch_size; i++) {
115 |             sum += grad[i * size + idx];
116 |         }
117 |         grad_bias[idx] = sum;
118 |     }
119 | }
120 | 
121 | // 3/4 working
122 | void backward_pass_cublas(cublasHandle_t handle, float *d_input, float *d_hidden, float *d_output, int *d_labels,
123 |                           float *d_weights1, float *d_weights2,
124 |                           float *d_grad_weights1, float *d_grad_weights2,
125 |                           float *d_grad_bias1, float *d_grad_bias2,
126 |                           float *d_grad_output, float *d_grad_hidden, float *d_ones,
127 |                           int input_size, int hidden_size, int output_size, int batch_size) {
128 |     float alpha = 1.0f, beta = 0.0f;
129 | 
130 |     // Compute output gradient
131 |     dim3 block_size(256);
132 |     dim3 grid_size((output_size + block_size.x - 1) / block_size.x, batch_size);
133 |     compute_output_gradient<<<grid_size, block_size>>>(d_output, d_labels, d_grad_output, output_size, batch_size);
134 | 
135 |     // Compute dW2 = dLoss @ x2.T = (10, B) @ (B, 256) = (10, 256)
136 |     cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
137 |                 hidden_size, output_size, batch_size, // (M K N)
138 |                 &alpha,
139 |                 d_hidden, hidden_size,
140 |                 d_grad_output, output_size,
141 |                 &beta,
142 |                 d_grad_weights2, hidden_size);
143 | 
144 |     // Compute hidden gradient
145 |     grid_size.x = (hidden_size + block_size.x - 1) / block_size.x;
146 |     compute_hidden_gradient<<<grid_size, block_size>>>(d_grad_hidden, d_grad_output, d_weights2, d_hidden,
147 |                                                        hidden_size, output_size, batch_size);
148 | 
149 |     // Compute dW1 = dRelu @ x1.T = (256, B) @ (B, 784) = (256, 784)
150 |     cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T,
151 |                 input_size, hidden_size, batch_size,
152 |                 &alpha,
153 |                 d_input, input_size,
154 |                 d_grad_hidden, hidden_size,
155 |                 &beta,
156 |                 d_grad_weights1, input_size);
157 | 
158 |     // Compute bias gradients
159 |     compute_bias_gradient<<<(output_size + 255) / 256, 256>>>(d_grad_bias2, d_grad_output, output_size, batch_size);
160 |     compute_bias_gradient<<<(hidden_size + 255) / 256, 256>>>(d_grad_bias1, d_grad_hidden, hidden_size, batch_size);
161 | }
162 | 
163 | void print_comparison(const char* name, float* arr1, float* arr2, int size) {
164 |     float max_diff = 0.0f;
165 |     printf("%s:\n", name);
166 |     printf("First 10 values:\n");
167 |     for (int i = 0; i < 10 && i < size; i++) {
168 |         printf("%.6f vs %.6f\n", arr1[i], arr2[i]);
169 |         max_diff = fmaxf(max_diff, fabsf(arr1[i] - arr2[i]));
170 |     }
171 |     for (int i = 10; i < size; i++) {
172 |         max_diff = fmaxf(max_diff, fabsf(arr1[i] - arr2[i]));
173 |     }
174 |     printf("Max difference: %.6f\n\n", max_diff);
175 | }
176 | 
177 | int main() {
178 |     // Allocate host memory
179 |     float *h_input, *h_hidden, *h_output, *h_weights1, *h_weights2;
180 |     int *h_labels;
181 |     float *h_grad_weights1_naive, *h_grad_weights2_naive, *h_grad_bias1_naive, *h_grad_bias2_naive;
182 |     float *h_grad_weights1_cublas, *h_grad_weights2_cublas, *h_grad_bias1_cublas, *h_grad_bias2_cublas;
183 | 
184 |     cudaMallocHost(&h_input, BATCH_SIZE * INPUT_SIZE * sizeof(float));
185 |     cudaMallocHost(&h_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float));
186 |     cudaMallocHost(&h_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
187 |     cudaMallocHost(&h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
188 |     cudaMallocHost(&h_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
189 |     cudaMallocHost(&h_labels, BATCH_SIZE * sizeof(int));
190 |     cudaMallocHost(&h_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
191 |     cudaMallocHost(&h_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
192 |     cudaMallocHost(&h_grad_bias1_naive, HIDDEN_SIZE * sizeof(float));
193 |     cudaMallocHost(&h_grad_bias2_naive, OUTPUT_SIZE * sizeof(float));
194 |     cudaMallocHost(&h_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
195 |     cudaMallocHost(&h_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
196 |     cudaMallocHost(&h_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float));
197 |     cudaMallocHost(&h_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float));
198 | 
199 |     // Allocate device memory
200 |     float *d_input, *d_hidden, *d_output, *d_weights1, *d_weights2;
201 |     int *d_labels;
202 |     float *d_grad_weights1_naive, *d_grad_weights2_naive, *d_grad_bias1_naive, *d_grad_bias2_naive;
203 |     float *d_grad_weights1_cublas, *d_grad_weights2_cublas, *d_grad_bias1_cublas, *d_grad_bias2_cublas;
204 |     float *d_grad_output, *d_grad_hidden, *d_ones;
205 | 
206 |     CUDA_CHECK(cudaMalloc(&d_input, BATCH_SIZE * INPUT_SIZE * sizeof(float)));
207 |     CUDA_CHECK(cudaMalloc(&d_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float)));
208 |     CUDA_CHECK(cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
209 |     CUDA_CHECK(cudaMalloc(&d_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
210 |     CUDA_CHECK(cudaMalloc(&d_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
211 |     CUDA_CHECK(cudaMalloc(&d_labels, BATCH_SIZE * sizeof(int)));
212 |     CUDA_CHECK(cudaMalloc(&d_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
213 |     CUDA_CHECK(cudaMalloc(&d_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
214 |     CUDA_CHECK(cudaMalloc(&d_grad_bias1_naive, HIDDEN_SIZE * sizeof(float)));
215 |     CUDA_CHECK(cudaMalloc(&d_grad_bias2_naive, OUTPUT_SIZE * sizeof(float)));
216 |     CUDA_CHECK(cudaMalloc(&d_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)));
217 |     CUDA_CHECK(cudaMalloc(&d_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)));
218 |     CUDA_CHECK(cudaMalloc(&d_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float)));
219 |     CUDA_CHECK(cudaMalloc(&d_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float)));
220 |     CUDA_CHECK(cudaMalloc(&d_grad_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
221 |     CUDA_CHECK(cudaMalloc(&d_grad_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float)));
222 |     CUDA_CHECK(cudaMalloc(&d_ones, BATCH_SIZE * sizeof(float)));
223 | 
224 |     // Initialize random data
225 |     int threads = 256;
226 |     int blocks;
227 |     unsigned long long seed = time(NULL);
228 | 
229 |     blocks = (BATCH_SIZE * INPUT_SIZE + threads - 1) / threads;
230 |     init_random<<<blocks, threads>>>(d_input, BATCH_SIZE * INPUT_SIZE, seed);
231 | 
232 |     blocks = (BATCH_SIZE * HIDDEN_SIZE + threads - 1) / threads;
233 |     init_random<<<blocks, threads>>>(d_hidden, BATCH_SIZE * HIDDEN_SIZE, seed);
234 | 
235 |     blocks = (BATCH_SIZE * OUTPUT_SIZE + threads - 1) / threads;
236 |     init_random<<<blocks, threads>>>(d_output, BATCH_SIZE * OUTPUT_SIZE, seed);
237 | 
238 |     blocks = (HIDDEN_SIZE * INPUT_SIZE + threads - 1) / threads;
239 |     init_random<<<blocks, threads>>>(d_weights1, HIDDEN_SIZE * INPUT_SIZE, seed);
240 | 
241 |     blocks = (OUTPUT_SIZE * HIDDEN_SIZE + threads - 1) / threads;
242 |     init_random<<<blocks, threads>>>(d_weights2, OUTPUT_SIZE * HIDDEN_SIZE, seed);
243 | 
244 |     // Initialize labels with random values between 0 and OUTPUT_SIZE - 1
245 |     for (int i = 0; i < BATCH_SIZE; i++) {
246 |         h_labels[i] = rand() % OUTPUT_SIZE;
247 |     }
248 |     CUDA_CHECK(cudaMemcpy(d_labels, h_labels, BATCH_SIZE * sizeof(int), cudaMemcpyHostToDevice));
249 | 
250 |     // Initialize d_ones with all 1's
251 |     CUDA_CHECK(cudaMemset(d_ones, 1, BATCH_SIZE * sizeof(float)));
252 | 
253 |     // Allocate host memory for grad_output
254 |     float *h_grad_output_naive, *h_grad_output_cublas;
255 |     cudaMallocHost(&h_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
256 |     cudaMallocHost(&h_grad_output_cublas, BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
257 | 
258 |     // Allocate device memory for grad_output_naive
259 |     float *d_grad_output_naive;
260 |     CUDA_CHECK(cudaMalloc(&d_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
261 | 
262 |     // Perform naive backward pass
263 |     dim3 block_size(256);
264 |     dim3 grid_size((max(HIDDEN_SIZE, OUTPUT_SIZE) + block_size.x - 1) / block_size.x, BATCH_SIZE);
265 |     backward_pass_naive<<<grid_size, block_size>>>(d_input, d_hidden, d_output, d_labels,
266 |                                                    d_weights1, d_weights2,
267 |                                                    d_grad_weights1_naive, d_grad_weights2_naive,
268 |                                                    d_grad_bias1_naive, d_grad_bias2_naive,
269 |                                                    INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, BATCH_SIZE);
270 | 
271 |     // Compute grad_output for naive approach
272 |     compute_output_gradient<<<grid_size, block_size>>>(d_output, d_labels, d_grad_output_naive, OUTPUT_SIZE, BATCH_SIZE);
273 | 
274 |     // Perform cuBLAS backward pass
275 |     cublasHandle_t handle;
276 |     cublasCreate(&handle);
277 |     backward_pass_cublas(handle, d_input, d_hidden, d_output, d_labels,
278 |                          d_weights1, d_weights2,
279 |                          d_grad_weights1_cublas, d_grad_weights2_cublas,
280 |                          d_grad_bias1_cublas, d_grad_bias2_cublas,
281 |                          d_grad_output, d_grad_hidden, d_ones,
282 |                          INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, BATCH_SIZE);
283 |     cublasDestroy(handle);
284 | 
285 |     // Copy results back to host
286 |     CUDA_CHECK(cudaMemcpy(h_grad_weights1_naive, d_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
287 |     CUDA_CHECK(cudaMemcpy(h_grad_weights2_naive, d_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
288 |     CUDA_CHECK(cudaMemcpy(h_grad_bias1_naive, d_grad_bias1_naive, HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
289 |     CUDA_CHECK(cudaMemcpy(h_grad_bias2_naive, d_grad_bias2_naive, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
290 | 
291 |     CUDA_CHECK(cudaMemcpy(h_grad_weights1_cublas, d_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
292 |     CUDA_CHECK(cudaMemcpy(h_grad_weights2_cublas, d_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
293 |     CUDA_CHECK(cudaMemcpy(h_grad_bias1_cublas, d_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
294 |     CUDA_CHECK(cudaMemcpy(h_grad_bias2_cublas, d_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
295 | 
296 |     CUDA_CHECK(cudaMemcpy(h_grad_output_naive, d_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
297 |     CUDA_CHECK(cudaMemcpy(h_grad_output_cublas, d_grad_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
298 | 
299 |     // Compare and print results
300 |     print_comparison("grad_output", h_grad_output_naive, h_grad_output_cublas, BATCH_SIZE * OUTPUT_SIZE);
301 |     print_comparison("grad_weights2", h_grad_weights2_naive, h_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE);
302 | 
303 |     // print indices of x > 1e-2 here (h_grad_weights2_naive, h_grad_weights2_cublas):
304 |     for (int i = 0; i < OUTPUT_SIZE * HIDDEN_SIZE; i += 16) {
305 |         if (fabsf(h_grad_weights2_naive[i] - h_grad_weights2_cublas[i]) > 1e-3) {   
306 |             printf("Index %d: %.6f vs %.6f\n", i, h_grad_weights2_naive[i], h_grad_weights2_cublas[i]);
307 |         }
308 |     }
309 | 
310 |     print_comparison("grad_bias2", h_grad_bias2_naive, h_grad_bias2_cublas, OUTPUT_SIZE);
311 |     print_comparison("grad_bias2", h_grad_bias2_naive, h_grad_bias2_cublas, OUTPUT_SIZE);
312 |     print_comparison("grad_weights1", h_grad_weights1_naive, h_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE);
313 |     print_comparison("grad_bias1", h_grad_bias1_naive, h_grad_bias1_cublas, HIDDEN_SIZE);
314 | 
315 |     // Free memory
316 |     cudaFreeHost(h_input);
317 |     cudaFreeHost(h_hidden);
318 |     cudaFreeHost(h_output);
319 |     cudaFreeHost(h_weights1);
320 |     cudaFreeHost(h_weights2);
321 |     cudaFreeHost(h_labels);
322 |     cudaFreeHost(h_grad_weights1_naive);
323 |     cudaFreeHost(h_grad_weights2_naive);
324 |     cudaFreeHost(h_grad_bias1_naive);
325 |     cudaFreeHost(h_grad_bias2_naive);
326 |     cudaFreeHost(h_grad_weights1_cublas);
327 |     cudaFreeHost(h_grad_weights2_cublas);
328 |     cudaFreeHost(h_grad_bias1_cublas);
329 |     cudaFreeHost(h_grad_bias2_cublas);
330 |     cudaFreeHost(h_grad_output_naive);
331 |     cudaFreeHost(h_grad_output_cublas);
332 | 
333 |     cudaFree(d_input);
334 |     cudaFree(d_hidden);
335 |     cudaFree(d_output);
336 |     cudaFree(d_weights1);
337 |     cudaFree(d_weights2);
338 |     cudaFree(d_labels);
339 |     cudaFree(d_grad_weights1_naive);
340 |     cudaFree(d_grad_weights2_naive);
341 |     cudaFree(d_grad_bias1_naive);
342 |     cudaFree(d_grad_bias2_naive);
343 |     cudaFree(d_grad_weights1_cublas);
344 |     cudaFree(d_grad_weights2_cublas);
345 |     cudaFree(d_grad_bias1_cublas);
346 |     cudaFree(d_grad_bias2_cublas);
347 |     cudaFree(d_grad_output);
348 |     cudaFree(d_grad_hidden);
349 |     cudaFree(d_ones);
350 |     cudaFree(d_grad_output_naive);
351 | 
352 |     return 0;
353 | }
354 | 


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import numpy as np
 4 | from torchvision import datasets, transforms
 5 | 
 6 | # Set the directory where you want to save the files
 7 | save_dir = "mnist_data"
 8 | os.makedirs(save_dir, exist_ok=True)
 9 | 
10 | # Download and load the MNIST dataset
11 | transform = transforms.Compose([transforms.ToTensor()])
12 | mnist_train = datasets.MNIST(
13 |     root="./data", train=True, download=True, transform=transform
14 | )
15 | mnist_test = datasets.MNIST(
16 |     root="./data", train=False, download=True, transform=transform
17 | )
18 | 
19 | # Convert to numpy arrays and normalize
20 | X_train = mnist_train.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
21 | y_train = mnist_train.targets.numpy().astype(np.int32)
22 | X_test = mnist_test.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0
23 | y_test = mnist_test.targets.numpy().astype(np.int32)
24 | 
25 | # Save the data as raw binary files
26 | X_train.tofile(os.path.join(save_dir, "X_train.bin"))
27 | y_train.tofile(os.path.join(save_dir, "y_train.bin"))
28 | X_test.tofile(os.path.join(save_dir, "X_test.bin"))
29 | y_test.tofile(os.path.join(save_dir, "y_test.bin"))
30 | 
31 | # Save metadata
32 | with open(os.path.join(save_dir, "metadata.txt"), "w") as f:
33 |     f.write(f"Training samples: {X_train.shape[0]}\n")
34 |     f.write(f"Test samples: {X_test.shape[0]}\n")
35 |     f.write(f"Input dimensions: {X_train.shape[1]}\n")
36 |     f.write(f"Number of classes: {len(np.unique(y_train))}\n")
37 | 
38 | print("MNIST dataset has been downloaded and saved in binary format.")
39 | 


--------------------------------------------------------------------------------
/naive-cpu/v1.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <math.h>
  4 | #include <time.h>
  5 | #include <string.h>
  6 | 
  7 | #define INPUT_SIZE 784
  8 | #define HIDDEN_SIZE 1
  9 | #define OUTPUT_SIZE 10
 10 | #define TRAIN_SIZE 10000
 11 | #define TEST_SIZE 1000
 12 | #define BATCH_SIZE 4
 13 | #define EPOCHS 10
 14 | #define LEARNING_RATE 0.001
 15 | 
 16 | typedef struct {
 17 |     float *weights1;
 18 |     float *weights2;
 19 |     float *bias1;
 20 |     float *bias2;
 21 |     float *grad_weights1;
 22 |     float *grad_weights2;
 23 |     float *grad_bias1;
 24 |     float *grad_bias2;
 25 | } NeuralNetwork;
 26 | 
 27 | 
 28 | // load batched img data
 29 | void load_data(const char *filename, float *data, int size) {
 30 |     FILE *file = fopen(filename, "rb");
 31 |     if (file == NULL) {
 32 |         fprintf(stderr, "Error opening file: %s\n", filename);
 33 |         exit(1);
 34 |     }
 35 |     size_t read_size = fread(data, sizeof(float), size, file);
 36 |     if (read_size != size) {
 37 |         fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size);
 38 |         exit(1);
 39 |     }
 40 |     fclose(file);
 41 | }
 42 | 
 43 | // load batch labels
 44 | void load_labels(const char *filename, int *labels, int size) {
 45 |     FILE *file = fopen(filename, "rb");
 46 |     if (file == NULL) {
 47 |         fprintf(stderr, "Error opening file: %s\n", filename);
 48 |         exit(1);
 49 |     }
 50 |     size_t read_size = fread(labels, sizeof(int), size, file);
 51 |     if (read_size != size) {
 52 |         fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size);
 53 |         exit(1);
 54 |     }
 55 |     fclose(file);
 56 | }
 57 | 
 58 | // kaiming init func for weights
 59 | void initialize_weights(float *weights, int size) {
 60 |     float scale = sqrtf(2.0f / size);
 61 |     for (int i = 0; i < size; i++) {
 62 |         weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f);
 63 |     }
 64 | }
 65 | 
 66 | // basic init for biases
 67 | void initialize_bias(float *bias, int size) {
 68 |     for (int i = 0; i < size; i++) {
 69 |         bias[i] = 0.0f;
 70 |     }
 71 | }
 72 | 
 73 | // Modify softmax to work with batches
 74 | void softmax(float *x, int batch_size, int size) {
 75 |     for (int b = 0; b < batch_size; b++) {
 76 |         float max = x[b * size];
 77 |         for (int i = 1; i < size; i++) {
 78 |             if (x[b * size + i] > max) max = x[b * size + i];
 79 |         }
 80 |         float sum = 0.0f;
 81 |         for (int i = 0; i < size; i++) {
 82 |             x[b * size + i] = expf(x[b * size + i] - max);
 83 |             sum += x[b * size + i];
 84 |         }
 85 |         for (int i = 0; i < size; i++) {
 86 |             x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f);
 87 |         }
 88 |     }
 89 | }
 90 | 
 91 | void matmul_a_b(float *A, float *B, float *C, int m, int n, int k) {
 92 |     for (int i = 0; i < m; i++) {
 93 |         for (int j = 0; j < k; j++) {
 94 |             C[i * k + j] = 0.0f;
 95 |             for (int l = 0; l < n; l++) {
 96 |                 C[i * k + j] += A[i * n + l] * B[l * k + j];
 97 |             }
 98 |         }
 99 |     }
100 | }
101 | 
102 | // Matrix multiplication A @ B.T
103 | void matmul_a_bt(float *A, float *B, float *C, int m, int n, int k) {
104 |     for (int i = 0; i < m; i++) {
105 |         for (int j = 0; j < k; j++) {
106 |             C[i * k + j] = 0.0f;
107 |             for (int l = 0; l < n; l++) {
108 |                 C[i * k + j] += A[i * n + l] * B[j * n + l];
109 |             }
110 |         }
111 |     }
112 | }
113 | 
114 | // Matrix multiplication A.T @ B
115 | void matmul_at_b(float *A, float *B, float *C, int m, int n, int k) {
116 |     for (int i = 0; i < n; i++) {
117 |         for (int j = 0; j < k; j++) {
118 |             C[i * k + j] = 0.0f;
119 |             for (int l = 0; l < m; l++) {
120 |                 C[i * k + j] += A[l * n + i] * B[l * k + j];
121 |             }
122 |         }
123 |     }
124 | }
125 | 
126 | // ReLU forward
127 | void relu_forward(float *x, int size) {
128 |     for (int i = 0; i < size; i++) {
129 |         x[i] = fmaxf(0.0f, x[i]);
130 |     }
131 | }
132 | 
133 | // Add bias
134 | void bias_forward(float *x, float *bias, int batch_size, int size) {
135 |     for (int b = 0; b < batch_size; b++) {
136 |         for (int i = 0; i < size; i++) {
137 |             x[b * size + i] += bias[i];
138 |         }
139 |     }
140 | }
141 | 
142 | // Modified forward function
143 | void forward(NeuralNetwork *nn, float *input, float *hidden, float *output, int batch_size) {
144 |     // Input to Hidden (X @ W1)
145 |     matmul_a_b(input, nn->weights1, hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE);
146 |     
147 |     // Add bias1
148 |     bias_forward(hidden, nn->bias1, batch_size, HIDDEN_SIZE);
149 |     
150 |     // Apply ReLU
151 |     relu_forward(hidden, batch_size * HIDDEN_SIZE);
152 | 
153 |     // Hidden to Output (Hidden @ W2)
154 |     matmul_a_b(hidden, nn->weights2, output, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
155 | 
156 |     // Add bias2
157 |     bias_forward(output, nn->bias2, batch_size, OUTPUT_SIZE);
158 |     
159 |     // Apply softmax
160 |     softmax(output, batch_size, OUTPUT_SIZE);
161 | }
162 | 
163 | // Modify cross_entropy_loss to work with batches
164 | float cross_entropy_loss(float *output, int *labels, int batch_size) {
165 |     float total_loss = 0.0f;
166 |     for (int b = 0; b < batch_size; b++) {
167 |         total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f));
168 |     }
169 |     return total_loss / batch_size;
170 | }
171 | 
172 | 
173 | // Zero out gradients
174 | void zero_grad(float *grad, int size) {
175 |     memset(grad, 0, size * sizeof(float));
176 | }
177 | 
178 | // ReLU backward
179 | void relu_backward(float *grad, float *x, int size) {
180 |     for (int i = 0; i < size; i++) {
181 |         grad[i] *= (x[i] > 0);
182 |     }
183 | }
184 | 
185 | // Bias backward
186 | void bias_backward(float *grad_bias, float *grad, int batch_size, int size) {
187 |     for (int i = 0; i < size; i++) {
188 |         grad_bias[i] = 0.0f;
189 |         for (int b = 0; b < batch_size; b++) {
190 |             grad_bias[i] += grad[b * size + i];
191 |         }
192 |     }
193 | }
194 | 
195 | // Compute gradients for output layer
196 | void compute_output_gradients(float *grad_output, float *output, int *labels, int batch_size) {
197 |     for (int b = 0; b < batch_size; b++) {
198 |         for (int i = 0; i < OUTPUT_SIZE; i++) {
199 |             grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i];
200 |         }
201 |         grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f;
202 |     }
203 | }
204 | 
205 | // Update gradients for weights and biases
206 | void update_gradients(float *grad_weights, float *grad_bias, float *grad_layer, float *prev_layer, int batch_size, int prev_size, int curr_size) {
207 |     for (int i = 0; i < curr_size; i++) {
208 |         for (int j = 0; j < prev_size; j++) {
209 |             for (int b = 0; b < batch_size; b++) {
210 |                 grad_weights[i * prev_size + j] += grad_layer[b * curr_size + i] * prev_layer[b * prev_size + j];
211 |             }
212 |         }
213 |         for (int b = 0; b < batch_size; b++) {
214 |             grad_bias[i] += grad_layer[b * curr_size + i];
215 |         }
216 |     }
217 | }
218 | 
219 | // Backward pass function
220 | void backward(NeuralNetwork *nn, float *input, float *hidden, float *output, int *labels, int batch_size) {
221 |     
222 |     // Initialize gradients to zero
223 |     zero_grad(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE);
224 |     zero_grad(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE);
225 |     zero_grad(nn->grad_bias1, HIDDEN_SIZE);
226 |     zero_grad(nn->grad_bias2, OUTPUT_SIZE);
227 | 
228 |     // Compute gradients for output layer
229 |     float *grad_output = malloc(batch_size * OUTPUT_SIZE * sizeof(float));
230 |     compute_output_gradients(grad_output, output, labels, batch_size);
231 | 
232 |     // Update gradients for weights2 (W2.grad = grad_output.T @ hidden)
233 |     matmul_at_b(hidden, grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE, OUTPUT_SIZE);
234 | 
235 |     // Update gradients for bias2
236 |     bias_backward(nn->grad_bias2, grad_output, batch_size, OUTPUT_SIZE);
237 | 
238 |     // Compute dX2 (gradient of loss w.r.t. input of second layer)
239 |     float *dX2 = malloc(batch_size * HIDDEN_SIZE * sizeof(float));
240 | 
241 |     // grad_output @ W2.T = dX2 -> (B, 10) @ (10, 256) = (B, 256)
242 |     matmul_a_bt(grad_output, nn->weights2, dX2, batch_size, OUTPUT_SIZE, HIDDEN_SIZE);
243 | 
244 |     // Compute d_ReLU_out (element-wise multiplication with ReLU derivative)
245 |     float *d_ReLU_out = malloc(batch_size * HIDDEN_SIZE * sizeof(float));
246 |     for (int i = 0; i < batch_size * HIDDEN_SIZE; i++) {
247 |         d_ReLU_out[i] = dX2[i] * (hidden[i] > 0);
248 |     }
249 |     // retains its shape since its just a point-wise operation
250 |     // Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input)
251 |     matmul_at_b(input, d_ReLU_out, nn->grad_weights1, batch_size, INPUT_SIZE, HIDDEN_SIZE);
252 | 
253 |     // Update gradients for bias1
254 |     bias_backward(nn->grad_bias1, d_ReLU_out, batch_size, HIDDEN_SIZE);
255 | 
256 |     // Free allocated memory
257 |     free(grad_output);
258 |     free(dX2);
259 |     free(d_ReLU_out);
260 | }
261 | 
262 | // gradient descent step
263 | void update_weights(NeuralNetwork *nn) {
264 |     for (int i = 0; i < HIDDEN_SIZE * INPUT_SIZE; i++) {
265 |         nn->weights1[i] -= LEARNING_RATE * nn->grad_weights1[i];
266 |     }
267 |     for (int i = 0; i < OUTPUT_SIZE * HIDDEN_SIZE; i++) {
268 |         nn->weights2[i] -= LEARNING_RATE * nn->grad_weights2[i];
269 |     }
270 |     for (int i = 0; i < HIDDEN_SIZE; i++) {
271 |         nn->bias1[i] -= LEARNING_RATE * nn->grad_bias1[i];
272 |     }
273 |     for (int i = 0; i < OUTPUT_SIZE; i++) {
274 |         nn->bias2[i] -= LEARNING_RATE * nn->grad_bias2[i];
275 |     }
276 | }
277 | 
278 | // Modify train function to work with batches
279 | void train(NeuralNetwork *nn, float *X_train, int *y_train) {
280 |     float *hidden = malloc(BATCH_SIZE * HIDDEN_SIZE * sizeof(float));
281 |     float *output = malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float));
282 | 
283 |     int num_batches = TRAIN_SIZE / BATCH_SIZE;
284 | 
285 |     for (int epoch = 0; epoch < EPOCHS; epoch++) {
286 |         float total_loss = 0.0f;
287 |         int correct = 0;
288 |         
289 |         for (int batch = 0; batch < num_batches; batch++) {
290 |             int start_idx = batch * BATCH_SIZE;
291 |             
292 |             forward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, BATCH_SIZE);
293 | 
294 |             float loss = cross_entropy_loss(output, &y_train[start_idx], BATCH_SIZE);
295 |             total_loss += loss;
296 | 
297 |             for (int i = 0; i < BATCH_SIZE; i++) {
298 |                 int predicted = 0;
299 |                 for (int j = 1; j < OUTPUT_SIZE; j++) {
300 |                     if (output[i * OUTPUT_SIZE + j] > output[i * OUTPUT_SIZE + predicted]) {
301 |                         predicted = j;
302 |                     }
303 |                 }
304 |                 if (predicted == y_train[start_idx + i]) {
305 |                     correct++;
306 |                 }
307 |             }
308 | 
309 |             backward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, &y_train[start_idx], BATCH_SIZE);
310 |             update_weights(nn);
311 | 
312 |             if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) {
313 |                 printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Accuracy: %.2f%%\n", 
314 |                        epoch + 1, EPOCHS, batch + 1, num_batches, total_loss / (batch + 1), 
315 |                        100.0f * correct / ((batch + 1) * BATCH_SIZE));
316 |             }
317 |         }
318 |         
319 |         printf("Epoch %d/%d completed, Loss: %.4f, Accuracy: %.2f%%\n", 
320 |             epoch + 1, EPOCHS, total_loss / num_batches, 100.0f * correct / TRAIN_SIZE);
321 |     }
322 |     
323 |     free(hidden);
324 |     free(output);
325 | }
326 | 
327 | // Modify the initialize function to allocate memory for gradients
328 | void initialize_neural_network(NeuralNetwork *nn) {
329 |     nn->weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
330 |     nn->weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
331 |     nn->bias1 = malloc(HIDDEN_SIZE * sizeof(float));
332 |     nn->bias2 = malloc(OUTPUT_SIZE * sizeof(float));
333 |     nn->grad_weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float));
334 |     nn->grad_weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float));
335 |     nn->grad_bias1 = malloc(HIDDEN_SIZE * sizeof(float));
336 |     nn->grad_bias2 = malloc(OUTPUT_SIZE * sizeof(float));
337 | 
338 |     initialize_weights(nn->weights1, HIDDEN_SIZE * INPUT_SIZE);
339 |     initialize_weights(nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE);
340 |     initialize_bias(nn->bias1, HIDDEN_SIZE);
341 |     initialize_bias(nn->bias2, OUTPUT_SIZE);
342 | }
343 | 
344 | int main() {
345 |     srand(time(NULL));
346 | 
347 |     NeuralNetwork nn;
348 |     initialize_neural_network(&nn);
349 | 
350 |     float *X_train = malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float));
351 |     int *y_train = malloc(TRAIN_SIZE * sizeof(int));
352 |     float *X_test = malloc(TEST_SIZE * INPUT_SIZE * sizeof(float));
353 |     int *y_test = malloc(TEST_SIZE * sizeof(int));
354 | 
355 |     load_data("../mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE);
356 |     load_labels("../mnist_data/y_train.bin", y_train, TRAIN_SIZE);
357 |     load_data("../mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE);
358 |     load_labels("../mnist_data/y_test.bin", y_test, TEST_SIZE);
359 | 
360 | 
361 |     // print first image in the terminal
362 |     for (int i = 0; i < 28; i++) {
363 |         for (int j = 0; j < 28; j++) {
364 |             if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) {
365 |                 printf("X");
366 |             } else {
367 |                 printf(" ");
368 |             }
369 |         }
370 |         printf("\n");
371 |     }
372 | 
373 |     printf("First 10 training labels: ");
374 |     for (int i = 0; i < 10; i++) {
375 |         printf("%d ", y_train[i]);
376 |     }
377 |     printf("\n");
378 | 
379 |     train(&nn, X_train, y_train);
380 | 
381 |     free(nn.weights1);
382 |     free(nn.weights2);
383 |     free(nn.bias1);
384 |     free(nn.bias2);
385 |     free(nn.grad_weights1);
386 |     free(nn.grad_weights2);
387 |     free(nn.grad_bias1);
388 |     free(nn.grad_bias2);
389 |     free(X_train);
390 |     free(y_train);
391 |     free(X_test);
392 |     free(y_test);
393 | 
394 |     return 0;
395 | }
396 | 


--------------------------------------------------------------------------------
/python/c-friendly.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torchvision import datasets, transforms
  3 | 
  4 | # Load and preprocess the data
  5 | transform = transforms.Compose([transforms.ToTensor()])
  6 | mnist_train = datasets.MNIST(root='mnist_data', train=True, download=True, transform=transform)
  7 | mnist_test = datasets.MNIST(root='mnist_data', train=False, download=True, transform=transform)
  8 | 
  9 | X_train = mnist_train.data.numpy().reshape(-1, 1, 28, 28)[:10000] / 255.0
 10 | y_train = mnist_train.targets.numpy()[:10000]
 11 | X_test = mnist_test.data.numpy().reshape(-1, 1, 28, 28) / 255.0
 12 | y_test = mnist_test.targets.numpy()
 13 | 
 14 | # print the shapes of the data
 15 | print(X_train.shape, y_train.shape)
 16 | print(X_test.shape, y_test.shape)
 17 | # Activation functions
 18 | def relu(x):
 19 |     return np.maximum(0, x)
 20 | 
 21 | def relu_derivative(x):
 22 |     return (x > 0).astype(float)
 23 | 
 24 | # Linear layer
 25 | def initialize_weights(input_size, output_size):
 26 |     return np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size)
 27 | 
 28 | def initialize_bias(output_size):
 29 |     return np.zeros((1, output_size))
 30 | 
 31 | def linear_forward(x, weights, bias):
 32 |     return x @ weights + bias
 33 | 
 34 | def linear_backward(grad_output, x, weights):
 35 |     grad_weights = x.T @ grad_output
 36 |     grad_bias = np.sum(grad_output, axis=0, keepdims=True)
 37 |     grad_input = grad_output @ weights.T
 38 |     return grad_input, grad_weights, grad_bias
 39 | 
 40 | # Softmax and Cross-Entropy Loss
 41 | def softmax(x):
 42 |     exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
 43 |     return exp_x / np.sum(exp_x, axis=1, keepdims=True)
 44 | 
 45 | def cross_entropy_loss(y_pred, y_true):
 46 |     batch_size = y_pred.shape[0]
 47 |     probabilities = softmax(y_pred)
 48 |     correct_log_probs = np.log(probabilities[np.arange(batch_size), y_true])
 49 |     loss = -np.sum(correct_log_probs) / batch_size
 50 |     return loss
 51 | 
 52 | class NeuralNetwork:
 53 |     def __init__(self, input_size, hidden_size, output_size):
 54 |         self.weights1 = initialize_weights(input_size, hidden_size)
 55 |         self.bias1 = initialize_bias(hidden_size)
 56 |         self.weights2 = initialize_weights(hidden_size, output_size)
 57 |         self.bias2 = initialize_bias(output_size)
 58 | 
 59 |     def forward(self, x):
 60 |         batch_size = x.shape[0]
 61 |         fc1_input = x.reshape(batch_size, -1)
 62 |         fc1_output = linear_forward(fc1_input, self.weights1, self.bias1)
 63 |         relu_output = relu(fc1_output)
 64 |         fc2_output = linear_forward(relu_output, self.weights2, self.bias2)
 65 |         return fc2_output, (fc1_input, fc1_output, relu_output)
 66 | 
 67 |     def backward(self, grad_output, cache):
 68 |         x, fc1_output, relu_output = cache
 69 | 
 70 |         grad_fc2, grad_weights2, grad_bias2 = linear_backward(grad_output, relu_output, self.weights2)
 71 |         grad_relu = grad_fc2 * relu_derivative(fc1_output)
 72 |         grad_fc1, grad_weights1, grad_bias1 = linear_backward(grad_relu, x, self.weights1)
 73 |         return grad_weights1, grad_bias1, grad_weights2, grad_bias2
 74 | 
 75 |     def update_weights(self, grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate):
 76 |         self.weights1 -= learning_rate * grad_weights1
 77 |         self.bias1 -= learning_rate * grad_bias1
 78 |         self.weights2 -= learning_rate * grad_weights2
 79 |         self.bias2 -= learning_rate * grad_bias2
 80 | 
 81 | def train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate):
 82 |     for epoch in range(epochs):
 83 |         print(f"Epoch {epoch+1}/{epochs}")
 84 |         for i in range(0, len(X_train), batch_size):
 85 |             batch_X = X_train[i:i+batch_size]
 86 |             batch_y = y_train[i:i+batch_size]
 87 |             y_pred, cache = model.forward(batch_X)
 88 |             loss = cross_entropy_loss(y_pred, batch_y)
 89 | 
 90 |             softmax_probs = softmax(y_pred)
 91 |             y_true_one_hot = np.zeros_like(y_pred)
 92 |             y_true_one_hot[np.arange(len(batch_y)), batch_y] = 1
 93 |             grad_output = softmax_probs - y_true_one_hot
 94 | 
 95 |             grad_weights1, grad_bias1, grad_weights2, grad_bias2 = model.backward(grad_output, cache)
 96 |             model.update_weights(grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate)
 97 | 
 98 |             if (i//batch_size) % 100 == 0:
 99 |                 print(f"Iteration: {i//batch_size} Loss: {loss:.4f}")
100 | 
101 |         y_pred, _ = model.forward(X_test)
102 |         test_loss = cross_entropy_loss(y_pred, y_test)
103 |         accuracy = np.mean(np.argmax(y_pred, axis=1) == y_test)
104 |         print(f"Epoch {epoch+1} - Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}")
105 | 
106 |     print("Training completed!")
107 | 
108 | if __name__ == "__main__":
109 |     input_size = 784  # 28x28 pixels
110 |     hidden_size = 256
111 |     output_size = 10  # 10 digits
112 |     
113 |     model = NeuralNetwork(input_size, hidden_size, output_size)
114 |     
115 |     batch_size = 4
116 |     epochs = 3
117 |     learning_rate = 0.001
118 |     
119 |     train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate)


--------------------------------------------------------------------------------
/python/torch_reference.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import torch\n",
 10 |     "import torch.nn as nn\n",
 11 |     "import torch.nn.functional as F\n",
 12 |     "import torch.optim as optim\n",
 13 |     "from torch.utils.data import DataLoader\n",
 14 |     "from torchvision import datasets, transforms\n",
 15 |     "import numpy as np\n",
 16 |     "import time"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Train Data Shape: torch.Size([60000, 1, 28, 28])\n",
 29 |       "Train Data Type: torch.float32\n",
 30 |       "Test Data Shape: torch.Size([10000, 1, 28, 28])\n",
 31 |       "Test Data Type: torch.float32\n",
 32 |       "Iters per epoch: 937\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "batch_size = 64\n",
 38 |     "# learning_rate = 0.01\n",
 39 |     "num_epochs = 5\n",
 40 |     "data_dir = '../../../data'\n",
 41 |     "\n",
 42 |     "torch.set_float32_matmul_precision('high')\n",
 43 |     "\n",
 44 |     "# MNIST Dataset\n",
 45 |     "transform = transforms.Compose([\n",
 46 |     "    transforms.ToTensor(),\n",
 47 |     "    transforms.Normalize((0.1307,), (0.3081,))  # Mean and std of MNIST\n",
 48 |     "])\n",
 49 |     "\n",
 50 |     "\n",
 51 |     "train_dataset = datasets.MNIST(root=data_dir, train=True, transform=transform, download=True)\n",
 52 |     "test_dataset = datasets.MNIST(root=data_dir, train=False, transform=transform, download=True)\n",
 53 |     "\n",
 54 |     "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n",
 55 |     "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)\n",
 56 |     "\n",
 57 |     "# Pre-allocate tensors of the appropriate size\n",
 58 |     "train_data = torch.zeros(len(train_dataset), 1, 28, 28)\n",
 59 |     "train_labels = torch.zeros(len(train_dataset), dtype=torch.long)\n",
 60 |     "test_data = torch.zeros(len(test_dataset), 1, 28, 28)\n",
 61 |     "test_labels = torch.zeros(len(test_dataset), dtype=torch.long)\n",
 62 |     "\n",
 63 |     "# Load all training data into RAM\n",
 64 |     "for idx, (data, label) in enumerate(train_loader):\n",
 65 |     "    start_idx = idx * batch_size\n",
 66 |     "    end_idx = start_idx + data.size(0)\n",
 67 |     "    train_data[start_idx:end_idx] = data\n",
 68 |     "    train_labels[start_idx:end_idx] = label\n",
 69 |     "\n",
 70 |     "print('Train Data Shape:', train_data.shape)\n",
 71 |     "print('Train Data Type:', train_data.dtype)\n",
 72 |     "\n",
 73 |     "# Load all test data into RAM\n",
 74 |     "for idx, (data, label) in enumerate(test_loader):\n",
 75 |     "    start_idx = idx * batch_size\n",
 76 |     "    end_idx = start_idx + data.size(0)\n",
 77 |     "    test_data[start_idx:end_idx] = data\n",
 78 |     "    test_labels[start_idx:end_idx] = label\n",
 79 |     "\n",
 80 |     "print('Test Data Shape:', test_data.shape)\n",
 81 |     "print('Test Data Type:', test_data.dtype)\n",
 82 |     "\n",
 83 |     "iters_per_epoch = len(train_dataset) // batch_size\n",
 84 |     "print('Iters per epoch:', iters_per_epoch)\n"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "learning_rate = 1e-3\n",
 94 |     "batch_size = 16\n",
 95 |     "\n",
 96 |     "class MLP(nn.Module):\n",
 97 |     "    def __init__(self, in_features, hidden_features, num_classes):\n",
 98 |     "        super(MLP, self).__init__()\n",
 99 |     "        self.fc1 = nn.Linear(in_features, hidden_features)\n",
100 |     "        self.relu = nn.ReLU()\n",
101 |     "        self.fc2 = nn.Linear(hidden_features, num_classes)\n",
102 |     "\n",
103 |     "    def forward(self, x):\n",
104 |     "        x = x.reshape(batch_size, 28*28)\n",
105 |     "        x = self.fc1(x)\n",
106 |     "        x = self.relu(x)\n",
107 |     "        x = self.fc2(x)\n",
108 |     "        return x\n",
109 |     "   \n",
110 |     "model = MLP(in_features=784, hidden_features=256, num_classes=10).to('cuda')\n",
111 |     "# model = torch.compile(model)\n",
112 |     "criterion = nn.CrossEntropyLoss()\n",
113 |     "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 4,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "Epoch: 1, Iter: 1, Loss: 2.3299460411071777\n",
126 |       "Iteration Time: 85.8350 ms\n",
127 |       "Epoch: 1, Iter: 100, Loss: 2.140476703643799\n",
128 |       "Iteration Time: 0.4425 ms\n",
129 |       "Epoch: 1, Iter: 200, Loss: 2.0235793590545654\n",
130 |       "Iteration Time: 0.4423 ms\n",
131 |       "Epoch: 1, Iter: 300, Loss: 1.7592310905456543\n",
132 |       "Iteration Time: 0.4220 ms\n",
133 |       "Epoch: 1, Iter: 400, Loss: 1.6951887607574463\n",
134 |       "Iteration Time: 0.4134 ms\n",
135 |       "Epoch: 1, Iter: 500, Loss: 1.3808064460754395\n",
136 |       "Iteration Time: 0.4227 ms\n",
137 |       "Epoch: 1, Iter: 600, Loss: 1.2386987209320068\n",
138 |       "Iteration Time: 0.4241 ms\n",
139 |       "Epoch: 1, Iter: 700, Loss: 1.2353482246398926\n",
140 |       "Iteration Time: 0.4146 ms\n",
141 |       "Epoch: 1, Iter: 800, Loss: 1.1316126585006714\n",
142 |       "Iteration Time: 0.4673 ms\n",
143 |       "Epoch: 1, Iter: 900, Loss: 0.9632489681243896\n",
144 |       "Iteration Time: 0.4680 ms\n",
145 |       "Average Batch Accuracy: 81.23%\n",
146 |       "Finished Training\n"
147 |      ]
148 |     }
149 |    ],
150 |    "source": [
151 |     "\n",
152 |     "# epochs = 2\n",
153 |     "# Training the model\n",
154 |     "def train(model, criterion, optimizer, epoch):\n",
155 |     "    model.train()\n",
156 |     "    running_loss = 0.0\n",
157 |     "\n",
158 |     "    for i in range(iters_per_epoch):\n",
159 |     "        \n",
160 |     "        optimizer.zero_grad()\n",
161 |     "        data = train_data[i*batch_size:(i+1)*batch_size].to('cuda')\n",
162 |     "        target = train_labels[i*batch_size:(i+1)*batch_size].to('cuda')\n",
163 |     "        start = time.time()\n",
164 |     "        outputs = model(data)\n",
165 |     "        loss = criterion(outputs, target)\n",
166 |     "        loss.backward()\n",
167 |     "        optimizer.step()\n",
168 |     "        optimizer.zero_grad()\n",
169 |     "        end = time.time()\n",
170 |     "        running_loss += loss.item()\n",
171 |     "        if i % 100 == 99 or i == 0:\n",
172 |     "            print(f'Epoch: {epoch+1}, Iter: {i+1}, Loss: {loss}')\n",
173 |     "            print(f'Iteration Time: {(end - start) * 1e3:.4f} ms')\n",
174 |     "            running_loss = 0.0\n",
175 |     "\n",
176 |     "# Evaluation function to report average batch accuracy using the loaded test data\n",
177 |     "def evaluate(model, test_data, test_labels):\n",
178 |     "    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
179 |     "    model.to(device)\n",
180 |     "    model.eval()\n",
181 |     "    \n",
182 |     "    total_batch_accuracy = torch.tensor(0.0, device=device)\n",
183 |     "    num_batches = 0\n",
184 |     "    \n",
185 |     "    with torch.no_grad():\n",
186 |     "        for i in range(len(test_data) // batch_size):\n",
187 |     "            data = test_data[i * batch_size: (i + 1) * batch_size].to(device)\n",
188 |     "            target = test_labels[i * batch_size: (i + 1) * batch_size].to(device)\n",
189 |     "            outputs = model(data)\n",
190 |     "            _, predicted = torch.max(outputs.data, 1)\n",
191 |     "            correct_batch = (predicted == target).sum().item()\n",
192 |     "            total_batch = target.size(0)\n",
193 |     "            if total_batch != 0:  # Check to avoid division by zero\n",
194 |     "                batch_accuracy = correct_batch / total_batch\n",
195 |     "                total_batch_accuracy += batch_accuracy\n",
196 |     "                num_batches += 1\n",
197 |     "    \n",
198 |     "    avg_batch_accuracy = total_batch_accuracy / num_batches\n",
199 |     "    print(f'Average Batch Accuracy: {avg_batch_accuracy * 100:.2f}%')\n",
200 |     "\n",
201 |     "# Main\n",
202 |     "for epoch in range(1):\n",
203 |     "    train(model, criterion, optimizer, epoch)\n",
204 |     "    evaluate(model, test_data, test_labels)\n",
205 |     "    \n",
206 |     "print('Finished Training')"
207 |    ]
208 |   }
209 |  ],
210 |  "metadata": {
211 |   "kernelspec": {
212 |    "display_name": "Python 3",
213 |    "language": "python",
214 |    "name": "python3"
215 |   },
216 |   "language_info": {
217 |    "codemirror_mode": {
218 |     "name": "ipython",
219 |     "version": 3
220 |    },
221 |    "file_extension": ".py",
222 |    "mimetype": "text/x-python",
223 |    "name": "python",
224 |    "nbconvert_exporter": "python",
225 |    "pygments_lexer": "ipython3",
226 |    "version": "3.11.7"
227 |   }
228 |  },
229 |  "nbformat": 4,
230 |  "nbformat_minor": 2
231 | }
232 | 


--------------------------------------------------------------------------------
/python/torch_reference.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | import torch.optim as optim
  8 | from torch.utils.data import DataLoader
  9 | from torchvision import datasets, transforms
 10 | 
 11 | TRAIN_SIZE = 10000
 12 | epochs = 3
 13 | learning_rate = 1e-3
 14 | batch_size = 4
 15 | num_epochs = 3
 16 | data_dir = "../../../data"
 17 | 
 18 | torch.set_float32_matmul_precision("high")
 19 | 
 20 | # MNIST Dataset
 21 | transform = transforms.Compose(
 22 |     [
 23 |         transforms.ToTensor(),
 24 |         transforms.Normalize((0.1307,), (0.3081,)),  # Mean and std of MNIST
 25 |     ]
 26 | )
 27 | 
 28 | 
 29 | train_dataset = datasets.MNIST(
 30 |     root=data_dir, train=True, transform=transform, download=True
 31 | )
 32 | test_dataset = datasets.MNIST(
 33 |     root=data_dir, train=False, transform=transform, download=True
 34 | )
 35 | 
 36 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
 37 | test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
 38 | 
 39 | # Pre-allocate tensors of the appropriate size
 40 | train_data = torch.zeros(len(train_dataset), 1, 28, 28)
 41 | train_labels = torch.zeros(len(train_dataset), dtype=torch.long)
 42 | test_data = torch.zeros(len(test_dataset), 1, 28, 28)
 43 | test_labels = torch.zeros(len(test_dataset), dtype=torch.long)
 44 | 
 45 | # Load all training data into RAM
 46 | for idx, (data, label) in enumerate(train_loader):
 47 |     start_idx = idx * batch_size
 48 |     end_idx = start_idx + data.size(0)
 49 |     train_data[start_idx:end_idx] = data
 50 |     train_labels[start_idx:end_idx] = label
 51 | 
 52 | print("Train Data Shape:", train_data.shape)
 53 | print("Train Data Type:", train_data.dtype)
 54 | 
 55 | # Load all test data into RAM
 56 | for idx, (data, label) in enumerate(test_loader):
 57 |     start_idx = idx * batch_size
 58 |     end_idx = start_idx + data.size(0)
 59 |     test_data[start_idx:end_idx] = data
 60 |     test_labels[start_idx:end_idx] = label
 61 | 
 62 | print("Test Data Shape:", test_data.shape)
 63 | print("Test Data Type:", test_data.dtype)
 64 | 
 65 | iters_per_epoch = TRAIN_SIZE // batch_size
 66 | print("Iters per epoch:", iters_per_epoch)
 67 | 
 68 | 
 69 | class MLP(nn.Module):
 70 |     def __init__(self, in_features, hidden_features, num_classes):
 71 |         super(MLP, self).__init__()
 72 |         self.fc1 = nn.Linear(in_features, hidden_features)
 73 |         self.relu = nn.ReLU()
 74 |         self.fc2 = nn.Linear(hidden_features, num_classes)
 75 | 
 76 |     def forward(self, x):
 77 |         x = x.reshape(batch_size, 28 * 28)
 78 |         x = self.fc1(x)
 79 |         x = self.relu(x)
 80 |         x = self.fc2(x)
 81 |         return x
 82 | 
 83 | 
 84 | model = MLP(in_features=784, hidden_features=256, num_classes=10).to("cuda")
 85 | # model = torch.compile(model)
 86 | criterion = nn.CrossEntropyLoss()
 87 | optimizer = optim.SGD(model.parameters(), lr=learning_rate)
 88 | 
 89 | 
 90 | # Training the model
 91 | def train(model, criterion, optimizer, epoch):
 92 |     model.train()
 93 |     running_loss = 0.0
 94 | 
 95 |     for i in range(iters_per_epoch):
 96 | 
 97 |         optimizer.zero_grad()
 98 |         data = train_data[i * batch_size : (i + 1) * batch_size].to("cuda")
 99 |         target = train_labels[i * batch_size : (i + 1) * batch_size].to("cuda")
100 | 
101 |         start = time.time()
102 |         outputs = model(data)
103 |         loss = criterion(outputs, target)
104 |         loss.backward()
105 |         optimizer.step()
106 |         optimizer.zero_grad()
107 |         end = time.time()
108 |         running_loss += loss.item()
109 |         if i % 100 == 99 or i == 0:
110 |             print(f"Epoch: {epoch+1}, Iter: {i+1}, Loss: {loss}")
111 |             print(f"Iteration Time: {(end - start) * 1e3:.4f} sec")
112 |             running_loss = 0.0
113 | 
114 | 
115 | # Evaluation function to report average batch accuracy using the loaded test data
116 | def evaluate(model, test_data, test_labels):
117 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
118 |     model.to(device)
119 |     model.eval()
120 | 
121 |     total_batch_accuracy = torch.tensor(0.0, device=device)
122 |     num_batches = 0
123 | 
124 |     with torch.no_grad():
125 |         for i in range(len(test_data) // batch_size):
126 |             data = test_data[i * batch_size : (i + 1) * batch_size].to(device)
127 |             target = test_labels[i * batch_size : (i + 1) * batch_size].to(device)
128 |             outputs = model(data)
129 |             _, predicted = torch.max(outputs.data, 1)
130 |             correct_batch = (predicted == target).sum().item()
131 |             total_batch = target.size(0)
132 |             if total_batch != 0:  # Check to avoid division by zero
133 |                 batch_accuracy = correct_batch / total_batch
134 |                 total_batch_accuracy += batch_accuracy
135 |                 num_batches += 1
136 | 
137 |     avg_batch_accuracy = total_batch_accuracy / num_batches
138 |     print(f"Average Batch Accuracy: {avg_batch_accuracy * 100:.2f}%")
139 | 
140 | 
141 | # Main
142 | if __name__ == "__main__":
143 |     for epoch in range(epochs):
144 |         train(model, criterion, optimizer, epoch)
145 |         evaluate(model, test_data, test_labels)
146 | 
147 |     print("Finished Training")
148 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | numpy
4 | matplotlib
5 | 


--------------------------------------------------------------------------------