├── .DS_Store ├── .gitignore ├── .vscode └── settings.json ├── README.md ├── assets ├── architecture.excalidraw ├── architecture.png ├── mnist-mlp.excalidraw └── mnist-mlp.png ├── cuda ├── .DS_Store ├── naive-gpu │ └── 1layer.cu └── vroom │ ├── comparing │ ├── batch-compare-backward.cu │ ├── batch-compare-forward.cu │ └── batch-matmul-compare.cu │ └── v1.cu ├── downloader.py ├── naive-cpu └── v1.c ├── python ├── c-friendly.py ├── torch_reference.ipynb └── torch_reference.py └── requirements.txt /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # watch out for those pesky .DS_Store & binary files 2 | data 3 | python/data 4 | python/venv 5 | .vscode 6 | .gitignore 7 | mnist_data 8 | dev -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.associations": { 3 | "ostream": "cpp", 4 | "chrono": "cpp", 5 | "dataloader.cuh": "c", 6 | "stdio.h": "c", 7 | "random": "cpp", 8 | "queue": "cpp", 9 | "stack": "cpp", 10 | "iostream": "cpp", 11 | "cstddef": "cpp", 12 | "array": "cpp", 13 | "atomic": "cpp", 14 | "bit": "cpp", 15 | "*.tcc": "cpp", 16 | "bitset": "cpp", 17 | "cctype": "cpp", 18 | "cinttypes": "cpp", 19 | "clocale": "cpp", 20 | "cmath": "cpp", 21 | "compare": "cpp", 22 | "complex": "cpp", 23 | "concepts": "cpp", 24 | "condition_variable": "cpp", 25 | "csignal": "cpp", 26 | "cstdarg": "cpp", 27 | "cstdint": "cpp", 28 | "cstdio": "cpp", 29 | "cstdlib": "cpp", 30 | "cstring": "cpp", 31 | "ctime": "cpp", 32 | "cwchar": "cpp", 33 | "cwctype": "cpp", 34 | "deque": "cpp", 35 | "forward_list": "cpp", 36 | "list": "cpp", 37 | "map": "cpp", 38 | "set": "cpp", 39 | "string": "cpp", 40 | "unordered_map": "cpp", 41 | "unordered_set": "cpp", 42 | "vector": "cpp", 43 | "exception": "cpp", 44 | "algorithm": "cpp", 45 | "functional": "cpp", 46 | "iterator": "cpp", 47 | "memory": "cpp", 48 | "memory_resource": "cpp", 49 | "numeric": "cpp", 50 | "optional": "cpp", 51 | "ratio": "cpp", 52 | "regex": "cpp", 53 | "string_view": "cpp", 54 | "system_error": "cpp", 55 | "tuple": "cpp", 56 | "type_traits": "cpp", 57 | "utility": "cpp", 58 | "fstream": "cpp", 59 | "future": "cpp", 60 | "initializer_list": "cpp", 61 | "iomanip": "cpp", 62 | "iosfwd": "cpp", 63 | "istream": "cpp", 64 | "limits": "cpp", 65 | "mutex": "cpp", 66 | "new": "cpp", 67 | "numbers": "cpp", 68 | "semaphore": "cpp", 69 | "shared_mutex": "cpp", 70 | "sstream": "cpp", 71 | "stdexcept": "cpp", 72 | "stop_token": "cpp", 73 | "streambuf": "cpp", 74 | "thread": "cpp", 75 | "cfenv": "cpp", 76 | "typeindex": "cpp", 77 | "typeinfo": "cpp", 78 | "valarray": "cpp", 79 | "variant": "cpp", 80 | "filesystem": "cpp", 81 | "__locale": "cpp" 82 | } 83 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MNIST in CUDA 2 | 3 | ![](assets/mnist-mlp.png) 4 | > This is instruction manual for understanding + using the mnist training run in CUDA 5 | 6 | 7 | ## Setup 8 | > DISCLAIMER: ensure you have a GPU with compute capability 5.0 or greater (at least maxwell architecture). See compatibilty guide: https://docs.nvidia.com/deeplearning/cudnn/latest/reference/support-matrix.html 9 | ```bash 10 | git clone https://github.com/Infatoshi/mnist-cuda 11 | python3 -m venv venv 12 | source venv/bin/activate 13 | pip install -r requirements.txt 14 | ``` 15 | ## Purpose 16 | 17 | We train an MLP on the MNIST dataset. 18 | We implement both the batched training run in pytorch, then translate over to CUDA C/C++ using iteratively optimized GPU kernels. I purposely left out batchnorm, residual blocks, lower-precision, and other optimizations to keep the code simple and easy to understand. It would also take wayyyy longer to implement and explain. 19 | 20 | 21 | ## What we need to watch out for + pay attention to: 22 | 23 | - [row vs col major](https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication) 24 | - [tensor cores](https://docs.nvidia.com/cuda/cublas/#tensor-core-usage) 25 | 26 | ## Accelerate the data transfer via Prefetching 27 | 28 | - [Unified vs Explicit Memory in CUDA](https://github.com/lintenn/cudaAddVectors-explicit-vs-unified-memory) 29 | - [Maximizing Unified Memory Performance](https://developer.nvidia.com/blog/maximizing-unified-memory-performance-cuda/) 30 | - Prefetching is automatically taken care of by unified memory via **streams** (this is what is has lower latency in the github link above) 31 | - [CUDA streams - Lei Mao](https://leimao.github.io/blog/CUDA-Stream/) 32 | - [NVIDIA Docs](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#asynchronous-concurrent-execution) 33 | - Streams allow for overlapping data transfer (prefetching) with computation. 34 | - While one stream is executing a kernel, another stream can be transferring data for the next computation. 35 | - This technique is often called "double buffering" or "multi-buffering" when extended to more buffers. 36 | 37 | ## Kernel Conversion 38 | > we will change the following functions to kernels: 39 | matmul_a_bt and matmul_at_b 40 | relu_forward and relu_backward 41 | bias_forward and bias_backward 42 | softmax 43 | compute_grad_output 44 | compute_output_gradients 45 | compute_hidden_gradients 46 | update_gradients 47 | -------------------------------------------------------------------------------- /assets/architecture.excalidraw: -------------------------------------------------------------------------------- 1 | { 2 | "type": "excalidraw", 3 | "version": 2, 4 | "source": "https://excalidraw.com", 5 | "elements": [ 6 | { 7 | "type": "rectangle", 8 | "version": 121, 9 | "versionNonce": 1332964315, 10 | "index": "a9", 11 | "isDeleted": false, 12 | "id": "Tb5EZSqLfMsjXk12HZZH2", 13 | "fillStyle": "solid", 14 | "strokeWidth": 2, 15 | "strokeStyle": "solid", 16 | "roughness": 1, 17 | "opacity": 100, 18 | "angle": 0, 19 | "x": 460, 20 | "y": 179, 21 | "strokeColor": "#1e1e1e", 22 | "backgroundColor": "transparent", 23 | "width": 310, 24 | "height": 68, 25 | "seed": 195673173, 26 | "groupIds": [], 27 | "frameId": null, 28 | "roundness": { 29 | "type": 3 30 | }, 31 | "boundElements": [ 32 | { 33 | "type": "text", 34 | "id": "226hBQzcNEGHGlDXnn7fS" 35 | }, 36 | { 37 | "id": "WOHMQMCgLty5Tquv57o3x", 38 | "type": "arrow" 39 | }, 40 | { 41 | "id": "5-IqAH6FDB3OTGQo3zcbC", 42 | "type": "arrow" 43 | } 44 | ], 45 | "updated": 1720154708284, 46 | "link": null, 47 | "locked": false 48 | }, 49 | { 50 | "type": "text", 51 | "version": 60, 52 | "versionNonce": 647234719, 53 | "index": "a9V", 54 | "isDeleted": false, 55 | "id": "226hBQzcNEGHGlDXnn7fS", 56 | "fillStyle": "solid", 57 | "strokeWidth": 2, 58 | "strokeStyle": "solid", 59 | "roughness": 1, 60 | "opacity": 100, 61 | "angle": 0, 62 | "x": 520.3000869750977, 63 | "y": 200.5, 64 | "strokeColor": "#1e1e1e", 65 | "backgroundColor": "transparent", 66 | "width": 189.3998260498047, 67 | "height": 25, 68 | "seed": 604860667, 69 | "groupIds": [], 70 | "frameId": null, 71 | "roundness": null, 72 | "boundElements": [], 73 | "updated": 1720239838246, 74 | "link": null, 75 | "locked": false, 76 | "fontSize": 20, 77 | "fontFamily": 1, 78 | "text": "X -> (B, 1, 28, 28)", 79 | "textAlign": "center", 80 | "verticalAlign": "middle", 81 | "containerId": "Tb5EZSqLfMsjXk12HZZH2", 82 | "originalText": "X -> (B, 1, 28, 28)", 83 | "autoResize": true, 84 | "lineHeight": 1.25 85 | }, 86 | { 87 | "type": "arrow", 88 | "version": 218, 89 | "versionNonce": 811579281, 90 | "index": "aC", 91 | "isDeleted": false, 92 | "id": "WOHMQMCgLty5Tquv57o3x", 93 | "fillStyle": "solid", 94 | "strokeWidth": 2, 95 | "strokeStyle": "solid", 96 | "roughness": 1, 97 | "opacity": 100, 98 | "angle": 0, 99 | "x": 606.8094064949607, 100 | "y": 248, 101 | "strokeColor": "#1e1e1e", 102 | "backgroundColor": "transparent", 103 | "width": 0.19059350503925998, 104 | "height": 49, 105 | "seed": 2010511989, 106 | "groupIds": [], 107 | "frameId": null, 108 | "roundness": { 109 | "type": 2 110 | }, 111 | "boundElements": [], 112 | "updated": 1720239960994, 113 | "link": null, 114 | "locked": false, 115 | "startBinding": { 116 | "elementId": "Tb5EZSqLfMsjXk12HZZH2", 117 | "focus": 0.053675053385165365, 118 | "gap": 1 119 | }, 120 | "endBinding": null, 121 | "lastCommittedPoint": null, 122 | "startArrowhead": null, 123 | "endArrowhead": "arrow", 124 | "points": [ 125 | [ 126 | 0, 127 | 0 128 | ], 129 | [ 130 | 0.19059350503925998, 131 | 49 132 | ] 133 | ] 134 | }, 135 | { 136 | "type": "rectangle", 137 | "version": 355, 138 | "versionNonce": 676668063, 139 | "index": "aa", 140 | "isDeleted": false, 141 | "id": "a4bTGDbNZih7uekkgQdnF", 142 | "fillStyle": "solid", 143 | "strokeWidth": 2, 144 | "strokeStyle": "solid", 145 | "roughness": 1, 146 | "opacity": 100, 147 | "angle": 0, 148 | "x": 829, 149 | "y": 175, 150 | "strokeColor": "#1e1e1e", 151 | "backgroundColor": "transparent", 152 | "width": 276, 153 | "height": 70, 154 | "seed": 413798971, 155 | "groupIds": [], 156 | "frameId": null, 157 | "roundness": { 158 | "type": 3 159 | }, 160 | "boundElements": [ 161 | { 162 | "type": "text", 163 | "id": "NcrKQWyJrYeqex0ldZ7aJ" 164 | }, 165 | { 166 | "id": "5-IqAH6FDB3OTGQo3zcbC", 167 | "type": "arrow" 168 | }, 169 | { 170 | "id": "Gl5Sw9zFzPIVic4meB60T", 171 | "type": "arrow" 172 | } 173 | ], 174 | "updated": 1720240138338, 175 | "link": null, 176 | "locked": false 177 | }, 178 | { 179 | "type": "text", 180 | "version": 327, 181 | "versionNonce": 680082719, 182 | "index": "ab", 183 | "isDeleted": false, 184 | "id": "NcrKQWyJrYeqex0ldZ7aJ", 185 | "fillStyle": "solid", 186 | "strokeWidth": 2, 187 | "strokeStyle": "solid", 188 | "roughness": 1, 189 | "opacity": 100, 190 | "angle": 0, 191 | "x": 839.3201522827148, 192 | "y": 197.5, 193 | "strokeColor": "#1e1e1e", 194 | "backgroundColor": "transparent", 195 | "width": 255.3596954345703, 196 | "height": 25, 197 | "seed": 1200618709, 198 | "groupIds": [], 199 | "frameId": null, 200 | "roundness": null, 201 | "boundElements": [], 202 | "updated": 1720239838246, 203 | "link": null, 204 | "locked": false, 205 | "fontSize": 20, 206 | "fontFamily": 1, 207 | "text": "Dataloader gets a batch", 208 | "textAlign": "center", 209 | "verticalAlign": "middle", 210 | "containerId": "a4bTGDbNZih7uekkgQdnF", 211 | "originalText": "Dataloader gets a batch", 212 | "autoResize": true, 213 | "lineHeight": 1.25 214 | }, 215 | { 216 | "type": "arrow", 217 | "version": 617, 218 | "versionNonce": 588130933, 219 | "index": "ac", 220 | "isDeleted": false, 221 | "id": "5-IqAH6FDB3OTGQo3zcbC", 222 | "fillStyle": "solid", 223 | "strokeWidth": 2, 224 | "strokeStyle": "solid", 225 | "roughness": 1, 226 | "opacity": 100, 227 | "angle": 0, 228 | "x": 827, 229 | "y": 208.77830746985512, 230 | "strokeColor": "#1e1e1e", 231 | "backgroundColor": "transparent", 232 | "width": 52, 233 | "height": 1.864830883005169, 234 | "seed": 1412391483, 235 | "groupIds": [], 236 | "frameId": null, 237 | "roundness": { 238 | "type": 2 239 | }, 240 | "boundElements": [], 241 | "updated": 1720154761064, 242 | "link": null, 243 | "locked": false, 244 | "startBinding": { 245 | "elementId": "a4bTGDbNZih7uekkgQdnF", 246 | "focus": 0.15297822093938598, 247 | "gap": 2 248 | }, 249 | "endBinding": { 250 | "elementId": "Tb5EZSqLfMsjXk12HZZH2", 251 | "focus": 0.08547008547008547, 252 | "gap": 5 253 | }, 254 | "lastCommittedPoint": null, 255 | "startArrowhead": null, 256 | "endArrowhead": "arrow", 257 | "points": [ 258 | [ 259 | 0, 260 | 0 261 | ], 262 | [ 263 | -52, 264 | 1.864830883005169 265 | ] 266 | ] 267 | }, 268 | { 269 | "type": "arrow", 270 | "version": 198, 271 | "versionNonce": 1232196113, 272 | "index": "ad", 273 | "isDeleted": false, 274 | "id": "02JUczFgbNAnZ_XK7SONP", 275 | "fillStyle": "solid", 276 | "strokeWidth": 2, 277 | "strokeStyle": "solid", 278 | "roughness": 1, 279 | "opacity": 100, 280 | "angle": 0, 281 | "x": 775, 282 | "y": 711.1284390243902, 283 | "strokeColor": "#1e1e1e", 284 | "backgroundColor": "transparent", 285 | "width": 66, 286 | "height": 38.87156097560978, 287 | "seed": 1399797883, 288 | "groupIds": [], 289 | "frameId": null, 290 | "roundness": { 291 | "type": 2 292 | }, 293 | "boundElements": [], 294 | "updated": 1720240155126, 295 | "link": null, 296 | "locked": false, 297 | "startBinding": { 298 | "elementId": "FDItF61llBXUrbKwoTXXI", 299 | "focus": -0.42416759838920154, 300 | "gap": 1 301 | }, 302 | "endBinding": { 303 | "elementId": "B_wNqm25NGAtRVmirgvYP", 304 | "focus": -0.5189441487483543, 305 | "gap": 1 306 | }, 307 | "lastCommittedPoint": null, 308 | "startArrowhead": null, 309 | "endArrowhead": "arrow", 310 | "points": [ 311 | [ 312 | 0, 313 | 0 314 | ], 315 | [ 316 | 66, 317 | 38.87156097560978 318 | ] 319 | ] 320 | }, 321 | { 322 | "type": "rectangle", 323 | "version": 166, 324 | "versionNonce": 1082925713, 325 | "index": "ae", 326 | "isDeleted": false, 327 | "id": "B_wNqm25NGAtRVmirgvYP", 328 | "fillStyle": "solid", 329 | "strokeWidth": 2, 330 | "strokeStyle": "solid", 331 | "roughness": 1, 332 | "opacity": 100, 333 | "angle": 0, 334 | "x": 840, 335 | "y": 736, 336 | "strokeColor": "#1e1e1e", 337 | "backgroundColor": "transparent", 338 | "width": 227, 339 | "height": 60, 340 | "seed": 1422467323, 341 | "groupIds": [], 342 | "frameId": null, 343 | "roundness": { 344 | "type": 3 345 | }, 346 | "boundElements": [ 347 | { 348 | "type": "text", 349 | "id": "cYREyB1eDLvVCxnSeVAHd" 350 | }, 351 | { 352 | "id": "02JUczFgbNAnZ_XK7SONP", 353 | "type": "arrow" 354 | } 355 | ], 356 | "updated": 1720240153683, 357 | "link": null, 358 | "locked": false 359 | }, 360 | { 361 | "type": "text", 362 | "version": 132, 363 | "versionNonce": 1197339793, 364 | "index": "af", 365 | "isDeleted": false, 366 | "id": "cYREyB1eDLvVCxnSeVAHd", 367 | "fillStyle": "solid", 368 | "strokeWidth": 2, 369 | "strokeStyle": "solid", 370 | "roughness": 1, 371 | "opacity": 100, 372 | "angle": 0, 373 | "x": 846.3601226806641, 374 | "y": 741, 375 | "strokeColor": "#1e1e1e", 376 | "backgroundColor": "transparent", 377 | "width": 214.27975463867188, 378 | "height": 50, 379 | "seed": 2095659675, 380 | "groupIds": [], 381 | "frameId": null, 382 | "roundness": null, 383 | "boundElements": [], 384 | "updated": 1720240148754, 385 | "link": null, 386 | "locked": false, 387 | "fontSize": 20, 388 | "fontFamily": 1, 389 | "text": "CrossEntropyLoss w/ \nSoftmax", 390 | "textAlign": "center", 391 | "verticalAlign": "middle", 392 | "containerId": "B_wNqm25NGAtRVmirgvYP", 393 | "originalText": "CrossEntropyLoss w/ \nSoftmax", 394 | "autoResize": true, 395 | "lineHeight": 1.25 396 | }, 397 | { 398 | "type": "arrow", 399 | "version": 442, 400 | "versionNonce": 580566353, 401 | "index": "ag", 402 | "isDeleted": false, 403 | "id": "Gl5Sw9zFzPIVic4meB60T", 404 | "fillStyle": "solid", 405 | "strokeWidth": 2, 406 | "strokeStyle": "solid", 407 | "roughness": 1, 408 | "opacity": 100, 409 | "angle": 0, 410 | "x": 882, 411 | "y": 262, 412 | "strokeColor": "#1e1e1e", 413 | "backgroundColor": "transparent", 414 | "width": 2, 415 | "height": 369, 416 | "seed": 1570799611, 417 | "groupIds": [], 418 | "frameId": null, 419 | "roundness": { 420 | "type": 2 421 | }, 422 | "boundElements": [], 423 | "updated": 1720240164336, 424 | "link": null, 425 | "locked": false, 426 | "startBinding": { 427 | "elementId": "a4bTGDbNZih7uekkgQdnF", 428 | "focus": 0.6130569501098212, 429 | "gap": 17 430 | }, 431 | "endBinding": { 432 | "elementId": "bMXxc2iYQD9OKukWZgVhi", 433 | "focus": 0.00921406396789014, 434 | "gap": 12.002668957710789 435 | }, 436 | "lastCommittedPoint": null, 437 | "startArrowhead": null, 438 | "endArrowhead": "arrow", 439 | "points": [ 440 | [ 441 | 0, 442 | 0 443 | ], 444 | [ 445 | -2, 446 | 369 447 | ] 448 | ] 449 | }, 450 | { 451 | "type": "arrow", 452 | "version": 265, 453 | "versionNonce": 633629279, 454 | "index": "ai", 455 | "isDeleted": false, 456 | "id": "dyST1-PXoEsDXgjTBDOom", 457 | "fillStyle": "solid", 458 | "strokeWidth": 2, 459 | "strokeStyle": "solid", 460 | "roughness": 1, 461 | "opacity": 100, 462 | "angle": 0, 463 | "x": 1005, 464 | "y": 717, 465 | "strokeColor": "#1e1e1e", 466 | "backgroundColor": "transparent", 467 | "width": 0, 468 | "height": 393, 469 | "seed": 1136485621, 470 | "groupIds": [], 471 | "frameId": null, 472 | "roundness": { 473 | "type": 2 474 | }, 475 | "boundElements": [], 476 | "updated": 1720240157303, 477 | "link": null, 478 | "locked": false, 479 | "startBinding": null, 480 | "endBinding": { 481 | "elementId": "W-FMv5BfpNwKuBZtFDJPw", 482 | "focus": -0.012987012987012988, 483 | "gap": 8.002431575230332 484 | }, 485 | "lastCommittedPoint": null, 486 | "startArrowhead": null, 487 | "endArrowhead": "arrow", 488 | "points": [ 489 | [ 490 | 0, 491 | 0 492 | ], 493 | [ 494 | 0, 495 | -393 496 | ] 497 | ] 498 | }, 499 | { 500 | "type": "ellipse", 501 | "version": 162, 502 | "versionNonce": 1352715185, 503 | "index": "aj", 504 | "isDeleted": false, 505 | "id": "bMXxc2iYQD9OKukWZgVhi", 506 | "fillStyle": "solid", 507 | "strokeWidth": 2, 508 | "strokeStyle": "solid", 509 | "roughness": 1, 510 | "opacity": 100, 511 | "angle": 0, 512 | "x": 799, 513 | "y": 643, 514 | "strokeColor": "#1e1e1e", 515 | "backgroundColor": "transparent", 516 | "width": 160, 517 | "height": 72.99999999999997, 518 | "seed": 1540359669, 519 | "groupIds": [], 520 | "frameId": null, 521 | "roundness": { 522 | "type": 2 523 | }, 524 | "boundElements": [ 525 | { 526 | "type": "text", 527 | "id": "WCUFJus6hMfoXEtlcd0Zs" 528 | }, 529 | { 530 | "id": "Gl5Sw9zFzPIVic4meB60T", 531 | "type": "arrow" 532 | } 533 | ], 534 | "updated": 1720240164045, 535 | "link": null, 536 | "locked": false 537 | }, 538 | { 539 | "type": "text", 540 | "version": 110, 541 | "versionNonce": 107802847, 542 | "index": "ak", 543 | "isDeleted": false, 544 | "id": "WCUFJus6hMfoXEtlcd0Zs", 545 | "fillStyle": "solid", 546 | "strokeWidth": 2, 547 | "strokeStyle": "solid", 548 | "roughness": 1, 549 | "opacity": 100, 550 | "angle": 0, 551 | "x": 857.9314727638653, 552 | "y": 667.190602486691, 553 | "strokeColor": "#1e1e1e", 554 | "backgroundColor": "transparent", 555 | "width": 41.999969482421875, 556 | "height": 25, 557 | "seed": 2053893467, 558 | "groupIds": [], 559 | "frameId": null, 560 | "roundness": null, 561 | "boundElements": [], 562 | "updated": 1720240161330, 563 | "link": null, 564 | "locked": false, 565 | "fontSize": 20, 566 | "fontFamily": 1, 567 | "text": "FWD", 568 | "textAlign": "center", 569 | "verticalAlign": "middle", 570 | "containerId": "bMXxc2iYQD9OKukWZgVhi", 571 | "originalText": "FWD", 572 | "autoResize": true, 573 | "lineHeight": 1.25 574 | }, 575 | { 576 | "type": "ellipse", 577 | "version": 55, 578 | "versionNonce": 1638133215, 579 | "index": "al", 580 | "isDeleted": false, 581 | "id": "W-FMv5BfpNwKuBZtFDJPw", 582 | "fillStyle": "solid", 583 | "strokeWidth": 2, 584 | "strokeStyle": "solid", 585 | "roughness": 1, 586 | "opacity": 100, 587 | "angle": 0, 588 | "x": 927, 589 | "y": 256, 590 | "strokeColor": "#1e1e1e", 591 | "backgroundColor": "transparent", 592 | "width": 154, 593 | "height": 60, 594 | "seed": 2111931605, 595 | "groupIds": [], 596 | "frameId": null, 597 | "roundness": { 598 | "type": 2 599 | }, 600 | "boundElements": [ 601 | { 602 | "type": "text", 603 | "id": "djk1Lt9IuiKWjg7fDTQ7T" 604 | }, 605 | { 606 | "id": "dyST1-PXoEsDXgjTBDOom", 607 | "type": "arrow" 608 | } 609 | ], 610 | "updated": 1720240145253, 611 | "link": null, 612 | "locked": false 613 | }, 614 | { 615 | "type": "text", 616 | "version": 15, 617 | "versionNonce": 359446065, 618 | "index": "am", 619 | "isDeleted": false, 620 | "id": "djk1Lt9IuiKWjg7fDTQ7T", 621 | "fillStyle": "solid", 622 | "strokeWidth": 2, 623 | "strokeStyle": "solid", 624 | "roughness": 1, 625 | "opacity": 100, 626 | "angle": 0, 627 | "x": 975.3927970747101, 628 | "y": 273.2867965644036, 629 | "strokeColor": "#1e1e1e", 630 | "backgroundColor": "transparent", 631 | "width": 57.31996154785156, 632 | "height": 25, 633 | "seed": 1491854523, 634 | "groupIds": [], 635 | "frameId": null, 636 | "roundness": null, 637 | "boundElements": [], 638 | "updated": 1720239838246, 639 | "link": null, 640 | "locked": false, 641 | "fontSize": 20, 642 | "fontFamily": 1, 643 | "text": "BKWD", 644 | "textAlign": "center", 645 | "verticalAlign": "middle", 646 | "containerId": "W-FMv5BfpNwKuBZtFDJPw", 647 | "originalText": "BKWD", 648 | "autoResize": true, 649 | "lineHeight": 1.25 650 | }, 651 | { 652 | "id": "252rC5Ah4gyn8FABm_5NN", 653 | "type": "rectangle", 654 | "x": 462, 655 | "y": 296, 656 | "width": 311, 657 | "height": 71, 658 | "angle": 0, 659 | "strokeColor": "#1e1e1e", 660 | "backgroundColor": "transparent", 661 | "fillStyle": "solid", 662 | "strokeWidth": 2, 663 | "strokeStyle": "solid", 664 | "roughness": 1, 665 | "opacity": 100, 666 | "groupIds": [], 667 | "frameId": null, 668 | "index": "an", 669 | "roundness": { 670 | "type": 3 671 | }, 672 | "seed": 1657504177, 673 | "version": 132, 674 | "versionNonce": 1541264031, 675 | "isDeleted": false, 676 | "boundElements": [ 677 | { 678 | "type": "text", 679 | "id": "giSDh_vvh8OoE56tLSEab" 680 | }, 681 | { 682 | "id": "R0fTzItfD5zQl_LTuH2iZ", 683 | "type": "arrow" 684 | } 685 | ], 686 | "updated": 1720240007882, 687 | "link": null, 688 | "locked": false 689 | }, 690 | { 691 | "id": "giSDh_vvh8OoE56tLSEab", 692 | "type": "text", 693 | "x": 520.2600936889648, 694 | "y": 319, 695 | "width": 194.4798126220703, 696 | "height": 25, 697 | "angle": 0, 698 | "strokeColor": "#1e1e1e", 699 | "backgroundColor": "transparent", 700 | "fillStyle": "solid", 701 | "strokeWidth": 2, 702 | "strokeStyle": "solid", 703 | "roughness": 1, 704 | "opacity": 100, 705 | "groupIds": [], 706 | "frameId": null, 707 | "index": "anV", 708 | "roundness": null, 709 | "seed": 1033794257, 710 | "version": 21, 711 | "versionNonce": 1913163313, 712 | "isDeleted": false, 713 | "boundElements": null, 714 | "updated": 1720240001071, 715 | "link": null, 716 | "locked": false, 717 | "text": "Flatten -> (B, 784)", 718 | "fontSize": 20, 719 | "fontFamily": 1, 720 | "textAlign": "center", 721 | "verticalAlign": "middle", 722 | "containerId": "252rC5Ah4gyn8FABm_5NN", 723 | "originalText": "Flatten -> (B, 784)", 724 | "autoResize": true, 725 | "lineHeight": 1.25 726 | }, 727 | { 728 | "id": "R0fTzItfD5zQl_LTuH2iZ", 729 | "type": "arrow", 730 | "x": 615, 731 | "y": 367, 732 | "width": 1, 733 | "height": 44, 734 | "angle": 0, 735 | "strokeColor": "#1e1e1e", 736 | "backgroundColor": "transparent", 737 | "fillStyle": "solid", 738 | "strokeWidth": 2, 739 | "strokeStyle": "solid", 740 | "roughness": 1, 741 | "opacity": 100, 742 | "groupIds": [], 743 | "frameId": null, 744 | "index": "ap", 745 | "roundness": { 746 | "type": 2 747 | }, 748 | "seed": 2094851455, 749 | "version": 42, 750 | "versionNonce": 1377008255, 751 | "isDeleted": false, 752 | "boundElements": null, 753 | "updated": 1720240007882, 754 | "link": null, 755 | "locked": false, 756 | "points": [ 757 | [ 758 | 0, 759 | 0 760 | ], 761 | [ 762 | -1, 763 | 44 764 | ] 765 | ], 766 | "lastCommittedPoint": null, 767 | "startBinding": { 768 | "elementId": "252rC5Ah4gyn8FABm_5NN", 769 | "focus": 0.010832424572882589, 770 | "gap": 1 771 | }, 772 | "endBinding": null, 773 | "startArrowhead": null, 774 | "endArrowhead": "arrow" 775 | }, 776 | { 777 | "id": "8TuQ_WHfiSFXafyH497Yl", 778 | "type": "rectangle", 779 | "x": 467, 780 | "y": 416, 781 | "width": 304, 782 | "height": 74, 783 | "angle": 0, 784 | "strokeColor": "#1e1e1e", 785 | "backgroundColor": "transparent", 786 | "fillStyle": "solid", 787 | "strokeWidth": 2, 788 | "strokeStyle": "solid", 789 | "roughness": 1, 790 | "opacity": 100, 791 | "groupIds": [], 792 | "frameId": null, 793 | "index": "aq", 794 | "roundness": { 795 | "type": 3 796 | }, 797 | "seed": 1189619391, 798 | "version": 99, 799 | "versionNonce": 2031612881, 800 | "isDeleted": false, 801 | "boundElements": [ 802 | { 803 | "type": "text", 804 | "id": "vfAXY-vLTQ7k-h5kCS-hO" 805 | }, 806 | { 807 | "id": "N5t1cG6wymc0G3VV4WAdy", 808 | "type": "arrow" 809 | } 810 | ], 811 | "updated": 1720240080586, 812 | "link": null, 813 | "locked": false 814 | }, 815 | { 816 | "id": "vfAXY-vLTQ7k-h5kCS-hO", 817 | "type": "text", 818 | "x": 479.7501220703125, 819 | "y": 428, 820 | "width": 278.499755859375, 821 | "height": 50, 822 | "angle": 0, 823 | "strokeColor": "#1e1e1e", 824 | "backgroundColor": "transparent", 825 | "fillStyle": "solid", 826 | "strokeWidth": 2, 827 | "strokeStyle": "solid", 828 | "roughness": 1, 829 | "opacity": 100, 830 | "groupIds": [], 831 | "frameId": null, 832 | "index": "ar", 833 | "roundness": null, 834 | "seed": 352394897, 835 | "version": 49, 836 | "versionNonce": 1612744721, 837 | "isDeleted": false, 838 | "boundElements": null, 839 | "updated": 1720240072802, 840 | "link": null, 841 | "locked": false, 842 | "text": "Linear -> (B, 784) @ (784, \n256) = (B, 256)", 843 | "fontSize": 20, 844 | "fontFamily": 1, 845 | "textAlign": "center", 846 | "verticalAlign": "middle", 847 | "containerId": "8TuQ_WHfiSFXafyH497Yl", 848 | "originalText": "Linear -> (B, 784) @ (784, 256) = (B, 256)", 849 | "autoResize": true, 850 | "lineHeight": 1.25 851 | }, 852 | { 853 | "id": "N5t1cG6wymc0G3VV4WAdy", 854 | "type": "arrow", 855 | "x": 614, 856 | "y": 490, 857 | "width": 2, 858 | "height": 39, 859 | "angle": 0, 860 | "strokeColor": "#1e1e1e", 861 | "backgroundColor": "transparent", 862 | "fillStyle": "solid", 863 | "strokeWidth": 2, 864 | "strokeStyle": "solid", 865 | "roughness": 1, 866 | "opacity": 100, 867 | "groupIds": [], 868 | "frameId": null, 869 | "index": "as", 870 | "roundness": { 871 | "type": 2 872 | }, 873 | "seed": 972584913, 874 | "version": 33, 875 | "versionNonce": 3561969, 876 | "isDeleted": false, 877 | "boundElements": null, 878 | "updated": 1720240080586, 879 | "link": null, 880 | "locked": false, 881 | "points": [ 882 | [ 883 | 0, 884 | 0 885 | ], 886 | [ 887 | 2, 888 | 39 889 | ] 890 | ], 891 | "lastCommittedPoint": null, 892 | "startBinding": { 893 | "elementId": "8TuQ_WHfiSFXafyH497Yl", 894 | "focus": 0.04481839386871043, 895 | "gap": 1 896 | }, 897 | "endBinding": null, 898 | "startArrowhead": null, 899 | "endArrowhead": "arrow" 900 | }, 901 | { 902 | "id": "DyktbCEmygt1qtlFFoMfX", 903 | "type": "rectangle", 904 | "x": 464, 905 | "y": 527, 906 | "width": 315, 907 | "height": 70, 908 | "angle": 0, 909 | "strokeColor": "#1e1e1e", 910 | "backgroundColor": "transparent", 911 | "fillStyle": "solid", 912 | "strokeWidth": 2, 913 | "strokeStyle": "solid", 914 | "roughness": 1, 915 | "opacity": 100, 916 | "groupIds": [], 917 | "frameId": null, 918 | "index": "at", 919 | "roundness": { 920 | "type": 3 921 | }, 922 | "seed": 2053853617, 923 | "version": 129, 924 | "versionNonce": 1848488031, 925 | "isDeleted": false, 926 | "boundElements": [ 927 | { 928 | "type": "text", 929 | "id": "6M7P3dpAANA56DOMAm4tk" 930 | }, 931 | { 932 | "id": "N9bJrjrXf9FaYkZTgVyo4", 933 | "type": "arrow" 934 | } 935 | ], 936 | "updated": 1720240105382, 937 | "link": null, 938 | "locked": false 939 | }, 940 | { 941 | "id": "6M7P3dpAANA56DOMAm4tk", 942 | "type": "text", 943 | "x": 535.090087890625, 944 | "y": 549.5, 945 | "width": 172.81982421875, 946 | "height": 25, 947 | "angle": 0, 948 | "strokeColor": "#1e1e1e", 949 | "backgroundColor": "transparent", 950 | "fillStyle": "solid", 951 | "strokeWidth": 2, 952 | "strokeStyle": "solid", 953 | "roughness": 1, 954 | "opacity": 100, 955 | "groupIds": [], 956 | "frameId": null, 957 | "index": "atV", 958 | "roundness": null, 959 | "seed": 1939366193, 960 | "version": 20, 961 | "versionNonce": 1378467505, 962 | "isDeleted": false, 963 | "boundElements": null, 964 | "updated": 1720240096881, 965 | "link": null, 966 | "locked": false, 967 | "text": "ReLU -> (B, 256)", 968 | "fontSize": 20, 969 | "fontFamily": 1, 970 | "textAlign": "center", 971 | "verticalAlign": "middle", 972 | "containerId": "DyktbCEmygt1qtlFFoMfX", 973 | "originalText": "ReLU -> (B, 256)", 974 | "autoResize": true, 975 | "lineHeight": 1.25 976 | }, 977 | { 978 | "id": "N9bJrjrXf9FaYkZTgVyo4", 979 | "type": "arrow", 980 | "x": 615, 981 | "y": 596, 982 | "width": 0, 983 | "height": 39, 984 | "angle": 0, 985 | "strokeColor": "#1e1e1e", 986 | "backgroundColor": "transparent", 987 | "fillStyle": "solid", 988 | "strokeWidth": 2, 989 | "strokeStyle": "solid", 990 | "roughness": 1, 991 | "opacity": 100, 992 | "groupIds": [], 993 | "frameId": null, 994 | "index": "av", 995 | "roundness": { 996 | "type": 2 997 | }, 998 | "seed": 1360399615, 999 | "version": 60, 1000 | "versionNonce": 1249644607, 1001 | "isDeleted": false, 1002 | "boundElements": null, 1003 | "updated": 1720240105382, 1004 | "link": null, 1005 | "locked": false, 1006 | "points": [ 1007 | [ 1008 | 0, 1009 | 0 1010 | ], 1011 | [ 1012 | 0, 1013 | 39 1014 | ] 1015 | ], 1016 | "lastCommittedPoint": null, 1017 | "startBinding": { 1018 | "elementId": "DyktbCEmygt1qtlFFoMfX", 1019 | "focus": 0.04126984126984127, 1020 | "gap": 1 1021 | }, 1022 | "endBinding": null, 1023 | "startArrowhead": null, 1024 | "endArrowhead": "arrow" 1025 | }, 1026 | { 1027 | "id": "FDItF61llBXUrbKwoTXXI", 1028 | "type": "rectangle", 1029 | "x": 462, 1030 | "y": 639, 1031 | "width": 317, 1032 | "height": 72, 1033 | "angle": 0, 1034 | "strokeColor": "#1e1e1e", 1035 | "backgroundColor": "transparent", 1036 | "fillStyle": "solid", 1037 | "strokeWidth": 2, 1038 | "strokeStyle": "solid", 1039 | "roughness": 1, 1040 | "opacity": 100, 1041 | "groupIds": [], 1042 | "frameId": null, 1043 | "index": "aw", 1044 | "roundness": { 1045 | "type": 3 1046 | }, 1047 | "seed": 1190546559, 1048 | "version": 91, 1049 | "versionNonce": 186610257, 1050 | "isDeleted": false, 1051 | "boundElements": [ 1052 | { 1053 | "type": "text", 1054 | "id": "_7Mk3eUecqZw4aJygXBMz" 1055 | }, 1056 | { 1057 | "id": "02JUczFgbNAnZ_XK7SONP", 1058 | "type": "arrow" 1059 | } 1060 | ], 1061 | "updated": 1720240153683, 1062 | "link": null, 1063 | "locked": false 1064 | }, 1065 | { 1066 | "id": "_7Mk3eUecqZw4aJygXBMz", 1067 | "type": "text", 1068 | "x": 467.7401428222656, 1069 | "y": 650, 1070 | "width": 305.51971435546875, 1071 | "height": 50, 1072 | "angle": 0, 1073 | "strokeColor": "#1e1e1e", 1074 | "backgroundColor": "transparent", 1075 | "fillStyle": "solid", 1076 | "strokeWidth": 2, 1077 | "strokeStyle": "solid", 1078 | "roughness": 1, 1079 | "opacity": 100, 1080 | "groupIds": [], 1081 | "frameId": null, 1082 | "index": "ax", 1083 | "roundness": null, 1084 | "seed": 446522335, 1085 | "version": 43, 1086 | "versionNonce": 1837740351, 1087 | "isDeleted": false, 1088 | "boundElements": null, 1089 | "updated": 1720240129464, 1090 | "link": null, 1091 | "locked": false, 1092 | "text": "Linear -> (B, 256) @ (256, 10)\n= (B, 10)", 1093 | "fontSize": 20, 1094 | "fontFamily": 1, 1095 | "textAlign": "center", 1096 | "verticalAlign": "middle", 1097 | "containerId": "FDItF61llBXUrbKwoTXXI", 1098 | "originalText": "Linear -> (B, 256) @ (256, 10)\n= (B, 10)", 1099 | "autoResize": true, 1100 | "lineHeight": 1.25 1101 | }, 1102 | { 1103 | "id": "Wn2D_MDQlGufT2Nv2ZPaD", 1104 | "type": "text", 1105 | "x": 902, 1106 | "y": 448, 1107 | "width": 85.05990600585938, 1108 | "height": 50, 1109 | "angle": 0, 1110 | "strokeColor": "#1e1e1e", 1111 | "backgroundColor": "transparent", 1112 | "fillStyle": "solid", 1113 | "strokeWidth": 2, 1114 | "strokeStyle": "solid", 1115 | "roughness": 1, 1116 | "opacity": 100, 1117 | "groupIds": [], 1118 | "frameId": null, 1119 | "index": "ay", 1120 | "roundness": null, 1121 | "seed": 719543391, 1122 | "version": 123, 1123 | "versionNonce": 1609278399, 1124 | "isDeleted": false, 1125 | "boundElements": null, 1126 | "updated": 1720240192707, 1127 | "link": null, 1128 | "locked": false, 1129 | "text": "Gradient\nDescent", 1130 | "fontSize": 20, 1131 | "fontFamily": 1, 1132 | "textAlign": "left", 1133 | "verticalAlign": "top", 1134 | "containerId": null, 1135 | "originalText": "Gradient\nDescent", 1136 | "autoResize": true, 1137 | "lineHeight": 1.25 1138 | } 1139 | ], 1140 | "appState": { 1141 | "gridSize": null, 1142 | "viewBackgroundColor": "#ffffff" 1143 | }, 1144 | "files": {} 1145 | } -------------------------------------------------------------------------------- /assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/assets/architecture.png -------------------------------------------------------------------------------- /assets/mnist-mlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/assets/mnist-mlp.png -------------------------------------------------------------------------------- /cuda/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Infatoshi/mnist-cuda/d0673daac0eb555c7d88717bb75becb5b6bfdcbb/cuda/.DS_Store -------------------------------------------------------------------------------- /cuda/naive-gpu/1layer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define INPUT_SIZE 784 9 | #define HIDDEN_SIZE 4096 10 | #define OUTPUT_SIZE 10 11 | #define TRAIN_SIZE 10000 12 | #define TEST_SIZE 1000 13 | #define BATCH_SIZE 32 14 | #define EPOCHS 20 15 | #define LEARNING_RATE 0.05 16 | 17 | typedef struct { 18 | float *weights1; 19 | float *weights2; 20 | float *bias1; 21 | float *bias2; 22 | float *grad_weights1; 23 | float *grad_weights2; 24 | float *grad_bias1; 25 | float *grad_bias2; 26 | } NeuralNetwork; 27 | 28 | // Modify the CUDA_CHECK macro to print more information 29 | #define CUDA_CHECK(call) \ 30 | do { \ 31 | cudaError_t error = call; \ 32 | if (error != cudaSuccess) { \ 33 | fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ 34 | cudaGetErrorString(error)); \ 35 | cudaDeviceReset(); \ 36 | exit(EXIT_FAILURE); \ 37 | } \ 38 | } while(0) 39 | 40 | // load batched img data 41 | void load_data(const char *filename, float *data, int size) { 42 | FILE *file = fopen(filename, "rb"); 43 | if (file == NULL) { 44 | fprintf(stderr, "Error opening file: %s\n", filename); 45 | exit(1); 46 | } 47 | size_t read_size = fread(data, sizeof(float), size, file); 48 | if (read_size != size) { 49 | fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size); 50 | exit(1); 51 | } 52 | fclose(file); 53 | } 54 | 55 | // load batch labels 56 | void load_labels(const char *filename, int *labels, int size) { 57 | FILE *file = fopen(filename, "rb"); 58 | if (file == NULL) { 59 | fprintf(stderr, "Error opening file: %s\n", filename); 60 | exit(1); 61 | } 62 | size_t read_size = fread(labels, sizeof(int), size, file); 63 | if (read_size != size) { 64 | fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size); 65 | exit(1); 66 | } 67 | fclose(file); 68 | } 69 | 70 | // kaiming init func for weights 71 | void initialize_weights(float *weights, int size) { 72 | float scale = sqrtf(2.0f / size); 73 | for (int i = 0; i < size; i++) { 74 | weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f); 75 | } 76 | } 77 | 78 | // basic init for biases 79 | void initialize_bias(float *bias, int size) { 80 | for (int i = 0; i < size; i++) { 81 | bias[i] = 0.0f; 82 | } 83 | } 84 | 85 | // CUDA kernel for matrix multiplication (A @ B) 86 | __global__ void matmul_a_b_kernel(float *A, float *B, float *C, int m, int n, int k) { 87 | int row = blockIdx.y * blockDim.y + threadIdx.y; 88 | int col = blockIdx.x * blockDim.x + threadIdx.x; 89 | 90 | if (row < m && col < k) { 91 | float sum = 0.0f; 92 | for (int i = 0; i < n; ++i) { 93 | sum += A[row * n + i] * B[i * k + col]; 94 | } 95 | C[row * k + col] = sum; 96 | } 97 | } 98 | 99 | // CUDA kernel for matrix multiplication (A @ B.T) 100 | __global__ void matmul_a_bt_kernel(float *A, float *B, float *C, int m, int n, int k) { 101 | int row = blockIdx.y * blockDim.y + threadIdx.y; 102 | int col = blockIdx.x * blockDim.x + threadIdx.x; 103 | 104 | if (row < m && col < k) { 105 | float sum = 0.0f; 106 | for (int i = 0; i < n; ++i) { 107 | sum += A[row * n + i] * B[col * n + i]; 108 | } 109 | C[row * k + col] = sum; 110 | } 111 | } 112 | 113 | // CUDA kernel for matrix multiplication (A.T @ B) 114 | __global__ void matmul_at_b_kernel(float *A, float *B, float *C, int m, int n, int k) { 115 | int row = blockIdx.y * blockDim.y + threadIdx.y; 116 | int col = blockIdx.x * blockDim.x + threadIdx.x; 117 | 118 | if (row < n && col < k) { 119 | float sum = 0.0f; 120 | for (int i = 0; i < m; ++i) { 121 | sum += A[i * n + row] * B[i * k + col]; 122 | } 123 | C[row * k + col] = sum; 124 | } 125 | } 126 | 127 | // CUDA kernel for ReLU activation 128 | __global__ void relu_kernel(float *x, int size) { 129 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 130 | if (idx < size) { 131 | x[idx] = fmaxf(0.0f, x[idx]); 132 | } 133 | } 134 | 135 | // CUDA kernel for bias addition 136 | __global__ void bias_add_kernel(float *x, float *bias, int batch_size, int size) { 137 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 138 | int b = idx / size; 139 | int i = idx % size; 140 | 141 | if (b < batch_size && i < size) { 142 | x[idx] += bias[i]; 143 | } 144 | } 145 | 146 | // CUDA kernel for softmax 147 | __global__ void softmax_kernel(float *x, int batch_size, int size) { 148 | int b = blockIdx.x; 149 | if (b < batch_size) { 150 | float max_val = x[b * size]; 151 | for (int i = 1; i < size; ++i) { 152 | max_val = fmaxf(max_val, x[b * size + i]); 153 | } 154 | 155 | float sum = 0.0f; 156 | for (int i = 0; i < size; ++i) { 157 | x[b * size + i] = expf(x[b * size + i] - max_val); 158 | sum += x[b * size + i]; 159 | } 160 | 161 | for (int i = 0; i < size; ++i) { 162 | x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f); 163 | } 164 | } 165 | } 166 | 167 | __global__ void clip_gradients_kernel(float *gradients, int size, float max_norm) { 168 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 169 | if (idx < size) { 170 | float grad = gradients[idx]; 171 | if (grad > max_norm) { 172 | gradients[idx] = max_norm; 173 | } else if (grad < -max_norm) { 174 | gradients[idx] = -max_norm; 175 | } 176 | } 177 | } 178 | 179 | 180 | // Modified forward function using CUDA kernels 181 | void forward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int batch_size) { 182 | // 1024 threads/blocks 183 | dim3 block_size(32, 32); 184 | // just enough blocks + threads for our naive matmul kernel 185 | dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (batch_size + block_size.y - 1) / block_size.y); 186 | 187 | // Input to Hidden (X @ W1) 188 | matmul_a_b_kernel<<>>(d_input, nn->weights1, d_hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE); 189 | CUDA_CHECK(cudaGetLastError()); 190 | 191 | // Add bias1 (one bias term for each neuron (multiple weights)) 192 | bias_add_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, nn->bias1, batch_size, HIDDEN_SIZE); 193 | CUDA_CHECK(cudaGetLastError()); 194 | 195 | // Apply ReLU 196 | relu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, batch_size * HIDDEN_SIZE); 197 | CUDA_CHECK(cudaGetLastError()); 198 | 199 | // Hidden to Output (Hidden @ W2) 200 | grid_size.x = (OUTPUT_SIZE + block_size.x - 1) / block_size.x; 201 | grid_size.y = (batch_size + block_size.y - 1) / block_size.y; 202 | matmul_a_b_kernel<<>>(d_hidden, nn->weights2, d_output, batch_size, HIDDEN_SIZE, OUTPUT_SIZE); 203 | CUDA_CHECK(cudaGetLastError()); 204 | 205 | // Add bias2 (also one bias term per neuron) 206 | bias_add_kernel<<<(batch_size * OUTPUT_SIZE + 255) / 256, 256>>>(d_output, nn->bias2, batch_size, OUTPUT_SIZE); 207 | CUDA_CHECK(cudaGetLastError()); 208 | 209 | // Apply softmax 210 | softmax_kernel<<>>(d_output, batch_size, OUTPUT_SIZE); 211 | CUDA_CHECK(cudaGetLastError()); 212 | 213 | CUDA_CHECK(cudaDeviceSynchronize()); 214 | } 215 | 216 | // Modify cross_entropy_loss to work with batches (w/out softmax because we already do this in the forward pass) 217 | float cross_entropy_loss(float *output, int *labels, int batch_size) { 218 | float total_loss = 0.0f; 219 | for (int b = 0; b < batch_size; b++) { 220 | total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f)); 221 | } 222 | return total_loss / batch_size; 223 | } 224 | 225 | // Add this CUDA kernel to zero out gradients 226 | __global__ void zero_grad_kernel(float *grad, int size) { 227 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 228 | if (idx < size) { 229 | grad[idx] = 0.0f; 230 | } 231 | } 232 | 233 | // CUDA kernel for computing output gradients 234 | __global__ void compute_output_gradients_kernel(float *grad_output, float *output, int *labels, int batch_size) { 235 | int b = blockIdx.x * blockDim.x + threadIdx.x; 236 | if (b < batch_size) { 237 | for (int i = 0; i < OUTPUT_SIZE; ++i) { 238 | grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i]; 239 | } 240 | grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f; 241 | } 242 | } 243 | 244 | // CUDA kernel for updating gradients 245 | __global__ void update_gradients_kernel(float *grad_weights, float *grad_bias, float *grad_layer, float *prev_layer, int batch_size, int prev_size, int curr_size) { 246 | int i = blockIdx.y; 247 | int j = blockIdx.x * blockDim.x + threadIdx.x; 248 | 249 | if (i < curr_size && j < prev_size) { 250 | float grad_w_sum = 0.0f; 251 | for (int b = 0; b < batch_size; ++b) { 252 | grad_w_sum += grad_layer[b * curr_size + i] * prev_layer[b * prev_size + j]; 253 | } 254 | atomicAdd(&grad_weights[i * prev_size + j], grad_w_sum); 255 | 256 | if (j == 0) { 257 | float grad_b_sum = 0.0f; 258 | for (int b = 0; b < batch_size; ++b) { 259 | grad_b_sum += grad_layer[b * curr_size + i]; 260 | } 261 | atomicAdd(&grad_bias[i], grad_b_sum); 262 | } 263 | } 264 | } 265 | 266 | __global__ void drelu_kernel(float *x, float *d_ReLU_out, int size) { 267 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 268 | if (idx < size) { 269 | d_ReLU_out[idx] = x[idx] > 0.0f ? 1.0f : 0.0f; 270 | } 271 | } 272 | 273 | // Element-wise multiplication of d_dX2 and d_grad_hidden 274 | __global__ void multiply_gradients_kernel(float *grad1, float *grad2, int size) { 275 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 276 | if (idx < size) { 277 | grad1[idx] *= grad2[idx]; 278 | } 279 | } 280 | 281 | // Modified backward function using CUDA kernels 282 | // shape rotating is on par with the visual example (excalidraw diagram) in the mnist-cuda git repo (also found in "assets") 283 | void backward(NeuralNetwork *nn, float *d_input, float *d_hidden, float *d_output, int *d_labels, int batch_size) { 284 | // Initialize gradients to zero using CUDA kernel 285 | 286 | zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE); 287 | CUDA_CHECK(cudaGetLastError()); 288 | 289 | zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE); 290 | CUDA_CHECK(cudaGetLastError()); 291 | 292 | zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE); 293 | CUDA_CHECK(cudaGetLastError()); 294 | 295 | zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE); 296 | CUDA_CHECK(cudaGetLastError()); 297 | 298 | // Compute gradients for output layer 299 | float *d_grad_output; 300 | CUDA_CHECK(cudaMalloc(&d_grad_output, batch_size * OUTPUT_SIZE * sizeof(float))); 301 | compute_output_gradients_kernel<<<(batch_size + 255) / 256, 256>>>(d_grad_output, d_output, d_labels, batch_size); 302 | CUDA_CHECK(cudaGetLastError()); 303 | 304 | // Update gradients for weights2 (W2.grad = grad_output.T @ hidden) 305 | dim3 block_size(32, 32); 306 | dim3 grid_size((HIDDEN_SIZE + block_size.x - 1) / block_size.x, (OUTPUT_SIZE + block_size.y - 1) / block_size.y); 307 | matmul_at_b_kernel<<>>(d_hidden, d_grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE, OUTPUT_SIZE); 308 | CUDA_CHECK(cudaGetLastError()); 309 | 310 | // Update gradients for bias2 311 | update_gradients_kernel<<>>(nn->grad_weights2, nn->grad_bias2, d_grad_output, d_hidden, batch_size, HIDDEN_SIZE, OUTPUT_SIZE); 312 | CUDA_CHECK(cudaGetLastError()); 313 | 314 | // Compute dX2 (gradient of loss w.r.t. input of second layer) 315 | float *d_dX2; 316 | CUDA_CHECK(cudaMalloc(&d_dX2, batch_size * HIDDEN_SIZE * sizeof(float))); 317 | grid_size.x = (HIDDEN_SIZE + block_size.x - 1) / block_size.x; 318 | grid_size.y = (batch_size + block_size.y - 1) / block_size.y; 319 | matmul_a_bt_kernel<<>>(d_grad_output, nn->weights2, d_dX2, batch_size, OUTPUT_SIZE, HIDDEN_SIZE); 320 | CUDA_CHECK(cudaGetLastError()); 321 | 322 | // Compute d_ReLU_out (element-wise multiplication with ReLU derivative) 323 | float *d_grad_hidden; 324 | CUDA_CHECK(cudaMalloc(&d_grad_hidden, batch_size * HIDDEN_SIZE * sizeof(float))); 325 | drelu_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_hidden, d_grad_hidden, batch_size * HIDDEN_SIZE); 326 | CUDA_CHECK(cudaGetLastError()); 327 | 328 | 329 | multiply_gradients_kernel<<<(batch_size * HIDDEN_SIZE + 255) / 256, 256>>>(d_dX2, d_grad_hidden, batch_size * HIDDEN_SIZE); 330 | CUDA_CHECK(cudaGetLastError()); 331 | 332 | // Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input) 333 | grid_size.x = (INPUT_SIZE + block_size.x - 1) / block_size.x; 334 | grid_size.y = (HIDDEN_SIZE + block_size.y - 1) / block_size.y; 335 | matmul_at_b_kernel<<>>(d_input, d_dX2, nn->grad_weights1, batch_size, INPUT_SIZE, HIDDEN_SIZE); 336 | CUDA_CHECK(cudaGetLastError()); 337 | 338 | // Update gradients for bias1 339 | update_gradients_kernel<<>>(nn->grad_weights1, nn->grad_bias1, d_dX2, d_input, batch_size, INPUT_SIZE, HIDDEN_SIZE); 340 | CUDA_CHECK(cudaGetLastError()); 341 | 342 | // Free allocated memory 343 | CUDA_CHECK(cudaFree(d_grad_output)); 344 | CUDA_CHECK(cudaFree(d_dX2)); 345 | CUDA_CHECK(cudaFree(d_grad_hidden)); 346 | 347 | CUDA_CHECK(cudaDeviceSynchronize()); 348 | } 349 | 350 | // gradient descent step 351 | __global__ void update_weights_kernel(float *weights, float *grad_weights, int size) { 352 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 353 | if (idx < size) { 354 | weights[idx] -= LEARNING_RATE * grad_weights[idx]; 355 | } 356 | } 357 | 358 | void update_weights(NeuralNetwork *nn) { 359 | int block_size = 256; 360 | int grid_size; 361 | 362 | // Update weights1 363 | grid_size = (HIDDEN_SIZE * INPUT_SIZE + block_size - 1) / block_size; 364 | update_weights_kernel<<>>(nn->weights1, nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE); 365 | CUDA_CHECK(cudaGetLastError()); 366 | 367 | // Update weights2 368 | grid_size = (OUTPUT_SIZE * HIDDEN_SIZE + block_size - 1) / block_size; 369 | update_weights_kernel<<>>(nn->weights2, nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE); 370 | CUDA_CHECK(cudaGetLastError()); 371 | 372 | // Update bias1 373 | grid_size = (HIDDEN_SIZE + block_size - 1) / block_size; 374 | update_weights_kernel<<>>(nn->bias1, nn->grad_bias1, HIDDEN_SIZE); 375 | CUDA_CHECK(cudaGetLastError()); 376 | 377 | // Update bias2 378 | grid_size = (OUTPUT_SIZE + block_size - 1) / block_size; 379 | update_weights_kernel<<>>(nn->bias2, nn->grad_bias2, OUTPUT_SIZE); 380 | CUDA_CHECK(cudaGetLastError()); 381 | 382 | CUDA_CHECK(cudaDeviceSynchronize()); 383 | } 384 | 385 | 386 | // Modify evaluate_accuracy to handle larger datasets by processing in batches 387 | float evaluate_accuracy(NeuralNetwork *nn, float *d_X_test, int *d_y_test, float *d_hidden, float *d_output, int total_size) { 388 | int num_batches = (total_size + BATCH_SIZE - 1) / BATCH_SIZE; 389 | int total_correct = 0; 390 | int total_processed = 0; 391 | 392 | for (int batch = 0; batch < num_batches; batch++) { 393 | int current_batch_size = (batch == num_batches - 1) ? 394 | (total_size - batch * BATCH_SIZE) : BATCH_SIZE; 395 | 396 | if (current_batch_size <= 0) break; 397 | 398 | forward(nn, &d_X_test[batch * BATCH_SIZE * INPUT_SIZE], 399 | d_hidden, d_output, current_batch_size); 400 | 401 | float *h_output = (float *)malloc(current_batch_size * OUTPUT_SIZE * sizeof(float)); 402 | int *h_y_test = (int *)malloc(current_batch_size * sizeof(int)); 403 | 404 | CUDA_CHECK(cudaMemcpy(h_output, d_output, 405 | current_batch_size * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 406 | CUDA_CHECK(cudaMemcpy(h_y_test, &d_y_test[batch * BATCH_SIZE], 407 | current_batch_size * sizeof(int), cudaMemcpyDeviceToHost)); 408 | 409 | for (int i = 0; i < current_batch_size; i++) { 410 | int predicted = 0; 411 | for (int j = 1; j < OUTPUT_SIZE; j++) { 412 | if (h_output[i * OUTPUT_SIZE + j] > h_output[i * OUTPUT_SIZE + predicted]) { 413 | predicted = j; 414 | } 415 | } 416 | if (predicted == h_y_test[i]) { 417 | total_correct++; 418 | } 419 | } 420 | 421 | total_processed += current_batch_size; 422 | free(h_output); 423 | free(h_y_test); 424 | } 425 | 426 | return 100.0f * total_correct / total_processed; 427 | } 428 | 429 | // Modify train function 430 | void train(NeuralNetwork *nn, float *X_train, int *y_train, float *X_test, int *y_test) { 431 | float *d_X_train, *d_X_test, *d_hidden, *d_output; 432 | int *d_y_train, *d_y_test; 433 | 434 | // Allocate memory for training and test data 435 | CUDA_CHECK(cudaMalloc(&d_X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float))); 436 | CUDA_CHECK(cudaMalloc(&d_X_test, TEST_SIZE * INPUT_SIZE * sizeof(float))); 437 | CUDA_CHECK(cudaMalloc(&d_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float))); 438 | CUDA_CHECK(cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); 439 | CUDA_CHECK(cudaMalloc(&d_y_train, TRAIN_SIZE * sizeof(int))); 440 | CUDA_CHECK(cudaMalloc(&d_y_test, TEST_SIZE * sizeof(int))); 441 | 442 | // Copy data to GPU 443 | CUDA_CHECK(cudaMemcpy(d_X_train, X_train, TRAIN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 444 | CUDA_CHECK(cudaMemcpy(d_X_test, X_test, TEST_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 445 | CUDA_CHECK(cudaMemcpy(d_y_train, y_train, TRAIN_SIZE * sizeof(int), cudaMemcpyHostToDevice)); 446 | CUDA_CHECK(cudaMemcpy(d_y_test, y_test, TEST_SIZE * sizeof(int), cudaMemcpyHostToDevice)); 447 | 448 | int num_batches = TRAIN_SIZE / BATCH_SIZE; 449 | 450 | for (int epoch = 0; epoch < EPOCHS; epoch++) { 451 | float total_loss = 0.0f; 452 | 453 | // Zero out gradients at the beginning of each epoch 454 | zero_grad_kernel<<<(HIDDEN_SIZE * INPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE); 455 | zero_grad_kernel<<<(OUTPUT_SIZE * HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE); 456 | zero_grad_kernel<<<(HIDDEN_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias1, HIDDEN_SIZE); 457 | zero_grad_kernel<<<(OUTPUT_SIZE + 256 - 1) / 256, 256>>>(nn->grad_bias2, OUTPUT_SIZE); 458 | CUDA_CHECK(cudaDeviceSynchronize()); 459 | 460 | for (int batch = 0; batch < num_batches; batch++) { 461 | int start_idx = batch * BATCH_SIZE; 462 | 463 | forward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, BATCH_SIZE); 464 | 465 | float *h_output = (float *)malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float)); 466 | CUDA_CHECK(cudaMemcpy(h_output, d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 467 | 468 | float loss = cross_entropy_loss(h_output, &y_train[start_idx], BATCH_SIZE); 469 | total_loss += loss; 470 | 471 | free(h_output); 472 | 473 | backward(nn, &d_X_train[start_idx * INPUT_SIZE], d_hidden, d_output, &d_y_train[start_idx], BATCH_SIZE); 474 | update_weights(nn); 475 | 476 | if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) { 477 | // Use random batch from test set for accuracy reporting 478 | int test_start_idx = rand() % (TEST_SIZE - BATCH_SIZE); 479 | float test_accuracy = evaluate_accuracy(nn, 480 | &d_X_test[test_start_idx * INPUT_SIZE], 481 | &d_y_test[test_start_idx], 482 | d_hidden, d_output, BATCH_SIZE); 483 | 484 | printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Test Accuracy: %.2f%%\n", 485 | epoch + 1, EPOCHS, batch + 1, num_batches, 486 | total_loss / (batch + 1), test_accuracy); 487 | } 488 | } 489 | 490 | // Evaluate on entire test set at end of epoch 491 | float test_accuracy = evaluate_accuracy(nn, d_X_test, d_y_test, d_hidden, d_output, TEST_SIZE); 492 | printf("Epoch %d/%d completed, Loss: %.4f, Test Accuracy: %.2f%%\n", 493 | epoch + 1, EPOCHS, total_loss / num_batches, test_accuracy); 494 | } 495 | 496 | // Free GPU memory 497 | CUDA_CHECK(cudaFree(d_X_train)); 498 | CUDA_CHECK(cudaFree(d_X_test)); 499 | CUDA_CHECK(cudaFree(d_hidden)); 500 | CUDA_CHECK(cudaFree(d_output)); 501 | CUDA_CHECK(cudaFree(d_y_train)); 502 | CUDA_CHECK(cudaFree(d_y_test)); 503 | } 504 | 505 | // Modified initialize function to allocate memory for gradients 506 | void initialize_neural_network(NeuralNetwork *nn) { 507 | CUDA_CHECK(cudaMalloc(&nn->weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float))); 508 | CUDA_CHECK(cudaMalloc(&nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float))); 509 | CUDA_CHECK(cudaMalloc(&nn->bias1, HIDDEN_SIZE * sizeof(float))); 510 | CUDA_CHECK(cudaMalloc(&nn->bias2, OUTPUT_SIZE * sizeof(float))); 511 | CUDA_CHECK(cudaMalloc(&nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float))); 512 | CUDA_CHECK(cudaMalloc(&nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float))); 513 | CUDA_CHECK(cudaMalloc(&nn->grad_bias1, HIDDEN_SIZE * sizeof(float))); 514 | CUDA_CHECK(cudaMalloc(&nn->grad_bias2, OUTPUT_SIZE * sizeof(float))); 515 | 516 | // Allocate temporary host memory 517 | float *h_weights1 = (float *)malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 518 | float *h_weights2 = (float *)malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 519 | float *h_bias1 = (float *)malloc(HIDDEN_SIZE * sizeof(float)); 520 | float *h_bias2 = (float *)malloc(OUTPUT_SIZE * sizeof(float)); 521 | 522 | // Initialize weights and biases on the host 523 | initialize_weights(h_weights1, HIDDEN_SIZE * INPUT_SIZE); 524 | initialize_weights(h_weights2, OUTPUT_SIZE * HIDDEN_SIZE); 525 | initialize_bias(h_bias1, HIDDEN_SIZE); 526 | initialize_bias(h_bias2, OUTPUT_SIZE); 527 | 528 | // Copy initialized values to device 529 | CUDA_CHECK(cudaMemcpy(nn->weights1, h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 530 | CUDA_CHECK(cudaMemcpy(nn->weights2, h_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 531 | CUDA_CHECK(cudaMemcpy(nn->bias1, h_bias1, HIDDEN_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 532 | CUDA_CHECK(cudaMemcpy(nn->bias2, h_bias2, OUTPUT_SIZE * sizeof(float), cudaMemcpyHostToDevice)); 533 | 534 | // Free temporary host memory 535 | free(h_weights1); 536 | free(h_weights2); 537 | free(h_bias1); 538 | free(h_bias2); 539 | } 540 | 541 | int main() { 542 | srand(time(NULL)); 543 | 544 | NeuralNetwork nn; 545 | initialize_neural_network(&nn); 546 | 547 | float *X_train = (float *)malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float)); 548 | int *y_train = (int *)malloc(TRAIN_SIZE * sizeof(int)); 549 | float *X_test = (float *)malloc(TEST_SIZE * INPUT_SIZE * sizeof(float)); 550 | int *y_test = (int *)malloc(TEST_SIZE * sizeof(int)); 551 | 552 | load_data("../../mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE); 553 | load_labels("../../mnist_data/y_train.bin", y_train, TRAIN_SIZE); 554 | load_data("../../mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE); 555 | load_labels("../../mnist_data/y_test.bin", y_test, TEST_SIZE); 556 | 557 | // print first image in the terminal 558 | for (int i = 0; i < 28; i++) { 559 | for (int j = 0; j < 28; j++) { 560 | if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) { 561 | printf("X"); 562 | } else { 563 | printf(" "); 564 | } 565 | } 566 | printf("\n"); 567 | } 568 | 569 | printf("First 10 training labels: "); 570 | for (int i = 0; i < 10; i++) { 571 | printf("%d ", y_train[i]); 572 | } 573 | printf("\n"); 574 | 575 | // Start timing 576 | struct timespec start, end; 577 | clock_gettime(CLOCK_MONOTONIC, &start); 578 | 579 | train(&nn, X_train, y_train, X_test, y_test); 580 | 581 | // End timing 582 | clock_gettime(CLOCK_MONOTONIC, &end); 583 | 584 | // Calculate duration in seconds with milliseconds 585 | double training_time = (end.tv_sec - start.tv_sec) + 586 | (end.tv_nsec - start.tv_nsec) / 1e9; 587 | 588 | printf("\nTotal training time: %.2f sec\n", training_time); 589 | 590 | CUDA_CHECK(cudaFree(nn.weights1)); 591 | CUDA_CHECK(cudaFree(nn.weights2)); 592 | CUDA_CHECK(cudaFree(nn.bias1)); 593 | CUDA_CHECK(cudaFree(nn.bias2)); 594 | CUDA_CHECK(cudaFree(nn.grad_weights1)); 595 | CUDA_CHECK(cudaFree(nn.grad_weights2)); 596 | CUDA_CHECK(cudaFree(nn.grad_bias1)); 597 | CUDA_CHECK(cudaFree(nn.grad_bias2)); 598 | free(X_train); 599 | free(y_train); 600 | free(X_test); 601 | free(y_test); 602 | 603 | cudaError_t err = cudaGetLastError(); 604 | if (err != cudaSuccess) { 605 | fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err)); 606 | return 1; 607 | } 608 | 609 | return 0; 610 | } 611 | -------------------------------------------------------------------------------- /cuda/vroom/comparing/batch-compare-backward.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define CHECK_CUDA(call) { \ 9 | cudaError_t err = call; \ 10 | if (err != cudaSuccess) { \ 11 | fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 12 | exit(EXIT_FAILURE); \ 13 | } \ 14 | } 15 | 16 | #define CHECK_CUBLAS(call) { \ 17 | cublasStatus_t status = call; \ 18 | if (status != CUBLAS_STATUS_SUCCESS) { \ 19 | fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \ 20 | exit(EXIT_FAILURE); \ 21 | } \ 22 | } 23 | 24 | __device__ float relu_derivative(float x) { 25 | return x > 0 ? 1.0f : 0.0f; 26 | } 27 | 28 | __global__ void compute_output_gradient(float *output, int *labels, float *grad_output, int output_size, int batch_size) { 29 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 30 | int batch_idx = blockIdx.y; 31 | 32 | if (idx < output_size && batch_idx < batch_size) { 33 | int index = batch_idx * output_size + idx; 34 | grad_output[index] = output[index]; 35 | if (idx == labels[batch_idx]) { 36 | grad_output[index] -= 1.0f; 37 | } 38 | } 39 | } 40 | 41 | __global__ void compute_hidden_gradient(float *hidden, float *weights2, float *grad_output, float *grad_hidden, 42 | int hidden_size, int output_size, int batch_size) { 43 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 44 | int batch_idx = blockIdx.y; 45 | 46 | if (idx < hidden_size && batch_idx < batch_size) { 47 | float sum = 0.0f; 48 | for (int i = 0; i < output_size; i++) { 49 | sum += grad_output[batch_idx * output_size + i] * weights2[i * hidden_size + idx]; 50 | } 51 | int hidden_index = batch_idx * hidden_size + idx; 52 | grad_hidden[hidden_index] = sum * relu_derivative(hidden[hidden_index]); 53 | } 54 | } 55 | 56 | __global__ void compute_weight_gradients(float *input, float *grad_hidden, float *grad_weights1, 57 | int input_size, int hidden_size, int batch_size) { 58 | int hidden_idx = blockIdx.x * blockDim.x + threadIdx.x; 59 | int input_idx = blockIdx.y * blockDim.y + threadIdx.y; 60 | 61 | if (hidden_idx < hidden_size && input_idx < input_size) { 62 | float sum = 0.0f; 63 | for (int b = 0; b < batch_size; b++) { 64 | sum += grad_hidden[b * hidden_size + hidden_idx] * input[b * input_size + input_idx]; 65 | } 66 | grad_weights1[hidden_idx * input_size + input_idx] = sum; 67 | } 68 | } 69 | 70 | __global__ void compute_bias_gradients(float *grad_hidden, float *grad_bias1, int hidden_size, int batch_size) { 71 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 72 | 73 | if (idx < hidden_size) { 74 | float sum = 0.0f; 75 | for (int b = 0; b < batch_size; b++) { 76 | sum += grad_hidden[b * hidden_size + idx]; 77 | } 78 | grad_bias1[idx] = sum; 79 | } 80 | } 81 | 82 | __device__ float relu(float x) { 83 | return fmaxf(x, 0.0f); 84 | } 85 | 86 | __global__ void add_bias_and_relu(float *data, float *bias, int size, int batch_size) { 87 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 88 | int batch_idx = blockIdx.y; 89 | 90 | if (idx < size && batch_idx < batch_size) { 91 | int index = batch_idx * size + idx; 92 | data[index] = relu(data[index] + bias[idx]); 93 | } 94 | } 95 | 96 | __global__ void add_bias(float *data, float *bias, int size, int batch_size) { 97 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 98 | int batch_idx = blockIdx.y; 99 | 100 | if (idx < size && batch_idx < batch_size) { 101 | int index = batch_idx * size + idx; 102 | data[index] += bias[idx]; 103 | } 104 | } 105 | 106 | 107 | __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) { 108 | int row = blockIdx.y * blockDim.y + threadIdx.y; 109 | int col = blockIdx.x * blockDim.x + threadIdx.x; 110 | 111 | if (row < M && col < N) { 112 | float sum = 0.0f; 113 | for (int i = 0; i < K; i++) { 114 | sum += A[row * K + i] * B[i * N + col]; 115 | } 116 | C[row * N + col] = sum; 117 | } 118 | } 119 | 120 | __global__ void add_bias_naive(float *input, float *bias, int rows, int cols) { 121 | int row = blockIdx.y * blockDim.y + threadIdx.y; 122 | int col = blockIdx.x * blockDim.x + threadIdx.x; 123 | 124 | if (row < rows && col < cols) { 125 | input[row * cols + col] += bias[col]; 126 | } 127 | } 128 | 129 | __global__ void apply_relu_naive(float *input, int size) { 130 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 131 | 132 | if (idx < size) { 133 | input[idx] = relu(input[idx]); 134 | } 135 | } 136 | 137 | 138 | void forward_pass_naive(float *input, float *weights1, float *bias1, float *hidden, 139 | float *weights2, float *bias2, float *output, 140 | int input_size, int hidden_size, int output_size, int batch_size) { 141 | // Define grid and block dimensions 142 | dim3 block_dim(32, 32); 143 | dim3 grid_dim_1((hidden_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); 144 | dim3 grid_dim_2((output_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); 145 | 146 | 147 | // // print inputs 148 | // float *h_input = (float*)malloc(batch_size * input_size * sizeof(float)); 149 | // CHECK_CUDA(cudaMemcpy(h_input, input, batch_size * input_size * sizeof(float), cudaMemcpyDeviceToHost)); 150 | // std::cout << "input to naive: " << std::endl; 151 | // for (int i = 0; i < batch_size * input_size; i++) { 152 | // printf("%f ", h_input[i]); 153 | // if ((i+1) % input_size == 0) { 154 | // printf("\n"); 155 | // } 156 | // } 157 | // // copy back to device 158 | // CHECK_CUDA(cudaMemcpy(input, h_input, batch_size * input_size * sizeof(float), cudaMemcpyHostToDevice)); 159 | 160 | // First layer: input to hidden 161 | matmul_forward_naive<<>>(input, weights1, hidden, batch_size, hidden_size, input_size); 162 | 163 | // print "hidden" values 164 | // copy hidden to host 165 | // float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 166 | // CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 167 | // std::cout << "naive hidden values (no bias): " << std::endl; 168 | // for (int i = 0; i < batch_size * hidden_size; i++) { 169 | // printf("%f ", h_hidden[i]); 170 | // if ((i+1) % hidden_size == 0) { 171 | // printf("\n"); 172 | // } 173 | // } 174 | // // copy back to device 175 | // CHECK_CUDA(cudaMemcpy(hidden, h_hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyHostToDevice)); 176 | 177 | 178 | add_bias_naive<<>>(hidden, bias1, batch_size, hidden_size); 179 | apply_relu_naive<<<(batch_size * hidden_size + 255) / 256, 256>>>(hidden, batch_size * hidden_size); 180 | 181 | // Second layer: hidden to output 182 | matmul_forward_naive<<>>(hidden, weights2, output, batch_size, output_size, hidden_size); 183 | add_bias_naive<<>>(output, bias2, batch_size, output_size); 184 | } 185 | 186 | void backward_pass_naive(float *input, float *hidden, float *output, int *labels, 187 | float *weights1, float *weights2, 188 | float *grad_weights1, float *grad_weights2, 189 | float *grad_bias1, float *grad_bias2, 190 | int input_size, int hidden_size, int output_size, int batch_size) { 191 | 192 | float *d_grad_output, *d_grad_hidden; 193 | CHECK_CUDA(cudaMalloc(&d_grad_output, batch_size * output_size * sizeof(float))); 194 | CHECK_CUDA(cudaMalloc(&d_grad_hidden, batch_size * hidden_size * sizeof(float))); 195 | 196 | dim3 block(256); 197 | dim3 grid_output((output_size + block.x - 1) / block.x, batch_size); 198 | dim3 grid_hidden((hidden_size + block.x - 1) / block.x, batch_size); 199 | 200 | compute_output_gradient<<>>(output, labels, d_grad_output, output_size, batch_size); 201 | compute_hidden_gradient<<>>(hidden, weights2, d_grad_output, d_grad_hidden, hidden_size, output_size, batch_size); 202 | 203 | dim3 block_weights(16, 16); 204 | dim3 grid_weights((hidden_size + block_weights.x - 1) / block_weights.x, 205 | (input_size + block_weights.y - 1) / block_weights.y); 206 | compute_weight_gradients<<>>(input, d_grad_hidden, grad_weights1, input_size, hidden_size, batch_size); 207 | 208 | compute_bias_gradients<<<(hidden_size + 255) / 256, 256>>>(d_grad_hidden, grad_bias1, hidden_size, batch_size); 209 | 210 | // For grad_weights2 and grad_bias2, we can reuse the existing kernels with different dimensions 211 | dim3 grid_weights2((output_size + block_weights.x - 1) / block_weights.x, 212 | (hidden_size + block_weights.y - 1) / block_weights.y); 213 | compute_weight_gradients<<>>(hidden, d_grad_output, grad_weights2, hidden_size, output_size, batch_size); 214 | 215 | compute_bias_gradients<<<(output_size + 255) / 256, 256>>>(d_grad_output, grad_bias2, output_size, batch_size); 216 | 217 | CHECK_CUDA(cudaFree(d_grad_output)); 218 | CHECK_CUDA(cudaFree(d_grad_hidden)); 219 | } 220 | 221 | void backward_pass_cublas(cublasHandle_t handle, 222 | float *input, float *hidden, float *output, int *labels, 223 | float *weights1, float *weights2, 224 | float *grad_weights1, float *grad_weights2, 225 | float *grad_bias1, float *grad_bias2, 226 | int input_size, int hidden_size, int output_size, int batch_size) { 227 | float *d_grad_output, *d_grad_hidden; 228 | CHECK_CUDA(cudaMalloc(&d_grad_output, batch_size * output_size * sizeof(float))); 229 | CHECK_CUDA(cudaMalloc(&d_grad_hidden, batch_size * hidden_size * sizeof(float))); 230 | 231 | dim3 block(256); 232 | dim3 grid_output((output_size + block.x - 1) / block.x, batch_size); 233 | dim3 grid_hidden((hidden_size + block.x - 1) / block.x, batch_size); 234 | 235 | compute_output_gradient<<>>(output, labels, d_grad_output, output_size, batch_size); 236 | 237 | // Compute grad_hidden using cuBLAS 238 | float alpha = 1.0f, beta = 0.0f; 239 | CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, 240 | hidden_size, batch_size, output_size, 241 | &alpha, weights2, output_size, 242 | d_grad_output, output_size, 243 | &beta, d_grad_hidden, hidden_size)); 244 | 245 | // Apply ReLU derivative 246 | dim3 block_relu(256); 247 | dim3 grid_relu((batch_size * hidden_size + block_relu.x - 1) / block_relu.x); 248 | compute_hidden_gradient<<>>(hidden, weights2, d_grad_output, d_grad_hidden, hidden_size, output_size, batch_size); 249 | 250 | // Compute grad_weights1 using cuBLAS 251 | CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 252 | input_size, hidden_size, batch_size, 253 | &alpha, input, input_size, 254 | d_grad_hidden, hidden_size, 255 | &beta, grad_weights1, input_size)); 256 | 257 | // Compute grad_bias1 258 | compute_bias_gradients<<<(hidden_size + 255) / 256, 256>>>(d_grad_hidden, grad_bias1, hidden_size, batch_size); 259 | 260 | // Compute grad_weights2 using cuBLAS 261 | CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 262 | hidden_size, output_size, batch_size, 263 | &alpha, hidden, hidden_size, 264 | d_grad_output, output_size, 265 | &beta, grad_weights2, hidden_size)); 266 | 267 | // Compute grad_bias2 268 | compute_bias_gradients<<<(output_size + 255) / 256, 256>>>(d_grad_output, grad_bias2, output_size, batch_size); 269 | 270 | CHECK_CUDA(cudaFree(d_grad_output)); 271 | CHECK_CUDA(cudaFree(d_grad_hidden)); 272 | } 273 | 274 | void compare_results(float *output1, float *output2, int size) { 275 | float eps = 1e-5f; 276 | int max_diff_idx = 0; 277 | for (int i = 0; i < size; i++) { 278 | float diff = fabsf(output1[i] - output2[i]); 279 | if (diff > eps) { 280 | printf("Results differ at index %d: %f vs %f\n", i, output1[i], output2[i]); 281 | max_diff_idx = i; 282 | break; 283 | } 284 | } 285 | } 286 | 287 | 288 | int main() { 289 | const int batch_size = 2; 290 | const int input_size = 4; 291 | const int hidden_size = 4; 292 | const int output_size = 1; 293 | 294 | size_t input_bytes = batch_size * input_size * sizeof(float); 295 | size_t hidden_bytes = batch_size * hidden_size * sizeof(float); 296 | size_t output_bytes = batch_size * output_size * sizeof(float); 297 | size_t weights1_bytes = input_size * hidden_size * sizeof(float); 298 | size_t weights2_bytes = hidden_size * output_size * sizeof(float); 299 | size_t bias1_bytes = hidden_size * sizeof(float); 300 | size_t bias2_bytes = output_size * sizeof(float); 301 | 302 | float *d_input, *d_hidden, *d_output, *d_output_cublas; 303 | float *d_weights1, *d_weights2, *d_bias1, *d_bias2; 304 | float *d_grad_weights1, *d_grad_weights2, *d_grad_bias1, *d_grad_bias2; 305 | float *d_grad_weights1_cublas, *d_grad_weights2_cublas, *d_grad_bias1_cublas, *d_grad_bias2_cublas; 306 | int *d_labels; 307 | 308 | // Allocate memory 309 | CHECK_CUDA(cudaMalloc(&d_input, input_bytes)); 310 | CHECK_CUDA(cudaMalloc(&d_hidden, hidden_bytes)); 311 | CHECK_CUDA(cudaMalloc(&d_output, output_bytes)); 312 | CHECK_CUDA(cudaMalloc(&d_output_cublas, output_bytes)); 313 | CHECK_CUDA(cudaMalloc(&d_weights1, weights1_bytes)); 314 | CHECK_CUDA(cudaMalloc(&d_weights2, weights2_bytes)); 315 | CHECK_CUDA(cudaMalloc(&d_bias1, bias1_bytes)); 316 | CHECK_CUDA(cudaMalloc(&d_bias2, bias2_bytes)); 317 | CHECK_CUDA(cudaMalloc(&d_grad_weights1, weights1_bytes)); 318 | CHECK_CUDA(cudaMalloc(&d_grad_weights2, weights2_bytes)); 319 | CHECK_CUDA(cudaMalloc(&d_grad_bias1, bias1_bytes)); 320 | CHECK_CUDA(cudaMalloc(&d_grad_bias2, bias2_bytes)); 321 | CHECK_CUDA(cudaMalloc(&d_grad_weights1_cublas, weights1_bytes)); 322 | CHECK_CUDA(cudaMalloc(&d_grad_weights2_cublas, weights2_bytes)); 323 | CHECK_CUDA(cudaMalloc(&d_grad_bias1_cublas, bias1_bytes)); 324 | CHECK_CUDA(cudaMalloc(&d_grad_bias2_cublas, bias2_bytes)); 325 | CHECK_CUDA(cudaMalloc(&d_labels, batch_size * sizeof(int))); 326 | 327 | // Initialize data 328 | float h_input[batch_size * input_size] = {1.0f, 2.0f, 3.0f, 4.0f, 329 | 2.0f, 4.0f, 6.0f, 8.0f}; 330 | float h_weights1[input_size * hidden_size] = {1.0f, 2.0f, 3.0f, 4.0f, 331 | 2.0f, 4.0f, 6.0f, 8.0f, 332 | 3.0f, 6.0f, 9.0f, 12.0f, 333 | 4.0f, 8.0f, 12.0f, 16.0f}; 334 | float h_bias1[hidden_size] = {1.0f, 2.0f, 3.0f, 4.0f}; 335 | float h_weights2[hidden_size * output_size] = {1.0f, 2.0f, 3.0f, 4.0f}; 336 | float h_bias2[output_size] = {1.0f}; 337 | int h_labels[batch_size] = {0, 0}; // Assuming binary classification 338 | 339 | // Copy data to device 340 | CHECK_CUDA(cudaMemcpy(d_input, h_input, input_bytes, cudaMemcpyHostToDevice)); 341 | CHECK_CUDA(cudaMemcpy(d_weights1, h_weights1, weights1_bytes, cudaMemcpyHostToDevice)); 342 | CHECK_CUDA(cudaMemcpy(d_bias1, h_bias1, bias1_bytes, cudaMemcpyHostToDevice)); 343 | CHECK_CUDA(cudaMemcpy(d_weights2, h_weights2, weights2_bytes, cudaMemcpyHostToDevice)); 344 | CHECK_CUDA(cudaMemcpy(d_bias2, h_bias2, bias2_bytes, cudaMemcpyHostToDevice)); 345 | CHECK_CUDA(cudaMemcpy(d_labels, h_labels, batch_size * sizeof(int), cudaMemcpyHostToDevice)); 346 | 347 | // Forward pass (naive) 348 | forward_pass_naive(d_input, d_weights1, d_bias1, d_hidden, 349 | d_weights2, d_bias2, d_output, 350 | input_size, hidden_size, output_size, batch_size); 351 | 352 | // Forward pass (cuBLAS) 353 | cublasHandle_t handle; 354 | CHECK_CUBLAS(cublasCreate(&handle)); 355 | // forward_pass_cublas(handle, d_input, d_weights1, d_bias1, d_hidden, 356 | // d_weights2, d_bias2, d_output_cublas, 357 | // input_size, hidden_size, output_size, batch_size); 358 | 359 | // Backward pass (naive) 360 | backward_pass_naive(d_input, d_hidden, d_output, d_labels, 361 | d_weights1, d_weights2, 362 | d_grad_weights1, d_grad_weights2, 363 | d_grad_bias1, d_grad_bias2, 364 | input_size, hidden_size, output_size, batch_size); 365 | 366 | // Backward pass (cuBLAS) 367 | backward_pass_cublas(handle, d_input, d_hidden, d_output_cublas, d_labels, 368 | d_weights1, d_weights2, 369 | d_grad_weights1_cublas, d_grad_weights2_cublas, 370 | d_grad_bias1_cublas, d_grad_bias2_cublas, 371 | input_size, hidden_size, output_size, batch_size); 372 | 373 | CHECK_CUBLAS(cublasDestroy(handle)); 374 | 375 | // Compare results 376 | float *h_output = (float*)malloc(output_bytes); 377 | float *h_output_cublas = (float*)malloc(output_bytes); 378 | float *h_grad_weights1 = (float*)malloc(weights1_bytes); 379 | float *h_grad_weights1_cublas = (float*)malloc(weights1_bytes); 380 | 381 | CHECK_CUDA(cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost)); 382 | CHECK_CUDA(cudaMemcpy(h_output_cublas, d_output_cublas, output_bytes, cudaMemcpyDeviceToHost)); 383 | CHECK_CUDA(cudaMemcpy(h_grad_weights1, d_grad_weights1, weights1_bytes, cudaMemcpyDeviceToHost)); 384 | CHECK_CUDA(cudaMemcpy(h_grad_weights1_cublas, d_grad_weights1_cublas, weights1_bytes, cudaMemcpyDeviceToHost)); 385 | 386 | printf("Comparing forward pass results:\n"); 387 | compare_results(h_output, h_output_cublas, batch_size * output_size); 388 | 389 | printf("Comparing backward pass results (grad_weights1):\n"); 390 | compare_results(h_grad_weights1, h_grad_weights1_cublas, input_size * hidden_size); 391 | 392 | // Free memory 393 | free(h_output); 394 | free(h_output_cublas); 395 | free(h_grad_weights1); 396 | free(h_grad_weights1_cublas); 397 | 398 | CHECK_CUDA(cudaFree(d_input)); 399 | CHECK_CUDA(cudaFree(d_hidden)); 400 | CHECK_CUDA(cudaFree(d_output)); 401 | CHECK_CUDA(cudaFree(d_output_cublas)); 402 | CHECK_CUDA(cudaFree(d_weights1)); 403 | CHECK_CUDA(cudaFree(d_weights2)); 404 | CHECK_CUDA(cudaFree(d_bias1)); 405 | CHECK_CUDA(cudaFree(d_bias2)); 406 | CHECK_CUDA(cudaFree(d_grad_weights1)); 407 | CHECK_CUDA(cudaFree(d_grad_weights2)); 408 | CHECK_CUDA(cudaFree(d_grad_bias1)); 409 | CHECK_CUDA(cudaFree(d_grad_bias2)); 410 | CHECK_CUDA(cudaFree(d_grad_weights1_cublas)); 411 | CHECK_CUDA(cudaFree(d_grad_weights2_cublas)); 412 | CHECK_CUDA(cudaFree(d_grad_bias1_cublas)); 413 | CHECK_CUDA(cudaFree(d_grad_bias2_cublas)); 414 | CHECK_CUDA(cudaFree(d_labels)); 415 | 416 | return 0; 417 | } -------------------------------------------------------------------------------- /cuda/vroom/comparing/batch-compare-forward.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | #define CHECK_CURAND(call) { \ 12 | curandStatus_t status = call; \ 13 | if (status != CURAND_STATUS_SUCCESS) { \ 14 | fprintf(stderr, "cuRAND error in %s:%d\n", __FILE__, __LINE__); \ 15 | exit(EXIT_FAILURE); \ 16 | } \ 17 | } 18 | 19 | #define CHECK_CUDA(call) { \ 20 | cudaError_t err = call; \ 21 | if (err != cudaSuccess) { \ 22 | fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 23 | exit(EXIT_FAILURE); \ 24 | } \ 25 | } 26 | 27 | #define CHECK_CUBLAS(call) { \ 28 | cublasStatus_t status = call; \ 29 | if (status != CUBLAS_STATUS_SUCCESS) { \ 30 | fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \ 31 | exit(EXIT_FAILURE); \ 32 | } \ 33 | } 34 | 35 | // __device__ float relu(float x) { 36 | // return fmaxf(x, 0.0f); 37 | // } 38 | 39 | // __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) { 40 | // int row = blockIdx.y * blockDim.y + threadIdx.y; 41 | // int col = blockIdx.x * blockDim.x + threadIdx.x; 42 | 43 | // if (row < M && col < N) { 44 | // float sum = 0.0f; 45 | // for (int i = 0; i < K; i++) { 46 | // sum += A[row * K + i] * B[i * N + col]; 47 | // } 48 | // C[row * N + col] = sum; 49 | // } 50 | // } 51 | 52 | // __global__ void forward_pass(float *input, float *weights1, float *bias1, float *hidden, 53 | // float *weights2, float *bias2, float *output, 54 | // int input_size, int hidden_size, int output_size, int batch_size) { 55 | // int idx = blockIdx.x * blockDim.x + threadIdx.x; 56 | // int batch_idx = blockIdx.y; 57 | 58 | // if (idx < hidden_size && batch_idx < batch_size) { 59 | // float sum = 0.0f; 60 | // for (int i = 0; i < input_size; i++) { 61 | // sum += weights1[idx * input_size + i] * input[batch_idx * input_size + i]; 62 | // } 63 | // float hidden_val = relu(sum + bias1[idx]); 64 | // hidden[batch_idx * hidden_size + idx] = hidden_val; 65 | // } 66 | 67 | // __syncthreads(); 68 | 69 | // if (idx < output_size && batch_idx < batch_size) { 70 | // float sum = 0.0f; 71 | // for (int i = 0; i < hidden_size; i++) { 72 | // sum += weights2[idx * hidden_size + i] * hidden[batch_idx * hidden_size + i]; 73 | // } 74 | // float output_val = sum + bias2[idx]; 75 | // output[batch_idx * output_size + idx] = output_val; 76 | // } 77 | // } 78 | 79 | 80 | __device__ float relu(float x) { 81 | return fmaxf(x, 0.0f); 82 | } 83 | 84 | __global__ void add_bias_and_relu(float *data, float *bias, int size, int batch_size) { 85 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 86 | int batch_idx = blockIdx.y; 87 | 88 | if (idx < size && batch_idx < batch_size) { 89 | int index = batch_idx * size + idx; 90 | data[index] = relu(data[index] + bias[idx]); 91 | } 92 | } 93 | 94 | __global__ void add_bias(float *data, float *bias, int size, int batch_size) { 95 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 96 | int batch_idx = blockIdx.y; 97 | 98 | if (idx < size && batch_idx < batch_size) { 99 | int index = batch_idx * size + idx; 100 | data[index] += bias[idx]; 101 | } 102 | } 103 | 104 | 105 | __global__ void matmul_forward_naive(float *A, float *B, float *C, int M, int N, int K) { 106 | int row = blockIdx.y * blockDim.y + threadIdx.y; 107 | int col = blockIdx.x * blockDim.x + threadIdx.x; 108 | 109 | if (row < M && col < N) { 110 | float sum = 0.0f; 111 | for (int i = 0; i < K; i++) { 112 | sum += A[row * K + i] * B[i * N + col]; 113 | } 114 | C[row * N + col] = sum; 115 | } 116 | } 117 | 118 | __global__ void add_bias_naive(float *input, float *bias, int rows, int cols) { 119 | int row = blockIdx.y * blockDim.y + threadIdx.y; 120 | int col = blockIdx.x * blockDim.x + threadIdx.x; 121 | 122 | if (row < rows && col < cols) { 123 | input[row * cols + col] += bias[col]; 124 | } 125 | } 126 | 127 | __global__ void apply_relu_naive(float *input, int size) { 128 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 129 | 130 | if (idx < size) { 131 | input[idx] = relu(input[idx]); 132 | } 133 | } 134 | 135 | void cublasMatmul(cublasHandle_t handle, float *d_A, float *d_B, float *d_C, int M, int K, int N) { 136 | float alpha = 1.0f, beta = 0.0f; 137 | 138 | CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, 139 | &alpha, d_B, N, d_A, K, &beta, d_C, N)); 140 | 141 | } 142 | 143 | __global__ void forward_pass(float *input, float *weights1, float *bias1, float *hidden, 144 | float *weights2, float *bias2, float *output, 145 | int input_size, int hidden_size, int output_size, int batch_size) { 146 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 147 | int batch_idx = blockIdx.y; 148 | 149 | if (idx < hidden_size && batch_idx < batch_size) { 150 | float sum = 0.0f; 151 | for (int i = 0; i < input_size; i++) { 152 | sum += weights1[idx * input_size + i] * input[batch_idx * input_size + i]; 153 | } 154 | hidden[batch_idx * hidden_size + idx] = relu(sum + bias1[idx]); 155 | } 156 | 157 | __syncthreads(); 158 | 159 | if (idx < output_size && batch_idx < batch_size) { 160 | float sum = 0.0f; 161 | for (int i = 0; i < hidden_size; i++) { 162 | sum += weights2[idx * hidden_size + i] * hidden[batch_idx * hidden_size + i]; 163 | } 164 | output[batch_idx * output_size + idx] = sum + bias2[idx]; 165 | } 166 | } 167 | 168 | void forward_pass_wrapper(float *d_input, float *d_weights1, float *d_bias1, float *d_hidden, 169 | float *d_weights2, float *d_bias2, float *d_output, 170 | int input_size, int hidden_size, int output_size, int batch_size) { 171 | dim3 block_dim(256); 172 | dim3 grid_dim((max(hidden_size, output_size) + block_dim.x - 1) / block_dim.x, batch_size); 173 | 174 | forward_pass<<>>(d_input, d_weights1, d_bias1, d_hidden, 175 | d_weights2, d_bias2, d_output, 176 | input_size, hidden_size, output_size, batch_size); 177 | 178 | // Print hidden layer values 179 | float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 180 | CHECK_CUDA(cudaMemcpy(h_hidden, d_hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 181 | std::cout << "Forward pass hidden layer values:" << std::endl; 182 | for (int i = 0; i < batch_size * hidden_size; i++) { 183 | printf("%f ", h_hidden[i]); 184 | if ((i+1) % hidden_size == 0) printf("\n"); 185 | } 186 | free(h_hidden); 187 | 188 | // Print output values 189 | float *h_output = (float*)malloc(batch_size * output_size * sizeof(float)); 190 | CHECK_CUDA(cudaMemcpy(h_output, d_output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost)); 191 | std::cout << "Forward pass output values:" << std::endl; 192 | for (int i = 0; i < batch_size * output_size; i++) { 193 | printf("%f ", h_output[i]); 194 | if ((i+1) % output_size == 0) printf("\n"); 195 | } 196 | free(h_output); 197 | } 198 | 199 | void forward_pass_naive(float *input, float *weights1, float *bias1, float *hidden, 200 | float *weights2, float *bias2, float *output, 201 | int input_size, int hidden_size, int output_size, int batch_size) { 202 | dim3 block_dim(32, 32); 203 | dim3 grid_dim_1((hidden_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); 204 | dim3 grid_dim_2((output_size + block_dim.x - 1) / block_dim.x, (batch_size + block_dim.y - 1) / block_dim.y); 205 | 206 | // First layer: input to hidden 207 | matmul_forward_naive<<>>(input, weights1, hidden, batch_size, hidden_size, input_size); 208 | 209 | // Print hidden values after matmul 210 | float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 211 | CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 212 | std::cout << "Naive hidden values after matmul:" << std::endl; 213 | for (int i = 0; i < batch_size * hidden_size; i++) { 214 | printf("%f ", h_hidden[i]); 215 | if ((i+1) % hidden_size == 0) printf("\n"); 216 | } 217 | free(h_hidden); 218 | 219 | add_bias_naive<<>>(hidden, bias1, batch_size, hidden_size); 220 | 221 | // Print hidden values after adding bias 222 | h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 223 | CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 224 | std::cout << "Naive hidden values after adding bias:" << std::endl; 225 | for (int i = 0; i < batch_size * hidden_size; i++) { 226 | printf("%f ", h_hidden[i]); 227 | if ((i+1) % hidden_size == 0) printf("\n"); 228 | } 229 | free(h_hidden); 230 | 231 | apply_relu_naive<<<(batch_size * hidden_size + 255) / 256, 256>>>(hidden, batch_size * hidden_size); 232 | 233 | // Print hidden values after ReLU 234 | h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 235 | CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 236 | std::cout << "Naive hidden values after ReLU:" << std::endl; 237 | for (int i = 0; i < batch_size * hidden_size; i++) { 238 | printf("%f ", h_hidden[i]); 239 | if ((i+1) % hidden_size == 0) printf("\n"); 240 | } 241 | free(h_hidden); 242 | 243 | // Second layer: hidden to output 244 | matmul_forward_naive<<>>(hidden, weights2, output, batch_size, output_size, hidden_size); 245 | add_bias_naive<<>>(output, bias2, batch_size, output_size); 246 | 247 | // Print final output 248 | float *h_output = (float*)malloc(batch_size * output_size * sizeof(float)); 249 | CHECK_CUDA(cudaMemcpy(h_output, output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost)); 250 | std::cout << "Naive final output:" << std::endl; 251 | for (int i = 0; i < batch_size * output_size; i++) { 252 | printf("%f ", h_output[i]); 253 | if ((i+1) % output_size == 0) printf("\n"); 254 | } 255 | std::cout << std::endl << std::endl; 256 | free(h_output); 257 | } 258 | 259 | void forward_pass_cublas(cublasHandle_t handle, float *input, float *weights1, float *bias1, float *hidden, 260 | float *weights2, float *bias2, float *output, 261 | int input_size, int hidden_size, int output_size, int batch_size) { 262 | float alpha = 1.0f, beta = 0.0f; 263 | 264 | // First layer: input to hidden 265 | cublasMatmul(handle, input, weights1, hidden, batch_size, input_size, hidden_size); 266 | 267 | // Print hidden values after matmul 268 | float *h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 269 | CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 270 | std::cout << "cuBLAS hidden values after matmul:" << std::endl; 271 | for (int i = 0; i < batch_size * hidden_size; i++) { 272 | printf("%f ", h_hidden[i]); 273 | if ((i+1) % hidden_size == 0) printf("\n"); 274 | } 275 | free(h_hidden); 276 | 277 | dim3 block(256); 278 | dim3 grid((hidden_size + block.x - 1) / block.x, batch_size); 279 | add_bias_and_relu<<>>(hidden, bias1, hidden_size, batch_size); 280 | 281 | // Print hidden values after bias and ReLU 282 | h_hidden = (float*)malloc(batch_size * hidden_size * sizeof(float)); 283 | CHECK_CUDA(cudaMemcpy(h_hidden, hidden, batch_size * hidden_size * sizeof(float), cudaMemcpyDeviceToHost)); 284 | std::cout << "cuBLAS hidden values after bias and ReLU:" << std::endl; 285 | for (int i = 0; i < batch_size * hidden_size; i++) { 286 | printf("%f ", h_hidden[i]); 287 | if ((i+1) % hidden_size == 0) printf("\n"); 288 | } 289 | free(h_hidden); 290 | 291 | // Second layer: hidden to output 292 | cublasMatmul(handle, hidden, weights2, output, batch_size, hidden_size, output_size); 293 | 294 | grid = dim3((output_size + block.x - 1) / block.x, batch_size); 295 | add_bias<<>>(output, bias2, output_size, batch_size); 296 | 297 | // Print final output 298 | float *h_output = (float*)malloc(batch_size * output_size * sizeof(float)); 299 | CHECK_CUDA(cudaMemcpy(h_output, output, batch_size * output_size * sizeof(float), cudaMemcpyDeviceToHost)); 300 | std::cout << "cuBLAS final output:" << std::endl; 301 | for (int i = 0; i < batch_size * output_size; i++) { 302 | printf("%f ", h_output[i]); 303 | if ((i+1) % output_size == 0) printf("\n"); 304 | } 305 | free(h_output); 306 | } 307 | 308 | void compare_results(float *output1, float *output2, int size) { 309 | float max_diff = 0.0f; 310 | int max_diff_idx = 0; 311 | for (int i = 0; i < size; i++) { 312 | float diff = fabsf(output1[i] - output2[i]); 313 | if (diff > max_diff) { 314 | max_diff = diff; 315 | max_diff_idx = i; 316 | std::cout << "max_diff_idx: " << max_diff_idx << std::endl; 317 | } 318 | } 319 | } 320 | 321 | __global__ void scale_array(float *arr, int size, float scale) { 322 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 323 | if (idx < size) { 324 | arr[idx] = (arr[idx] - 0.5f) * scale; 325 | } 326 | } 327 | 328 | int main() { 329 | const int batch_size = 2; // M -> batch_size 330 | const int input_size = 4; // K -> 784 331 | const int hidden_size = 4; // N -> 256 332 | const int output_size = 1; // O 333 | 334 | size_t input_bytes = batch_size * input_size * sizeof(float); // M * K 335 | size_t hidden_bytes = batch_size * hidden_size * sizeof(float); // M * N 336 | size_t output_bytes = batch_size * output_size * sizeof(float); // M * O 337 | size_t weights1_bytes = input_size * hidden_size * sizeof(float); // K * N 338 | size_t weights2_bytes = hidden_size * output_size * sizeof(float); // N * O 339 | size_t bias1_bytes = hidden_size * sizeof(float); 340 | size_t bias2_bytes = output_size * sizeof(float); 341 | 342 | float *d_input, *d_weights1, *d_bias1, *d_hidden, *d_weights2, *d_bias2, *d_output, *d_output_cublas; 343 | 344 | CHECK_CUDA(cudaMalloc(&d_input, input_bytes)); 345 | CHECK_CUDA(cudaMalloc(&d_weights1, weights1_bytes)); 346 | CHECK_CUDA(cudaMalloc(&d_bias1, bias1_bytes)); 347 | CHECK_CUDA(cudaMalloc(&d_hidden, hidden_bytes)); 348 | CHECK_CUDA(cudaMalloc(&d_weights2, weights2_bytes)); 349 | CHECK_CUDA(cudaMalloc(&d_bias2, bias2_bytes)); 350 | CHECK_CUDA(cudaMalloc(&d_output, output_bytes)); 351 | CHECK_CUDA(cudaMalloc(&d_output_cublas, output_bytes)); 352 | 353 | float h_input[batch_size * input_size] = {1.0f, -2.0f, 3.0f, -4.0f, 354 | 2.0f, -4.0f, 6.0f, -8.0f}; 355 | 356 | float h_weights1[input_size * hidden_size] = {-1.0f, 2.0f, 3.0f, 4.0f, 357 | 2.0f, -4.0f, 6.0f, 8.0f, 358 | 3.0f, 6.0f, -9.0f, 12.0f, 359 | 4.0f, 8.0f, 12.0f, -16.0f}; 360 | 361 | float h_bias1[hidden_size] = {-1.0f, -2.0f, -3.0f, -4.0f}; 362 | 363 | float h_weights2[hidden_size * output_size] = {1.0f, 2.0f, 3.0f, 4.0f}; 364 | 365 | float h_bias2[output_size] = {-1.0f}; 366 | 367 | CHECK_CUDA(cudaMemcpy(d_input, h_input, input_bytes, cudaMemcpyHostToDevice)); 368 | 369 | CHECK_CUDA(cudaMemcpy(d_weights1, h_weights1, weights1_bytes, cudaMemcpyHostToDevice)); 370 | 371 | CHECK_CUDA(cudaMemcpy(d_bias1, h_bias1, bias1_bytes, cudaMemcpyHostToDevice)); 372 | 373 | CHECK_CUDA(cudaMemcpy(d_weights2, h_weights2, weights2_bytes, cudaMemcpyHostToDevice)); 374 | 375 | CHECK_CUDA(cudaMemcpy(d_bias2, h_bias2, bias2_bytes, cudaMemcpyHostToDevice)); 376 | 377 | 378 | dim3 block(256); 379 | dim3 grid((max(hidden_size, output_size) + block.x - 1) / block.x, batch_size); 380 | forward_pass_naive(d_input, d_weights1, d_bias1, d_hidden, 381 | d_weights2, d_bias2, d_output, 382 | input_size, hidden_size, output_size, batch_size); 383 | 384 | CHECK_CUDA(cudaGetLastError()); 385 | CHECK_CUDA(cudaDeviceSynchronize()); 386 | 387 | cublasHandle_t handle; 388 | CHECK_CUBLAS(cublasCreate(&handle)); 389 | forward_pass_cublas(handle, d_input, d_weights1, d_bias1, d_hidden, 390 | d_weights2, d_bias2, d_output_cublas, 391 | input_size, hidden_size, output_size, batch_size); 392 | CHECK_CUBLAS(cublasDestroy(handle)); 393 | 394 | float *h_output = (float*)malloc(output_bytes); 395 | float *h_output_cublas = (float*)malloc(output_bytes); 396 | CHECK_CUDA(cudaMemcpy(h_output, d_output, output_bytes, cudaMemcpyDeviceToHost)); 397 | CHECK_CUDA(cudaMemcpy(h_output_cublas, d_output_cublas, output_bytes, cudaMemcpyDeviceToHost)); 398 | // In main() 399 | forward_pass_wrapper(d_input, d_weights1, d_bias1, d_hidden, 400 | d_weights2, d_bias2, d_output, 401 | input_size, hidden_size, output_size, batch_size); 402 | 403 | CHECK_CUDA(cudaGetLastError()); 404 | CHECK_CUDA(cudaDeviceSynchronize()); 405 | 406 | compare_results(h_output, h_output_cublas, batch_size * output_size); 407 | 408 | free(h_output); 409 | free(h_output_cublas); 410 | CHECK_CUDA(cudaFree(d_input)); 411 | CHECK_CUDA(cudaFree(d_weights1)); 412 | CHECK_CUDA(cudaFree(d_bias1)); 413 | CHECK_CUDA(cudaFree(d_hidden)); 414 | CHECK_CUDA(cudaFree(d_weights2)); 415 | CHECK_CUDA(cudaFree(d_bias2)); 416 | CHECK_CUDA(cudaFree(d_output)); 417 | CHECK_CUDA(cudaFree(d_output_cublas)); 418 | 419 | return 0; 420 | } -------------------------------------------------------------------------------- /cuda/vroom/comparing/batch-matmul-compare.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define CHECK_CUDA(call) { \ 8 | cudaError_t err = call; \ 9 | if (err != cudaSuccess) { \ 10 | fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \ 11 | exit(EXIT_FAILURE); \ 12 | } \ 13 | } 14 | 15 | #define CHECK_CUBLAS(call) { \ 16 | cublasStatus_t status = call; \ 17 | if (status != CUBLAS_STATUS_SUCCESS) { \ 18 | fprintf(stderr, "cuBLAS error in %s:%d\n", __FILE__, __LINE__); \ 19 | exit(EXIT_FAILURE); \ 20 | } \ 21 | } 22 | 23 | // Naive matrix multiplication kernel 24 | __global__ void naiveMatmulKernel(float* A, float* B, float* C, int M, int N, int K) { 25 | int row = blockIdx.y * blockDim.y + threadIdx.y; 26 | int col = blockIdx.x * blockDim.x + threadIdx.x; 27 | 28 | if (row < M && col < N) { 29 | float sum = 0.0f; 30 | for (int i = 0; i < K; ++i) { 31 | sum += A[row * K + i] * B[i * N + col]; 32 | } 33 | C[row * N + col] = sum; 34 | } 35 | } 36 | 37 | // Function to initialize a matrix with random values 38 | void initMatrix(float* mat, int rows, int cols) { 39 | for (int i = 0; i < rows * cols; ++i) { 40 | // mat[i] = static_cast(rand()) / RAND_MAX; 41 | // set to i and static cast to float 42 | mat[i] = static_cast(i) * 0.05; 43 | } 44 | } 45 | 46 | // Function to compare two matrices 47 | bool compareMatrices(float* A, float* B, int size, float tolerance = 1e-5) { 48 | for (int i = 0; i < size; ++i) { 49 | std::cout << "A[" << i << "] = " << A[i] << " B[" << i << "] = " << B[i] << std::endl; 50 | if (fabs(A[i] - B[i]) > tolerance) { 51 | return false; 52 | } 53 | } 54 | return true; 55 | } 56 | 57 | int main() { 58 | const int M = 4; // batchsize 59 | const int K = 4; // input size 60 | const int N = 6; // hidden size 61 | 62 | // (batch_size, input_size) x (input_size, hidden_size) = (batch_size, hidden_size) = (4, 6) 63 | 64 | 65 | size_t bytes_A = M * K * sizeof(float); 66 | size_t bytes_B = K * N * sizeof(float); 67 | size_t bytes_C = M * N * sizeof(float); 68 | 69 | float *h_A, *h_B, *h_C_naive, *h_C_cublas; 70 | float *d_A, *d_B, *d_C_naive, *d_C_cublas; 71 | 72 | // Allocate host memory 73 | h_A = (float*)malloc(bytes_A); 74 | h_B = (float*)malloc(bytes_B); 75 | h_C_naive = (float*)malloc(bytes_C); 76 | h_C_cublas = (float*)malloc(bytes_C); 77 | 78 | // Initialize matrices 79 | initMatrix(h_A, M, K); 80 | initMatrix(h_B, K, N); 81 | 82 | // Allocate device memory 83 | CHECK_CUDA(cudaMalloc(&d_A, bytes_A)); 84 | CHECK_CUDA(cudaMalloc(&d_B, bytes_B)); 85 | CHECK_CUDA(cudaMalloc(&d_C_naive, bytes_C)); 86 | CHECK_CUDA(cudaMalloc(&d_C_cublas, bytes_C)); 87 | 88 | // Copy data to device 89 | CHECK_CUDA(cudaMemcpy(d_A, h_A, bytes_A, cudaMemcpyHostToDevice)); 90 | CHECK_CUDA(cudaMemcpy(d_B, h_B, bytes_B, cudaMemcpyHostToDevice)); 91 | 92 | // Naive kernel 93 | dim3 blockDim(32, 32); 94 | dim3 gridDim((N + blockDim.x - 1) / blockDim.x, (M + blockDim.y - 1) / blockDim.y); 95 | naiveMatmulKernel<<>>(d_A, d_B, d_C_naive, M, N, K); 96 | CHECK_CUDA(cudaGetLastError()); 97 | CHECK_CUDA(cudaDeviceSynchronize()); 98 | 99 | // cuBLAS 100 | cublasHandle_t handle; 101 | CHECK_CUBLAS(cublasCreate(&handle)); 102 | float alpha = 1.0f; 103 | float beta = 0.0f; 104 | 105 | 106 | // w @ x -> 107 | CHECK_CUBLAS(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, 108 | &alpha, d_B, N, d_A, K, &beta, d_C_cublas, N)); 109 | 110 | CHECK_CUBLAS(cublasDestroy(handle)); 111 | 112 | // Copy results back to host 113 | CHECK_CUDA(cudaMemcpy(h_C_naive, d_C_naive, bytes_C, cudaMemcpyDeviceToHost)); 114 | CHECK_CUDA(cudaMemcpy(h_C_cublas, d_C_cublas, bytes_C, cudaMemcpyDeviceToHost)); 115 | 116 | // Compare results 117 | bool results_match = compareMatrices(h_C_naive, h_C_cublas, M * N); 118 | if (results_match) { 119 | printf("Naive and cuBLAS results match!\n"); 120 | } else { 121 | printf("Naive and cuBLAS results do not match!\n"); 122 | } 123 | 124 | // print all results 125 | std::cout << "naive\n"; 126 | for (int i = 0; i < M * K; i++) { 127 | // std::cout << "naive idx " << i << " = " << h_C_naive[i] << std::endl; 128 | // std::cout << "cublas idx " << i << " = " << h_C_cublas[i] << std::endl; 129 | std::cout << h_C_naive[i]; 130 | if (i % M == 0) { 131 | std::cout << "\n"; 132 | } 133 | 134 | } 135 | 136 | std::cout << "\n\n"; 137 | std::cout << "cublas\n"; 138 | for (int i = 0; i < M * K; i++) { 139 | std::cout << h_C_cublas[i]; 140 | if (i % M == 0) { 141 | std::cout << "\n"; 142 | } 143 | } 144 | 145 | // Free memory 146 | free(h_A); 147 | free(h_B); 148 | free(h_C_naive); 149 | free(h_C_cublas); 150 | CHECK_CUDA(cudaFree(d_A)); 151 | CHECK_CUDA(cudaFree(d_B)); 152 | CHECK_CUDA(cudaFree(d_C_naive)); 153 | CHECK_CUDA(cudaFree(d_C_cublas)); 154 | 155 | return 0; 156 | } -------------------------------------------------------------------------------- /cuda/vroom/v1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define INPUT_SIZE 784 8 | #define HIDDEN_SIZE 256 9 | #define OUTPUT_SIZE 10 10 | #define BATCH_SIZE 4 11 | 12 | #define CUDA_CHECK(call) \ 13 | do { \ 14 | cudaError_t error = call; \ 15 | if (error != cudaSuccess) { \ 16 | fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ 17 | cudaGetErrorString(error)); \ 18 | exit(EXIT_FAILURE); \ 19 | } \ 20 | } while(0) 21 | 22 | __global__ void init_random(float *data, int size, unsigned long long seed) { 23 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 24 | if (idx < size) { 25 | curandState state; 26 | curand_init(seed, idx, 0, &state); 27 | data[idx] = curand_uniform(&state) * 2.0f - 1.0f; 28 | } 29 | } 30 | 31 | __global__ void relu_derivative(float *grad, float *x, int size) { 32 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 33 | if (idx < size) { 34 | grad[idx] *= (x[idx] > 0) ? 1.0f : 0.0f; 35 | } 36 | } 37 | 38 | 39 | __global__ void backward_pass_naive(float *input, float *hidden, float *output, int *labels, 40 | float *weights1, float *weights2, 41 | float *grad_weights1, float *grad_weights2, 42 | float *grad_bias1, float *grad_bias2, 43 | int input_size, int hidden_size, int output_size, int batch_size) { 44 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 45 | int batch_idx = blockIdx.y; 46 | 47 | __shared__ float grad_output[OUTPUT_SIZE]; 48 | 49 | if (idx < output_size && batch_idx < batch_size) { 50 | grad_output[idx] = output[batch_idx * output_size + idx]; 51 | if (idx == labels[batch_idx]) { 52 | grad_output[idx] -= 1.0f; 53 | } 54 | } 55 | 56 | __syncthreads(); 57 | 58 | if (idx < hidden_size && batch_idx < batch_size) { 59 | float grad_hidden = 0.0f; 60 | for (int i = 0; i < output_size; i++) { 61 | grad_hidden += grad_output[i] * weights2[i * hidden_size + idx]; 62 | } 63 | grad_hidden *= (hidden[batch_idx * hidden_size + idx] > 0) ? 1.0f : 0.0f; // ReLU derivative 64 | 65 | for (int i = 0; i < input_size; i++) { 66 | atomicAdd(&grad_weights1[idx * input_size + i], grad_hidden * input[batch_idx * input_size + i]); 67 | } 68 | atomicAdd(&grad_bias1[idx], grad_hidden); 69 | } 70 | 71 | if (idx < output_size * hidden_size && batch_idx < batch_size) { 72 | int i = idx / hidden_size; 73 | int j = idx % hidden_size; 74 | atomicAdd(&grad_weights2[idx], grad_output[i] * hidden[batch_idx * hidden_size + j]); 75 | } 76 | 77 | if (idx < output_size && batch_idx < batch_size) { 78 | atomicAdd(&grad_bias2[idx], grad_output[idx]); 79 | } 80 | } 81 | 82 | __global__ void compute_output_gradient(float *output, int *labels, float *grad_output, int output_size, int batch_size) { 83 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 84 | int batch_idx = blockIdx.y; 85 | 86 | if (idx < output_size && batch_idx < batch_size) { 87 | int index = batch_idx * output_size + idx; 88 | grad_output[index] = output[index]; 89 | if (idx == labels[batch_idx]) { 90 | grad_output[index] -= 1.0f; 91 | } 92 | } 93 | } 94 | 95 | __global__ void compute_hidden_gradient(float *grad_hidden, float *grad_output, float *weights2, float *hidden, 96 | int hidden_size, int output_size, int batch_size) { 97 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 98 | int batch_idx = blockIdx.y; 99 | 100 | if (idx < hidden_size && batch_idx < batch_size) { 101 | float grad = 0.0f; 102 | for (int i = 0; i < output_size; i++) { 103 | grad += grad_output[batch_idx * output_size + i] * weights2[i * hidden_size + idx]; 104 | } 105 | grad_hidden[batch_idx * hidden_size + idx] = grad * ((hidden[batch_idx * hidden_size + idx] > 0) ? 1.0f : 0.0f); 106 | } 107 | } 108 | 109 | __global__ void compute_bias_gradient(float *grad_bias, float *grad, int size, int batch_size) { 110 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 111 | 112 | if (idx < size) { 113 | float sum = 0.0f; 114 | for (int i = 0; i < batch_size; i++) { 115 | sum += grad[i * size + idx]; 116 | } 117 | grad_bias[idx] = sum; 118 | } 119 | } 120 | 121 | // 3/4 working 122 | void backward_pass_cublas(cublasHandle_t handle, float *d_input, float *d_hidden, float *d_output, int *d_labels, 123 | float *d_weights1, float *d_weights2, 124 | float *d_grad_weights1, float *d_grad_weights2, 125 | float *d_grad_bias1, float *d_grad_bias2, 126 | float *d_grad_output, float *d_grad_hidden, float *d_ones, 127 | int input_size, int hidden_size, int output_size, int batch_size) { 128 | float alpha = 1.0f, beta = 0.0f; 129 | 130 | // Compute output gradient 131 | dim3 block_size(256); 132 | dim3 grid_size((output_size + block_size.x - 1) / block_size.x, batch_size); 133 | compute_output_gradient<<>>(d_output, d_labels, d_grad_output, output_size, batch_size); 134 | 135 | // Compute dW2 = dLoss @ x2.T = (10, B) @ (B, 256) = (10, 256) 136 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 137 | hidden_size, output_size, batch_size, // (M K N) 138 | &alpha, 139 | d_hidden, hidden_size, 140 | d_grad_output, output_size, 141 | &beta, 142 | d_grad_weights2, hidden_size); 143 | 144 | // Compute hidden gradient 145 | grid_size.x = (hidden_size + block_size.x - 1) / block_size.x; 146 | compute_hidden_gradient<<>>(d_grad_hidden, d_grad_output, d_weights2, d_hidden, 147 | hidden_size, output_size, batch_size); 148 | 149 | // Compute dW1 = dRelu @ x1.T = (256, B) @ (B, 784) = (256, 784) 150 | cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_T, 151 | input_size, hidden_size, batch_size, 152 | &alpha, 153 | d_input, input_size, 154 | d_grad_hidden, hidden_size, 155 | &beta, 156 | d_grad_weights1, input_size); 157 | 158 | // Compute bias gradients 159 | compute_bias_gradient<<<(output_size + 255) / 256, 256>>>(d_grad_bias2, d_grad_output, output_size, batch_size); 160 | compute_bias_gradient<<<(hidden_size + 255) / 256, 256>>>(d_grad_bias1, d_grad_hidden, hidden_size, batch_size); 161 | } 162 | 163 | void print_comparison(const char* name, float* arr1, float* arr2, int size) { 164 | float max_diff = 0.0f; 165 | printf("%s:\n", name); 166 | printf("First 10 values:\n"); 167 | for (int i = 0; i < 10 && i < size; i++) { 168 | printf("%.6f vs %.6f\n", arr1[i], arr2[i]); 169 | max_diff = fmaxf(max_diff, fabsf(arr1[i] - arr2[i])); 170 | } 171 | for (int i = 10; i < size; i++) { 172 | max_diff = fmaxf(max_diff, fabsf(arr1[i] - arr2[i])); 173 | } 174 | printf("Max difference: %.6f\n\n", max_diff); 175 | } 176 | 177 | int main() { 178 | // Allocate host memory 179 | float *h_input, *h_hidden, *h_output, *h_weights1, *h_weights2; 180 | int *h_labels; 181 | float *h_grad_weights1_naive, *h_grad_weights2_naive, *h_grad_bias1_naive, *h_grad_bias2_naive; 182 | float *h_grad_weights1_cublas, *h_grad_weights2_cublas, *h_grad_bias1_cublas, *h_grad_bias2_cublas; 183 | 184 | cudaMallocHost(&h_input, BATCH_SIZE * INPUT_SIZE * sizeof(float)); 185 | cudaMallocHost(&h_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float)); 186 | cudaMallocHost(&h_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)); 187 | cudaMallocHost(&h_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 188 | cudaMallocHost(&h_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 189 | cudaMallocHost(&h_labels, BATCH_SIZE * sizeof(int)); 190 | cudaMallocHost(&h_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 191 | cudaMallocHost(&h_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 192 | cudaMallocHost(&h_grad_bias1_naive, HIDDEN_SIZE * sizeof(float)); 193 | cudaMallocHost(&h_grad_bias2_naive, OUTPUT_SIZE * sizeof(float)); 194 | cudaMallocHost(&h_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 195 | cudaMallocHost(&h_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 196 | cudaMallocHost(&h_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float)); 197 | cudaMallocHost(&h_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float)); 198 | 199 | // Allocate device memory 200 | float *d_input, *d_hidden, *d_output, *d_weights1, *d_weights2; 201 | int *d_labels; 202 | float *d_grad_weights1_naive, *d_grad_weights2_naive, *d_grad_bias1_naive, *d_grad_bias2_naive; 203 | float *d_grad_weights1_cublas, *d_grad_weights2_cublas, *d_grad_bias1_cublas, *d_grad_bias2_cublas; 204 | float *d_grad_output, *d_grad_hidden, *d_ones; 205 | 206 | CUDA_CHECK(cudaMalloc(&d_input, BATCH_SIZE * INPUT_SIZE * sizeof(float))); 207 | CUDA_CHECK(cudaMalloc(&d_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float))); 208 | CUDA_CHECK(cudaMalloc(&d_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); 209 | CUDA_CHECK(cudaMalloc(&d_weights1, HIDDEN_SIZE * INPUT_SIZE * sizeof(float))); 210 | CUDA_CHECK(cudaMalloc(&d_weights2, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float))); 211 | CUDA_CHECK(cudaMalloc(&d_labels, BATCH_SIZE * sizeof(int))); 212 | CUDA_CHECK(cudaMalloc(&d_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float))); 213 | CUDA_CHECK(cudaMalloc(&d_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float))); 214 | CUDA_CHECK(cudaMalloc(&d_grad_bias1_naive, HIDDEN_SIZE * sizeof(float))); 215 | CUDA_CHECK(cudaMalloc(&d_grad_bias2_naive, OUTPUT_SIZE * sizeof(float))); 216 | CUDA_CHECK(cudaMalloc(&d_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float))); 217 | CUDA_CHECK(cudaMalloc(&d_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float))); 218 | CUDA_CHECK(cudaMalloc(&d_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float))); 219 | CUDA_CHECK(cudaMalloc(&d_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float))); 220 | CUDA_CHECK(cudaMalloc(&d_grad_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); 221 | CUDA_CHECK(cudaMalloc(&d_grad_hidden, BATCH_SIZE * HIDDEN_SIZE * sizeof(float))); 222 | CUDA_CHECK(cudaMalloc(&d_ones, BATCH_SIZE * sizeof(float))); 223 | 224 | // Initialize random data 225 | int threads = 256; 226 | int blocks; 227 | unsigned long long seed = time(NULL); 228 | 229 | blocks = (BATCH_SIZE * INPUT_SIZE + threads - 1) / threads; 230 | init_random<<>>(d_input, BATCH_SIZE * INPUT_SIZE, seed); 231 | 232 | blocks = (BATCH_SIZE * HIDDEN_SIZE + threads - 1) / threads; 233 | init_random<<>>(d_hidden, BATCH_SIZE * HIDDEN_SIZE, seed); 234 | 235 | blocks = (BATCH_SIZE * OUTPUT_SIZE + threads - 1) / threads; 236 | init_random<<>>(d_output, BATCH_SIZE * OUTPUT_SIZE, seed); 237 | 238 | blocks = (HIDDEN_SIZE * INPUT_SIZE + threads - 1) / threads; 239 | init_random<<>>(d_weights1, HIDDEN_SIZE * INPUT_SIZE, seed); 240 | 241 | blocks = (OUTPUT_SIZE * HIDDEN_SIZE + threads - 1) / threads; 242 | init_random<<>>(d_weights2, OUTPUT_SIZE * HIDDEN_SIZE, seed); 243 | 244 | // Initialize labels with random values between 0 and OUTPUT_SIZE - 1 245 | for (int i = 0; i < BATCH_SIZE; i++) { 246 | h_labels[i] = rand() % OUTPUT_SIZE; 247 | } 248 | CUDA_CHECK(cudaMemcpy(d_labels, h_labels, BATCH_SIZE * sizeof(int), cudaMemcpyHostToDevice)); 249 | 250 | // Initialize d_ones with all 1's 251 | CUDA_CHECK(cudaMemset(d_ones, 1, BATCH_SIZE * sizeof(float))); 252 | 253 | // Allocate host memory for grad_output 254 | float *h_grad_output_naive, *h_grad_output_cublas; 255 | cudaMallocHost(&h_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)); 256 | cudaMallocHost(&h_grad_output_cublas, BATCH_SIZE * OUTPUT_SIZE * sizeof(float)); 257 | 258 | // Allocate device memory for grad_output_naive 259 | float *d_grad_output_naive; 260 | CUDA_CHECK(cudaMalloc(&d_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float))); 261 | 262 | // Perform naive backward pass 263 | dim3 block_size(256); 264 | dim3 grid_size((max(HIDDEN_SIZE, OUTPUT_SIZE) + block_size.x - 1) / block_size.x, BATCH_SIZE); 265 | backward_pass_naive<<>>(d_input, d_hidden, d_output, d_labels, 266 | d_weights1, d_weights2, 267 | d_grad_weights1_naive, d_grad_weights2_naive, 268 | d_grad_bias1_naive, d_grad_bias2_naive, 269 | INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, BATCH_SIZE); 270 | 271 | // Compute grad_output for naive approach 272 | compute_output_gradient<<>>(d_output, d_labels, d_grad_output_naive, OUTPUT_SIZE, BATCH_SIZE); 273 | 274 | // Perform cuBLAS backward pass 275 | cublasHandle_t handle; 276 | cublasCreate(&handle); 277 | backward_pass_cublas(handle, d_input, d_hidden, d_output, d_labels, 278 | d_weights1, d_weights2, 279 | d_grad_weights1_cublas, d_grad_weights2_cublas, 280 | d_grad_bias1_cublas, d_grad_bias2_cublas, 281 | d_grad_output, d_grad_hidden, d_ones, 282 | INPUT_SIZE, HIDDEN_SIZE, OUTPUT_SIZE, BATCH_SIZE); 283 | cublasDestroy(handle); 284 | 285 | // Copy results back to host 286 | CUDA_CHECK(cudaMemcpy(h_grad_weights1_naive, d_grad_weights1_naive, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 287 | CUDA_CHECK(cudaMemcpy(h_grad_weights2_naive, d_grad_weights2_naive, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 288 | CUDA_CHECK(cudaMemcpy(h_grad_bias1_naive, d_grad_bias1_naive, HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 289 | CUDA_CHECK(cudaMemcpy(h_grad_bias2_naive, d_grad_bias2_naive, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 290 | 291 | CUDA_CHECK(cudaMemcpy(h_grad_weights1_cublas, d_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 292 | CUDA_CHECK(cudaMemcpy(h_grad_weights2_cublas, d_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 293 | CUDA_CHECK(cudaMemcpy(h_grad_bias1_cublas, d_grad_bias1_cublas, HIDDEN_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 294 | CUDA_CHECK(cudaMemcpy(h_grad_bias2_cublas, d_grad_bias2_cublas, OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 295 | 296 | CUDA_CHECK(cudaMemcpy(h_grad_output_naive, d_grad_output_naive, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 297 | CUDA_CHECK(cudaMemcpy(h_grad_output_cublas, d_grad_output, BATCH_SIZE * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost)); 298 | 299 | // Compare and print results 300 | print_comparison("grad_output", h_grad_output_naive, h_grad_output_cublas, BATCH_SIZE * OUTPUT_SIZE); 301 | print_comparison("grad_weights2", h_grad_weights2_naive, h_grad_weights2_cublas, OUTPUT_SIZE * HIDDEN_SIZE); 302 | 303 | // print indices of x > 1e-2 here (h_grad_weights2_naive, h_grad_weights2_cublas): 304 | for (int i = 0; i < OUTPUT_SIZE * HIDDEN_SIZE; i += 16) { 305 | if (fabsf(h_grad_weights2_naive[i] - h_grad_weights2_cublas[i]) > 1e-3) { 306 | printf("Index %d: %.6f vs %.6f\n", i, h_grad_weights2_naive[i], h_grad_weights2_cublas[i]); 307 | } 308 | } 309 | 310 | print_comparison("grad_bias2", h_grad_bias2_naive, h_grad_bias2_cublas, OUTPUT_SIZE); 311 | print_comparison("grad_bias2", h_grad_bias2_naive, h_grad_bias2_cublas, OUTPUT_SIZE); 312 | print_comparison("grad_weights1", h_grad_weights1_naive, h_grad_weights1_cublas, HIDDEN_SIZE * INPUT_SIZE); 313 | print_comparison("grad_bias1", h_grad_bias1_naive, h_grad_bias1_cublas, HIDDEN_SIZE); 314 | 315 | // Free memory 316 | cudaFreeHost(h_input); 317 | cudaFreeHost(h_hidden); 318 | cudaFreeHost(h_output); 319 | cudaFreeHost(h_weights1); 320 | cudaFreeHost(h_weights2); 321 | cudaFreeHost(h_labels); 322 | cudaFreeHost(h_grad_weights1_naive); 323 | cudaFreeHost(h_grad_weights2_naive); 324 | cudaFreeHost(h_grad_bias1_naive); 325 | cudaFreeHost(h_grad_bias2_naive); 326 | cudaFreeHost(h_grad_weights1_cublas); 327 | cudaFreeHost(h_grad_weights2_cublas); 328 | cudaFreeHost(h_grad_bias1_cublas); 329 | cudaFreeHost(h_grad_bias2_cublas); 330 | cudaFreeHost(h_grad_output_naive); 331 | cudaFreeHost(h_grad_output_cublas); 332 | 333 | cudaFree(d_input); 334 | cudaFree(d_hidden); 335 | cudaFree(d_output); 336 | cudaFree(d_weights1); 337 | cudaFree(d_weights2); 338 | cudaFree(d_labels); 339 | cudaFree(d_grad_weights1_naive); 340 | cudaFree(d_grad_weights2_naive); 341 | cudaFree(d_grad_bias1_naive); 342 | cudaFree(d_grad_bias2_naive); 343 | cudaFree(d_grad_weights1_cublas); 344 | cudaFree(d_grad_weights2_cublas); 345 | cudaFree(d_grad_bias1_cublas); 346 | cudaFree(d_grad_bias2_cublas); 347 | cudaFree(d_grad_output); 348 | cudaFree(d_grad_hidden); 349 | cudaFree(d_ones); 350 | cudaFree(d_grad_output_naive); 351 | 352 | return 0; 353 | } 354 | -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from torchvision import datasets, transforms 5 | 6 | # Set the directory where you want to save the files 7 | save_dir = "mnist_data" 8 | os.makedirs(save_dir, exist_ok=True) 9 | 10 | # Download and load the MNIST dataset 11 | transform = transforms.Compose([transforms.ToTensor()]) 12 | mnist_train = datasets.MNIST( 13 | root="./data", train=True, download=True, transform=transform 14 | ) 15 | mnist_test = datasets.MNIST( 16 | root="./data", train=False, download=True, transform=transform 17 | ) 18 | 19 | # Convert to numpy arrays and normalize 20 | X_train = mnist_train.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 21 | y_train = mnist_train.targets.numpy().astype(np.int32) 22 | X_test = mnist_test.data.numpy().reshape(-1, 28 * 28).astype(np.float32) / 255.0 23 | y_test = mnist_test.targets.numpy().astype(np.int32) 24 | 25 | # Save the data as raw binary files 26 | X_train.tofile(os.path.join(save_dir, "X_train.bin")) 27 | y_train.tofile(os.path.join(save_dir, "y_train.bin")) 28 | X_test.tofile(os.path.join(save_dir, "X_test.bin")) 29 | y_test.tofile(os.path.join(save_dir, "y_test.bin")) 30 | 31 | # Save metadata 32 | with open(os.path.join(save_dir, "metadata.txt"), "w") as f: 33 | f.write(f"Training samples: {X_train.shape[0]}\n") 34 | f.write(f"Test samples: {X_test.shape[0]}\n") 35 | f.write(f"Input dimensions: {X_train.shape[1]}\n") 36 | f.write(f"Number of classes: {len(np.unique(y_train))}\n") 37 | 38 | print("MNIST dataset has been downloaded and saved in binary format.") 39 | -------------------------------------------------------------------------------- /naive-cpu/v1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #define INPUT_SIZE 784 8 | #define HIDDEN_SIZE 1 9 | #define OUTPUT_SIZE 10 10 | #define TRAIN_SIZE 10000 11 | #define TEST_SIZE 1000 12 | #define BATCH_SIZE 4 13 | #define EPOCHS 10 14 | #define LEARNING_RATE 0.001 15 | 16 | typedef struct { 17 | float *weights1; 18 | float *weights2; 19 | float *bias1; 20 | float *bias2; 21 | float *grad_weights1; 22 | float *grad_weights2; 23 | float *grad_bias1; 24 | float *grad_bias2; 25 | } NeuralNetwork; 26 | 27 | 28 | // load batched img data 29 | void load_data(const char *filename, float *data, int size) { 30 | FILE *file = fopen(filename, "rb"); 31 | if (file == NULL) { 32 | fprintf(stderr, "Error opening file: %s\n", filename); 33 | exit(1); 34 | } 35 | size_t read_size = fread(data, sizeof(float), size, file); 36 | if (read_size != size) { 37 | fprintf(stderr, "Error reading data: expected %d elements, got %zu\n", size, read_size); 38 | exit(1); 39 | } 40 | fclose(file); 41 | } 42 | 43 | // load batch labels 44 | void load_labels(const char *filename, int *labels, int size) { 45 | FILE *file = fopen(filename, "rb"); 46 | if (file == NULL) { 47 | fprintf(stderr, "Error opening file: %s\n", filename); 48 | exit(1); 49 | } 50 | size_t read_size = fread(labels, sizeof(int), size, file); 51 | if (read_size != size) { 52 | fprintf(stderr, "Error reading labels: expected %d elements, got %zu\n", size, read_size); 53 | exit(1); 54 | } 55 | fclose(file); 56 | } 57 | 58 | // kaiming init func for weights 59 | void initialize_weights(float *weights, int size) { 60 | float scale = sqrtf(2.0f / size); 61 | for (int i = 0; i < size; i++) { 62 | weights[i] = ((float)rand() / RAND_MAX) * scale - (scale / 2.0f); 63 | } 64 | } 65 | 66 | // basic init for biases 67 | void initialize_bias(float *bias, int size) { 68 | for (int i = 0; i < size; i++) { 69 | bias[i] = 0.0f; 70 | } 71 | } 72 | 73 | // Modify softmax to work with batches 74 | void softmax(float *x, int batch_size, int size) { 75 | for (int b = 0; b < batch_size; b++) { 76 | float max = x[b * size]; 77 | for (int i = 1; i < size; i++) { 78 | if (x[b * size + i] > max) max = x[b * size + i]; 79 | } 80 | float sum = 0.0f; 81 | for (int i = 0; i < size; i++) { 82 | x[b * size + i] = expf(x[b * size + i] - max); 83 | sum += x[b * size + i]; 84 | } 85 | for (int i = 0; i < size; i++) { 86 | x[b * size + i] = fmaxf(x[b * size + i] / sum, 1e-7f); 87 | } 88 | } 89 | } 90 | 91 | void matmul_a_b(float *A, float *B, float *C, int m, int n, int k) { 92 | for (int i = 0; i < m; i++) { 93 | for (int j = 0; j < k; j++) { 94 | C[i * k + j] = 0.0f; 95 | for (int l = 0; l < n; l++) { 96 | C[i * k + j] += A[i * n + l] * B[l * k + j]; 97 | } 98 | } 99 | } 100 | } 101 | 102 | // Matrix multiplication A @ B.T 103 | void matmul_a_bt(float *A, float *B, float *C, int m, int n, int k) { 104 | for (int i = 0; i < m; i++) { 105 | for (int j = 0; j < k; j++) { 106 | C[i * k + j] = 0.0f; 107 | for (int l = 0; l < n; l++) { 108 | C[i * k + j] += A[i * n + l] * B[j * n + l]; 109 | } 110 | } 111 | } 112 | } 113 | 114 | // Matrix multiplication A.T @ B 115 | void matmul_at_b(float *A, float *B, float *C, int m, int n, int k) { 116 | for (int i = 0; i < n; i++) { 117 | for (int j = 0; j < k; j++) { 118 | C[i * k + j] = 0.0f; 119 | for (int l = 0; l < m; l++) { 120 | C[i * k + j] += A[l * n + i] * B[l * k + j]; 121 | } 122 | } 123 | } 124 | } 125 | 126 | // ReLU forward 127 | void relu_forward(float *x, int size) { 128 | for (int i = 0; i < size; i++) { 129 | x[i] = fmaxf(0.0f, x[i]); 130 | } 131 | } 132 | 133 | // Add bias 134 | void bias_forward(float *x, float *bias, int batch_size, int size) { 135 | for (int b = 0; b < batch_size; b++) { 136 | for (int i = 0; i < size; i++) { 137 | x[b * size + i] += bias[i]; 138 | } 139 | } 140 | } 141 | 142 | // Modified forward function 143 | void forward(NeuralNetwork *nn, float *input, float *hidden, float *output, int batch_size) { 144 | // Input to Hidden (X @ W1) 145 | matmul_a_b(input, nn->weights1, hidden, batch_size, INPUT_SIZE, HIDDEN_SIZE); 146 | 147 | // Add bias1 148 | bias_forward(hidden, nn->bias1, batch_size, HIDDEN_SIZE); 149 | 150 | // Apply ReLU 151 | relu_forward(hidden, batch_size * HIDDEN_SIZE); 152 | 153 | // Hidden to Output (Hidden @ W2) 154 | matmul_a_b(hidden, nn->weights2, output, batch_size, HIDDEN_SIZE, OUTPUT_SIZE); 155 | 156 | // Add bias2 157 | bias_forward(output, nn->bias2, batch_size, OUTPUT_SIZE); 158 | 159 | // Apply softmax 160 | softmax(output, batch_size, OUTPUT_SIZE); 161 | } 162 | 163 | // Modify cross_entropy_loss to work with batches 164 | float cross_entropy_loss(float *output, int *labels, int batch_size) { 165 | float total_loss = 0.0f; 166 | for (int b = 0; b < batch_size; b++) { 167 | total_loss -= logf(fmaxf(output[b * OUTPUT_SIZE + labels[b]], 1e-7f)); 168 | } 169 | return total_loss / batch_size; 170 | } 171 | 172 | 173 | // Zero out gradients 174 | void zero_grad(float *grad, int size) { 175 | memset(grad, 0, size * sizeof(float)); 176 | } 177 | 178 | // ReLU backward 179 | void relu_backward(float *grad, float *x, int size) { 180 | for (int i = 0; i < size; i++) { 181 | grad[i] *= (x[i] > 0); 182 | } 183 | } 184 | 185 | // Bias backward 186 | void bias_backward(float *grad_bias, float *grad, int batch_size, int size) { 187 | for (int i = 0; i < size; i++) { 188 | grad_bias[i] = 0.0f; 189 | for (int b = 0; b < batch_size; b++) { 190 | grad_bias[i] += grad[b * size + i]; 191 | } 192 | } 193 | } 194 | 195 | // Compute gradients for output layer 196 | void compute_output_gradients(float *grad_output, float *output, int *labels, int batch_size) { 197 | for (int b = 0; b < batch_size; b++) { 198 | for (int i = 0; i < OUTPUT_SIZE; i++) { 199 | grad_output[b * OUTPUT_SIZE + i] = output[b * OUTPUT_SIZE + i]; 200 | } 201 | grad_output[b * OUTPUT_SIZE + labels[b]] -= 1.0f; 202 | } 203 | } 204 | 205 | // Update gradients for weights and biases 206 | void update_gradients(float *grad_weights, float *grad_bias, float *grad_layer, float *prev_layer, int batch_size, int prev_size, int curr_size) { 207 | for (int i = 0; i < curr_size; i++) { 208 | for (int j = 0; j < prev_size; j++) { 209 | for (int b = 0; b < batch_size; b++) { 210 | grad_weights[i * prev_size + j] += grad_layer[b * curr_size + i] * prev_layer[b * prev_size + j]; 211 | } 212 | } 213 | for (int b = 0; b < batch_size; b++) { 214 | grad_bias[i] += grad_layer[b * curr_size + i]; 215 | } 216 | } 217 | } 218 | 219 | // Backward pass function 220 | void backward(NeuralNetwork *nn, float *input, float *hidden, float *output, int *labels, int batch_size) { 221 | 222 | // Initialize gradients to zero 223 | zero_grad(nn->grad_weights1, HIDDEN_SIZE * INPUT_SIZE); 224 | zero_grad(nn->grad_weights2, OUTPUT_SIZE * HIDDEN_SIZE); 225 | zero_grad(nn->grad_bias1, HIDDEN_SIZE); 226 | zero_grad(nn->grad_bias2, OUTPUT_SIZE); 227 | 228 | // Compute gradients for output layer 229 | float *grad_output = malloc(batch_size * OUTPUT_SIZE * sizeof(float)); 230 | compute_output_gradients(grad_output, output, labels, batch_size); 231 | 232 | // Update gradients for weights2 (W2.grad = grad_output.T @ hidden) 233 | matmul_at_b(hidden, grad_output, nn->grad_weights2, batch_size, HIDDEN_SIZE, OUTPUT_SIZE); 234 | 235 | // Update gradients for bias2 236 | bias_backward(nn->grad_bias2, grad_output, batch_size, OUTPUT_SIZE); 237 | 238 | // Compute dX2 (gradient of loss w.r.t. input of second layer) 239 | float *dX2 = malloc(batch_size * HIDDEN_SIZE * sizeof(float)); 240 | 241 | // grad_output @ W2.T = dX2 -> (B, 10) @ (10, 256) = (B, 256) 242 | matmul_a_bt(grad_output, nn->weights2, dX2, batch_size, OUTPUT_SIZE, HIDDEN_SIZE); 243 | 244 | // Compute d_ReLU_out (element-wise multiplication with ReLU derivative) 245 | float *d_ReLU_out = malloc(batch_size * HIDDEN_SIZE * sizeof(float)); 246 | for (int i = 0; i < batch_size * HIDDEN_SIZE; i++) { 247 | d_ReLU_out[i] = dX2[i] * (hidden[i] > 0); 248 | } 249 | // retains its shape since its just a point-wise operation 250 | // Update gradients for weights1 (W1.grad = d_ReLU_out.T @ input) 251 | matmul_at_b(input, d_ReLU_out, nn->grad_weights1, batch_size, INPUT_SIZE, HIDDEN_SIZE); 252 | 253 | // Update gradients for bias1 254 | bias_backward(nn->grad_bias1, d_ReLU_out, batch_size, HIDDEN_SIZE); 255 | 256 | // Free allocated memory 257 | free(grad_output); 258 | free(dX2); 259 | free(d_ReLU_out); 260 | } 261 | 262 | // gradient descent step 263 | void update_weights(NeuralNetwork *nn) { 264 | for (int i = 0; i < HIDDEN_SIZE * INPUT_SIZE; i++) { 265 | nn->weights1[i] -= LEARNING_RATE * nn->grad_weights1[i]; 266 | } 267 | for (int i = 0; i < OUTPUT_SIZE * HIDDEN_SIZE; i++) { 268 | nn->weights2[i] -= LEARNING_RATE * nn->grad_weights2[i]; 269 | } 270 | for (int i = 0; i < HIDDEN_SIZE; i++) { 271 | nn->bias1[i] -= LEARNING_RATE * nn->grad_bias1[i]; 272 | } 273 | for (int i = 0; i < OUTPUT_SIZE; i++) { 274 | nn->bias2[i] -= LEARNING_RATE * nn->grad_bias2[i]; 275 | } 276 | } 277 | 278 | // Modify train function to work with batches 279 | void train(NeuralNetwork *nn, float *X_train, int *y_train) { 280 | float *hidden = malloc(BATCH_SIZE * HIDDEN_SIZE * sizeof(float)); 281 | float *output = malloc(BATCH_SIZE * OUTPUT_SIZE * sizeof(float)); 282 | 283 | int num_batches = TRAIN_SIZE / BATCH_SIZE; 284 | 285 | for (int epoch = 0; epoch < EPOCHS; epoch++) { 286 | float total_loss = 0.0f; 287 | int correct = 0; 288 | 289 | for (int batch = 0; batch < num_batches; batch++) { 290 | int start_idx = batch * BATCH_SIZE; 291 | 292 | forward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, BATCH_SIZE); 293 | 294 | float loss = cross_entropy_loss(output, &y_train[start_idx], BATCH_SIZE); 295 | total_loss += loss; 296 | 297 | for (int i = 0; i < BATCH_SIZE; i++) { 298 | int predicted = 0; 299 | for (int j = 1; j < OUTPUT_SIZE; j++) { 300 | if (output[i * OUTPUT_SIZE + j] > output[i * OUTPUT_SIZE + predicted]) { 301 | predicted = j; 302 | } 303 | } 304 | if (predicted == y_train[start_idx + i]) { 305 | correct++; 306 | } 307 | } 308 | 309 | backward(nn, &X_train[start_idx * INPUT_SIZE], hidden, output, &y_train[start_idx], BATCH_SIZE); 310 | update_weights(nn); 311 | 312 | if ((batch + 1) % 100 == 0 || (epoch == 0 && batch == 0)) { 313 | printf("Epoch %d/%d, Iter %d/%d, Loss: %.4f, Accuracy: %.2f%%\n", 314 | epoch + 1, EPOCHS, batch + 1, num_batches, total_loss / (batch + 1), 315 | 100.0f * correct / ((batch + 1) * BATCH_SIZE)); 316 | } 317 | } 318 | 319 | printf("Epoch %d/%d completed, Loss: %.4f, Accuracy: %.2f%%\n", 320 | epoch + 1, EPOCHS, total_loss / num_batches, 100.0f * correct / TRAIN_SIZE); 321 | } 322 | 323 | free(hidden); 324 | free(output); 325 | } 326 | 327 | // Modify the initialize function to allocate memory for gradients 328 | void initialize_neural_network(NeuralNetwork *nn) { 329 | nn->weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 330 | nn->weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 331 | nn->bias1 = malloc(HIDDEN_SIZE * sizeof(float)); 332 | nn->bias2 = malloc(OUTPUT_SIZE * sizeof(float)); 333 | nn->grad_weights1 = malloc(HIDDEN_SIZE * INPUT_SIZE * sizeof(float)); 334 | nn->grad_weights2 = malloc(OUTPUT_SIZE * HIDDEN_SIZE * sizeof(float)); 335 | nn->grad_bias1 = malloc(HIDDEN_SIZE * sizeof(float)); 336 | nn->grad_bias2 = malloc(OUTPUT_SIZE * sizeof(float)); 337 | 338 | initialize_weights(nn->weights1, HIDDEN_SIZE * INPUT_SIZE); 339 | initialize_weights(nn->weights2, OUTPUT_SIZE * HIDDEN_SIZE); 340 | initialize_bias(nn->bias1, HIDDEN_SIZE); 341 | initialize_bias(nn->bias2, OUTPUT_SIZE); 342 | } 343 | 344 | int main() { 345 | srand(time(NULL)); 346 | 347 | NeuralNetwork nn; 348 | initialize_neural_network(&nn); 349 | 350 | float *X_train = malloc(TRAIN_SIZE * INPUT_SIZE * sizeof(float)); 351 | int *y_train = malloc(TRAIN_SIZE * sizeof(int)); 352 | float *X_test = malloc(TEST_SIZE * INPUT_SIZE * sizeof(float)); 353 | int *y_test = malloc(TEST_SIZE * sizeof(int)); 354 | 355 | load_data("../mnist_data/X_train.bin", X_train, TRAIN_SIZE * INPUT_SIZE); 356 | load_labels("../mnist_data/y_train.bin", y_train, TRAIN_SIZE); 357 | load_data("../mnist_data/X_test.bin", X_test, TEST_SIZE * INPUT_SIZE); 358 | load_labels("../mnist_data/y_test.bin", y_test, TEST_SIZE); 359 | 360 | 361 | // print first image in the terminal 362 | for (int i = 0; i < 28; i++) { 363 | for (int j = 0; j < 28; j++) { 364 | if (X_train[0 * INPUT_SIZE + i * 28 + j] > 0.0f) { 365 | printf("X"); 366 | } else { 367 | printf(" "); 368 | } 369 | } 370 | printf("\n"); 371 | } 372 | 373 | printf("First 10 training labels: "); 374 | for (int i = 0; i < 10; i++) { 375 | printf("%d ", y_train[i]); 376 | } 377 | printf("\n"); 378 | 379 | train(&nn, X_train, y_train); 380 | 381 | free(nn.weights1); 382 | free(nn.weights2); 383 | free(nn.bias1); 384 | free(nn.bias2); 385 | free(nn.grad_weights1); 386 | free(nn.grad_weights2); 387 | free(nn.grad_bias1); 388 | free(nn.grad_bias2); 389 | free(X_train); 390 | free(y_train); 391 | free(X_test); 392 | free(y_test); 393 | 394 | return 0; 395 | } 396 | -------------------------------------------------------------------------------- /python/c-friendly.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torchvision import datasets, transforms 3 | 4 | # Load and preprocess the data 5 | transform = transforms.Compose([transforms.ToTensor()]) 6 | mnist_train = datasets.MNIST(root='mnist_data', train=True, download=True, transform=transform) 7 | mnist_test = datasets.MNIST(root='mnist_data', train=False, download=True, transform=transform) 8 | 9 | X_train = mnist_train.data.numpy().reshape(-1, 1, 28, 28)[:10000] / 255.0 10 | y_train = mnist_train.targets.numpy()[:10000] 11 | X_test = mnist_test.data.numpy().reshape(-1, 1, 28, 28) / 255.0 12 | y_test = mnist_test.targets.numpy() 13 | 14 | # print the shapes of the data 15 | print(X_train.shape, y_train.shape) 16 | print(X_test.shape, y_test.shape) 17 | # Activation functions 18 | def relu(x): 19 | return np.maximum(0, x) 20 | 21 | def relu_derivative(x): 22 | return (x > 0).astype(float) 23 | 24 | # Linear layer 25 | def initialize_weights(input_size, output_size): 26 | return np.random.randn(input_size, output_size) * np.sqrt(2.0 / input_size) 27 | 28 | def initialize_bias(output_size): 29 | return np.zeros((1, output_size)) 30 | 31 | def linear_forward(x, weights, bias): 32 | return x @ weights + bias 33 | 34 | def linear_backward(grad_output, x, weights): 35 | grad_weights = x.T @ grad_output 36 | grad_bias = np.sum(grad_output, axis=0, keepdims=True) 37 | grad_input = grad_output @ weights.T 38 | return grad_input, grad_weights, grad_bias 39 | 40 | # Softmax and Cross-Entropy Loss 41 | def softmax(x): 42 | exp_x = np.exp(x - np.max(x, axis=1, keepdims=True)) 43 | return exp_x / np.sum(exp_x, axis=1, keepdims=True) 44 | 45 | def cross_entropy_loss(y_pred, y_true): 46 | batch_size = y_pred.shape[0] 47 | probabilities = softmax(y_pred) 48 | correct_log_probs = np.log(probabilities[np.arange(batch_size), y_true]) 49 | loss = -np.sum(correct_log_probs) / batch_size 50 | return loss 51 | 52 | class NeuralNetwork: 53 | def __init__(self, input_size, hidden_size, output_size): 54 | self.weights1 = initialize_weights(input_size, hidden_size) 55 | self.bias1 = initialize_bias(hidden_size) 56 | self.weights2 = initialize_weights(hidden_size, output_size) 57 | self.bias2 = initialize_bias(output_size) 58 | 59 | def forward(self, x): 60 | batch_size = x.shape[0] 61 | fc1_input = x.reshape(batch_size, -1) 62 | fc1_output = linear_forward(fc1_input, self.weights1, self.bias1) 63 | relu_output = relu(fc1_output) 64 | fc2_output = linear_forward(relu_output, self.weights2, self.bias2) 65 | return fc2_output, (fc1_input, fc1_output, relu_output) 66 | 67 | def backward(self, grad_output, cache): 68 | x, fc1_output, relu_output = cache 69 | 70 | grad_fc2, grad_weights2, grad_bias2 = linear_backward(grad_output, relu_output, self.weights2) 71 | grad_relu = grad_fc2 * relu_derivative(fc1_output) 72 | grad_fc1, grad_weights1, grad_bias1 = linear_backward(grad_relu, x, self.weights1) 73 | return grad_weights1, grad_bias1, grad_weights2, grad_bias2 74 | 75 | def update_weights(self, grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate): 76 | self.weights1 -= learning_rate * grad_weights1 77 | self.bias1 -= learning_rate * grad_bias1 78 | self.weights2 -= learning_rate * grad_weights2 79 | self.bias2 -= learning_rate * grad_bias2 80 | 81 | def train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate): 82 | for epoch in range(epochs): 83 | print(f"Epoch {epoch+1}/{epochs}") 84 | for i in range(0, len(X_train), batch_size): 85 | batch_X = X_train[i:i+batch_size] 86 | batch_y = y_train[i:i+batch_size] 87 | y_pred, cache = model.forward(batch_X) 88 | loss = cross_entropy_loss(y_pred, batch_y) 89 | 90 | softmax_probs = softmax(y_pred) 91 | y_true_one_hot = np.zeros_like(y_pred) 92 | y_true_one_hot[np.arange(len(batch_y)), batch_y] = 1 93 | grad_output = softmax_probs - y_true_one_hot 94 | 95 | grad_weights1, grad_bias1, grad_weights2, grad_bias2 = model.backward(grad_output, cache) 96 | model.update_weights(grad_weights1, grad_bias1, grad_weights2, grad_bias2, learning_rate) 97 | 98 | if (i//batch_size) % 100 == 0: 99 | print(f"Iteration: {i//batch_size} Loss: {loss:.4f}") 100 | 101 | y_pred, _ = model.forward(X_test) 102 | test_loss = cross_entropy_loss(y_pred, y_test) 103 | accuracy = np.mean(np.argmax(y_pred, axis=1) == y_test) 104 | print(f"Epoch {epoch+1} - Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.4f}") 105 | 106 | print("Training completed!") 107 | 108 | if __name__ == "__main__": 109 | input_size = 784 # 28x28 pixels 110 | hidden_size = 256 111 | output_size = 10 # 10 digits 112 | 113 | model = NeuralNetwork(input_size, hidden_size, output_size) 114 | 115 | batch_size = 4 116 | epochs = 3 117 | learning_rate = 0.001 118 | 119 | train(model, X_train, y_train, X_test, y_test, batch_size, epochs, learning_rate) -------------------------------------------------------------------------------- /python/torch_reference.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import torch\n", 10 | "import torch.nn as nn\n", 11 | "import torch.nn.functional as F\n", 12 | "import torch.optim as optim\n", 13 | "from torch.utils.data import DataLoader\n", 14 | "from torchvision import datasets, transforms\n", 15 | "import numpy as np\n", 16 | "import time" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "Train Data Shape: torch.Size([60000, 1, 28, 28])\n", 29 | "Train Data Type: torch.float32\n", 30 | "Test Data Shape: torch.Size([10000, 1, 28, 28])\n", 31 | "Test Data Type: torch.float32\n", 32 | "Iters per epoch: 937\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "batch_size = 64\n", 38 | "# learning_rate = 0.01\n", 39 | "num_epochs = 5\n", 40 | "data_dir = '../../../data'\n", 41 | "\n", 42 | "torch.set_float32_matmul_precision('high')\n", 43 | "\n", 44 | "# MNIST Dataset\n", 45 | "transform = transforms.Compose([\n", 46 | " transforms.ToTensor(),\n", 47 | " transforms.Normalize((0.1307,), (0.3081,)) # Mean and std of MNIST\n", 48 | "])\n", 49 | "\n", 50 | "\n", 51 | "train_dataset = datasets.MNIST(root=data_dir, train=True, transform=transform, download=True)\n", 52 | "test_dataset = datasets.MNIST(root=data_dir, train=False, transform=transform, download=True)\n", 53 | "\n", 54 | "train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)\n", 55 | "test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)\n", 56 | "\n", 57 | "# Pre-allocate tensors of the appropriate size\n", 58 | "train_data = torch.zeros(len(train_dataset), 1, 28, 28)\n", 59 | "train_labels = torch.zeros(len(train_dataset), dtype=torch.long)\n", 60 | "test_data = torch.zeros(len(test_dataset), 1, 28, 28)\n", 61 | "test_labels = torch.zeros(len(test_dataset), dtype=torch.long)\n", 62 | "\n", 63 | "# Load all training data into RAM\n", 64 | "for idx, (data, label) in enumerate(train_loader):\n", 65 | " start_idx = idx * batch_size\n", 66 | " end_idx = start_idx + data.size(0)\n", 67 | " train_data[start_idx:end_idx] = data\n", 68 | " train_labels[start_idx:end_idx] = label\n", 69 | "\n", 70 | "print('Train Data Shape:', train_data.shape)\n", 71 | "print('Train Data Type:', train_data.dtype)\n", 72 | "\n", 73 | "# Load all test data into RAM\n", 74 | "for idx, (data, label) in enumerate(test_loader):\n", 75 | " start_idx = idx * batch_size\n", 76 | " end_idx = start_idx + data.size(0)\n", 77 | " test_data[start_idx:end_idx] = data\n", 78 | " test_labels[start_idx:end_idx] = label\n", 79 | "\n", 80 | "print('Test Data Shape:', test_data.shape)\n", 81 | "print('Test Data Type:', test_data.dtype)\n", 82 | "\n", 83 | "iters_per_epoch = len(train_dataset) // batch_size\n", 84 | "print('Iters per epoch:', iters_per_epoch)\n" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "learning_rate = 1e-3\n", 94 | "batch_size = 16\n", 95 | "\n", 96 | "class MLP(nn.Module):\n", 97 | " def __init__(self, in_features, hidden_features, num_classes):\n", 98 | " super(MLP, self).__init__()\n", 99 | " self.fc1 = nn.Linear(in_features, hidden_features)\n", 100 | " self.relu = nn.ReLU()\n", 101 | " self.fc2 = nn.Linear(hidden_features, num_classes)\n", 102 | "\n", 103 | " def forward(self, x):\n", 104 | " x = x.reshape(batch_size, 28*28)\n", 105 | " x = self.fc1(x)\n", 106 | " x = self.relu(x)\n", 107 | " x = self.fc2(x)\n", 108 | " return x\n", 109 | " \n", 110 | "model = MLP(in_features=784, hidden_features=256, num_classes=10).to('cuda')\n", 111 | "# model = torch.compile(model)\n", 112 | "criterion = nn.CrossEntropyLoss()\n", 113 | "optimizer = optim.SGD(model.parameters(), lr=learning_rate)\n" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 4, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Epoch: 1, Iter: 1, Loss: 2.3299460411071777\n", 126 | "Iteration Time: 85.8350 ms\n", 127 | "Epoch: 1, Iter: 100, Loss: 2.140476703643799\n", 128 | "Iteration Time: 0.4425 ms\n", 129 | "Epoch: 1, Iter: 200, Loss: 2.0235793590545654\n", 130 | "Iteration Time: 0.4423 ms\n", 131 | "Epoch: 1, Iter: 300, Loss: 1.7592310905456543\n", 132 | "Iteration Time: 0.4220 ms\n", 133 | "Epoch: 1, Iter: 400, Loss: 1.6951887607574463\n", 134 | "Iteration Time: 0.4134 ms\n", 135 | "Epoch: 1, Iter: 500, Loss: 1.3808064460754395\n", 136 | "Iteration Time: 0.4227 ms\n", 137 | "Epoch: 1, Iter: 600, Loss: 1.2386987209320068\n", 138 | "Iteration Time: 0.4241 ms\n", 139 | "Epoch: 1, Iter: 700, Loss: 1.2353482246398926\n", 140 | "Iteration Time: 0.4146 ms\n", 141 | "Epoch: 1, Iter: 800, Loss: 1.1316126585006714\n", 142 | "Iteration Time: 0.4673 ms\n", 143 | "Epoch: 1, Iter: 900, Loss: 0.9632489681243896\n", 144 | "Iteration Time: 0.4680 ms\n", 145 | "Average Batch Accuracy: 81.23%\n", 146 | "Finished Training\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "\n", 152 | "# epochs = 2\n", 153 | "# Training the model\n", 154 | "def train(model, criterion, optimizer, epoch):\n", 155 | " model.train()\n", 156 | " running_loss = 0.0\n", 157 | "\n", 158 | " for i in range(iters_per_epoch):\n", 159 | " \n", 160 | " optimizer.zero_grad()\n", 161 | " data = train_data[i*batch_size:(i+1)*batch_size].to('cuda')\n", 162 | " target = train_labels[i*batch_size:(i+1)*batch_size].to('cuda')\n", 163 | " start = time.time()\n", 164 | " outputs = model(data)\n", 165 | " loss = criterion(outputs, target)\n", 166 | " loss.backward()\n", 167 | " optimizer.step()\n", 168 | " optimizer.zero_grad()\n", 169 | " end = time.time()\n", 170 | " running_loss += loss.item()\n", 171 | " if i % 100 == 99 or i == 0:\n", 172 | " print(f'Epoch: {epoch+1}, Iter: {i+1}, Loss: {loss}')\n", 173 | " print(f'Iteration Time: {(end - start) * 1e3:.4f} ms')\n", 174 | " running_loss = 0.0\n", 175 | "\n", 176 | "# Evaluation function to report average batch accuracy using the loaded test data\n", 177 | "def evaluate(model, test_data, test_labels):\n", 178 | " device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", 179 | " model.to(device)\n", 180 | " model.eval()\n", 181 | " \n", 182 | " total_batch_accuracy = torch.tensor(0.0, device=device)\n", 183 | " num_batches = 0\n", 184 | " \n", 185 | " with torch.no_grad():\n", 186 | " for i in range(len(test_data) // batch_size):\n", 187 | " data = test_data[i * batch_size: (i + 1) * batch_size].to(device)\n", 188 | " target = test_labels[i * batch_size: (i + 1) * batch_size].to(device)\n", 189 | " outputs = model(data)\n", 190 | " _, predicted = torch.max(outputs.data, 1)\n", 191 | " correct_batch = (predicted == target).sum().item()\n", 192 | " total_batch = target.size(0)\n", 193 | " if total_batch != 0: # Check to avoid division by zero\n", 194 | " batch_accuracy = correct_batch / total_batch\n", 195 | " total_batch_accuracy += batch_accuracy\n", 196 | " num_batches += 1\n", 197 | " \n", 198 | " avg_batch_accuracy = total_batch_accuracy / num_batches\n", 199 | " print(f'Average Batch Accuracy: {avg_batch_accuracy * 100:.2f}%')\n", 200 | "\n", 201 | "# Main\n", 202 | "for epoch in range(1):\n", 203 | " train(model, criterion, optimizer, epoch)\n", 204 | " evaluate(model, test_data, test_labels)\n", 205 | " \n", 206 | "print('Finished Training')" 207 | ] 208 | } 209 | ], 210 | "metadata": { 211 | "kernelspec": { 212 | "display_name": "Python 3", 213 | "language": "python", 214 | "name": "python3" 215 | }, 216 | "language_info": { 217 | "codemirror_mode": { 218 | "name": "ipython", 219 | "version": 3 220 | }, 221 | "file_extension": ".py", 222 | "mimetype": "text/x-python", 223 | "name": "python", 224 | "nbconvert_exporter": "python", 225 | "pygments_lexer": "ipython3", 226 | "version": "3.11.7" 227 | } 228 | }, 229 | "nbformat": 4, 230 | "nbformat_minor": 2 231 | } 232 | -------------------------------------------------------------------------------- /python/torch_reference.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.optim as optim 8 | from torch.utils.data import DataLoader 9 | from torchvision import datasets, transforms 10 | 11 | TRAIN_SIZE = 10000 12 | epochs = 3 13 | learning_rate = 1e-3 14 | batch_size = 4 15 | num_epochs = 3 16 | data_dir = "../../../data" 17 | 18 | torch.set_float32_matmul_precision("high") 19 | 20 | # MNIST Dataset 21 | transform = transforms.Compose( 22 | [ 23 | transforms.ToTensor(), 24 | transforms.Normalize((0.1307,), (0.3081,)), # Mean and std of MNIST 25 | ] 26 | ) 27 | 28 | 29 | train_dataset = datasets.MNIST( 30 | root=data_dir, train=True, transform=transform, download=True 31 | ) 32 | test_dataset = datasets.MNIST( 33 | root=data_dir, train=False, transform=transform, download=True 34 | ) 35 | 36 | train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) 37 | test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) 38 | 39 | # Pre-allocate tensors of the appropriate size 40 | train_data = torch.zeros(len(train_dataset), 1, 28, 28) 41 | train_labels = torch.zeros(len(train_dataset), dtype=torch.long) 42 | test_data = torch.zeros(len(test_dataset), 1, 28, 28) 43 | test_labels = torch.zeros(len(test_dataset), dtype=torch.long) 44 | 45 | # Load all training data into RAM 46 | for idx, (data, label) in enumerate(train_loader): 47 | start_idx = idx * batch_size 48 | end_idx = start_idx + data.size(0) 49 | train_data[start_idx:end_idx] = data 50 | train_labels[start_idx:end_idx] = label 51 | 52 | print("Train Data Shape:", train_data.shape) 53 | print("Train Data Type:", train_data.dtype) 54 | 55 | # Load all test data into RAM 56 | for idx, (data, label) in enumerate(test_loader): 57 | start_idx = idx * batch_size 58 | end_idx = start_idx + data.size(0) 59 | test_data[start_idx:end_idx] = data 60 | test_labels[start_idx:end_idx] = label 61 | 62 | print("Test Data Shape:", test_data.shape) 63 | print("Test Data Type:", test_data.dtype) 64 | 65 | iters_per_epoch = TRAIN_SIZE // batch_size 66 | print("Iters per epoch:", iters_per_epoch) 67 | 68 | 69 | class MLP(nn.Module): 70 | def __init__(self, in_features, hidden_features, num_classes): 71 | super(MLP, self).__init__() 72 | self.fc1 = nn.Linear(in_features, hidden_features) 73 | self.relu = nn.ReLU() 74 | self.fc2 = nn.Linear(hidden_features, num_classes) 75 | 76 | def forward(self, x): 77 | x = x.reshape(batch_size, 28 * 28) 78 | x = self.fc1(x) 79 | x = self.relu(x) 80 | x = self.fc2(x) 81 | return x 82 | 83 | 84 | model = MLP(in_features=784, hidden_features=256, num_classes=10).to("cuda") 85 | # model = torch.compile(model) 86 | criterion = nn.CrossEntropyLoss() 87 | optimizer = optim.SGD(model.parameters(), lr=learning_rate) 88 | 89 | 90 | # Training the model 91 | def train(model, criterion, optimizer, epoch): 92 | model.train() 93 | running_loss = 0.0 94 | 95 | for i in range(iters_per_epoch): 96 | 97 | optimizer.zero_grad() 98 | data = train_data[i * batch_size : (i + 1) * batch_size].to("cuda") 99 | target = train_labels[i * batch_size : (i + 1) * batch_size].to("cuda") 100 | 101 | start = time.time() 102 | outputs = model(data) 103 | loss = criterion(outputs, target) 104 | loss.backward() 105 | optimizer.step() 106 | optimizer.zero_grad() 107 | end = time.time() 108 | running_loss += loss.item() 109 | if i % 100 == 99 or i == 0: 110 | print(f"Epoch: {epoch+1}, Iter: {i+1}, Loss: {loss}") 111 | print(f"Iteration Time: {(end - start) * 1e3:.4f} sec") 112 | running_loss = 0.0 113 | 114 | 115 | # Evaluation function to report average batch accuracy using the loaded test data 116 | def evaluate(model, test_data, test_labels): 117 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 118 | model.to(device) 119 | model.eval() 120 | 121 | total_batch_accuracy = torch.tensor(0.0, device=device) 122 | num_batches = 0 123 | 124 | with torch.no_grad(): 125 | for i in range(len(test_data) // batch_size): 126 | data = test_data[i * batch_size : (i + 1) * batch_size].to(device) 127 | target = test_labels[i * batch_size : (i + 1) * batch_size].to(device) 128 | outputs = model(data) 129 | _, predicted = torch.max(outputs.data, 1) 130 | correct_batch = (predicted == target).sum().item() 131 | total_batch = target.size(0) 132 | if total_batch != 0: # Check to avoid division by zero 133 | batch_accuracy = correct_batch / total_batch 134 | total_batch_accuracy += batch_accuracy 135 | num_batches += 1 136 | 137 | avg_batch_accuracy = total_batch_accuracy / num_batches 138 | print(f"Average Batch Accuracy: {avg_batch_accuracy * 100:.2f}%") 139 | 140 | 141 | # Main 142 | if __name__ == "__main__": 143 | for epoch in range(epochs): 144 | train(model, criterion, optimizer, epoch) 145 | evaluate(model, test_data, test_labels) 146 | 147 | print("Finished Training") 148 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | torchvision 3 | numpy 4 | matplotlib 5 | --------------------------------------------------------------------------------