├── .gitignore ├── GPU_Puzzlers_C++.ipynb ├── GPU_Puzzlers_C++_Answers.ipynb ├── GPU_puzzlers.ipynb ├── GPU_puzzlers.py ├── GPU_puzzlers_exec ├── 1dconv_runner.cu ├── axis_sum_runner.cu ├── blocks_runner.cu ├── broadcast_runner.cu ├── dotproduct_runner.cu ├── guards_runner.cu ├── map2d_block_runner.cu ├── map2d_runner.cu ├── map_runner.cu ├── matmul_runner.cu ├── pooling_runner.cu ├── prefixsum_runner.cu ├── shared_runner.cu └── zip_runner.cu ├── GPU_puzzlers_files ├── GPU_puzzlers_100_1.svg ├── GPU_puzzlers_101_1.svg ├── GPU_puzzlers_102_1.svg ├── GPU_puzzlers_103_1.svg ├── GPU_puzzlers_104_1.svg ├── GPU_puzzlers_105_1.svg ├── GPU_puzzlers_106_1.svg ├── GPU_puzzlers_108_1.svg ├── GPU_puzzlers_12_1.svg ├── GPU_puzzlers_13_1.svg ├── GPU_puzzlers_14_1.svg ├── GPU_puzzlers_15_1.svg ├── GPU_puzzlers_16_1.svg ├── GPU_puzzlers_17_1.svg ├── GPU_puzzlers_18_1.svg ├── GPU_puzzlers_19_1.svg ├── GPU_puzzlers_21_1.svg ├── GPU_puzzlers_22_1.svg ├── GPU_puzzlers_23_1.svg ├── GPU_puzzlers_24_1.svg ├── GPU_puzzlers_26_1.svg ├── GPU_puzzlers_27_1.svg ├── GPU_puzzlers_28_1.svg ├── GPU_puzzlers_29_1.svg ├── GPU_puzzlers_31_1.svg ├── GPU_puzzlers_32_1.svg ├── GPU_puzzlers_33_1.svg ├── GPU_puzzlers_34_1.svg ├── GPU_puzzlers_36_1.svg ├── GPU_puzzlers_37_1.svg ├── GPU_puzzlers_38_1.svg ├── GPU_puzzlers_39_1.svg ├── GPU_puzzlers_40_1.svg ├── GPU_puzzlers_41_1.svg ├── GPU_puzzlers_42_1.svg ├── GPU_puzzlers_43_1.svg ├── GPU_puzzlers_44_1.svg ├── GPU_puzzlers_45_1.svg ├── GPU_puzzlers_46_1.svg ├── GPU_puzzlers_47_1.svg ├── GPU_puzzlers_48_1.svg ├── GPU_puzzlers_49_1.svg ├── GPU_puzzlers_50_1.svg ├── GPU_puzzlers_52_1.svg ├── GPU_puzzlers_53_1.svg ├── GPU_puzzlers_54_1.svg ├── GPU_puzzlers_55_1.svg ├── GPU_puzzlers_56_1.svg ├── GPU_puzzlers_57_1.svg ├── GPU_puzzlers_58_1.svg ├── GPU_puzzlers_59_1.svg ├── GPU_puzzlers_60_1.svg ├── GPU_puzzlers_61_1.svg ├── GPU_puzzlers_62_1.svg ├── GPU_puzzlers_63_1.svg ├── GPU_puzzlers_64_1.svg ├── GPU_puzzlers_66_1.svg ├── GPU_puzzlers_67_1.svg ├── GPU_puzzlers_68_1.svg ├── GPU_puzzlers_69_1.svg ├── GPU_puzzlers_70_1.svg ├── GPU_puzzlers_73_1.svg ├── GPU_puzzlers_74_1.svg ├── GPU_puzzlers_75_1.svg ├── GPU_puzzlers_76_1.svg ├── GPU_puzzlers_77_1.svg ├── GPU_puzzlers_78_1.svg ├── GPU_puzzlers_79_1.svg ├── GPU_puzzlers_84_1.svg ├── GPU_puzzlers_85_1.svg ├── GPU_puzzlers_86_1.svg ├── GPU_puzzlers_87_1.svg ├── GPU_puzzlers_88_1.svg ├── GPU_puzzlers_89_1.svg ├── GPU_puzzlers_90_1.svg ├── GPU_puzzlers_91_1.svg ├── GPU_puzzlers_92_1.svg ├── GPU_puzzlers_93_1.svg ├── GPU_puzzlers_94_1.svg ├── GPU_puzzlers_95_1.svg ├── GPU_puzzlers_97_1.svg ├── GPU_puzzlers_99_1.svg └── robot.png ├── LICENSE ├── README.md ├── cuda.png ├── lib.py └── robot.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /GPU_puzzlers_exec/1dconv_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Conv1D(float* A, float* B, float* C, int a_size, int b_size); 6 | 7 | const int TPB = 8; 8 | const int MAX_CONV = 4; 9 | const int TPB_MAX_CONV = TPB + MAX_CONV; 10 | 11 | void runKernel() { 12 | const int size = 5; 13 | 14 | float A[size], B[size-2], C[1]; 15 | 16 | for (int i = 0; i < size; i++) { 17 | A[i] = i; 18 | } 19 | 20 | for (int j = 0; j < size-2; j++) { 21 | B[j] = j; 22 | } 23 | 24 | float *d_A, *d_B, *d_C; 25 | 26 | cudaMalloc(&d_A, size * sizeof(float)); 27 | cudaMalloc(&d_B, (size - 2) * sizeof(float)); 28 | cudaMalloc(&d_C, size * sizeof(float)); 29 | 30 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 31 | cudaMemcpy(d_B, B, (size - 2) * sizeof(float), cudaMemcpyHostToDevice); 32 | 33 | int blocksPerGrid = (size + TPB - 1) / TPB; 34 | 35 | int shared_size_a = sizeof(float) * (TPB + MAX_CONV); 36 | int shared_size_b = sizeof(float) * MAX_CONV; 37 | 38 | Conv1D<<>>( 39 | d_A, d_B, d_C, size, (size - 2) 40 | ); 41 | 42 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 43 | 44 | cudaFree(d_A); 45 | cudaFree(d_B); 46 | cudaFree(d_C); 47 | 48 | float host_C[] = {5, 8, 11, 4, 0}; 49 | 50 | for (int i = 0; i < size; i++) { 51 | assert(host_C[i] == C[i]); 52 | } 53 | 54 | std::cout << "1D Convolution successful!" << std::endl; 55 | } 56 | 57 | int main() { 58 | runKernel(); 59 | return 0; 60 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/axis_sum_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void AxisSum(float* A, float* C, int size); 6 | 7 | void runKernel() { 8 | const int size = 5; 9 | const int numBatches = 1; 10 | 11 | float A[size * numBatches], C[numBatches]; 12 | 13 | for (int j = 0; j < numBatches; j++) { 14 | for (int i = 0; i < size; i++) { 15 | A[j * size + i] = i; 16 | } 17 | } 18 | 19 | float *d_A, *d_C; 20 | 21 | cudaMalloc(&d_A, size * numBatches * sizeof(float)); 22 | cudaMalloc(&d_C, numBatches * sizeof(float)); 23 | 24 | cudaMemcpy(d_A, A, size * numBatches * sizeof(float), cudaMemcpyHostToDevice); 25 | 26 | dim3 threadsPerBlock(size, 1); 27 | dim3 blocksPerGrid(1, numBatches); 28 | int shared_size = threadsPerBlock.x * sizeof(float); 29 | 30 | AxisSum<<>>(d_A, d_C, size); 31 | 32 | cudaMemcpy(C, d_C, numBatches * sizeof(float), cudaMemcpyDeviceToHost); 33 | 34 | cudaFree(d_A); 35 | cudaFree(d_C); 36 | 37 | assert(C[0] == 10); 38 | 39 | std::cout << "Axis sum successful!" << std::endl; 40 | } 41 | 42 | int main() { 43 | runKernel(); 44 | return 0; 45 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/blocks_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Blocks(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 5; 9 | float A[size], C[size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i] = static_cast(i); 13 | } 14 | 15 | float *d_A, *d_C; 16 | 17 | cudaMalloc(&d_A, size * sizeof(float)); 18 | cudaMalloc(&d_C, size * sizeof(float)); 19 | 20 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 21 | 22 | int threadsPerBlock = size - 1; 23 | int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; 24 | 25 | Blocks<<>>(d_A, d_C, size); 26 | 27 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 28 | 29 | cudaFree(d_A); 30 | cudaFree(d_C); 31 | 32 | for (int i = 0; i < size; i++) { 33 | assert(C[i] == A[i] + 10); 34 | } 35 | 36 | std::cout << "Blocks successful!" << std::endl; 37 | } 38 | 39 | int main() { 40 | runKernel(); 41 | return 0; 42 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/broadcast_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Broadcast(float* A, float* B, float* C, int size); 6 | 7 | void runKernel() { 8 | const int size = 4; 9 | float A[size][1], B[1][size], C[size][size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i][0] = static_cast(i); 13 | } 14 | 15 | for (int j = 0; j < size; j++) { 16 | B[0][j] = static_cast(j); 17 | } 18 | 19 | float *d_A, *d_B, *d_C; 20 | 21 | cudaMalloc(&d_A, size * sizeof(float)); 22 | cudaMalloc(&d_B, size * sizeof(float)); 23 | cudaMalloc(&d_C, (size * size) * sizeof(float)); 24 | 25 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 26 | cudaMemcpy(d_B, B, size * sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | dim3 blockDim(size, size); 29 | 30 | Broadcast<<<1, blockDim>>>(d_A, d_B, d_C, size); 31 | 32 | cudaMemcpy(C, d_C, (size * size) * sizeof(float), cudaMemcpyDeviceToHost); 33 | 34 | cudaFree(d_A); 35 | cudaFree(d_B); 36 | cudaFree(d_C); 37 | 38 | for (int i = 0; i < size; i++) { 39 | for (int j = 0; j < size; j++) { 40 | assert(C[i][j] == A[i][0] + B[0][j]); 41 | } 42 | } 43 | 44 | std::cout << "Broadcast successful" << std::endl; 45 | } 46 | 47 | int main() { 48 | runKernel(); 49 | return 0; 50 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/dotproduct_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void DotProduct(float* A, float* B, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 8; 9 | float A[size], B[size], C[1]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i] = i; 13 | } 14 | 15 | for (int j = 0; j < size; j++) { 16 | B[j] = j; 17 | } 18 | 19 | float *d_A, *d_B, *d_C; 20 | 21 | cudaMalloc(&d_A, size * sizeof(float)); 22 | cudaMalloc(&d_B, size * sizeof(float)); 23 | cudaMalloc(&d_C, sizeof(float)); 24 | 25 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 26 | cudaMemcpy(d_B, B, size * sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | int threadsPerBlock = size; 29 | int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; 30 | int shared_size = threadsPerBlock * sizeof(float); 31 | 32 | DotProduct<<>>(d_A, d_B, d_C, size); 33 | 34 | cudaMemcpy(C, d_C, sizeof(float), cudaMemcpyDeviceToHost); 35 | 36 | cudaFree(d_A); 37 | cudaFree(d_B); 38 | cudaFree(d_C); 39 | 40 | int expected_dot_product = 0; 41 | for (int k = 0; k < size; k++) { 42 | expected_dot_product += A[k] * B[k]; 43 | } 44 | assert(C[0] == expected_dot_product); 45 | 46 | std::cout << "Dot product successful!" << std::endl; 47 | } 48 | 49 | int main() { 50 | runKernel(); 51 | return 0; 52 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/guards_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Guards(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 3; 9 | float A[size], C[size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i] = static_cast(i); 13 | } 14 | 15 | float *d_A, *d_C; 16 | 17 | cudaMalloc(&d_A, size * sizeof(float)); 18 | cudaMalloc(&d_C, size * sizeof(float)); 19 | 20 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 21 | 22 | Guards<<<1, 10>>>(d_A, d_C, size); 23 | 24 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 25 | 26 | cudaFree(d_A); 27 | cudaFree(d_C); 28 | 29 | for (int i = 0; i < size; i++){ 30 | assert(C[i] == A[i] + 10); 31 | } 32 | 33 | std::cout << "Guards successful!" << std::endl; 34 | } 35 | 36 | int main() { 37 | runKernel(); 38 | return 0; 39 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/map2d_block_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Map2DBlock(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 6; 9 | float A[size][size], C[size][size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | for (int j = 0; j < size; j++) { 13 | A[i][j] = static_cast(i) + static_cast(j); 14 | } 15 | } 16 | 17 | float *d_A, *d_C; 18 | 19 | cudaMalloc(&d_A, (size * size) * sizeof(float)); 20 | cudaMalloc(&d_C, (size * size) * sizeof(float)); 21 | 22 | dim3 threadsPerBlock(size - 1, size - 1); 23 | dim3 blocksPerGrid(((size + threadsPerBlock.x - 1) / threadsPerBlock.x), 24 | (((size + threadsPerBlock.y - 1) / threadsPerBlock.y))); 25 | 26 | cudaMemcpy(d_A, A, (size * size) * sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | Map2DBlock<<>>(d_A, d_C, size); 29 | 30 | cudaMemcpy(C, d_C, (size * size) * sizeof(float), cudaMemcpyDeviceToHost); 31 | 32 | cudaFree(d_A); 33 | cudaFree(d_C); 34 | 35 | for (int i = 0; i < size; i++) { 36 | for (int j = 0; j < size; j++) { 37 | assert(C[i][j] == A[i][j] + 10); 38 | } 39 | } 40 | 41 | std::cout << "2D mapping successful" << std::endl; 42 | } 43 | 44 | int main() { 45 | runKernel(); 46 | return 0; 47 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/map2d_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Map2D(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 4; 9 | float A[size][size], C[size][size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | for (int j = 0; j < size; j++) { 13 | A[i][j] = static_cast(i) + static_cast(j); 14 | } 15 | } 16 | 17 | float *d_A, *d_C; 18 | 19 | cudaMalloc(&d_A, (size * size) * sizeof(float)); 20 | cudaMalloc(&d_C, (size * size) * sizeof(float)); 21 | 22 | dim3 blockDim(size, size); 23 | 24 | cudaMemcpy(d_A, A, (size * size) * sizeof(float), cudaMemcpyHostToDevice); 25 | 26 | Map2D<<<1, blockDim>>>(d_A, d_C, size); 27 | 28 | cudaMemcpy(C, d_C, (size * size) * sizeof(float), cudaMemcpyDeviceToHost); 29 | 30 | cudaFree(d_A); 31 | cudaFree(d_C); 32 | 33 | for (int i = 0; i < size; i++) { 34 | for (int j = 0; j < size; j++) { 35 | assert(C[i][j] == A[i][j] + 10); 36 | } 37 | } 38 | 39 | std::cout << "2D mapping successful" << std::endl; 40 | } 41 | 42 | int main() { 43 | runKernel(); 44 | return 0; 45 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/map_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void ScalarAdd(float* A, float* C); 6 | 7 | void runKernel() { 8 | const int N = 3; 9 | float A[N], C[N]; 10 | 11 | for (int i = 0; i < N; i++) { 12 | A[i] = static_cast(i); 13 | } 14 | 15 | float *d_A, *d_C; 16 | 17 | cudaMalloc(&d_A, N * sizeof(float)); 18 | cudaMalloc(&d_C, N * sizeof(float)); 19 | 20 | cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice); 21 | 22 | ScalarAdd<<<1, N>>>(d_A, d_C); 23 | 24 | cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost); 25 | 26 | cudaFree(d_A); 27 | cudaFree(d_C); 28 | 29 | for (int i = 0; i < N; i++) { 30 | assert(C[i] == A[i] + 10); 31 | } 32 | 33 | std::cout << "Scalar addition is successful!" << std::endl; 34 | } 35 | 36 | int main() { 37 | runKernel(); 38 | return 0; 39 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/matmul_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | const int TPB = 3; 6 | 7 | extern __global__ void Matmul(float* A, float* B, float* C, int size); 8 | 9 | void runKernel() { 10 | const int size = 2; 11 | float A[size][size], B[size][size], C[size][size]; 12 | 13 | for (int i = 0; i < size; i++) { 14 | for (int j = 0; j < size; j++) { 15 | A[i][j] = i * j; 16 | B[i][j] = i + j; 17 | } 18 | } 19 | 20 | float *d_A, *d_B, *d_C; 21 | cudaMalloc(&d_A, (size * size) * sizeof(float)); 22 | cudaMalloc(&d_B, (size * size) * sizeof(float)); 23 | cudaMalloc(&d_C, (size * size) * sizeof(float)); 24 | 25 | cudaMemcpy(d_A, A, (size * size) * sizeof(float), cudaMemcpyHostToDevice); 26 | cudaMemcpy(d_B, B, (size * size) * sizeof(float), cudaMemcpyHostToDevice); 27 | 28 | int BpG = (size + TPB - 1) / TPB; 29 | dim3 blocksPerGrid(BpG, BpG); 30 | dim3 threadsPerBlock(TPB, TPB); 31 | int sharedMemSize = 2 * (TPB * TPB) * sizeof(float); 32 | 33 | Matmul<<>>(d_A, d_B, d_C, size); 34 | 35 | cudaMemcpy(C, d_C, (size * size) * sizeof(float), cudaMemcpyDeviceToHost); 36 | 37 | cudaFree(d_A); 38 | cudaFree(d_B); 39 | cudaFree(d_C); 40 | 41 | assert(C[0][0] == 0); 42 | assert(C[0][1] == 0); 43 | assert(C[1][0] == 1); 44 | assert(C[1][1] == 2); 45 | 46 | std::cout << "Matrix multiplication successful!" << std::endl; 47 | } 48 | 49 | int main() { 50 | runKernel(); 51 | return 0; 52 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/pooling_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Pooling(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 4; 9 | float A[size], C[size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i] = static_cast(i); 13 | } 14 | 15 | float *d_A, *d_C; 16 | 17 | cudaMalloc(&d_A, size * sizeof(float)); 18 | cudaMalloc(&d_C, size * sizeof(float)); 19 | 20 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 21 | 22 | int threadsPerBlock = size; 23 | int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; 24 | int shared_size = threadsPerBlock * sizeof(float); 25 | 26 | Pooling<<>>(d_A, d_C, size); 27 | 28 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 29 | 30 | cudaFree(d_A); 31 | cudaFree(d_C); 32 | 33 | for (int i = 0; i < size; i++) { 34 | if (i >= 2) { 35 | assert(C[i] == A[i] + A[i-1] + A[i-2]); 36 | } 37 | } 38 | 39 | std::cout << "Pooling successful!" << std::endl; 40 | } 41 | 42 | int main() { 43 | runKernel(); 44 | return 0; 45 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/prefixsum_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void PrefixSum(float* A, float* C, int size); 6 | 7 | void runKernel() { 8 | const int size = 5; 9 | 10 | float A[size], C[size]; 11 | 12 | for (int i = 0; i < size; i++) { 13 | A[i] = i; 14 | } 15 | 16 | float *d_A, *d_C; 17 | 18 | cudaMalloc(&d_A, size * sizeof(float)); 19 | cudaMalloc(&d_C, size * sizeof(float)); 20 | 21 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 22 | 23 | int threadsPerBlock = size; 24 | int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; 25 | int shared_size = threadsPerBlock * sizeof(float); 26 | 27 | PrefixSum<<>>( 28 | d_A, d_C, size 29 | ); 30 | 31 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 32 | 33 | cudaFree(d_A); 34 | cudaFree(d_C); 35 | 36 | assert(C[0] == 10); 37 | 38 | std::cout << "Prefix sum successful!" << std::endl; 39 | } 40 | 41 | int main() { 42 | runKernel(); 43 | return 0; 44 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/shared_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void Shared(float* A, float* C, float size); 6 | 7 | void runKernel() { 8 | const int size = 5; 9 | float A[size], C[size]; 10 | 11 | for (int i = 0; i < size; i++) { 12 | A[i] = static_cast(i); 13 | } 14 | 15 | float *d_A, *d_C; 16 | 17 | cudaMalloc(&d_A, size * sizeof(float)); 18 | cudaMalloc(&d_C, size * sizeof(float)); 19 | 20 | cudaMemcpy(d_A, A, size * sizeof(float), cudaMemcpyHostToDevice); 21 | 22 | int threadsPerBlock = size - 1; 23 | int blocksPerGrid = (size + threadsPerBlock - 1) / threadsPerBlock; 24 | int shared_size = threadsPerBlock * sizeof(float); 25 | 26 | Shared<<>>(d_A, d_C, size); 27 | 28 | cudaMemcpy(C, d_C, size * sizeof(float), cudaMemcpyDeviceToHost); 29 | 30 | cudaFree(d_A); 31 | cudaFree(d_C); 32 | 33 | for (int i = 0; i < size; i++) { 34 | assert(C[i] == A[i] + 10); 35 | } 36 | 37 | std::cout << "Shared successful!" << std::endl; 38 | } 39 | 40 | int main() { 41 | runKernel(); 42 | return 0; 43 | } -------------------------------------------------------------------------------- /GPU_puzzlers_exec/zip_runner.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | extern __global__ void VecAdd(float* A, float* B, float* C); 6 | 7 | void runKernel() { 8 | const int N = 3; 9 | float A[N], B[N], C[N]; 10 | 11 | for (int i = 0; i < N; i++) { 12 | A[i] = static_cast(i); 13 | B[i] = static_cast(N - i); 14 | } 15 | 16 | float *d_A, *d_B, *d_C; 17 | 18 | cudaMalloc(&d_A, sizeof(float) * N); 19 | cudaMalloc(&d_B, sizeof(float) * N); 20 | cudaMalloc(&d_C, sizeof(float) * N); 21 | 22 | cudaMemcpy(d_A, A, N * sizeof(float), cudaMemcpyHostToDevice); 23 | cudaMemcpy(d_B, B, N * sizeof(float), cudaMemcpyHostToDevice); 24 | 25 | VecAdd<<<1, N>>>(d_A, d_B, d_C); 26 | 27 | cudaMemcpy(C, d_C, N * sizeof(float), cudaMemcpyDeviceToHost); 28 | 29 | cudaFree(d_A); 30 | cudaFree(d_B); 31 | cudaFree(d_C); 32 | 33 | for (int i = 0; i < N; i++) { 34 | assert(C[i] == A[i] + B[i]); 35 | } 36 | 37 | std::cout << "Vector addition successful!" << std::endl; 38 | } 39 | 40 | int main() { 41 | runKernel(); 42 | return 0; 43 | } -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_12_1.svg: -------------------------------------------------------------------------------- 1 | 2 | MapaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_13_1.svg: -------------------------------------------------------------------------------- 1 | 2 | MapaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_14_1.svg: -------------------------------------------------------------------------------- 1 | 2 | MapaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_15_1.svg: -------------------------------------------------------------------------------- 1 | 2 | MapaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_16_1.svg: -------------------------------------------------------------------------------- 1 | 2 | MapaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_17_1.svg: -------------------------------------------------------------------------------- 1 | 2 | ZipaboutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_18_1.svg: -------------------------------------------------------------------------------- 1 | 2 | ZipaboutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_19_1.svg: -------------------------------------------------------------------------------- 1 | 2 | ZipaboutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_21_1.svg: -------------------------------------------------------------------------------- 1 | 2 | GuardaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_22_1.svg: -------------------------------------------------------------------------------- 1 | 2 | ZipaboutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_23_1.svg: -------------------------------------------------------------------------------- 1 | 2 | ZipaboutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_24_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Map 2DaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_26_1.svg: -------------------------------------------------------------------------------- 1 | 2 | GuardaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_28_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Map 2DaoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_58_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Sum (Simple)aoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_84_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Sum (Simple)aoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_85_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Sum (Simple)aoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_86_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Sum (Simple)aoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/GPU_puzzlers_87_1.svg: -------------------------------------------------------------------------------- 1 | 2 | Sum (Simple)aoutBlock 0 0 -------------------------------------------------------------------------------- /GPU_puzzlers_files/robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshah3/GPU-Puzzles/966f30635343d59fa34c8fcefae6a0398886b0e9/GPU_puzzlers_files/robot.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Sasha Rush 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cuda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshah3/GPU-Puzzles/966f30635343d59fa34c8fcefae6a0398886b0e9/cuda.png -------------------------------------------------------------------------------- /robot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dshah3/GPU-Puzzles/966f30635343d59fa34c8fcefae6a0398886b0e9/robot.png --------------------------------------------------------------------------------