├── compile_n_run.sh ├── refs └── im2col.png ├── README.md ├── CMakeLists.txt ├── .gitignore ├── LICENSE ├── log ├── perflog_before.csv ├── perflog_final.csv ├── perflog_write_coalescing.csv └── perflog_loop_opt.csv ├── im2col.cu └── Visualise results.ipynb /compile_n_run.sh: -------------------------------------------------------------------------------- 1 | nvcc im2col.cu -o im2col.out && ./im2col.out 2 | -------------------------------------------------------------------------------- /refs/im2col.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/piojanu/CUDA-im2col-conv/HEAD/refs/im2col.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA implementation of convolution with im2col algorithm 2 | CUDA project for uni subject 3 | 4 | ![im2col](refs/im2col.png) 5 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1.0 FATAL_ERROR) 2 | set(PROJECT_NAME "CUDA-im2col-conv") 3 | 4 | # Find src files and include dir 5 | file(GLOB CPU_SOURCE_FILES "${CMAKE_SOURCE_DIR}/src/*.c") 6 | file(GLOB GPU_SOURCE_FILES "${CMAKE_SOURCE_DIR}/src/*.cu") 7 | set(INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include) 8 | 9 | # Create lib with CUDA code 10 | find_package(CUDA REQUIRED) 11 | if(NOT CUDA_FOUND) 12 | message(FATAL_ERROR "CUDA not found! CMake will exit.") 13 | endif() 14 | 15 | cuda_include_directories(${INCLUDE_DIRS}) 16 | cuda_add_library(cudalib ${GPU_SOURCE_FILES}) 17 | target_include_directories(cudalib PUBLIC ${INCLUDE_DIRS}) 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ## C 2 | 3 | # Prerequisites 4 | *.d 5 | 6 | # Object files 7 | *.o 8 | *.ko 9 | *.obj 10 | *.elf 11 | 12 | # Linker output 13 | *.ilk 14 | *.map 15 | *.exp 16 | 17 | # Precompiled Headers 18 | *.gch 19 | *.pch 20 | 21 | # Libraries 22 | *.lib 23 | *.a 24 | *.la 25 | *.lo 26 | 27 | # Shared objects (inc. Windows DLLs) 28 | *.dll 29 | *.so 30 | *.so.* 31 | *.dylib 32 | 33 | # Executables 34 | *.exe 35 | *.out 36 | *.app 37 | *.i*86 38 | *.x86_64 39 | *.hex 40 | 41 | # Debug files 42 | *.dSYM/ 43 | *.su 44 | *.idb 45 | *.pdb 46 | 47 | # Kernel Module Compile Results 48 | *.mod* 49 | *.cmd 50 | .tmp_versions/ 51 | modules.order 52 | Module.symvers 53 | Mkfile.old 54 | dkms.conf 55 | 56 | ## CMake 57 | 58 | CMakeCache.txt 59 | CMakeFiles 60 | CMakeScripts 61 | Testing 62 | Makefile 63 | cmake_install.cmake 64 | install_manifest.txt 65 | compile_commands.json 66 | CTestTestfile.cmake 67 | 68 | ## CUDA 69 | 70 | *.i 71 | *.ii 72 | *.gpu 73 | *.ptx 74 | *.cubin 75 | *.fatbin 76 | 77 | ## Python 78 | .ipynb_checkpoints/ 79 | 80 | ## Own 81 | /*.csv 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Piotr Januszewski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /log/perflog_before.csv: -------------------------------------------------------------------------------- 1 | 1,1,12335.5 2 | 1,2,6521.33 3 | 1,4,3578.7 4 | 1,8,1811.8 5 | 1,16,928.817 6 | 1,32,493.007 7 | 1,64,281.06 8 | 1,128,171.425 9 | 1,256,118.305 10 | 1,512,120.154 11 | 1,1024,121.042 12 | 1,2048,112.686 13 | 1,4096,108.361 14 | 1,8192,108.214 15 | 2,1,6560.73 16 | 2,2,3603.4 17 | 2,4,1827.46 18 | 2,8,936.064 19 | 2,16,502.304 20 | 2,32,277.921 21 | 2,64,173.525 22 | 2,128,116.03 23 | 2,256,87.663 24 | 2,512,89.921 25 | 2,1024,86.842 26 | 2,2048,82.098 27 | 2,4096,80.082 28 | 2,8192,79.777 29 | 4,1,3701.98 30 | 4,2,1867.73 31 | 4,4,965.522 32 | 4,8,514.046 33 | 4,16,289.178 34 | 4,32,173.232 35 | 4,64,118.07 36 | 4,128,89.374 37 | 4,256,74.707 38 | 4,512,74.362 39 | 4,1024,71.75 40 | 4,2048,69.598 41 | 4,4096,67.145 42 | 4,8192,66.548 43 | 8,1,2200.58 44 | 8,2,1122.74 45 | 8,4,585.689 46 | 8,8,324.973 47 | 8,16,188.025 48 | 8,32,120.502 49 | 8,64,88.089 50 | 8,128,71.886 51 | 8,256,66.106 52 | 8,512,65.853 53 | 8,1024,64.84 54 | 8,2048,63.527 55 | 8,4096,62.33 56 | 8,8192,62.167 57 | 16,1,1171.25 58 | 16,2,606.87 59 | 16,4,333.99 60 | 16,8,193.129 61 | 16,16,122.567 62 | 16,32,88.134 63 | 16,64,70.963 64 | 16,128,63.573 65 | 16,256,63.514 66 | 16,512,63.801 67 | 16,1024,63.061 68 | 16,2048,62.676 69 | 16,4096,62.12 70 | 16,8192,62.018 71 | 32,1,658.194 72 | 32,2,353.969 73 | 32,4,202.668 74 | 32,8,127.259 75 | 32,16,89.846 76 | 32,32,71.798 77 | 32,64,62.943 78 | 32,128,61.343 79 | 32,256,72.229 80 | 32,512,78.764 81 | 32,1024,85.692 82 | 32,2048,84.395 83 | 32,4096,84.362 84 | 32,8192,84.387 85 | 64,1,361.558 86 | 64,2,204.838 87 | 64,4,127.67 88 | 64,8,89.902 89 | 64,16,71.828 90 | 64,32,63.231 91 | 64,64,61.849 92 | 64,128,68.834 93 | 64,256,96.987 94 | 64,512,93.555 95 | 64,1024,99.424 96 | 64,2048,99.648 97 | 64,4096,100.038 98 | 64,8192,99.836 99 | 128,1,208.545 100 | 128,2,129.423 101 | 128,4,90.392 102 | 128,8,71.593 103 | 128,16,63.001 104 | 128,32,61.937 105 | 128,64,70.22 106 | 128,128,97.454 107 | 128,256,93.509 108 | 128,512,99.498 109 | 128,1024,99.287 110 | 128,2048,99.762 111 | 128,4096,99.768 112 | 128,8192,99.873 113 | 256,1,135.636 114 | 256,2,93.734 115 | 256,4,72.453 116 | 256,8,62.804 117 | 256,16,61.1 118 | 256,32,71.208 119 | 256,64,96.793 120 | 256,128,95.216 121 | 256,256,99.653 122 | 256,512,99.785 123 | 256,1024,99.6 124 | 256,2048,99.89 125 | 256,4096,99.793 126 | 256,8192,99.767 127 | 512,1,121.334 128 | 512,2,85.972 129 | 512,4,69.091 130 | 512,8,61.132 131 | 512,16,68.24 132 | 512,32,98.08 133 | 512,64,95.228 134 | 512,128,100.05 135 | 512,256,99.999 136 | 512,512,99.784 137 | 512,1024,99.975 138 | 512,2048,99.92 139 | 512,4096,100.001 140 | 512,8192,100.041 141 | 1024,1,121.468 142 | 1024,2,87.468 143 | 1024,4,69.833 144 | 1024,8,70.728 145 | 1024,16,92.798 146 | 1024,32,94.592 147 | 1024,64,100.532 148 | 1024,128,99.389 149 | 1024,256,100.53 150 | 1024,512,100.512 151 | 1024,1024,100.228 152 | 1024,2048,100.599 153 | 1024,4096,100.572 154 | 1024,8192,-1 155 | 2048,1,50.468 156 | 2048,2,50.537 157 | 2048,4,50.418 158 | 2048,8,50.429 159 | 2048,16,50.394 160 | 2048,32,50.455 161 | 2048,64,50.45 162 | 2048,128,50.552 163 | 2048,256,50.461 164 | 2048,512,50.522 165 | 2048,1024,50.286 166 | 2048,2048,50.548 167 | 2048,4096,-1 168 | 2048,8192,-1 169 | -------------------------------------------------------------------------------- /log/perflog_final.csv: -------------------------------------------------------------------------------- 1 | 1,1,18760.6 2 | 1,2,9748.33 3 | 1,4,5221.13 4 | 1,8,2985.59 5 | 1,16,1490.37 6 | 1,32,781.293 7 | 1,64,419.882 8 | 1,128,236.332 9 | 1,256,155.505 10 | 1,512,151.724 11 | 1,1024,154.243 12 | 1,2048,141.864 13 | 1,4096,135.765 14 | 1,8192,140.427 15 | 2,1,9822.23 16 | 2,2,5280.27 17 | 2,4,3014.08 18 | 2,8,1556.06 19 | 2,16,794.364 20 | 2,32,418.888 21 | 2,64,236.019 22 | 2,128,147.074 23 | 2,256,102.997 24 | 2,512,102.182 25 | 2,1024,103.043 26 | 2,2048,96.383 27 | 2,4096,95.599 28 | 2,8192,96.721 29 | 4,1,5280.99 30 | 4,2,3026.39 31 | 4,4,1535.26 32 | 4,8,792.703 33 | 4,16,421.224 34 | 4,32,236.061 35 | 4,64,145.334 36 | 4,128,100.395 37 | 4,256,78.57 38 | 4,512,78.292 39 | 4,1024,77.803 40 | 4,2048,75.752 41 | 4,4096,74.877 42 | 4,8192,74.816 43 | 8,1,3024.62 44 | 8,2,1541.38 45 | 8,4,796.194 46 | 8,8,423.651 47 | 8,16,237.76 48 | 8,32,145.101 49 | 8,64,101.267 50 | 8,128,76.595 51 | 8,256,66.337 52 | 8,512,65.811 53 | 8,1024,66.178 54 | 8,2048,64.928 55 | 8,4096,64.112 56 | 8,8192,64.303 57 | 16,1,1545.25 58 | 16,2,799.478 59 | 16,4,425.226 60 | 16,8,238.028 61 | 16,16,144.729 62 | 16,32,98.714 63 | 16,64,75.491 64 | 16,128,63.828 65 | 16,256,58.627 66 | 16,512,58.626 67 | 16,1024,59.015 68 | 16,2048,58.221 69 | 16,4096,58.033 70 | 16,8192,57.938 71 | 32,1,798.57 72 | 32,2,425.426 73 | 32,4,237.811 74 | 32,8,144.537 75 | 32,16,98.435 76 | 32,32,75.096 77 | 32,64,63.341 78 | 32,128,57.776 79 | 32,256,55.07 80 | 32,512,55.288 81 | 32,1024,55.344 82 | 32,2048,55.097 83 | 32,4096,54.905 84 | 32,8192,55.098 85 | 64,1,425.753 86 | 64,2,238.125 87 | 64,4,144.916 88 | 64,8,98.689 89 | 64,16,74.994 90 | 64,32,63.204 91 | 64,64,57.783 92 | 64,128,55.139 93 | 64,256,54.456 94 | 64,512,54.392 95 | 64,1024,54.532 96 | 64,2048,54.044 97 | 64,4096,53.991 98 | 64,8192,53.976 99 | 128,1,239.596 100 | 128,2,145.527 101 | 128,4,98.679 102 | 128,8,74.607 103 | 128,16,63.355 104 | 128,32,57.706 105 | 128,64,55.05 106 | 128,128,53.813 107 | 128,256,54.095 108 | 128,512,54.223 109 | 128,1024,54.174 110 | 128,2048,54.002 111 | 128,4096,54.163 112 | 128,8192,53.953 113 | 256,1,147.662 114 | 256,2,100.686 115 | 256,4,75.762 116 | 256,8,63.359 117 | 256,16,57.772 118 | 256,32,55.658 119 | 256,64,53.97 120 | 256,128,54.173 121 | 256,256,54.548 122 | 256,512,54.058 123 | 256,1024,54.022 124 | 256,2048,53.892 125 | 256,4096,53.929 126 | 256,8192,53.85 127 | 512,1,102.41 128 | 512,2,76.818 129 | 512,4,64.11 130 | 512,8,57.721 131 | 512,16,55.162 132 | 512,32,54.403 133 | 512,64,54.533 134 | 512,128,54.491 135 | 512,256,54.112 136 | 512,512,53.941 137 | 512,1024,54.202 138 | 512,2048,53.857 139 | 512,4096,53.882 140 | 512,8192,53.49 141 | 1024,1,79.017 142 | 1024,2,65.097 143 | 1024,4,58.305 144 | 1024,8,55.016 145 | 1024,16,54.329 146 | 1024,32,54.231 147 | 1024,64,54.511 148 | 1024,128,54.249 149 | 1024,256,54.156 150 | 1024,512,53.894 151 | 1024,1024,54.028 152 | 1024,2048,53.847 153 | 1024,4096,53.716 154 | 1024,8192,-1 155 | 2048,1,50.569 156 | 2048,2,50.568 157 | 2048,4,50.472 158 | 2048,8,50.62 159 | 2048,16,50.425 160 | 2048,32,50.509 161 | 2048,64,50.446 162 | 2048,128,50.472 163 | 2048,256,50.482 164 | 2048,512,50.425 165 | 2048,1024,50.468 166 | 2048,2048,50.453 167 | 2048,4096,-1 168 | 2048,8192,-1 169 | -------------------------------------------------------------------------------- /log/perflog_write_coalescing.csv: -------------------------------------------------------------------------------- 1 | 1,1,12102.8 2 | 1,2,6116.13 3 | 1,4,3082.8 4 | 1,8,1563.37 5 | 1,16,812.534 6 | 1,32,433.483 7 | 1,64,245.882 8 | 1,128,152.816 9 | 1,256,107.24 10 | 1,512,108.955 11 | 1,1024,110.169 12 | 1,2048,103.484 13 | 1,4096,100.103 14 | 1,8192,100.207 15 | 2,1,6170.66 16 | 2,2,3112.78 17 | 2,4,1567.84 18 | 2,8,813.777 19 | 2,16,435.126 20 | 2,32,246.311 21 | 2,64,150.198 22 | 2,128,103.59 23 | 2,256,80.396 24 | 2,512,82.923 25 | 2,1024,81.116 26 | 2,2048,77.607 27 | 2,4096,75.774 28 | 2,8192,75.637 29 | 4,1,3237.59 30 | 4,2,1632.09 31 | 4,4,836.935 32 | 4,8,444.462 33 | 4,16,250.202 34 | 4,32,152.982 35 | 4,64,103.544 36 | 4,128,78.532 37 | 4,256,69.666 38 | 4,512,70.319 39 | 4,1024,69.358 40 | 4,2048,67.76 41 | 4,4096,66.929 42 | 4,8192,66.683 43 | 8,1,1723.67 44 | 8,2,883.819 45 | 8,4,461.663 46 | 8,8,258.365 47 | 8,16,156.645 48 | 8,32,104.974 49 | 8,64,79.113 50 | 8,128,67.909 51 | 8,256,65.485 52 | 8,512,67.707 53 | 8,1024,66.044 54 | 8,2048,65.454 55 | 8,4096,65.116 56 | 8,8192,65.312 57 | 16,1,956.619 58 | 16,2,501.495 59 | 16,4,276.009 60 | 16,8,164.013 61 | 16,16,109.386 62 | 16,32,81.642 63 | 16,64,68.206 64 | 16,128,65.648 65 | 16,256,67.803 66 | 16,512,79.326 67 | 16,1024,81.367 68 | 16,2048,80.626 69 | 16,4096,79.482 70 | 16,8192,79.547 71 | 32,1,582.366 72 | 32,2,315.083 73 | 32,4,182.445 74 | 32,8,116.308 75 | 32,16,85.78 76 | 32,32,71.121 77 | 32,64,66.739 78 | 32,128,67.664 79 | 32,256,146.163 80 | 32,512,152.217 81 | 32,1024,183.698 82 | 32,2048,182.095 83 | 32,4096,182.226 84 | 32,8192,181.882 85 | 64,1,316.287 86 | 64,2,198.208 87 | 64,4,123.947 88 | 64,8,87.374 89 | 64,16,70.854 90 | 64,32,68.068 91 | 64,64,68.667 92 | 64,128,145.12 93 | 64,256,193.937 94 | 64,512,182.934 95 | 64,1024,193.799 96 | 64,2048,194.257 97 | 64,4096,194.004 98 | 64,8192,193.552 99 | 128,1,211.026 100 | 128,2,130.631 101 | 128,4,90.452 102 | 128,8,71.057 103 | 128,16,67.631 104 | 128,32,70.201 105 | 128,64,146.225 106 | 128,128,194.646 107 | 128,256,181.115 108 | 128,512,193.087 109 | 128,1024,194.073 110 | 128,2048,194.102 111 | 128,4096,193.607 112 | 128,8192,193.962 113 | 256,1,185.869 114 | 256,2,118.111 115 | 256,4,84.164 116 | 256,8,67.776 117 | 256,16,69.087 118 | 256,32,149.079 119 | 256,64,190.994 120 | 256,128,184.555 121 | 256,256,194.638 122 | 256,512,194.456 123 | 256,1024,193.971 124 | 256,2048,194.488 125 | 256,4096,194.32 126 | 256,8192,194.541 127 | 512,1,185.692 128 | 512,2,118.092 129 | 512,4,84.422 130 | 512,8,68.728 131 | 512,16,127.55 132 | 512,32,192.398 133 | 512,64,184.655 134 | 512,128,194.034 135 | 512,256,194.263 136 | 512,512,194.655 137 | 512,1024,194.677 138 | 512,2048,193.154 139 | 512,4096,194.677 140 | 512,8192,194.243 141 | 1024,1,186.62 142 | 1024,2,119.321 143 | 1024,4,84.885 144 | 1024,8,152.063 145 | 1024,16,187.355 146 | 1024,32,183.525 147 | 1024,64,195.369 148 | 1024,128,195.171 149 | 1024,256,195.82 150 | 1024,512,193.369 151 | 1024,1024,194.857 152 | 1024,2048,195.121 153 | 1024,4096,195.102 154 | 1024,8192,-1 155 | 2048,1,50.697 156 | 2048,2,50.294 157 | 2048,4,50.171 158 | 2048,8,50.203 159 | 2048,16,49.981 160 | 2048,32,50.113 161 | 2048,64,50.011 162 | 2048,128,50.751 163 | 2048,256,50 164 | 2048,512,50.181 165 | 2048,1024,49.997 166 | 2048,2048,50.247 167 | 2048,4096,-1 168 | 2048,8192,-1 169 | -------------------------------------------------------------------------------- /log/perflog_loop_opt.csv: -------------------------------------------------------------------------------- 1 | 1,1,19777.8 2 | 1,2,10210.1 3 | 1,4,5327.24 4 | 1,8,2707.27 5 | 1,16,1410.82 6 | 1,32,771.147 7 | 1,64,455.442 8 | 1,128,282.433 9 | 1,256,200.02 10 | 1,512,214.428 11 | 1,1024,220.866 12 | 1,2048,207.9 13 | 1,4096,198.098 14 | 1,8192,192.221 15 | 2,1,10275.3 16 | 2,2,5388.86 17 | 2,4,2711.22 18 | 2,8,1394.84 19 | 2,16,785.108 20 | 2,32,478.856 21 | 2,64,284.793 22 | 2,128,195.494 23 | 2,256,151.204 24 | 2,512,163.083 25 | 2,1024,179.349 26 | 2,2048,165.49 27 | 2,4096,162.725 28 | 2,8192,147.019 29 | 4,1,5432.01 30 | 4,2,2727.77 31 | 4,4,1388.69 32 | 4,8,761.545 33 | 4,16,469.875 34 | 4,32,275.023 35 | 4,64,196.264 36 | 4,128,149.329 37 | 4,256,126.235 38 | 4,512,135.413 39 | 4,1024,155.57 40 | 4,2048,139.183 41 | 4,4096,144.774 42 | 4,8192,147.576 43 | 8,1,2780.86 44 | 8,2,1426.92 45 | 8,4,764.726 46 | 8,8,439.434 47 | 8,16,275.293 48 | 8,32,187.966 49 | 8,64,148.949 50 | 8,128,124.672 51 | 8,256,115.682 52 | 8,512,127.123 53 | 8,1024,134.421 54 | 8,2048,130.728 55 | 8,4096,126.93 56 | 8,8192,132.641 57 | 16,1,1466.08 58 | 16,2,791.643 59 | 16,4,453.201 60 | 16,8,273.769 61 | 16,16,191.577 62 | 16,32,146.084 63 | 16,64,124.814 64 | 16,128,113.132 65 | 16,256,131.836 66 | 16,512,134.552 67 | 16,1024,124.039 68 | 16,2048,125.027 69 | 16,4096,118.56 70 | 16,8192,116.698 71 | 32,1,829.912 72 | 32,2,470.907 73 | 32,4,283.173 74 | 32,8,194.241 75 | 32,16,150.073 76 | 32,32,126.03 77 | 32,64,115.31 78 | 32,128,139.629 79 | 32,256,149.761 80 | 32,512,133.661 81 | 32,1024,121.487 82 | 32,2048,113.084 83 | 32,4096,111.309 84 | 32,8192,110.9 85 | 64,1,498.721 86 | 64,2,296.136 87 | 64,4,201.369 88 | 64,8,150.439 89 | 64,16,126.001 90 | 64,32,139.065 91 | 64,64,150.177 92 | 64,128,151.063 93 | 64,256,164.458 94 | 64,512,161.063 95 | 64,1024,152.713 96 | 64,2048,142.873 97 | 64,4096,129.514 98 | 64,8192,121.817 99 | 128,1,317.599 100 | 128,2,206.201 101 | 128,4,153.108 102 | 128,8,126.505 103 | 128,16,117.94 104 | 128,32,138.918 105 | 128,64,140.717 106 | 128,128,162.245 107 | 128,256,154.61 108 | 128,512,150.441 109 | 128,1024,141.16 110 | 128,2048,129.483 111 | 128,4096,120.215 112 | 128,8192,114.382 113 | 256,1,236.313 114 | 256,2,163.558 115 | 256,4,129.967 116 | 256,8,115.25 117 | 256,16,130.696 118 | 256,32,134.523 119 | 256,64,166.329 120 | 256,128,152.1 121 | 256,256,149.326 122 | 256,512,140.878 123 | 256,1024,127.63 124 | 256,2048,119.283 125 | 256,4096,113.892 126 | 256,8192,110.888 127 | 512,1,182.797 128 | 512,2,139.823 129 | 512,4,119.748 130 | 512,8,109.899 131 | 512,16,128.909 132 | 512,32,161.457 133 | 512,64,150.088 134 | 512,128,148.801 135 | 512,256,138.552 136 | 512,512,128.332 137 | 512,1024,117.671 138 | 512,2048,113.067 139 | 512,4096,110.114 140 | 512,8192,109.484 141 | 1024,1,180.882 142 | 1024,2,139.147 143 | 1024,4,119.194 144 | 1024,8,121.629 145 | 1024,16,154.223 146 | 1024,32,148.09 147 | 1024,64,150.241 148 | 1024,128,137.208 149 | 1024,256,126.119 150 | 1024,512,116.484 151 | 1024,1024,111.622 152 | 1024,2048,110.031 153 | 1024,4096,108.593 154 | 1024,8192,-1 155 | 2048,1,93.653 156 | 2048,2,93.376 157 | 2048,4,93.317 158 | 2048,8,93.405 159 | 2048,16,93.397 160 | 2048,32,94.056 161 | 2048,64,93.311 162 | 2048,128,93.259 163 | 2048,256,93.282 164 | 2048,512,93.545 165 | 2048,1024,93.336 166 | 2048,2048,93.149 167 | 2048,4096,-1 168 | 2048,8192,-1 169 | -------------------------------------------------------------------------------- /im2col.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // Feature maps dimensionality descriptions and assumptions: 8 | // : Height : Width : Channels : Number : 9 | // INPUT / A | H | W | C | ------------------------- | 10 | // KERNELS / F | P = K | Q = K | R = C | D = number of kernels = 1 | 11 | // OUTPUT / B | L = H * (K - 1) | M = W * (K - 1) | N = D = 1 | ------------------------- | 12 | // [!] K must be odd number. 13 | // [!] Data layout for INPUT/OUTPUT: C x H x W. 14 | // [!] Data layout for KERNELS: D x R(=C) x P(=K) x Q(=K) 15 | 16 | // Turn on/off debug mode 17 | // #define DEBUG 18 | // #define FUNCTEST 19 | #define PERFTEST 20 | 21 | #ifdef DEBUG 22 | #define LOG(...) printf(__VA_ARGS__); fflush(stdout); 23 | #else 24 | #define LOG(...) ; 25 | #endif 26 | 27 | const unsigned int H = 256, W = 256, C = 80, K = 3; 28 | 29 | // HOST FUNCTION 30 | // Takes matrix A [float *matA] and transforms it 31 | // into column representation [float *matAc] 32 | void im2colOnHost(float *matA, float *matAc, int radiusF, int countF, int L, int M, int K, int C) 33 | { 34 | // For each spatial position in output... 35 | for (int m = 0; m < M; m++) { 36 | int w = m + radiusF; 37 | for (int l = 0; l < L; l++) { 38 | int h = l + radiusF; 39 | 40 | // Progress.. 41 | LOG("\r[i] Calculation on CPU %3d%%...", ((m * L + l) * 100 / (M * L))); 42 | 43 | // For each kernel weight... 44 | for (int q = 0, oq = -1 * radiusF; oq <= radiusF; q++, oq++) { 45 | for (int p = 0, op = -1 * radiusF; op <= radiusF; p++, op++) { 46 | for (int r = 0; r < C; r++) { 47 | matAc[(r + C * (p + K * q)) + countF * (l + L * m)] = matA[r + C * ((h + op) + H * (w + oq))]; 48 | // LOG("matAc[%3d x %3d] <- matA[%3d x %3d x %3d]\n", (r + C * (p + K* q)), (l + L * m), (h + op), (w + oq), r); 49 | } 50 | } 51 | } 52 | } 53 | } 54 | LOG("\n"); 55 | } 56 | 57 | // DEVICE KERNEL 58 | // Takes matrix A [float *matA] and transforms it 59 | // into column representation [float *matAc] on GPU 60 | __global__ 61 | void im2colOnDevice(unsigned int n, float *matAc, float *matA, int radiusF, int countF, int L, int M, int K, int C) 62 | { 63 | // Using grid-stride loop if too big problem size. 64 | // https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ 65 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; 66 | idx < n; 67 | idx += blockDim.x * gridDim.x) 68 | { 69 | int m = (idx / C) / L; 70 | int l = (idx / C) % L; 71 | int r = idx % C; 72 | 73 | // For each spatial position in output... 74 | if (m < M) { 75 | int w = m + radiusF; 76 | if (l < L) { 77 | int h = l + radiusF; 78 | // For each kernel weight... 79 | for (int q = 0, oq = -1 * radiusF; oq <= radiusF; q++, oq++) { 80 | for (int p = 0, op = -1 * radiusF; op <= radiusF; p++, op++) { 81 | if (r < C) { 82 | matAc[(r + C * (p + K * q)) + countF * (l + L * m)] = matA[r + C * ((h + op) + H * (w + oq))]; 83 | } 84 | } 85 | } 86 | } 87 | } 88 | } 89 | } 90 | 91 | // DEVICE KERNEL 92 | // Takes matrix A [float *matA] and transforms it 93 | // into column representation [float *matAc] on GPU 94 | __global__ 95 | void col2imOnDevice(unsigned int n, float *matA, float *matAc, int radiusF, int countF, int L, int M, int K, int C) 96 | { 97 | // Using grid-stride loop if too big problem size. 98 | // https://devblogs.nvidia.com/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/ 99 | for (int idx = blockIdx.x * blockDim.x + threadIdx.x; 100 | idx < n; 101 | idx += blockDim.x * gridDim.x) 102 | { 103 | int m = (idx / C) / L; 104 | int l = (idx / C) % L; 105 | int r = idx % C; 106 | 107 | // For each spatial position in output... 108 | if (m < M) { 109 | int w = m + radiusF; 110 | if (l < L) { 111 | int h = l + radiusF; 112 | // For each kernel weight... 113 | for (int q = 0, oq = -1 * radiusF; oq <= radiusF; q++, oq++) { 114 | for (int p = 0, op = -1 * radiusF; op <= radiusF; p++, op++) { 115 | if (r < C) { 116 | matA[r + C * ((h + op) + H * (w + oq))] = matAc[(r + C * (p + K * q)) + countF * (l + L * m)]; 117 | } 118 | } 119 | } 120 | } 121 | } 122 | } 123 | } 124 | 125 | void program(unsigned int blockSize, unsigned int gridSize = 0) 126 | { 127 | // CONSTS AND VARIABLES 128 | 129 | // Input/kernel/output counts and sizes 130 | const unsigned int countA = H*W*C; 131 | const size_t sizeA = countA*sizeof(float); 132 | LOG("[i] INPUT PARAMS: %u height, %u width, %u channels, %u elems, %u bytes\n", H, W, C, countA, sizeA); 133 | 134 | const unsigned int radiusF = (K - 1) / 2; 135 | const unsigned int countF = K*K*C; 136 | LOG("[i] FILTER PARAMS: %u radius, %u elems, %u bytes\n", radiusF, countF, countF*sizeof(float)); 137 | 138 | const unsigned int L = H - (K - 1); 139 | const unsigned int M = W - (K - 1); 140 | LOG("[i] OUTPUT PARAMS: %u height, %u width, %u channels\n", L, M, 1); 141 | 142 | const unsigned int countLR = L * M; 143 | const unsigned int countAc = countF * countLR; 144 | const size_t sizeAc = countAc*sizeof(float); 145 | LOG("[i] INPUT IN COL PARAMS: %u elems, %u bytes\n", countAc, sizeAc); 146 | 147 | 148 | // PREPARE DATA 149 | 150 | // Generate input data 151 | float *matA = (float *)malloc(sizeA); 152 | for (int i = 0; i < countA; i++) { 153 | matA[i] = i; 154 | } 155 | LOG(" [!] FINISHED GENERATING INPUT\n"); 156 | 157 | #ifdef FUNCTEST 158 | // Calculate im2col result 159 | float *matAc = (float *)malloc(sizeAc); 160 | im2colOnHost(matA, matAc, radiusF, countF, L, M, K, C); 161 | LOG(" [!] FINISHED CALCULATING im2col RESULT ON CPU\n"); 162 | #endif 163 | 164 | 165 | // Alloc memory and copy data to device 166 | float *devA, *devAc, *retAc; 167 | 168 | cudaMalloc((void**)&devA, sizeA); 169 | cudaMalloc((void**)&devAc, sizeAc); 170 | retAc = (float *)malloc(sizeAc); 171 | 172 | cudaMemcpy(devA, matA, sizeA, cudaMemcpyHostToDevice); 173 | 174 | // Compute default grid size if it wasn't passed 175 | const unsigned int KERNELS_NUM = L * M * C; 176 | if (gridSize == 0) 177 | gridSize = (KERNELS_NUM + blockSize - 1) / blockSize; 178 | 179 | // Run im2col computation on device and copy results 180 | im2colOnDevice<<>>(KERNELS_NUM, devAc, devA, radiusF, countF, L, M, K, C); 181 | LOG(" [!] FINISHED CALCULATING im2col ON DEVICE\n"); 182 | 183 | cudaMemcpy(retAc, devAc, sizeAc, cudaMemcpyDeviceToHost); 184 | 185 | #ifdef FUNCTEST 186 | // Compare results 187 | int success = 1; 188 | for (int i = 0; i < countAc; i++) { 189 | if (retAc[i] != matAc[i]) { 190 | success = 0; 191 | printf("TEST FAILED: im2col device kernel...\n"); 192 | break; 193 | } 194 | } 195 | 196 | if (success) { 197 | printf("TEST PASSED: im2col device kernel!\n"); 198 | } 199 | #endif 200 | 201 | // Allocate memory for return value 202 | float *retA; 203 | retA = (float *)malloc(sizeA); 204 | cudaMemset(devA, 0, sizeA); 205 | 206 | // Run col2im computation on device and copy results 207 | col2imOnDevice<<>>(KERNELS_NUM, devA, devAc, radiusF, countF, L, M, K, C); 208 | LOG(" [!] FINISHED CALCULATING col2im ON DEVICE\n"); 209 | 210 | cudaMemcpy(retA, devA, sizeA, cudaMemcpyDeviceToHost); 211 | 212 | #ifdef FUNCTEST 213 | // Compare results 214 | success = 1; 215 | for (int i = 0; i < countA; i++) { 216 | if (retA[i] != matA[i]) { 217 | success = 0; 218 | printf("TEST FAILED: col2im device kernel...\n"); 219 | break; 220 | } 221 | } 222 | 223 | if (success) { 224 | printf("TEST PASSED: col2im device kernel!\n"); 225 | } 226 | #endif 227 | 228 | // CLEAN UP 229 | cudaFree(devA); 230 | cudaFree(devAc); 231 | 232 | free(matA); 233 | #ifdef FUNCTEST 234 | free(matAc); 235 | #endif 236 | free(retA); 237 | free(retAc); 238 | } 239 | 240 | int main() 241 | { 242 | // Enforce default block and grid sizes 243 | unsigned int blockSize = 256; 244 | unsigned int gridSize = 0; 245 | 246 | // Calculate max needed kernels/threads number 247 | const unsigned int L = H - (K - 1); 248 | const unsigned int M = W - (K - 1); 249 | const unsigned int KERNELS_NUM = L * M * C; 250 | 251 | // Prepare variables for time measurement 252 | struct timeval t1, t2; 253 | double elapsedTime, totalTime = 0; 254 | int totalRuns = 1; 255 | 256 | // First warm-up run 257 | LOG("--------- WARM-UP ---------\n"); 258 | program(256); 259 | LOG("--------- WARM-UP ---------\n\n"); 260 | 261 | #ifdef PERFTEST 262 | // Average over 10 runs 263 | totalRuns = 10; 264 | 265 | // Open file for perf logs 266 | std::fstream fperflog("perflog.csv", std::ios::out); 267 | if (fperflog.good()) 268 | { 269 | // Measure effect of different block sizes 270 | const unsigned int MAX_BLOCK_SIZE = 2048; 271 | for (blockSize = 1; blockSize <= MAX_BLOCK_SIZE; blockSize *= 2) { 272 | const unsigned int MAX_GRID_SIZE = (KERNELS_NUM + blockSize - 1) / blockSize; 273 | LOG(" [!] For %d blocks, max grid size is %d\n", blockSize, MAX_GRID_SIZE); 274 | for (gridSize = 1; gridSize <= 8192; gridSize *= 2) { 275 | if (gridSize <= MAX_GRID_SIZE) { 276 | totalTime = 0; 277 | for (int i = 0; i < totalRuns; i++) 278 | #endif 279 | { 280 | // Start timer 281 | gettimeofday(&t1, NULL); 282 | 283 | // WORK HARD! 284 | program(blockSize, gridSize); 285 | 286 | // Stop timer 287 | gettimeofday(&t2, NULL); 288 | 289 | // Compute the elapsed time in millisec 290 | elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms 291 | elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms 292 | 293 | totalTime += elapsedTime; 294 | } 295 | LOG(" [!] Whole program took %.3fms averaged over %d runs\n", totalTime / totalRuns, totalRuns); 296 | #ifdef PERFTEST 297 | fperflog << blockSize << "," << gridSize << "," << elapsedTime << std::endl; 298 | } else { 299 | // Meaningless data, there is more grids ten data cat utilize 300 | fperflog << blockSize << "," << gridSize << "," << -1 << std::endl; 301 | } 302 | } 303 | } 304 | 305 | // Close file 306 | fperflog.close(); 307 | } 308 | #endif 309 | 310 | return EXIT_SUCCESS; 311 | } 312 | -------------------------------------------------------------------------------- /Visualise results.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Visualise im2col performance test results\n", 17 | "\n", 18 | "We measure program execution time in function of block and grid size before and after optimisation. Results are visualised using heatmap in matplotlib." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 88, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from matplotlib import cm\n", 28 | "import matplotlib.pyplot as plt\n", 29 | "plt.rcParams['figure.figsize'] = 11.7,8.27\n", 30 | "import numpy as np\n", 31 | "import seaborn as sns\n", 32 | "\n", 33 | "PERFLOG_BEFORE_PATH = \"./log/perflog_write_coalescing.csv\"\n", 34 | "PERFLOG_AFTER_PATH = \"./log/perflog_final.csv\"\n", 35 | "GRID_SHAPE = (12, 14)\n", 36 | "\n", 37 | "# Load measurements\n", 38 | "data_before = np.loadtxt(PERFLOG_BEFORE_PATH, delimiter=',')\n", 39 | "data_after = np.loadtxt(PERFLOG_AFTER_PATH, delimiter=',')" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Preprocess data" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 89, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "before = data_before[:, 2].reshape(GRID_SHAPE)\n", 56 | "after = data_after[:, 2].reshape(GRID_SHAPE)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Visualise" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 90, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stderr", 73 | "output_type": "stream", 74 | "text": [ 75 | "/usr/lib64/python3.4/site-packages/matplotlib/figure.py:459: UserWarning: matplotlib is currently using a non-GUI backend, so cannot show the figure\n", 76 | " \"matplotlib is currently using a non-GUI backend, \"\n" 77 | ] 78 | }, 79 | { 80 | "data": { 81 | "image/png": "\n", 82 | "text/plain": [ 83 | "
" 84 | ] 85 | }, 86 | "metadata": {}, 87 | "output_type": "display_data" 88 | } 89 | ], 90 | "source": [ 91 | "fig = plt.figure(figsize=plt.figaspect(0.33))\n", 92 | "ax1 = fig.add_subplot(1, 2, 1)\n", 93 | "ax2 = fig.add_subplot(1, 2, 2)\n", 94 | "\n", 95 | "sns.heatmap(before, vmin=0, vmax=250, ax=ax1)\n", 96 | "ax1.set_title('Execution time in ms BEFORE optimisation')\n", 97 | "ax1.set_xlabel('Block Size [2^n]')\n", 98 | "ax1.set_ylabel('Grid Size [2^n]')\n", 99 | "\n", 100 | "sns.heatmap(after, vmin=0, vmax=250, ax=ax2)\n", 101 | "ax2.set_title('Execution time in ms AFTER optimisation')\n", 102 | "ax2.set_xlabel('Block Size [2^n]')\n", 103 | "ax2.set_ylabel('Grid Size [2^n]')\n", 104 | "\n", 105 | "fig.show()" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 91, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "image/png": "\n", 116 | "text/plain": [ 117 | "
" 118 | ] 119 | }, 120 | "metadata": {}, 121 | "output_type": "display_data" 122 | } 123 | ], 124 | "source": [ 125 | "ax = sns.heatmap(before - after, vmin=-170, vmax=170, center=0, annot=True, fmt=\"3.0f\")\n", 126 | "ax.set_title('Difference in execution time in ms')\n", 127 | "ax.set_xlabel('Block Size [2^n]')\n", 128 | "ax.set_ylabel('Grid Size [2^n]')\n", 129 | "\n", 130 | "plt.show()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | } 140 | ], 141 | "metadata": { 142 | "kernelspec": { 143 | "display_name": "Python 3", 144 | "language": "python", 145 | "name": "python3" 146 | }, 147 | "language_info": { 148 | "codemirror_mode": { 149 | "name": "ipython", 150 | "version": 3 151 | }, 152 | "file_extension": ".py", 153 | "mimetype": "text/x-python", 154 | "name": "python", 155 | "nbconvert_exporter": "python", 156 | "pygments_lexer": "ipython3", 157 | "version": "3.4.6" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 2 162 | } 163 | --------------------------------------------------------------------------------