├── 00_googleDrive_CUDAExam.ipynb ├── 01_PyCUDA_simple_example.ipynb ├── 01_cuda_lab ├── 01_simple.ipynb ├── 02_openmp.ipynb ├── 03_simple_avx.ipynb ├── 04_helloCUDA.ipynb ├── 05_vectorAdd.ipynb ├── 06_2DIndex.ipynb ├── 07_memoryType.ipynb ├── 08_DeviceQuery_Bandwidth.ipynb ├── 09_coalMemory.ipynb ├── README.md └── clock.cu ├── 02_cuda_lab ├── 00_UnifiedMemory_SharedMem.ipynb ├── 00_googleDrive_CUDAExam.ipynb ├── 01_Transpose.ipynb ├── 03_reduction.ipynb ├── 04_atomic.ipynb ├── README.md ├── atomicAdd.cu ├── clock.cu ├── gpu_timer.h └── reduction_all.cu ├── 03_cuda_lab ├── 01_matmul.cu ├── 02_matmul_tile.cu ├── 03_matmul_tile2_mem_coel.cu ├── 04_matmul_tile3_noBankConflict.cu ├── 05_matmul_tile4_unroll.cu ├── 07_async_streams.cu ├── 08_stream_n_event.ipynb ├── README.md └── clock.cu ├── 03_numba_vectorize.ipynb ├── PPTs ├── 001_Intro. Parallel Computing.pptx ├── README.md ├── Robot_02_CUDA I - Basic Programming.pdf ├── Robot_03_CUDA II - Optimization - Transpose.pdf ├── Robot_04_CUDA III - Optimization - Reductions.pdf ├── Robot_05_CUDA IV - Optimization - Mat-Mat Multiplication.pdf └── Robot_06_CUDA V - Synchronization Stream.pdf ├── README.md ├── colab_gdrive.ipynb └── hello_CUDA.ipynb /01_PyCUDA_simple_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "PyCUDA_example.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "Ab3KZ1wGAc9b", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## CUDA Programming on Colab with PyCUDA\n", 36 | "\n", 37 | "Colab에서 제공하는 python 환경을 효과적으로 사용한 CUDA Programming !\n", 38 | "\n", 39 | "- 한림대학교 이정근 교수\n", 40 | "- Email: jeonggun.lee@gmail.com" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "MbL_f-YCAtdA", 47 | "colab_type": "text" 48 | }, 49 | "source": [ 50 | "---\n", 51 | "Colab에서 PyCUDA를 사용하기 위해서는 PyCUDA를 먼저 설치해주어야 합니다.\n", 52 | "\n", 53 | "간단히, pip 명령어를 이용하여 설치 할 수 있습니다.\n" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "BEZ-wZwUTUhf", 60 | "colab_type": "code", 61 | "outputId": "4723fcf7-432d-444d-e964-6b44636b52b7", 62 | "colab": { 63 | "base_uri": "https://localhost:8080/", 64 | "height": 600 65 | } 66 | }, 67 | "source": [ 68 | "!pip install pycuda" 69 | ], 70 | "execution_count": 1, 71 | "outputs": [ 72 | { 73 | "output_type": "stream", 74 | "text": [ 75 | "Collecting pycuda\n", 76 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/4d/29/5a3eb66c2f1a4adc681f6c8131e9ed677af31b0c8a78726d540bd44b3403/pycuda-2019.1.tar.gz (1.6MB)\n", 77 | "\u001b[K |████████████████████████████████| 1.6MB 4.8MB/s \n", 78 | "\u001b[?25hCollecting pytools>=2011.2 (from pycuda)\n", 79 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/00/96/00416762a3eda8876a17d007df4a946f46b2e4ee1057e0b9714926472ef8/pytools-2019.1.1.tar.gz (58kB)\n", 80 | "\u001b[K |████████████████████████████████| 61kB 24.6MB/s \n", 81 | "\u001b[?25hRequirement already satisfied: pytest>=2 in /usr/local/lib/python3.6/dist-packages (from pycuda) (3.6.4)\n", 82 | "Requirement already satisfied: decorator>=3.2.0 in /usr/local/lib/python3.6/dist-packages (from pycuda) (4.4.0)\n", 83 | "Collecting appdirs>=1.4.0 (from pycuda)\n", 84 | " Downloading https://files.pythonhosted.org/packages/56/eb/810e700ed1349edde4cbdc1b2a21e28cdf115f9faf263f6bbf8447c1abf3/appdirs-1.4.3-py2.py3-none-any.whl\n", 85 | "Collecting mako (from pycuda)\n", 86 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/0a/af/a6d8aa7b8909a36074f517b15222e3a2fbd5ef3452c0a686e3d43043dd3b/Mako-1.0.12.tar.gz (460kB)\n", 87 | "\u001b[K |████████████████████████████████| 460kB 52.9MB/s \n", 88 | "\u001b[?25hRequirement already satisfied: six>=1.8.0 in /usr/local/lib/python3.6/dist-packages (from pytools>=2011.2->pycuda) (1.12.0)\n", 89 | "Requirement already satisfied: numpy>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from pytools>=2011.2->pycuda) (1.16.4)\n", 90 | "Requirement already satisfied: more-itertools>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (7.0.0)\n", 91 | "Requirement already satisfied: pluggy<0.8,>=0.5 in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (0.7.1)\n", 92 | "Requirement already satisfied: atomicwrites>=1.0 in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (1.3.0)\n", 93 | "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (19.1.0)\n", 94 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (41.0.1)\n", 95 | "Requirement already satisfied: py>=1.5.0 in /usr/local/lib/python3.6/dist-packages (from pytest>=2->pycuda) (1.8.0)\n", 96 | "Requirement already satisfied: MarkupSafe>=0.9.2 in /usr/local/lib/python3.6/dist-packages (from mako->pycuda) (1.1.1)\n", 97 | "Building wheels for collected packages: pycuda, pytools, mako\n", 98 | " Building wheel for pycuda (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 99 | " Stored in directory: /root/.cache/pip/wheels/de/c2/d5/351a6b47b20d417e82a669cf53f8cb4d7b55a57f73cbd05184\n", 100 | " Building wheel for pytools (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 101 | " Stored in directory: /root/.cache/pip/wheels/83/df/0b/75ac4572aaa93e3eba6a58472635d0fda907f5f4cf884a3a0c\n", 102 | " Building wheel for mako (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 103 | " Stored in directory: /root/.cache/pip/wheels/b3/7b/ae/5addd138cd8175503b9782737bada30d0c88310d08c106f9bf\n", 104 | "Successfully built pycuda pytools mako\n", 105 | "Installing collected packages: appdirs, pytools, mako, pycuda\n", 106 | "Successfully installed appdirs-1.4.3 mako-1.0.12 pycuda-2019.1 pytools-2019.1.1\n" 107 | ], 108 | "name": "stdout" 109 | } 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "metadata": { 115 | "id": "XeL9F-WmSw-1", 116 | "colab_type": "code", 117 | "outputId": "68ae61c1-4142-4d8f-e110-052196bc1387", 118 | "colab": { 119 | "base_uri": "https://localhost:8080/", 120 | "height": 105 121 | } 122 | }, 123 | "source": [ 124 | "import pycuda.autoinit\n", 125 | "import pycuda.driver as drv\n", 126 | "import numpy\n", 127 | "\n", 128 | "from pycuda.compiler import SourceModule\n", 129 | "\n", 130 | "mod = SourceModule(\"\"\"\n", 131 | "__global__ void multiply_them(float *dest, float *a, float *b)\n", 132 | "{\n", 133 | " const int i = threadIdx.x;\n", 134 | " dest[i] = a[i] * b[i];\n", 135 | "}\n", 136 | "\"\"\")\n", 137 | "\n", 138 | "multiply_them = mod.get_function(\"multiply_them\")\n", 139 | "\n", 140 | "a = numpy.random.randn(10).astype(numpy.float32)\n", 141 | "b = numpy.random.randn(10).astype(numpy.float32)\n", 142 | "\n", 143 | "print(a)\n", 144 | "print(b)\n", 145 | "\n", 146 | "dest = numpy.zeros_like(a)\n", 147 | "multiply_them(\n", 148 | " drv.Out(dest), drv.In(a), drv.In(b),\n", 149 | " block=(400,1,1), grid=(1,1))\n", 150 | "\n", 151 | "print(dest-a*b)" 152 | ], 153 | "execution_count": 2, 154 | "outputs": [ 155 | { 156 | "output_type": "stream", 157 | "text": [ 158 | "[ 0.80295664 -1.2928731 1.3868954 0.7483168 1.532189 -0.8343009\n", 159 | " -0.64259315 0.49740788 -1.4714196 1.5013071 ]\n", 160 | "[ 0.4420121 -0.3540332 0.2971032 1.5825133 0.00366833 0.37378067\n", 161 | " 1.2133838 -0.68266094 -0.42567483 -0.39658982]\n", 162 | "[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n" 163 | ], 164 | "name": "stdout" 165 | } 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "metadata": { 171 | "id": "woAIxy0fUqUa", 172 | "colab_type": "code", 173 | "colab": {} 174 | }, 175 | "source": [ 176 | "" 177 | ], 178 | "execution_count": 0, 179 | "outputs": [] 180 | } 181 | ] 182 | } -------------------------------------------------------------------------------- /01_cuda_lab/02_openmp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "02_openmp.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | " $\"Open$ " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "0aLN7JFf3war", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "# OpenMP Exercise" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "7lf4rqk9xgwC", 41 | "colab_type": "code", 42 | "colab": { 43 | "base_uri": "https://localhost:8080/", 44 | "height": 35 45 | }, 46 | "outputId": "ef6329f9-7852-4781-81e1-12da47553643" 47 | }, 48 | "source": [ 49 | "%%writefile openmp1.c\n", 50 | "\n", 51 | "#include\n", 52 | "\n", 53 | "int main(void)\n", 54 | "{\n", 55 | " #pragma omp parallel\n", 56 | " {\n", 57 | " printf(\"Hello, world.\\n\");\n", 58 | " }\n", 59 | "\n", 60 | " return 0;\n", 61 | "}" 62 | ], 63 | "execution_count": 7, 64 | "outputs": [ 65 | { 66 | "output_type": "stream", 67 | "text": [ 68 | "Overwriting openmp1.c\n" 69 | ], 70 | "name": "stdout" 71 | } 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "metadata": { 77 | "id": "xCoKVJiaxzWu", 78 | "colab_type": "code", 79 | "colab": {} 80 | }, 81 | "source": [ 82 | "!gcc -fopenmp openmp1.c -o openmp1" 83 | ], 84 | "execution_count": 0, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "id": "1TQb9w95yI4t", 91 | "colab_type": "text" 92 | }, 93 | "source": [ 94 | "colab 시스템이 기본적으로 two core 시스템이기 때문에 2번 진행" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "id": "Y5l7a06OyAym", 101 | "colab_type": "code", 102 | "colab": { 103 | "base_uri": "https://localhost:8080/", 104 | "height": 53 105 | }, 106 | "outputId": "087da35e-966a-4fa6-e180-4b691ca0c1a4" 107 | }, 108 | "source": [ 109 | "!./openmp1" 110 | ], 111 | "execution_count": 6, 112 | "outputs": [ 113 | { 114 | "output_type": "stream", 115 | "text": [ 116 | "Hello, world.\n", 117 | "Hello, world.\n" 118 | ], 119 | "name": "stdout" 120 | } 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "metadata": { 126 | "id": "jG-E21e62fw2", 127 | "colab_type": "code", 128 | "colab": { 129 | "base_uri": "https://localhost:8080/", 130 | "height": 35 131 | }, 132 | "outputId": "1c9919e9-7916-486d-be02-c23910489f6f" 133 | }, 134 | "source": [ 135 | "%%writefile mmul.c\n", 136 | "\n", 137 | "#include \n", 138 | "#include \n", 139 | "#include \n", 140 | "\n", 141 | "void transpose(double *A, double *B, int n) {\n", 142 | " int i,j;\n", 143 | " for(i=0; i $\"Open$ " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "metadata": { 30 | "id": "ne9p904zlLgF", 31 | "colab_type": "code", 32 | "colab": { 33 | "base_uri": "https://localhost:8080/", 34 | "height": 994 35 | }, 36 | "outputId": "32bcdc97-e43a-4150-f39b-e91ba2a0d803" 37 | }, 38 | "source": [ 39 | "!cat /proc/cpuinfo" 40 | ], 41 | "execution_count": 1, 42 | "outputs": [ 43 | { 44 | "output_type": "stream", 45 | "text": [ 46 | "processor\t: 0\n", 47 | "vendor_id\t: GenuineIntel\n", 48 | "cpu family\t: 6\n", 49 | "model\t\t: 79\n", 50 | "model name\t: Intel(R) Xeon(R) CPU @ 2.20GHz\n", 51 | "stepping\t: 0\n", 52 | "microcode\t: 0x1\n", 53 | "cpu MHz\t\t: 2200.000\n", 54 | "cache size\t: 56320 KB\n", 55 | "physical id\t: 0\n", 56 | "siblings\t: 2\n", 57 | "core id\t\t: 0\n", 58 | "cpu cores\t: 1\n", 59 | "apicid\t\t: 0\n", 60 | "initial apicid\t: 0\n", 61 | "fpu\t\t: yes\n", 62 | "fpu_exception\t: yes\n", 63 | "cpuid level\t: 13\n", 64 | "wp\t\t: yes\n", 65 | "flags\t\t: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities\n", 66 | "bugs\t\t: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs\n", 67 | "bogomips\t: 4400.00\n", 68 | "clflush size\t: 64\n", 69 | "cache_alignment\t: 64\n", 70 | "address sizes\t: 46 bits physical, 48 bits virtual\n", 71 | "power management:\n", 72 | "\n", 73 | "processor\t: 1\n", 74 | "vendor_id\t: GenuineIntel\n", 75 | "cpu family\t: 6\n", 76 | "model\t\t: 79\n", 77 | "model name\t: Intel(R) Xeon(R) CPU @ 2.20GHz\n", 78 | "stepping\t: 0\n", 79 | "microcode\t: 0x1\n", 80 | "cpu MHz\t\t: 2200.000\n", 81 | "cache size\t: 56320 KB\n", 82 | "physical id\t: 0\n", 83 | "siblings\t: 2\n", 84 | "core id\t\t: 0\n", 85 | "cpu cores\t: 1\n", 86 | "apicid\t\t: 1\n", 87 | "initial apicid\t: 1\n", 88 | "fpu\t\t: yes\n", 89 | "fpu_exception\t: yes\n", 90 | "cpuid level\t: 13\n", 91 | "wp\t\t: yes\n", 92 | "flags\t\t: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities\n", 93 | "bugs\t\t: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs\n", 94 | "bogomips\t: 4400.00\n", 95 | "clflush size\t: 64\n", 96 | "cache_alignment\t: 64\n", 97 | "address sizes\t: 46 bits physical, 48 bits virtual\n", 98 | "power management:\n", 99 | "\n" 100 | ], 101 | "name": "stdout" 102 | } 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "id": "Sg_Mr--y-e1K", 109 | "colab_type": "text" 110 | }, 111 | "source": [ 112 | "# Simple Example of SIMD (AVX)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "id": "jEjJGm8V-fTp", 119 | "colab_type": "text" 120 | }, 121 | "source": [ 122 | "" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "metadata": { 128 | "id": "2af1XxZT4amU", 129 | "colab_type": "code", 130 | "outputId": "dcb81f6f-378b-468e-d1d9-c2d915fa9e5b", 131 | "colab": { 132 | "base_uri": "https://localhost:8080/", 133 | "height": 35 134 | } 135 | }, 136 | "source": [ 137 | "%%writefile avx.cpp\n", 138 | "\n", 139 | "#include \n", 140 | "\n", 141 | "#ifdef __AVX__\n", 142 | " #include \n", 143 | "#else\n", 144 | " #warning No AVX support - will not compile\n", 145 | "#endif\n", 146 | "\n", 147 | "int main(int argc, char **argv)\n", 148 | "{\n", 149 | " __m256 a = _mm256_set_ps(8.0, 7.0, 6.0, 5.0, \n", 150 | " 4.0, 3.0, 2.0, 1.0);\n", 151 | " __m256 b = _mm256_set_ps(18.0, 17.0, 16.0, 15.0, \n", 152 | " 14.0, 13.0, 12.0, 11.0);\n", 153 | "\n", 154 | " __m256 c = _mm256_add_ps(a, b);\n", 155 | "\n", 156 | " float d[8];\n", 157 | " _mm256_storeu_ps(d, c);\n", 158 | "\n", 159 | " std::cout << \"result equals \" << d[0] << \",\" << d[1]\n", 160 | " << \",\" << d[2] << \",\" << d[3] << \",\"\n", 161 | " << d[4] << \",\" << d[5] << \",\" << d[6] << \",\"\n", 162 | " << d[7] << std::endl;\n", 163 | "\n", 164 | " return 0;\n", 165 | "}" 166 | ], 167 | "execution_count": 2, 168 | "outputs": [ 169 | { 170 | "output_type": "stream", 171 | "text": [ 172 | "Writing avx.cpp\n" 173 | ], 174 | "name": "stdout" 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "metadata": { 181 | "id": "8TkDqqyl4hPZ", 182 | "colab_type": "code", 183 | "colab": {} 184 | }, 185 | "source": [ 186 | "!g++ --std=c++14 -S -mavx avx.cpp" 187 | ], 188 | "execution_count": 0, 189 | "outputs": [] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "metadata": { 194 | "id": "VO0SfpASnTkA", 195 | "colab_type": "code", 196 | "colab": { 197 | "base_uri": "https://localhost:8080/", 198 | "height": 35 199 | }, 200 | "outputId": "7d5d47f6-79b8-4d2b-a398-09ebad4369d5" 201 | }, 202 | "source": [ 203 | "!ls" 204 | ], 205 | "execution_count": 15, 206 | "outputs": [ 207 | { 208 | "output_type": "stream", 209 | "text": [ 210 | "avx avx.cpp avx.s sample_data\n" 211 | ], 212 | "name": "stdout" 213 | } 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "id": "zWVMevlFnUNu", 220 | "colab_type": "code", 221 | "colab": { 222 | "base_uri": "https://localhost:8080/", 223 | "height": 1000 224 | }, 225 | "outputId": "897e6cc6-2aa7-4cd2-aed8-4d28479ab416" 226 | }, 227 | "source": [ 228 | "!cat avx.s" 229 | ], 230 | "execution_count": 16, 231 | "outputs": [ 232 | { 233 | "output_type": "stream", 234 | "text": [ 235 | "\t.file\t\"avx.cpp\"\n", 236 | "\t.text\n", 237 | "\t.section\t.rodata\n", 238 | "\t.type\t_ZStL19piecewise_construct, @object\n", 239 | "\t.size\t_ZStL19piecewise_construct, 1\n", 240 | "_ZStL19piecewise_construct:\n", 241 | "\t.zero\t1\n", 242 | "\t.local\t_ZStL8__ioinit\n", 243 | "\t.comm\t_ZStL8__ioinit,1,1\n", 244 | ".LC16:\n", 245 | "\t.string\t\"result equals \"\n", 246 | ".LC17:\n", 247 | "\t.string\t\",\"\n", 248 | "\t.text\n", 249 | "\t.globl\tmain\n", 250 | "\t.type\tmain, @function\n", 251 | "main:\n", 252 | ".LFB5151:\n", 253 | "\t.cfi_startproc\n", 254 | "\tleaq\t8(%rsp), %r10\n", 255 | "\t.cfi_def_cfa 10, 0\n", 256 | "\tandq\t$-32, %rsp\n", 257 | "\tpushq\t-8(%r10)\n", 258 | "\tpushq\t%rbp\n", 259 | "\t.cfi_escape 0x10,0x6,0x2,0x76,0\n", 260 | "\tmovq\t%rsp, %rbp\n", 261 | "\tpushq\t%r10\n", 262 | "\t.cfi_escape 0xf,0x3,0x76,0x78,0x6\n", 263 | "\tsubq\t$360, %rsp\n", 264 | "\tmovl\t%edi, -356(%rbp)\n", 265 | "\tmovq\t%rsi, -368(%rbp)\n", 266 | "\tmovq\t%fs:40, %rax\n", 267 | "\tmovq\t%rax, -24(%rbp)\n", 268 | "\txorl\t%eax, %eax\n", 269 | "\tvmovss\t.LC0(%rip), %xmm0\n", 270 | "\tvmovss\t%xmm0, -312(%rbp)\n", 271 | "\tvmovss\t.LC1(%rip), %xmm0\n", 272 | "\tvmovss\t%xmm0, -308(%rbp)\n", 273 | "\tvmovss\t.LC2(%rip), %xmm0\n", 274 | "\tvmovss\t%xmm0, -304(%rbp)\n", 275 | "\tvmovss\t.LC3(%rip), %xmm0\n", 276 | "\tvmovss\t%xmm0, -300(%rbp)\n", 277 | "\tvmovss\t.LC4(%rip), %xmm0\n", 278 | "\tvmovss\t%xmm0, -296(%rbp)\n", 279 | "\tvmovss\t.LC5(%rip), %xmm0\n", 280 | "\tvmovss\t%xmm0, -292(%rbp)\n", 281 | "\tvmovss\t.LC6(%rip), %xmm0\n", 282 | "\tvmovss\t%xmm0, -288(%rbp)\n", 283 | "\tvmovss\t.LC7(%rip), %xmm0\n", 284 | "\tvmovss\t%xmm0, -284(%rbp)\n", 285 | "\tvmovss\t-312(%rbp), %xmm1\n", 286 | "\tvmovss\t-308(%rbp), %xmm0\n", 287 | "\tvunpcklps\t%xmm1, %xmm0, %xmm2\n", 288 | "\tvmovss\t-304(%rbp), %xmm1\n", 289 | "\tvmovss\t-300(%rbp), %xmm0\n", 290 | "\tvunpcklps\t%xmm1, %xmm0, %xmm1\n", 291 | "\tvmovss\t-296(%rbp), %xmm3\n", 292 | "\tvmovss\t-292(%rbp), %xmm0\n", 293 | "\tvunpcklps\t%xmm3, %xmm0, %xmm3\n", 294 | "\tvmovss\t-288(%rbp), %xmm4\n", 295 | "\tvmovss\t-284(%rbp), %xmm0\n", 296 | "\tvunpcklps\t%xmm4, %xmm0, %xmm0\n", 297 | "\tvmovlhps\t%xmm3, %xmm0, %xmm0\n", 298 | "\tvmovlhps\t%xmm2, %xmm1, %xmm1\n", 299 | "\tvinsertf128\t$0x1, %xmm1, %ymm0, %ymm0\n", 300 | "\tvmovaps\t%ymm0, -272(%rbp)\n", 301 | "\tvmovss\t.LC8(%rip), %xmm0\n", 302 | "\tvmovss\t%xmm0, -344(%rbp)\n", 303 | "\tvmovss\t.LC9(%rip), %xmm0\n", 304 | "\tvmovss\t%xmm0, -340(%rbp)\n", 305 | "\tvmovss\t.LC10(%rip), %xmm0\n", 306 | "\tvmovss\t%xmm0, -336(%rbp)\n", 307 | "\tvmovss\t.LC11(%rip), %xmm0\n", 308 | "\tvmovss\t%xmm0, -332(%rbp)\n", 309 | "\tvmovss\t.LC12(%rip), %xmm0\n", 310 | "\tvmovss\t%xmm0, -328(%rbp)\n", 311 | "\tvmovss\t.LC13(%rip), %xmm0\n", 312 | "\tvmovss\t%xmm0, -324(%rbp)\n", 313 | "\tvmovss\t.LC14(%rip), %xmm0\n", 314 | "\tvmovss\t%xmm0, -320(%rbp)\n", 315 | "\tvmovss\t.LC15(%rip), %xmm0\n", 316 | "\tvmovss\t%xmm0, -316(%rbp)\n", 317 | "\tvmovss\t-344(%rbp), %xmm1\n", 318 | "\tvmovss\t-340(%rbp), %xmm0\n", 319 | "\tvunpcklps\t%xmm1, %xmm0, %xmm2\n", 320 | "\tvmovss\t-336(%rbp), %xmm1\n", 321 | "\tvmovss\t-332(%rbp), %xmm0\n", 322 | "\tvunpcklps\t%xmm1, %xmm0, %xmm1\n", 323 | "\tvmovss\t-328(%rbp), %xmm3\n", 324 | "\tvmovss\t-324(%rbp), %xmm0\n", 325 | "\tvunpcklps\t%xmm3, %xmm0, %xmm3\n", 326 | "\tvmovss\t-320(%rbp), %xmm4\n", 327 | "\tvmovss\t-316(%rbp), %xmm0\n", 328 | "\tvunpcklps\t%xmm4, %xmm0, %xmm0\n", 329 | "\tvmovlhps\t%xmm3, %xmm0, %xmm0\n", 330 | "\tvmovlhps\t%xmm2, %xmm1, %xmm1\n", 331 | "\tvinsertf128\t$0x1, %xmm1, %ymm0, %ymm0\n", 332 | "\tvmovaps\t%ymm0, -240(%rbp)\n", 333 | "\tvmovaps\t-272(%rbp), %ymm0\n", 334 | "\tvmovaps\t%ymm0, -144(%rbp)\n", 335 | "\tvmovaps\t-240(%rbp), %ymm0\n", 336 | "\tvmovaps\t%ymm0, -112(%rbp)\n", 337 | "\tvmovaps\t-144(%rbp), %ymm0\n", 338 | "\tvaddps\t-112(%rbp), %ymm0, %ymm0\n", 339 | "\tvmovaps\t%ymm0, -208(%rbp)\n", 340 | "\tleaq\t-64(%rbp), %rax\n", 341 | "\tmovq\t%rax, -280(%rbp)\n", 342 | "\tvmovaps\t-208(%rbp), %ymm0\n", 343 | "\tvmovaps\t%ymm0, -176(%rbp)\n", 344 | "\tvmovaps\t-176(%rbp), %ymm0\n", 345 | "\tmovq\t-280(%rbp), %rax\n", 346 | "\tvmovups\t%xmm0, (%rax)\n", 347 | "\tvextractf128\t$0x1, %ymm0, 16(%rax)\n", 348 | "\tleaq\t.LC16(%rip), %rsi\n", 349 | "\tleaq\t_ZSt4cout(%rip), %rdi\n", 350 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 351 | "\tmovq\t%rax, %rdx\n", 352 | "\tmovl\t-64(%rbp), %eax\n", 353 | "\tmovl\t%eax, -360(%rbp)\n", 354 | "\tvmovss\t-360(%rbp), %xmm0\n", 355 | "\tmovq\t%rdx, %rdi\n", 356 | "\tcall\t_ZNSolsEf@PLT\n", 357 | "\tleaq\t.LC17(%rip), %rsi\n", 358 | "\tmovq\t%rax, %rdi\n", 359 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 360 | "\tmovq\t%rax, %rdx\n", 361 | "\tmovl\t-60(%rbp), %eax\n", 362 | "\tmovl\t%eax, -360(%rbp)\n", 363 | "\tvmovss\t-360(%rbp), %xmm0\n", 364 | "\tmovq\t%rdx, %rdi\n", 365 | "\tcall\t_ZNSolsEf@PLT\n", 366 | "\tleaq\t.LC17(%rip), %rsi\n", 367 | "\tmovq\t%rax, %rdi\n", 368 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 369 | "\tmovq\t%rax, %rdx\n", 370 | "\tmovl\t-56(%rbp), %eax\n", 371 | "\tmovl\t%eax, -360(%rbp)\n", 372 | "\tvmovss\t-360(%rbp), %xmm0\n", 373 | "\tmovq\t%rdx, %rdi\n", 374 | "\tcall\t_ZNSolsEf@PLT\n", 375 | "\tleaq\t.LC17(%rip), %rsi\n", 376 | "\tmovq\t%rax, %rdi\n", 377 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 378 | "\tmovq\t%rax, %rdx\n", 379 | "\tmovl\t-52(%rbp), %eax\n", 380 | "\tmovl\t%eax, -360(%rbp)\n", 381 | "\tvmovss\t-360(%rbp), %xmm0\n", 382 | "\tmovq\t%rdx, %rdi\n", 383 | "\tcall\t_ZNSolsEf@PLT\n", 384 | "\tleaq\t.LC17(%rip), %rsi\n", 385 | "\tmovq\t%rax, %rdi\n", 386 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 387 | "\tmovq\t%rax, %rdx\n", 388 | "\tmovl\t-48(%rbp), %eax\n", 389 | "\tmovl\t%eax, -360(%rbp)\n", 390 | "\tvmovss\t-360(%rbp), %xmm0\n", 391 | "\tmovq\t%rdx, %rdi\n", 392 | "\tcall\t_ZNSolsEf@PLT\n", 393 | "\tleaq\t.LC17(%rip), %rsi\n", 394 | "\tmovq\t%rax, %rdi\n", 395 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 396 | "\tmovq\t%rax, %rdx\n", 397 | "\tmovl\t-44(%rbp), %eax\n", 398 | "\tmovl\t%eax, -360(%rbp)\n", 399 | "\tvmovss\t-360(%rbp), %xmm0\n", 400 | "\tmovq\t%rdx, %rdi\n", 401 | "\tcall\t_ZNSolsEf@PLT\n", 402 | "\tleaq\t.LC17(%rip), %rsi\n", 403 | "\tmovq\t%rax, %rdi\n", 404 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 405 | "\tmovq\t%rax, %rdx\n", 406 | "\tmovl\t-40(%rbp), %eax\n", 407 | "\tmovl\t%eax, -360(%rbp)\n", 408 | "\tvmovss\t-360(%rbp), %xmm0\n", 409 | "\tmovq\t%rdx, %rdi\n", 410 | "\tcall\t_ZNSolsEf@PLT\n", 411 | "\tleaq\t.LC17(%rip), %rsi\n", 412 | "\tmovq\t%rax, %rdi\n", 413 | "\tcall\t_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc@PLT\n", 414 | "\tmovq\t%rax, %rdx\n", 415 | "\tmovl\t-36(%rbp), %eax\n", 416 | "\tmovl\t%eax, -360(%rbp)\n", 417 | "\tvmovss\t-360(%rbp), %xmm0\n", 418 | "\tmovq\t%rdx, %rdi\n", 419 | "\tcall\t_ZNSolsEf@PLT\n", 420 | "\tmovq\t%rax, %rdx\n", 421 | "\tmovq\t_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_@GOTPCREL(%rip), %rax\n", 422 | "\tmovq\t%rax, %rsi\n", 423 | "\tmovq\t%rdx, %rdi\n", 424 | "\tcall\t_ZNSolsEPFRSoS_E@PLT\n", 425 | "\tmovl\t$0, %eax\n", 426 | "\tmovq\t-24(%rbp), %rcx\n", 427 | "\txorq\t%fs:40, %rcx\n", 428 | "\tje\t.L6\n", 429 | "\tcall\t__stack_chk_fail@PLT\n", 430 | ".L6:\n", 431 | "\taddq\t$360, %rsp\n", 432 | "\tpopq\t%r10\n", 433 | "\t.cfi_def_cfa 10, 0\n", 434 | "\tpopq\t%rbp\n", 435 | "\tleaq\t-8(%r10), %rsp\n", 436 | "\t.cfi_def_cfa 7, 8\n", 437 | "\tret\n", 438 | "\t.cfi_endproc\n", 439 | ".LFE5151:\n", 440 | "\t.size\tmain, .-main\n", 441 | "\t.type\t_Z41__static_initialization_and_destruction_0ii, @function\n", 442 | "_Z41__static_initialization_and_destruction_0ii:\n", 443 | ".LFB5641:\n", 444 | "\t.cfi_startproc\n", 445 | "\tpushq\t%rbp\n", 446 | "\t.cfi_def_cfa_offset 16\n", 447 | "\t.cfi_offset 6, -16\n", 448 | "\tmovq\t%rsp, %rbp\n", 449 | "\t.cfi_def_cfa_register 6\n", 450 | "\tsubq\t$16, %rsp\n", 451 | "\tmovl\t%edi, -4(%rbp)\n", 452 | "\tmovl\t%esi, -8(%rbp)\n", 453 | "\tcmpl\t$1, -4(%rbp)\n", 454 | "\tjne\t.L9\n", 455 | "\tcmpl\t$65535, -8(%rbp)\n", 456 | "\tjne\t.L9\n", 457 | "\tleaq\t_ZStL8__ioinit(%rip), %rdi\n", 458 | "\tcall\t_ZNSt8ios_base4InitC1Ev@PLT\n", 459 | "\tleaq\t__dso_handle(%rip), %rdx\n", 460 | "\tleaq\t_ZStL8__ioinit(%rip), %rsi\n", 461 | "\tmovq\t_ZNSt8ios_base4InitD1Ev@GOTPCREL(%rip), %rax\n", 462 | "\tmovq\t%rax, %rdi\n", 463 | "\tcall\t__cxa_atexit@PLT\n", 464 | ".L9:\n", 465 | "\tnop\n", 466 | "\tleave\n", 467 | "\t.cfi_def_cfa 7, 8\n", 468 | "\tret\n", 469 | "\t.cfi_endproc\n", 470 | ".LFE5641:\n", 471 | "\t.size\t_Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii\n", 472 | "\t.type\t_GLOBAL__sub_I_main, @function\n", 473 | "_GLOBAL__sub_I_main:\n", 474 | ".LFB5642:\n", 475 | "\t.cfi_startproc\n", 476 | "\tpushq\t%rbp\n", 477 | "\t.cfi_def_cfa_offset 16\n", 478 | "\t.cfi_offset 6, -16\n", 479 | "\tmovq\t%rsp, %rbp\n", 480 | "\t.cfi_def_cfa_register 6\n", 481 | "\tmovl\t$65535, %esi\n", 482 | "\tmovl\t$1, %edi\n", 483 | "\tcall\t_Z41__static_initialization_and_destruction_0ii\n", 484 | "\tpopq\t%rbp\n", 485 | "\t.cfi_def_cfa 7, 8\n", 486 | "\tret\n", 487 | "\t.cfi_endproc\n", 488 | ".LFE5642:\n", 489 | "\t.size\t_GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main\n", 490 | "\t.section\t.init_array,\"aw\"\n", 491 | "\t.align 8\n", 492 | "\t.quad\t_GLOBAL__sub_I_main\n", 493 | "\t.section\t.rodata\n", 494 | "\t.align 4\n", 495 | ".LC0:\n", 496 | "\t.long\t1090519040\n", 497 | "\t.align 4\n", 498 | ".LC1:\n", 499 | "\t.long\t1088421888\n", 500 | "\t.align 4\n", 501 | ".LC2:\n", 502 | "\t.long\t1086324736\n", 503 | "\t.align 4\n", 504 | ".LC3:\n", 505 | "\t.long\t1084227584\n", 506 | "\t.align 4\n", 507 | ".LC4:\n", 508 | "\t.long\t1082130432\n", 509 | "\t.align 4\n", 510 | ".LC5:\n", 511 | "\t.long\t1077936128\n", 512 | "\t.align 4\n", 513 | ".LC6:\n", 514 | "\t.long\t1073741824\n", 515 | "\t.align 4\n", 516 | ".LC7:\n", 517 | "\t.long\t1065353216\n", 518 | "\t.align 4\n", 519 | ".LC8:\n", 520 | "\t.long\t1099956224\n", 521 | "\t.align 4\n", 522 | ".LC9:\n", 523 | "\t.long\t1099431936\n", 524 | "\t.align 4\n", 525 | ".LC10:\n", 526 | "\t.long\t1098907648\n", 527 | "\t.align 4\n", 528 | ".LC11:\n", 529 | "\t.long\t1097859072\n", 530 | "\t.align 4\n", 531 | ".LC12:\n", 532 | "\t.long\t1096810496\n", 533 | "\t.align 4\n", 534 | ".LC13:\n", 535 | "\t.long\t1095761920\n", 536 | "\t.align 4\n", 537 | ".LC14:\n", 538 | "\t.long\t1094713344\n", 539 | "\t.align 4\n", 540 | ".LC15:\n", 541 | "\t.long\t1093664768\n", 542 | "\t.hidden\t__dso_handle\n", 543 | "\t.ident\t\"GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0\"\n", 544 | "\t.section\t.note.GNU-stack,\"\",@progbits\n" 545 | ], 546 | "name": "stdout" 547 | } 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "metadata": { 553 | "id": "zrPthIOjoRbF", 554 | "colab_type": "code", 555 | "colab": {} 556 | }, 557 | "source": [ 558 | "" 559 | ], 560 | "execution_count": 0, 561 | "outputs": [] 562 | } 563 | ] 564 | } -------------------------------------------------------------------------------- /01_cuda_lab/05_vectorAdd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "05_vectorAdd.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "TCTlP-YloFJP", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Vector Addition" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "27t6Hx4MLUpA", 42 | "colab_type": "code", 43 | "outputId": "9b8c1799-0020-42e0-8afb-8d587266d1c2", 44 | "colab": { 45 | "base_uri": "https://localhost:8080/", 46 | "height": 35 47 | } 48 | }, 49 | "source": [ 50 | "%%writefile vecAdd.cu\n", 51 | "\n", 52 | "#include \n", 53 | "#include \n", 54 | "using namespace std;\n", 55 | "\n", 56 | "int *a, *b; // host data\n", 57 | "int *c, *c2; // results\n", 58 | "\n", 59 | "__global__ void vecAdd(int *A,int *B,int *C,int N)\n", 60 | "{\n", 61 | " int i = blockIdx.x * blockDim.x + threadIdx.x;\n", 62 | " C[i] = A[i] + B[i]; \n", 63 | "}\n", 64 | "\n", 65 | "void vecAdd_h(int *A1,int *B1, int *C1, int N)\n", 66 | "{\n", 67 | " for(int i=0;i>>(a_d,b_d,c_d,n);\n", 104 | " cudaDeviceSynchronize();\n", 105 | " clock_t end_d = clock();\n", 106 | " clock_t start_h = clock();\n", 107 | " \n", 108 | " printf(\"Doing CPU Vector add\\n\");\n", 109 | " vecAdd_h(a,b,c2,n);\n", 110 | " clock_t end_h = clock();\n", 111 | " \n", 112 | " double time_d = (double)(end_d-start_d)/CLOCKS_PER_SEC;\n", 113 | " double time_h = (double)(end_h-start_h)/CLOCKS_PER_SEC;\n", 114 | " \n", 115 | " cudaMemcpy(c,c_d,n*sizeof(int),cudaMemcpyDeviceToHost);\n", 116 | " \n", 117 | " printf(\"on GPU: %f, on CPU: %f\\n\",time_d,time_h);\n", 118 | " printf(\"Speedup: %f \\n\", time_h/time_d);\n", 119 | " cudaFree(a_d);\n", 120 | " cudaFree(b_d);\n", 121 | " cudaFree(c_d);\n", 122 | " return 0;\n", 123 | "}" 124 | ], 125 | "execution_count": 0, 126 | "outputs": [ 127 | { 128 | "output_type": "stream", 129 | "text": [ 130 | "Overwriting vecAdd.cu\n" 131 | ], 132 | "name": "stdout" 133 | } 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "metadata": { 139 | "id": "7fN5slLYLdqI", 140 | "colab_type": "code", 141 | "outputId": "2f1d7fe2-4b07-4aa7-f71e-3b5c49152451", 142 | "colab": { 143 | "base_uri": "https://localhost:8080/", 144 | "height": 35 145 | } 146 | }, 147 | "source": [ 148 | "!nvcc -o vecAdd vecAdd.cu\n", 149 | "!ls" 150 | ], 151 | "execution_count": 0, 152 | "outputs": [ 153 | { 154 | "output_type": "stream", 155 | "text": [ 156 | "sample_data vecAdd vecAdd.cu\n" 157 | ], 158 | "name": "stdout" 159 | } 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "metadata": { 165 | "id": "aur_m904LgN4", 166 | "colab_type": "code", 167 | "outputId": "a227bd6c-30b7-4b30-83f7-a42ab25bcf1e", 168 | "colab": { 169 | "base_uri": "https://localhost:8080/", 170 | "height": 143 171 | } 172 | }, 173 | "source": [ 174 | "!./vecAdd" 175 | ], 176 | "execution_count": 0, 177 | "outputs": [ 178 | { 179 | "output_type": "stream", 180 | "text": [ 181 | "Begin \n", 182 | "Allocating device memory on host..\n", 183 | "Copying to device..\n", 184 | "Doing GPU Vector add\n", 185 | "Doing CPU Vector add\n", 186 | "on GPU: 0.000010, on CPU: 0.091319\n", 187 | "Speedup: 9131.900000 \n" 188 | ], 189 | "name": "stdout" 190 | } 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "metadata": { 196 | "id": "606PvSijob9P", 197 | "colab_type": "code", 198 | "colab": { 199 | "base_uri": "https://localhost:8080/", 200 | "height": 335 201 | }, 202 | "outputId": "943c733d-3fdb-4c64-f65f-896eb7084dbe" 203 | }, 204 | "source": [ 205 | "!nvidia-smi" 206 | ], 207 | "execution_count": 1, 208 | "outputs": [ 209 | { 210 | "output_type": "stream", 211 | "text": [ 212 | "Mon May 27 14:12:00 2019 \n", 213 | "+-----------------------------------------------------------------------------+\n", 214 | "| NVIDIA-SMI 418.67 Driver Version: 410.79 CUDA Version: 10.0 |\n", 215 | "|-------------------------------+----------------------+----------------------+\n", 216 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 217 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 218 | "|===============================+======================+======================|\n", 219 | "| 0 Tesla K80 Off | 00000000:00:04.0 Off | 0 |\n", 220 | "| N/A 33C P8 30W / 149W | 0MiB / 11441MiB | 0% Default |\n", 221 | "+-------------------------------+----------------------+----------------------+\n", 222 | " \n", 223 | "+-----------------------------------------------------------------------------+\n", 224 | "| Processes: GPU Memory |\n", 225 | "| GPU PID Type Process name Usage |\n", 226 | "|=============================================================================|\n", 227 | "| No running processes found |\n", 228 | "+-----------------------------------------------------------------------------+\n" 229 | ], 230 | "name": "stdout" 231 | } 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "id": "E47A8kq7ovll", 238 | "colab_type": "text" 239 | }, 240 | "source": [ 241 | "![Tesla K80](https://images.anandtech.com/doci/8729/TK80.jpg)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": { 247 | "id": "wYMwOdXBoNKf", 248 | "colab_type": "text" 249 | }, 250 | "source": [ 251 | "\n", 252 | "---\n", 253 | "\n", 254 | "---\n", 255 | "\n", 256 | "###Note: nvprof cannot be work correctly on Colab.\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "metadata": { 262 | "id": "fxWZrnhiMh04", 263 | "colab_type": "code", 264 | "outputId": "ebd09273-9c1d-4391-9059-521239839394", 265 | "colab": { 266 | "base_uri": "https://localhost:8080/", 267 | "height": 179 268 | } 269 | }, 270 | "source": [ 271 | "!nvprof --print-gpu-trace ./vecAdd" 272 | ], 273 | "execution_count": 0, 274 | "outputs": [ 275 | { 276 | "output_type": "stream", 277 | "text": [ 278 | "======== Warning: CUDA device error, GPU profiling skipped\n", 279 | "Begin \n", 280 | "Allocating device memory on host..\n", 281 | "Copying to device..\n", 282 | "Doing GPU Vector add\n", 283 | "Doing CPU Vector add\n", 284 | "on GPU: 0.000008, on CPU: 0.090094\n", 285 | "Speedup: 11261.750000 \n", 286 | "======== Warning: No profile data collected.\n" 287 | ], 288 | "name": "stdout" 289 | } 290 | ] 291 | } 292 | ] 293 | } -------------------------------------------------------------------------------- /01_cuda_lab/06_2DIndex.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "06_2DIndex.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "yQ6ize_D8LwU", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "# 2 Dimensional Indexing" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "b2WRUswXqQmB", 42 | "colab_type": "code", 43 | "colab": { 44 | "base_uri": "https://localhost:8080/", 45 | "height": 36 46 | }, 47 | "outputId": "bf42dbb9-9d77-455e-a691-f3772d65b226" 48 | }, 49 | "source": [ 50 | "%%writefile cuda2Dindex.cu\n", 51 | "\n", 52 | "#include \n", 53 | "__global__ void kernel( int *a )\n", 54 | "{\n", 55 | " int ix = blockIdx.x*blockDim.x + threadIdx.x;\n", 56 | " int iy = blockIdx.y*blockDim.y + threadIdx.y;\n", 57 | " int idx = iy * blockDim.x * gridDim.x + ix;\n", 58 | "\n", 59 | " a[idx] = a[idx] + 1;\n", 60 | "}\n", 61 | "\n", 62 | "int main()\n", 63 | "{\n", 64 | " int *host_array;\n", 65 | " int *dev_array;\n", 66 | "\n", 67 | " host_array = (int *) malloc(sizeof(int)*16);\n", 68 | " \n", 69 | " cudaMalloc(&dev_array, sizeof(int)*16);\n", 70 | " cudaMemset(dev_array, 0, sizeof(int)*16);\n", 71 | "\n", 72 | " dim3 block(2,2);\n", 73 | " dim3 threadPerBlock(2,2);\n", 74 | " kernel<<>>(dev_array); \n", 75 | " cudaMemcpy(host_array, dev_array, sizeof(int)*16, cudaMemcpyDeviceToHost);\n", 76 | "\n", 77 | " for(int i = 0; i < 16; i++) printf(\" %d \", host_array[i]);\n", 78 | " printf(\"\\n\");\n", 79 | "\n", 80 | " free(host_array);\n", 81 | " cudaFree(dev_array);\n", 82 | "\n", 83 | " return 0;\n", 84 | "}" 85 | ], 86 | "execution_count": 1, 87 | "outputs": [ 88 | { 89 | "output_type": "stream", 90 | "text": [ 91 | "Writing cuda2Dindex.cu\n" 92 | ], 93 | "name": "stdout" 94 | } 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "metadata": { 100 | "id": "dKTtkEQlqaBV", 101 | "colab_type": "code", 102 | "colab": {} 103 | }, 104 | "source": [ 105 | "!nvcc -o cuda2Dindex cuda2Dindex.cu" 106 | ], 107 | "execution_count": 0, 108 | "outputs": [] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "metadata": { 113 | "id": "F3g-cBNdqfas", 114 | "colab_type": "code", 115 | "colab": { 116 | "base_uri": "https://localhost:8080/", 117 | "height": 36 118 | }, 119 | "outputId": "a90bcb9f-b5ec-4a9f-8990-f2afdf7f789f" 120 | }, 121 | "source": [ 122 | "!./cuda2Dindex" 123 | ], 124 | "execution_count": 3, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "text": [ 129 | " 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \n" 130 | ], 131 | "name": "stdout" 132 | } 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "QCaKN9CMqkXG", 139 | "colab_type": "code", 140 | "colab": {} 141 | }, 142 | "source": [ 143 | "" 144 | ], 145 | "execution_count": 0, 146 | "outputs": [] 147 | } 148 | ] 149 | } -------------------------------------------------------------------------------- /01_cuda_lab/07_memoryType.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "07_memoryType.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "jB9CAgYq9M9A", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Memory Architecutre\n", 36 | "### GPU의 메모리 구조를 고려한 최적 Coding\n", 37 | "\n", 38 | "- Local Memory\n", 39 | "- Global Memory\n", 40 | "- Shared Memory\n", 41 | "\n", 42 | "### 참조\n", 43 | "\n", 44 | "- https://github.com/jeonggunlee/cs344" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "metadata": { 50 | "id": "5zIW-1c583Rl", 51 | "colab_type": "code", 52 | "outputId": "b8fc62cb-d607-490b-bead-1e0001e1fc46", 53 | "colab": { 54 | "base_uri": "https://localhost:8080/", 55 | "height": 35 56 | } 57 | }, 58 | "source": [ 59 | "%%writefile memoryType.cu\n", 60 | "\n", 61 | "// Convenience function for checking CUDA runtime API results\n", 62 | "// can be wrapped around any runtime API call. No-op in release builds.\n", 63 | "inline\n", 64 | "cudaError_t checkCuda(cudaError_t result)\n", 65 | "{\n", 66 | "#if defined(DEBUG) || defined(_DEBUG)\n", 67 | " if (result != cudaSuccess) {\n", 68 | " fprintf(stderr, \"CUDA Runtime Error: %s\\n\", cudaGetErrorString(result));\n", 69 | " assert(result == cudaSuccess);\n", 70 | " }\n", 71 | "#endif\n", 72 | " return result;\n", 73 | "}\n", 74 | "\n", 75 | "// CUDA에서 제공하는 서로 다른 타입의 메모리 공간 활용하기\n", 76 | "\n", 77 | "#include \n", 78 | "\n", 79 | "/**********************\n", 80 | " * using local memory *\n", 81 | " **********************/\n", 82 | "\n", 83 | "// a __device__ or __global__ function runs on the GPU\n", 84 | "__global__ void use_local_memory_GPU(float in)\n", 85 | "{\n", 86 | " float f; // variable \"f\" is in local memory and private to each thread\n", 87 | " f = in; // parameter \"in\" is in local memory and private to each thread\n", 88 | " // ... real code would presumably do other stuff here ... \n", 89 | "\n", 90 | " // ADDED\n", 91 | " int i, index = threadIdx.x;\n", 92 | " float average, sum = 0.0f;\n", 93 | "\n", 94 | " for (i=0; i average) { array[index] = average; } \n", 120 | "}\n", 121 | "\n", 122 | "/**********************\n", 123 | " * using shared memory *\n", 124 | " **********************/\n", 125 | "\n", 126 | "// (for clarity, hardcoding 128 threads/elements and omitting out-of-bounds checks)\n", 127 | "__global__ void use_shared_memory_GPU(float *array)\n", 128 | "{\n", 129 | " // local variables, private to each thread\n", 130 | " int i, index = threadIdx.x;\n", 131 | " float average, sum = 0.0f;\n", 132 | "\n", 133 | " // __shared__ variables are visible to all threads in the thread block\n", 134 | " // and have the same lifetime as the thread block\n", 135 | " __shared__ float sh_arr[128];\n", 136 | "\n", 137 | " // copy data from \"array\" in global memory to sh_arr in shared memory.\n", 138 | " // here, each thread is responsible for copying a single element.\n", 139 | " sh_arr[index] = array[index];\n", 140 | "\n", 141 | " __syncthreads(); // ensure all the writes to shared memory have completed\n", 142 | "\n", 143 | " // now, sh_arr is fully populated. Let's find the average of all previous elements\n", 144 | " for (i=0; i average) { array[index] = average; } \n", 148 | "\n", 149 | "}\n", 150 | "\n", 151 | "int main(int argc, char **argv)\n", 152 | "{\n", 153 | " int blockSize = 256;\n", 154 | " int nBlock= 1024;\n", 155 | " float ms;\n", 156 | " cudaEvent_t startEvent, stopEvent;\n", 157 | " \n", 158 | " checkCuda( cudaEventCreate(&startEvent) );\n", 159 | " checkCuda( cudaEventCreate(&stopEvent) );\n", 160 | " \n", 161 | " /*\n", 162 | " * First, call a kernel that shows using local memory \n", 163 | " */\n", 164 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 165 | " use_local_memory_GPU<<>>(2.0f);\n", 166 | " checkCuda( cudaEventRecord(stopEvent,0) );\n", 167 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 168 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 169 | " printf(\"Local : %f\\n\", ms);\n", 170 | " \n", 171 | " /*\n", 172 | " * Next, call a kernel that shows using global memory\n", 173 | " */\n", 174 | " float h_arr[blockSize]; // convention: h_ variables live on host\n", 175 | " float *d_arr; // convention: d_ variables live on device (GPU global mem)\n", 176 | "\n", 177 | " // allocate global memory on the device, place result in \"d_arr\"\n", 178 | " cudaMalloc((void **) &d_arr, sizeof(float) * blockSize);\n", 179 | " // now copy data from host memory \"h_arr\" to device memory \"d_arr\"\n", 180 | " cudaMemcpy((void *)d_arr, (void *)h_arr, sizeof(float) * blockSize, cudaMemcpyHostToDevice);\n", 181 | "\n", 182 | " // launch the kernel (1 block of 128 threads)\n", 183 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 184 | " use_global_memory_GPU<<>>(d_arr); // modifies the contents of array at d_arr\n", 185 | " checkCuda( cudaEventRecord(stopEvent,0) );\n", 186 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 187 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 188 | " printf(\"Global: %f\\n\", ms);\n", 189 | "\n", 190 | " \n", 191 | " // copy the modified array back to the host, overwriting contents of h_arr\n", 192 | " cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * blockSize, cudaMemcpyDeviceToHost);\n", 193 | " // ... do other stuff ...\n", 194 | "\n", 195 | " /*\n", 196 | " * Next, call a kernel that shows using shared memory\n", 197 | " */\n", 198 | "\n", 199 | " // as before, pass in a pointer to data in global memory\n", 200 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 201 | " use_shared_memory_GPU<<>>(d_arr); \n", 202 | " checkCuda( cudaEventRecord(stopEvent,0) );\n", 203 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 204 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 205 | " printf(\"Shared: %f\\n\", ms);\n", 206 | "\n", 207 | " \n", 208 | " // copy the modified array back to the host\n", 209 | " cudaMemcpy((void *)h_arr, (void *)d_arr, sizeof(float) * blockSize, cudaMemcpyHostToDevice);\n", 210 | " // ... do other stuff ...\n", 211 | " return 0;\n", 212 | "}" 213 | ], 214 | "execution_count": 2, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "text": [ 219 | "Overwriting memoryType.cu\n" 220 | ], 221 | "name": "stdout" 222 | } 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "metadata": { 228 | "id": "9cuuy1wx9C6q", 229 | "colab_type": "code", 230 | "outputId": "503a2f08-992b-4de1-f71a-d0aa2b3bd74e", 231 | "colab": { 232 | "base_uri": "https://localhost:8080/", 233 | "height": 89 234 | } 235 | }, 236 | "source": [ 237 | "!nvcc -o memoryType memoryType.cu" 238 | ], 239 | "execution_count": 3, 240 | "outputs": [ 241 | { 242 | "output_type": "stream", 243 | "text": [ 244 | "memoryType.cu(27): warning: variable \"f\" was set but never used\n", 245 | "\n", 246 | "memoryType.cu(33): warning: variable \"average\" was set but never used\n", 247 | "\n" 248 | ], 249 | "name": "stdout" 250 | } 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "metadata": { 256 | "id": "jY_CmtQ-9FIB", 257 | "colab_type": "code", 258 | "outputId": "9f4435ba-822e-47fc-e7bb-e4266b14d1f9", 259 | "colab": { 260 | "base_uri": "https://localhost:8080/", 261 | "height": 71 262 | } 263 | }, 264 | "source": [ 265 | "!./memoryType" 266 | ], 267 | "execution_count": 8, 268 | "outputs": [ 269 | { 270 | "output_type": "stream", 271 | "text": [ 272 | "Local : 0.020672\n", 273 | "Global: 0.122144\n", 274 | "Shared: 0.122144\n" 275 | ], 276 | "name": "stdout" 277 | } 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "metadata": { 283 | "id": "yTQ2WVPM9HVa", 284 | "colab_type": "code", 285 | "outputId": "9a30137c-d3a9-45a5-a437-d614d031f086", 286 | "colab": { 287 | "base_uri": "https://localhost:8080/", 288 | "height": 449 289 | } 290 | }, 291 | "source": [ 292 | "!nvprof ./memoryType" 293 | ], 294 | "execution_count": 0, 295 | "outputs": [ 296 | { 297 | "output_type": "stream", 298 | "text": [ 299 | "==683== NVPROF is profiling process 683, command: ./memoryType\n", 300 | "Local : 0.039360\n", 301 | "Global: 0.026976\n", 302 | "Shared: 0.026976\n", 303 | "==683== Profiling application: ./memoryType\n", 304 | "==683== Warning: 5 records have invalid timestamps due to insufficient device buffer space. You can configure the buffer space using the option --device-buffer-size.\n", 305 | "==683== Warning: 2 records have invalid timestamps due to insufficient semaphore pool size. You can configure the pool size using the option --profiling-semaphore-pool-size.\n", 306 | "==683== Profiling result:\n", 307 | "No kernels were profiled.\n", 308 | " Type Time(%) Time Calls Avg Min Max Name\n", 309 | " API calls: 85.54% 1.28212s 3 427.37ms 9.2140us 1.28210s cudaEventSynchronize\n", 310 | " 14.41% 215.93ms 2 107.96ms 1.1800us 215.93ms cudaEventCreate\n", 311 | " 0.02% 330.56us 1 330.56us 330.56us 330.56us cudaMalloc\n", 312 | " 0.01% 175.42us 1 175.42us 175.42us 175.42us cuDeviceTotalMem\n", 313 | " 0.01% 153.00us 96 1.5930us 131ns 62.566us cuDeviceGetAttribute\n", 314 | " 0.00% 60.460us 3 20.153us 11.842us 28.395us cudaLaunchKernel\n", 315 | " 0.00% 56.360us 3 18.786us 3.2680us 30.320us cudaMemcpy\n", 316 | " 0.00% 25.164us 1 25.164us 25.164us 25.164us cuDeviceGetName\n", 317 | " 0.00% 19.051us 6 3.1750us 2.2870us 5.1510us cudaEventRecord\n", 318 | " 0.00% 10.373us 3 3.4570us 2.6280us 5.0840us cudaEventElapsedTime\n", 319 | " 0.00% 2.8170us 1 2.8170us 2.8170us 2.8170us cuDeviceGetPCIBusId\n", 320 | " 0.00% 2.0260us 3 675ns 148ns 1.3160us cuDeviceGetCount\n", 321 | " 0.00% 1.4640us 2 732ns 282ns 1.1820us cuDeviceGet\n", 322 | " 0.00% 243ns 1 243ns 243ns 243ns cuDeviceGetUuid\n" 323 | ], 324 | "name": "stdout" 325 | } 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "metadata": { 331 | "id": "jDm9OcUwR7sk", 332 | "colab_type": "code", 333 | "colab": {} 334 | }, 335 | "source": [ 336 | "" 337 | ], 338 | "execution_count": 0, 339 | "outputs": [] 340 | } 341 | ] 342 | } -------------------------------------------------------------------------------- /01_cuda_lab/08_DeviceQuery_Bandwidth.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "08_DeviceQuery_Bandwidth.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "sn6MituUpzLH", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Device Query" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "metadata": { 41 | "id": "zz6ywIx3o-Um", 42 | "colab_type": "code", 43 | "colab": { 44 | "base_uri": "https://localhost:8080/", 45 | "height": 35 46 | }, 47 | "outputId": "0f354f6f-d357-4041-aa92-b07ec3e5bdc7" 48 | }, 49 | "source": [ 50 | "%cd /usr/local" 51 | ], 52 | "execution_count": 1, 53 | "outputs": [ 54 | { 55 | "output_type": "stream", 56 | "text": [ 57 | "/usr/local\n" 58 | ], 59 | "name": "stdout" 60 | } 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "metadata": { 66 | "id": "P5z-PXTUpJgh", 67 | "colab_type": "code", 68 | "colab": { 69 | "base_uri": "https://localhost:8080/", 70 | "height": 35 71 | }, 72 | "outputId": "4cf51c30-2091-476f-fce0-97e3034e6999" 73 | }, 74 | "source": [ 75 | "!ls" 76 | ], 77 | "execution_count": 2, 78 | "outputs": [ 79 | { 80 | "output_type": "stream", 81 | "text": [ 82 | "bin cuda cuda-10.0 etc games include lib\tman sbin share src xgboost\n" 83 | ], 84 | "name": "stdout" 85 | } 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "id": "fSn8icEapJ8Y", 92 | "colab_type": "code", 93 | "colab": { 94 | "base_uri": "https://localhost:8080/", 95 | "height": 35 96 | }, 97 | "outputId": "6b29dccc-6f64-4313-b6eb-b72f04c7cb5e" 98 | }, 99 | "source": [ 100 | "%cd cuda-10.0" 101 | ], 102 | "execution_count": 3, 103 | "outputs": [ 104 | { 105 | "output_type": "stream", 106 | "text": [ 107 | "/usr/local/cuda-10.0\n" 108 | ], 109 | "name": "stdout" 110 | } 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "metadata": { 116 | "id": "CrnBRNf3pNQX", 117 | "colab_type": "code", 118 | "colab": { 119 | "base_uri": "https://localhost:8080/", 120 | "height": 71 121 | }, 122 | "outputId": "972fc893-2dda-48c4-ac6f-f9c463fb95df" 123 | }, 124 | "source": [ 125 | "!ls" 126 | ], 127 | "execution_count": 4, 128 | "outputs": [ 129 | { 130 | "output_type": "stream", 131 | "text": [ 132 | "bin\textras\t libnsight NsightCompute-1.0 nvvm\tshare\t tools\n", 133 | "compat\tinclude libnvvp nsightee_plugins README\tsrc\t version.txt\n", 134 | "doc\tlib64\t LICENSE nvml\t samples\ttargets\n" 135 | ], 136 | "name": "stdout" 137 | } 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "metadata": { 143 | "id": "T2zbnJbKpN6C", 144 | "colab_type": "code", 145 | "colab": { 146 | "base_uri": "https://localhost:8080/", 147 | "height": 35 148 | }, 149 | "outputId": "4977391c-1fad-449c-ab9c-33f8299ee274" 150 | }, 151 | "source": [ 152 | "%cd samples" 153 | ], 154 | "execution_count": 5, 155 | "outputs": [ 156 | { 157 | "output_type": "stream", 158 | "text": [ 159 | "/usr/local/cuda-10.0/samples\n" 160 | ], 161 | "name": "stdout" 162 | } 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "metadata": { 168 | "id": "vT2Jae9spP43", 169 | "colab_type": "code", 170 | "colab": { 171 | "base_uri": "https://localhost:8080/", 172 | "height": 53 173 | }, 174 | "outputId": "3e15480b-4ec3-4c4b-a4a6-79032a00ea45" 175 | }, 176 | "source": [ 177 | "!ls" 178 | ], 179 | "execution_count": 6, 180 | "outputs": [ 181 | { 182 | "output_type": "stream", 183 | "text": [ 184 | "0_Simple 2_Graphics 4_Finance\t6_Advanced\t common Makefile\n", 185 | "1_Utilities 3_Imaging\t 5_Simulations\t7_CUDALibraries EULA.txt\n" 186 | ], 187 | "name": "stdout" 188 | } 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "metadata": { 194 | "id": "OeFVstEipQlH", 195 | "colab_type": "code", 196 | "colab": { 197 | "base_uri": "https://localhost:8080/", 198 | "height": 35 199 | }, 200 | "outputId": "7a337c9d-7627-40a7-e3de-e78eea498212" 201 | }, 202 | "source": [ 203 | "%cd 1_Utilities" 204 | ], 205 | "execution_count": 8, 206 | "outputs": [ 207 | { 208 | "output_type": "stream", 209 | "text": [ 210 | "/usr/local/cuda-10.0/samples/1_Utilities\n" 211 | ], 212 | "name": "stdout" 213 | } 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "metadata": { 219 | "id": "aRqzfQKvpSCf", 220 | "colab_type": "code", 221 | "colab": { 222 | "base_uri": "https://localhost:8080/", 223 | "height": 53 224 | }, 225 | "outputId": "fbe5d7cb-529c-42fd-bc75-84f6d537adbd" 226 | }, 227 | "source": [ 228 | "!ls" 229 | ], 230 | "execution_count": 9, 231 | "outputs": [ 232 | { 233 | "output_type": "stream", 234 | "text": [ 235 | "bandwidthTest deviceQueryDrv\t\ttopologyQuery\n", 236 | "deviceQuery p2pBandwidthLatencyTest\tUnifiedMemoryPerf\n" 237 | ], 238 | "name": "stdout" 239 | } 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "metadata": { 245 | "id": "2oAW-NqSpWbH", 246 | "colab_type": "code", 247 | "colab": { 248 | "base_uri": "https://localhost:8080/", 249 | "height": 35 250 | }, 251 | "outputId": "2e5617c7-fc9b-4f69-84b1-2329a7ae0db8" 252 | }, 253 | "source": [ 254 | "%cd deviceQuery" 255 | ], 256 | "execution_count": 10, 257 | "outputs": [ 258 | { 259 | "output_type": "stream", 260 | "text": [ 261 | "/usr/local/cuda-10.0/samples/1_Utilities/deviceQuery\n" 262 | ], 263 | "name": "stdout" 264 | } 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "metadata": { 270 | "id": "A1ERGh-bpYjf", 271 | "colab_type": "code", 272 | "colab": { 273 | "base_uri": "https://localhost:8080/", 274 | "height": 35 275 | }, 276 | "outputId": "8bfe7e95-5c9a-4935-c330-ccc266de74ee" 277 | }, 278 | "source": [ 279 | "!ls" 280 | ], 281 | "execution_count": 11, 282 | "outputs": [ 283 | { 284 | "output_type": "stream", 285 | "text": [ 286 | "deviceQuery.cpp Makefile NsightEclipse.xml readme.txt\n" 287 | ], 288 | "name": "stdout" 289 | } 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "id": "MgC5FxRfpZWI", 296 | "colab_type": "code", 297 | "colab": { 298 | "base_uri": "https://localhost:8080/", 299 | "height": 109 300 | }, 301 | "outputId": "67a2f06c-8c69-4f1e-ad3d-f310c53815d5" 302 | }, 303 | "source": [ 304 | "!make" 305 | ], 306 | "execution_count": 12, 307 | "outputs": [ 308 | { 309 | "output_type": "stream", 310 | "text": [ 311 | "/usr/local/cuda-10.0/bin/nvcc -ccbin g++ -I../../common/inc -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o deviceQuery.o -c deviceQuery.cpp\n", 312 | "/usr/local/cuda-10.0/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o deviceQuery deviceQuery.o \n", 313 | "mkdir -p ../../bin/x86_64/linux/release\n", 314 | "cp deviceQuery ../../bin/x86_64/linux/release\n" 315 | ], 316 | "name": "stdout" 317 | } 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "id": "eYnncOvXpabH", 324 | "colab_type": "code", 325 | "colab": { 326 | "base_uri": "https://localhost:8080/", 327 | "height": 809 328 | }, 329 | "outputId": "400a8c24-a9de-4d46-9ed4-45e3c7d35991" 330 | }, 331 | "source": [ 332 | "!./deviceQuery" 333 | ], 334 | "execution_count": 13, 335 | "outputs": [ 336 | { 337 | "output_type": "stream", 338 | "text": [ 339 | "./deviceQuery Starting...\n", 340 | "\n", 341 | " CUDA Device Query (Runtime API) version (CUDART static linking)\n", 342 | "\n", 343 | "Detected 1 CUDA Capable device(s)\n", 344 | "\n", 345 | "Device 0: \"Tesla T4\"\n", 346 | " CUDA Driver Version / Runtime Version 10.0 / 10.0\n", 347 | " CUDA Capability Major/Minor version number: 7.5\n", 348 | " Total amount of global memory: 15080 MBytes (15812263936 bytes)\n", 349 | " (40) Multiprocessors, ( 64) CUDA Cores/MP: 2560 CUDA Cores\n", 350 | " GPU Max Clock rate: 1590 MHz (1.59 GHz)\n", 351 | " Memory Clock rate: 5001 Mhz\n", 352 | " Memory Bus Width: 256-bit\n", 353 | " L2 Cache Size: 4194304 bytes\n", 354 | " Maximum Texture Dimension Size (x,y,z) 1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)\n", 355 | " Maximum Layered 1D Texture Size, (num) layers 1D=(32768), 2048 layers\n", 356 | " Maximum Layered 2D Texture Size, (num) layers 2D=(32768, 32768), 2048 layers\n", 357 | " Total amount of constant memory: 65536 bytes\n", 358 | " Total amount of shared memory per block: 49152 bytes\n", 359 | " Total number of registers available per block: 65536\n", 360 | " Warp size: 32\n", 361 | " Maximum number of threads per multiprocessor: 1024\n", 362 | " Maximum number of threads per block: 1024\n", 363 | " Max dimension size of a thread block (x,y,z): (1024, 1024, 64)\n", 364 | " Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)\n", 365 | " Maximum memory pitch: 2147483647 bytes\n", 366 | " Texture alignment: 512 bytes\n", 367 | " Concurrent copy and kernel execution: Yes with 3 copy engine(s)\n", 368 | " Run time limit on kernels: No\n", 369 | " Integrated GPU sharing Host Memory: No\n", 370 | " Support host page-locked memory mapping: Yes\n", 371 | " Alignment requirement for Surfaces: Yes\n", 372 | " Device has ECC support: Enabled\n", 373 | " Device supports Unified Addressing (UVA): Yes\n", 374 | " Device supports Compute Preemption: Yes\n", 375 | " Supports Cooperative Kernel Launch: Yes\n", 376 | " Supports MultiDevice Co-op Kernel Launch: Yes\n", 377 | " Device PCI Domain ID / Bus ID / location ID: 0 / 0 / 4\n", 378 | " Compute Mode:\n", 379 | " < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >\n", 380 | "\n", 381 | "deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 10.0, CUDA Runtime Version = 10.0, NumDevs = 1\n", 382 | "Result = PASS\n" 383 | ], 384 | "name": "stdout" 385 | } 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "id": "Ti9g0cZMp4jY", 392 | "colab_type": "text" 393 | }, 394 | "source": [ 395 | "## Bandwidth Test" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "metadata": { 401 | "id": "-ZDLNjdZpcZ_", 402 | "colab_type": "code", 403 | "colab": { 404 | "base_uri": "https://localhost:8080/", 405 | "height": 35 406 | }, 407 | "outputId": "b5110dca-2c1a-453a-e6f0-b874d3e88d45" 408 | }, 409 | "source": [ 410 | "%cd /usr/local/cuda-10.0" 411 | ], 412 | "execution_count": 15, 413 | "outputs": [ 414 | { 415 | "output_type": "stream", 416 | "text": [ 417 | "/usr/local/cuda-10.0\n" 418 | ], 419 | "name": "stdout" 420 | } 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "metadata": { 426 | "id": "IlxH8K7WpfY_", 427 | "colab_type": "code", 428 | "colab": { 429 | "base_uri": "https://localhost:8080/", 430 | "height": 35 431 | }, 432 | "outputId": "bc724dfd-dbca-4120-b572-891a1025f9e0" 433 | }, 434 | "source": [ 435 | "%cd samples" 436 | ], 437 | "execution_count": 16, 438 | "outputs": [ 439 | { 440 | "output_type": "stream", 441 | "text": [ 442 | "/usr/local/cuda-10.0/samples\n" 443 | ], 444 | "name": "stdout" 445 | } 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "metadata": { 451 | "id": "RFYzt1MfplOo", 452 | "colab_type": "code", 453 | "colab": { 454 | "base_uri": "https://localhost:8080/", 455 | "height": 53 456 | }, 457 | "outputId": "b3233261-4bb4-483a-b047-1f837c640ac9" 458 | }, 459 | "source": [ 460 | "!ls" 461 | ], 462 | "execution_count": 17, 463 | "outputs": [ 464 | { 465 | "output_type": "stream", 466 | "text": [ 467 | "0_Simple 2_Graphics 4_Finance\t6_Advanced\t bin\t EULA.txt\n", 468 | "1_Utilities 3_Imaging\t 5_Simulations\t7_CUDALibraries common Makefile\n" 469 | ], 470 | "name": "stdout" 471 | } 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "metadata": { 477 | "id": "W7q1WiDQplpI", 478 | "colab_type": "code", 479 | "colab": { 480 | "base_uri": "https://localhost:8080/", 481 | "height": 35 482 | }, 483 | "outputId": "7f4627d7-5756-4b08-efcb-63389388638d" 484 | }, 485 | "source": [ 486 | "%cd 1_Utilities" 487 | ], 488 | "execution_count": 18, 489 | "outputs": [ 490 | { 491 | "output_type": "stream", 492 | "text": [ 493 | "/usr/local/cuda-10.0/samples/1_Utilities\n" 494 | ], 495 | "name": "stdout" 496 | } 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "metadata": { 502 | "id": "GeUh0DO8popv", 503 | "colab_type": "code", 504 | "colab": { 505 | "base_uri": "https://localhost:8080/", 506 | "height": 53 507 | }, 508 | "outputId": "27111e17-67c4-4e3f-84f3-496e97040544" 509 | }, 510 | "source": [ 511 | "!ls" 512 | ], 513 | "execution_count": 19, 514 | "outputs": [ 515 | { 516 | "output_type": "stream", 517 | "text": [ 518 | "bandwidthTest deviceQueryDrv\t\ttopologyQuery\n", 519 | "deviceQuery p2pBandwidthLatencyTest\tUnifiedMemoryPerf\n" 520 | ], 521 | "name": "stdout" 522 | } 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "metadata": { 528 | "id": "DW4X1v_ZppNn", 529 | "colab_type": "code", 530 | "colab": { 531 | "base_uri": "https://localhost:8080/", 532 | "height": 35 533 | }, 534 | "outputId": "f74024fe-a91e-43fa-b095-0844e7b74082" 535 | }, 536 | "source": [ 537 | "%cd bandwidthTest" 538 | ], 539 | "execution_count": 20, 540 | "outputs": [ 541 | { 542 | "output_type": "stream", 543 | "text": [ 544 | "/usr/local/cuda-10.0/samples/1_Utilities/bandwidthTest\n" 545 | ], 546 | "name": "stdout" 547 | } 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "metadata": { 553 | "id": "wQjborwQprlg", 554 | "colab_type": "code", 555 | "colab": { 556 | "base_uri": "https://localhost:8080/", 557 | "height": 35 558 | }, 559 | "outputId": "2b30257d-7d5d-4af8-f6c8-606bdae85305" 560 | }, 561 | "source": [ 562 | "!ls" 563 | ], 564 | "execution_count": 21, 565 | "outputs": [ 566 | { 567 | "output_type": "stream", 568 | "text": [ 569 | "bandwidthTest.cu Makefile NsightEclipse.xml readme.txt\n" 570 | ], 571 | "name": "stdout" 572 | } 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "metadata": { 578 | "id": "ak6adm0wpsKg", 579 | "colab_type": "code", 580 | "colab": { 581 | "base_uri": "https://localhost:8080/", 582 | "height": 109 583 | }, 584 | "outputId": "d5b23d3b-6838-4219-e82e-4ccad0360c93" 585 | }, 586 | "source": [ 587 | "!make" 588 | ], 589 | "execution_count": 22, 590 | "outputs": [ 591 | { 592 | "output_type": "stream", 593 | "text": [ 594 | "/usr/local/cuda-10.0/bin/nvcc -ccbin g++ -I../../common/inc -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o bandwidthTest.o -c bandwidthTest.cu\n", 595 | "/usr/local/cuda-10.0/bin/nvcc -ccbin g++ -m64 -gencode arch=compute_30,code=sm_30 -gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_52,code=sm_52 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_75,code=compute_75 -o bandwidthTest bandwidthTest.o \n", 596 | "mkdir -p ../../bin/x86_64/linux/release\n", 597 | "cp bandwidthTest ../../bin/x86_64/linux/release\n" 598 | ], 599 | "name": "stdout" 600 | } 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "metadata": { 606 | "id": "xbL3i63kptaX", 607 | "colab_type": "code", 608 | "colab": { 609 | "base_uri": "https://localhost:8080/", 610 | "height": 449 611 | }, 612 | "outputId": "0be882c9-8e7b-4a54-8d40-e62c203c6a42" 613 | }, 614 | "source": [ 615 | "!./bandwidthTest" 616 | ], 617 | "execution_count": 23, 618 | "outputs": [ 619 | { 620 | "output_type": "stream", 621 | "text": [ 622 | "[CUDA Bandwidth Test] - Starting...\n", 623 | "Running on...\n", 624 | "\n", 625 | " Device 0: Tesla T4\n", 626 | " Quick Mode\n", 627 | "\n", 628 | " Host to Device Bandwidth, 1 Device(s)\n", 629 | " PINNED Memory Transfers\n", 630 | " Transfer Size (Bytes)\tBandwidth(MB/s)\n", 631 | " 33554432\t\t\t11859.1\n", 632 | "\n", 633 | " Device to Host Bandwidth, 1 Device(s)\n", 634 | " PINNED Memory Transfers\n", 635 | " Transfer Size (Bytes)\tBandwidth(MB/s)\n", 636 | " 33554432\t\t\t12822.5\n", 637 | "\n", 638 | " Device to Device Bandwidth, 1 Device(s)\n", 639 | " PINNED Memory Transfers\n", 640 | " Transfer Size (Bytes)\tBandwidth(MB/s)\n", 641 | " 33554432\t\t\t235308.2\n", 642 | "\n", 643 | "Result = PASS\n", 644 | "\n", 645 | "NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n" 646 | ], 647 | "name": "stdout" 648 | } 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "metadata": { 654 | "id": "yn9-hK0Fpwyf", 655 | "colab_type": "code", 656 | "colab": {} 657 | }, 658 | "source": [ 659 | "" 660 | ], 661 | "execution_count": 0, 662 | "outputs": [] 663 | } 664 | ] 665 | } -------------------------------------------------------------------------------- /01_cuda_lab/09_coalMemory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "09_coalMemory.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "D0UyXU-y6in5", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Memory Performance Comparison\n", 36 | "\n", 37 | "- stride effect\n", 38 | "- offset effect\n", 39 | "\n", 40 | "---\n", 41 | "---\n", 42 | "\n", 43 | "**offset function**\n", 44 | "```c\n", 45 | "template \n", 46 | "__global__ void offset(T* a, int s)\n", 47 | "{\n", 48 | " int i = blockDim.x * blockIdx.x + threadIdx.x + s;\n", 49 | " a[i] = a[i] + 1;\n", 50 | "}\n", 51 | "```\n", 52 | "\n", 53 | "**stride function**\n", 54 | "```c\n", 55 | "template \n", 56 | "__global__ void stride(T* a, int s)\n", 57 | "{\n", 58 | " int i = (blockDim.x * blockIdx.x + threadIdx.x) * s;\n", 59 | " a[i] = a[i] + 1;\n", 60 | "}\n", 61 | "\n", 62 | "```" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "metadata": { 68 | "id": "RK3Q1A7Y57Q2", 69 | "colab_type": "code", 70 | "colab": { 71 | "base_uri": "https://localhost:8080/", 72 | "height": 35 73 | }, 74 | "outputId": "fe8084bc-4db3-4351-d935-3b1f4e8a26bd" 75 | }, 76 | "source": [ 77 | "%%writefile coalMemory.cu\n", 78 | "\n", 79 | "/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.\n", 80 | " *\n", 81 | " * Redistribution and use in source and binary forms, with or without\n", 82 | " * modification, are permitted provided that the following conditions\n", 83 | " * are met:\n", 84 | " * * Redistributions of source code must retain the above copyright\n", 85 | " * notice, this list of conditions and the following disclaimer.\n", 86 | " * * Redistributions in binary form must reproduce the above copyright\n", 87 | " * notice, this list of conditions and the following disclaimer in the\n", 88 | " * documentation and/or other materials provided with the distribution.\n", 89 | " * * Neither the name of NVIDIA CORPORATION nor the names of its\n", 90 | " * contributors may be used to endorse or promote products derived\n", 91 | " * from this software without specific prior written permission.\n", 92 | " *\n", 93 | " * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n", 94 | " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", 95 | " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n", 96 | " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n", 97 | " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n", 98 | " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n", 99 | " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n", 100 | " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n", 101 | " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n", 102 | " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", 103 | " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n", 104 | " */\n", 105 | "#include \n", 106 | "#include \n", 107 | "\n", 108 | "// Convenience function for checking CUDA runtime API results\n", 109 | "// can be wrapped around any runtime API call. No-op in release builds.\n", 110 | "inline\n", 111 | "cudaError_t checkCuda(cudaError_t result)\n", 112 | "{\n", 113 | "#if defined(DEBUG) || defined(_DEBUG)\n", 114 | " if (result != cudaSuccess) {\n", 115 | " fprintf(stderr, \"CUDA Runtime Error: %s\\n\", cudaGetErrorString(result));\n", 116 | " assert(result == cudaSuccess);\n", 117 | " }\n", 118 | "#endif\n", 119 | " return result;\n", 120 | "}\n", 121 | "\n", 122 | "template \n", 123 | "__global__ void offset(T* a, int s)\n", 124 | "{\n", 125 | " int i = blockDim.x * blockIdx.x + threadIdx.x + s;\n", 126 | " a[i] = a[i] + 1;\n", 127 | "}\n", 128 | "\n", 129 | "template \n", 130 | "__global__ void stride(T* a, int s)\n", 131 | "{\n", 132 | " int i = (blockDim.x * blockIdx.x + threadIdx.x) * s;\n", 133 | " a[i] = a[i] + 1;\n", 134 | "}\n", 135 | "\n", 136 | "template \n", 137 | "void runTest(int deviceId, int nMB)\n", 138 | "{\n", 139 | " int blockSize = 256;\n", 140 | " float ms;\n", 141 | "\n", 142 | " T *d_a;\n", 143 | " cudaEvent_t startEvent, stopEvent;\n", 144 | " \n", 145 | " int n = nMB*1024*1024/sizeof(T);\n", 146 | "\n", 147 | " // NB: d_a(33*nMB) for stride case\n", 148 | " checkCuda( cudaMalloc(&d_a, n * 33 * sizeof(T)) );\n", 149 | "\n", 150 | " checkCuda( cudaEventCreate(&startEvent) );\n", 151 | " checkCuda( cudaEventCreate(&stopEvent) );\n", 152 | "\n", 153 | " printf(\"Offset, Bandwidth (GB/s):\\n\");\n", 154 | " \n", 155 | " offset<<>>(d_a, 0); // warm up\n", 156 | "\n", 157 | " for (int i = 0; i <= 32; i++) {\n", 158 | " checkCuda( cudaMemset(d_a, 0, n * sizeof(T)) );\n", 159 | "\n", 160 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 161 | " offset<<>>(d_a, i);\n", 162 | " checkCuda( cudaEventRecord(stopEvent,0) );\n", 163 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 164 | "\n", 165 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 166 | " printf(\"%d, %f\\n\", i, 2*nMB/ms);\n", 167 | " }\n", 168 | "\n", 169 | " printf(\"\\n\");\n", 170 | " printf(\"Stride, Bandwidth (GB/s):\\n\");\n", 171 | "\n", 172 | " stride<<>>(d_a, 1); // warm up\n", 173 | " for (int i = 1; i <= 32; i++) {\n", 174 | " checkCuda( cudaMemset(d_a, 0, n * sizeof(T)) );\n", 175 | "\n", 176 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 177 | " stride<<>>(d_a, i);\n", 178 | " checkCuda( cudaEventRecord(stopEvent,0) );\n", 179 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 180 | "\n", 181 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 182 | " printf(\"%d, %f\\n\", i, 2*nMB/ms);\n", 183 | " }\n", 184 | "\n", 185 | " checkCuda( cudaEventDestroy(startEvent) );\n", 186 | " checkCuda( cudaEventDestroy(stopEvent) );\n", 187 | " cudaFree(d_a);\n", 188 | "}\n", 189 | "\n", 190 | "int main(int argc, char **argv)\n", 191 | "{\n", 192 | " int nMB = 4;\n", 193 | " int deviceId = 0;\n", 194 | " bool bFp64 = false;\n", 195 | "\n", 196 | " for (int i = 1; i < argc; i++) { \n", 197 | " if (!strncmp(argv[i], \"dev=\", 4))\n", 198 | " deviceId = atoi((char*)(&argv[i][4]));\n", 199 | " else if (!strcmp(argv[i], \"fp64\"))\n", 200 | " bFp64 = true;\n", 201 | " }\n", 202 | " \n", 203 | " cudaDeviceProp prop;\n", 204 | " \n", 205 | " checkCuda( cudaSetDevice(deviceId) )\n", 206 | " ;\n", 207 | " checkCuda( cudaGetDeviceProperties(&prop, deviceId) );\n", 208 | " printf(\"Device: %s\\n\", prop.name);\n", 209 | " printf(\"Transfer size (MB): %d\\n\", nMB);\n", 210 | " \n", 211 | " printf(\"%s Precision\\n\", bFp64 ? \"Double\" : \"Single\");\n", 212 | " \n", 213 | " if (bFp64) runTest(deviceId, nMB);\n", 214 | " else runTest(deviceId, nMB);\n", 215 | "}" 216 | ], 217 | "execution_count": 2, 218 | "outputs": [ 219 | { 220 | "output_type": "stream", 221 | "text": [ 222 | "Writing coalMemory.cu\n" 223 | ], 224 | "name": "stdout" 225 | } 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "metadata": { 231 | "id": "WCtVyn-A6FPQ", 232 | "colab_type": "code", 233 | "colab": {} 234 | }, 235 | "source": [ 236 | "!nvcc -o coalMemory coalMemory.cu" 237 | ], 238 | "execution_count": 0, 239 | "outputs": [] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "id": "kTNlZQaU6JeG", 245 | "colab_type": "code", 246 | "colab": { 247 | "base_uri": "https://localhost:8080/", 248 | "height": 1295 249 | }, 250 | "outputId": "35368bc2-c3a9-44d0-9307-6cb16e51dfbb" 251 | }, 252 | "source": [ 253 | "!./coalMemory" 254 | ], 255 | "execution_count": 4, 256 | "outputs": [ 257 | { 258 | "output_type": "stream", 259 | "text": [ 260 | "Device: Tesla T4\n", 261 | "Transfer size (MB): 4\n", 262 | "Single Precision\n", 263 | "Offset, Bandwidth (GB/s):\n", 264 | "0, 269.687164\n", 265 | "1, 260.960327\n", 266 | "2, 263.435181\n", 267 | "3, 261.233002\n", 268 | "4, 271.149689\n", 269 | "5, 260.416687\n", 270 | "6, 263.157898\n", 271 | "7, 260.416687\n", 272 | "8, 269.978394\n", 273 | "9, 269.978394\n", 274 | "10, 265.957458\n", 275 | "11, 261.233002\n", 276 | "12, 276.243073\n", 277 | "13, 261.506287\n", 278 | "14, 274.122803\n", 279 | "15, 260.688202\n", 280 | "16, 278.086761\n", 281 | "17, 260.416687\n", 282 | "18, 266.240692\n", 283 | "19, 260.688202\n", 284 | "20, 275.938202\n", 285 | "21, 260.960327\n", 286 | "22, 272.034821\n", 287 | "23, 260.416687\n", 288 | "24, 275.938202\n", 289 | "25, 263.435181\n", 290 | "26, 267.379669\n", 291 | "27, 260.416687\n", 292 | "28, 271.149689\n", 293 | "29, 260.960327\n", 294 | "30, 266.240692\n", 295 | "31, 261.233002\n", 296 | "32, 275.633972\n", 297 | "\n", 298 | "Stride, Bandwidth (GB/s):\n", 299 | "1, 273.224060\n", 300 | "2, 144.425186\n", 301 | "3, 90.876045\n", 302 | "4, 65.789474\n", 303 | "5, 49.495148\n", 304 | "6, 41.118420\n", 305 | "7, 34.270050\n", 306 | "8, 30.048077\n", 307 | "9, 26.757999\n", 308 | "10, 24.492994\n", 309 | "11, 22.323421\n", 310 | "12, 20.490124\n", 311 | "13, 19.357336\n", 312 | "14, 18.513033\n", 313 | "15, 17.204597\n", 314 | "16, 16.723526\n", 315 | "17, 15.141420\n", 316 | "18, 14.575560\n", 317 | "19, 13.357555\n", 318 | "20, 13.240824\n", 319 | "21, 12.118862\n", 320 | "22, 11.626285\n", 321 | "23, 10.735603\n", 322 | "24, 10.728693\n", 323 | "25, 10.361406\n", 324 | "26, 9.977252\n", 325 | "27, 9.870889\n", 326 | "28, 9.316886\n", 327 | "29, 8.758409\n", 328 | "30, 9.010957\n", 329 | "31, 8.436255\n", 330 | "32, 9.151140\n" 331 | ], 332 | "name": "stdout" 333 | } 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "metadata": { 339 | "id": "t3KE5ahS6Yz1", 340 | "colab_type": "code", 341 | "colab": {} 342 | }, 343 | "source": [ 344 | "" 345 | ], 346 | "execution_count": 0, 347 | "outputs": [] 348 | } 349 | ] 350 | } -------------------------------------------------------------------------------- /01_cuda_lab/README.md: -------------------------------------------------------------------------------- 1 | ## CUDA 실습 화일 2 | 3 | - 01_simple.ipynb: 기본적인 C 프로그래밍 및 pthread 기본 프로그래밍 4 | - 02_openmp.ipynb: 기본 OpenMP 테스트 5 | - 03_simple_avx.ipynb: SIMD AVX 테스트 6 | - 04_helloCUDA.ipynb: 기초 CUDA 프로그래밍 7 | - 05_vectorAdd.ipynb: vector addition 프로그래밍 8 | - 06_2DIndex.ipynb: 2차원 thread 인덱싱 9 | -------------------------------------------------------------------------------- /01_cuda_lab/clock.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // This example shows how to use the clock function to measure the performance of 13 | // a kernel accurately. 14 | // 15 | // Blocks are executed in parallel and out of order. Since there's no synchronization 16 | // mechanism between blocks, we measure the clock once for each block. The clock 17 | // samples are written to device memory. 18 | 19 | // System includes 20 | #include 21 | #include 22 | #include 23 | 24 | // CUDA runtime 25 | #include 26 | 27 | // helper functions and utilities to work with CUDA 28 | #include 29 | #include 30 | 31 | // This kernel computes a standard parallel reduction and evaluates the 32 | // time it takes to do that for each block. The timing results are stored 33 | // in device memory. 34 | __global__ static void timedReduction(const float *input, float *output, clock_t *timer) 35 | { 36 | // __shared__ float shared[2 * blockDim.x]; 37 | extern __shared__ float shared[]; 38 | 39 | const int tid = threadIdx.x; 40 | const int bid = blockIdx.x; 41 | 42 | if (tid == 0) timer[bid] = clock(); 43 | 44 | // Copy input. 45 | shared[tid] = input[tid]; 46 | shared[tid + blockDim.x] = input[tid + blockDim.x]; 47 | 48 | // Perform reduction to find minimum. 49 | for (int d = blockDim.x; d > 0; d /= 2) 50 | { 51 | __syncthreads(); 52 | 53 | if (tid < d) 54 | { 55 | float f0 = shared[tid]; 56 | float f1 = shared[tid + d]; 57 | 58 | if (f1 < f0) 59 | { 60 | shared[tid] = f1; 61 | } 62 | } 63 | } 64 | 65 | // Write result. 66 | if (tid == 0) output[bid] = shared[0]; 67 | 68 | __syncthreads(); 69 | 70 | if (tid == 0) timer[bid+gridDim.x] = clock(); 71 | } 72 | 73 | 74 | // This example shows how to use the clock function to measure the performance of 75 | // a kernel accurately. 76 | // 77 | // Blocks are executed in parallel and out of order. Since there's no synchronization 78 | // mechanism between blocks, we measure the clock once for each block. The clock 79 | // samples are written to device memory. 80 | 81 | #define NUM_BLOCKS 64 82 | #define NUM_THREADS 256 83 | 84 | // It's interesting to change the number of blocks and the number of threads to 85 | // understand how to keep the hardware busy. 86 | // 87 | // Here are some numbers I get on my G80: 88 | // blocks - clocks 89 | // 1 - 3096 90 | // 8 - 3232 91 | // 16 - 3364 92 | // 32 - 4615 93 | // 64 - 9981 94 | // 95 | // With less than 16 blocks some of the multiprocessors of the device are idle. With 96 | // more than 16 you are using all the multiprocessors, but there's only one block per 97 | // multiprocessor and that doesn't allow you to hide the latency of the memory. With 98 | // more than 32 the speed scales linearly. 99 | 100 | // Start the main CUDA Sample here 101 | int main(int argc, char **argv) 102 | { 103 | printf("CUDA Clock sample\n"); 104 | 105 | // This will pick the best possible CUDA capable device 106 | int dev = findCudaDevice(argc, (const char **)argv); 107 | 108 | float *dinput = NULL; 109 | float *doutput = NULL; 110 | clock_t *dtimer = NULL; 111 | 112 | clock_t timer[NUM_BLOCKS * 2]; 113 | float input[NUM_THREADS * 2]; 114 | 115 | for (int i = 0; i < NUM_THREADS * 2; i++) 116 | { 117 | input[i] = (float)i; 118 | } 119 | 120 | checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); 121 | checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); 122 | checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); 123 | 124 | checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)); 125 | 126 | timedReduction<<>>(dinput, doutput, dtimer); 127 | 128 | checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)); 129 | 130 | checkCudaErrors(cudaFree(dinput)); 131 | checkCudaErrors(cudaFree(doutput)); 132 | checkCudaErrors(cudaFree(dtimer)); 133 | 134 | 135 | // Compute the difference between the last block end and the first block start. 136 | clock_t minStart = timer[0]; 137 | clock_t maxEnd = timer[NUM_BLOCKS]; 138 | 139 | for (int i = 1; i < NUM_BLOCKS; i++) 140 | { 141 | minStart = timer[i] < minStart ? timer[i] : minStart; 142 | maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd; 143 | } 144 | 145 | printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart)); 146 | 147 | 148 | // cudaDeviceReset causes the driver to clean up all state. While 149 | // not mandatory in normal operation, it is good practice. It is also 150 | // needed to ensure correct operation when the application is being 151 | // profiled. Calling cudaDeviceReset causes all profile data to be 152 | // flushed before the application exits 153 | cudaDeviceReset(); 154 | 155 | return EXIT_SUCCESS; 156 | } 157 | -------------------------------------------------------------------------------- /02_cuda_lab/00_UnifiedMemory_SharedMem.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "00_UnifiedMemory_SharedMem.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "NC8OEmdvu52k", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Unified Memory Test & Shared Memory vs Global Memory Test\n", 36 | "\n", 37 | "# Unified Memory Test\n", 38 | "\n", 39 | "참조: https://devblogs.nvidia.com/unified-memory-cuda-beginners/\n", 40 | "\n", 41 | "\n", 42 | "cudaMalloc 및 cudaMemCpy 등에 대한 고려 없이 마치 host에서 메모리를 사용하 듯이 GPU의 메모리를 사용함\n", 43 | "\n", 44 | "**cudaMallocManaged** 를 사용합니다!!!\n", 45 | "\n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "metadata": { 51 | "id": "XiDceBCqu0TF", 52 | "colab_type": "code", 53 | "colab": { 54 | "base_uri": "https://localhost:8080/", 55 | "height": 35 56 | }, 57 | "outputId": "4d66307b-a3d5-4d75-d0f3-2e09ba18e5fd" 58 | }, 59 | "source": [ 60 | "%%writefile unifiedMem.cu\n", 61 | "\n", 62 | "#include \n", 63 | "#include \n", 64 | " \n", 65 | "// CUDA kernel to add elements of two arrays\n", 66 | "__global__\n", 67 | "void add(int n, float *x, float *y)\n", 68 | "{\n", 69 | " int index = blockIdx.x * blockDim.x + threadIdx.x;\n", 70 | " int stride = blockDim.x * gridDim.x;\n", 71 | " for (int i = index; i < n; i += stride)\n", 72 | " y[i] = x[i] + y[i];\n", 73 | "}\n", 74 | " \n", 75 | "int main(void)\n", 76 | "{\n", 77 | " int N = 1<<20;\n", 78 | " float *x, *y;\n", 79 | " \n", 80 | " // Allocate Unified Memory -- accessible from CPU or GPU\n", 81 | " cudaMallocManaged(&x, N*sizeof(float));\n", 82 | " cudaMallocManaged(&y, N*sizeof(float));\n", 83 | " \n", 84 | " // initialize x and y arrays on the host\n", 85 | " for (int i = 0; i < N; i++) {\n", 86 | " x[i] = 1.0f;\n", 87 | " y[i] = 2.0f;\n", 88 | " }\n", 89 | " \n", 90 | " // Launch kernel on 1M elements on the GPU\n", 91 | " int blockSize = 256;\n", 92 | " int numBlocks = (N + blockSize - 1) / blockSize;\n", 93 | " add<<>>(N, x, y);\n", 94 | " \n", 95 | " // Wait for GPU to finish before accessing on host\n", 96 | " cudaDeviceSynchronize();\n", 97 | " \n", 98 | " // Check for errors (all values should be 3.0f)\n", 99 | " float maxError = 0.0f;\n", 100 | " for (int i = 0; i < N; i++)\n", 101 | " maxError = fmax(maxError, fabs(y[i]-3.0f));\n", 102 | " std::cout << \"Max error: \" << maxError << std::endl;\n", 103 | " \n", 104 | " // Free memory\n", 105 | " cudaFree(x);\n", 106 | " cudaFree(y);\n", 107 | " \n", 108 | " return 0;\n", 109 | "}" 110 | ], 111 | "execution_count": 1, 112 | "outputs": [ 113 | { 114 | "output_type": "stream", 115 | "text": [ 116 | "Writing unifiedMem.cu\n" 117 | ], 118 | "name": "stdout" 119 | } 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "metadata": { 125 | "id": "AJuad17dw3i7", 126 | "colab_type": "code", 127 | "colab": {} 128 | }, 129 | "source": [ 130 | "!nvcc -o unifiedMem unifiedMem.cu" 131 | ], 132 | "execution_count": 0, 133 | "outputs": [] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "5g_RA52kw3UX", 139 | "colab_type": "code", 140 | "colab": { 141 | "base_uri": "https://localhost:8080/", 142 | "height": 35 143 | }, 144 | "outputId": "b981cae8-c5ae-40ba-e462-528b385daebf" 145 | }, 146 | "source": [ 147 | "!./unifiedMem" 148 | ], 149 | "execution_count": 3, 150 | "outputs": [ 151 | { 152 | "output_type": "stream", 153 | "text": [ 154 | "Max error: 0\n" 155 | ], 156 | "name": "stdout" 157 | } 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "metadata": { 163 | "id": "chCSPzl9xOHb", 164 | "colab_type": "code", 165 | "colab": { 166 | "base_uri": "https://localhost:8080/", 167 | "height": 449 168 | }, 169 | "outputId": "4e72a62e-8c47-4057-840e-74dfac2e3c1a" 170 | }, 171 | "source": [ 172 | "!nvprof ./unifiedMem" 173 | ], 174 | "execution_count": 4, 175 | "outputs": [ 176 | { 177 | "output_type": "stream", 178 | "text": [ 179 | "==404== NVPROF is profiling process 404, command: ./unifiedMem\n", 180 | "Max error: 0\n", 181 | "==404== Profiling application: ./unifiedMem\n", 182 | "==404== Profiling result:\n", 183 | " Type Time(%) Time Calls Avg Min Max Name\n", 184 | " GPU activities: 100.00% 102.46us 1 102.46us 102.46us 102.46us add(int, float*, float*)\n", 185 | " API calls: 98.33% 219.67ms 2 109.84ms 803.66us 218.87ms cudaMallocManaged\n", 186 | " 0.83% 1.8441ms 1 1.8441ms 1.8441ms 1.8441ms cudaLaunchKernel\n", 187 | " 0.52% 1.1627ms 2 581.33us 486.14us 676.53us cudaFree\n", 188 | " 0.17% 379.82us 96 3.9560us 157ns 150.05us cuDeviceGetAttribute\n", 189 | " 0.08% 189.25us 1 189.25us 189.25us 189.25us cuDeviceTotalMem\n", 190 | " 0.05% 116.81us 1 116.81us 116.81us 116.81us cudaDeviceSynchronize\n", 191 | " 0.01% 23.703us 1 23.703us 23.703us 23.703us cuDeviceGetName\n", 192 | " 0.00% 3.2300us 1 3.2300us 3.2300us 3.2300us cuDeviceGetPCIBusId\n", 193 | " 0.00% 2.1020us 3 700ns 183ns 1.1680us cuDeviceGetCount\n", 194 | " 0.00% 1.6160us 2 808ns 262ns 1.3540us cuDeviceGet\n", 195 | " 0.00% 354ns 1 354ns 354ns 354ns cuDeviceGetUuid\n", 196 | "\n", 197 | "==404== Unified Memory profiling result:\n", 198 | "Device \"Tesla K80 (0)\"\n", 199 | " Count Avg Size Min Size Max Size Total Size Total Time Name\n", 200 | " 6 1.3333MB 256.00KB 2.0000MB 8.000000MB 1.184960ms Host To Device\n", 201 | " 90 136.53KB 4.0000KB 0.9961MB 12.00000MB 1.893344ms Device To Host\n", 202 | "Total CPU Page faults: 45\n" 203 | ], 204 | "name": "stdout" 205 | } 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "NOB94YC0x8sN", 212 | "colab_type": "text" 213 | }, 214 | "source": [ 215 | "- 참조: \n", 216 | " - https://devblogs.nvidia.com/unified-memory-cuda-beginners/\n", 217 | " - https://devblogs.nvidia.com/maximizing-unified-memory-performance-cuda/" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "id": "pg3EPDZVvGaf", 224 | "colab_type": "text" 225 | }, 226 | "source": [ 227 | "# Shared Memory vs Global Memory Test\n" 228 | ] 229 | } 230 | ] 231 | } -------------------------------------------------------------------------------- /02_cuda_lab/03_reduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "03_reduction.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "metadata": { 20 | "id": "view-in-github", 21 | "colab_type": "text" 22 | }, 23 | "source": [ 24 | " $\"Open$ " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "gjiSKFe4SA7Z", 31 | "colab_type": "text" 32 | }, 33 | "source": [ 34 | "" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "metadata": { 40 | "id": "dkJq9vwsR_tn", 41 | "colab_type": "code", 42 | "colab": { 43 | "base_uri": "https://localhost:8080/", 44 | "height": 35 45 | }, 46 | "outputId": "5239573b-0882-4e59-8941-34b0b480416b" 47 | }, 48 | "source": [ 49 | "%%writefile reduction.cu\n", 50 | "\n", 51 | "#include \n", 52 | "\n", 53 | "__global__ void reduce0(int *g_idata, int *g_odata) {\n", 54 | "\n", 55 | " extern __shared__ int sdata[];\n", 56 | " // each thread loads one element from global to shared mem\n", 57 | " unsigned int tid = threadIdx.x;\n", 58 | " unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;\n", 59 | " sdata[tid] = g_idata[i];\n", 60 | " __syncthreads();\n", 61 | " // do reduction in shared mem\n", 62 | " for(unsigned int s=1; s < blockDim.x; s *= 2) {\n", 63 | " if (tid % (2*s) == 0) {\n", 64 | " sdata[tid] += sdata[tid + s];\n", 65 | " }\n", 66 | " __syncthreads();\n", 67 | " }\n", 68 | " // write result for this block to global mem\n", 69 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 70 | "}\n", 71 | "\n", 72 | "__global__ void reduce1(int *g_idata, int *g_odata) {\n", 73 | "\n", 74 | " extern __shared__ int sdata[];\n", 75 | " // each thread loads one element from global to shared mem\n", 76 | " unsigned int tid = threadIdx.x;\n", 77 | " unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;\n", 78 | " sdata[tid] = g_idata[i];\n", 79 | " __syncthreads();\n", 80 | " // do reduction in shared mem\n", 81 | " for(unsigned int s=1; s < blockDim.x; s *= 2) {\n", 82 | " int index = 2 * s * tid;\n", 83 | " if (index < blockDim.x) {\n", 84 | " sdata[index] += sdata[index + s];\n", 85 | " }\n", 86 | " __syncthreads();\n", 87 | " }\n", 88 | " // write result for this block to global mem\n", 89 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 90 | "}\n", 91 | "__global__ void reduce2(int *g_idata, int *g_odata) {\n", 92 | "\n", 93 | " extern __shared__ int sdata[];\n", 94 | " // each thread loads one element from global to shared mem\n", 95 | " unsigned int tid = threadIdx.x;\n", 96 | " unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;\n", 97 | " sdata[tid] = g_idata[i];\n", 98 | " __syncthreads();\n", 99 | " // do reduction in shared mem\n", 100 | " for (unsigned int s=blockDim.x/2; s>0; s>>=1) {\n", 101 | " if (tid < s) {\n", 102 | " sdata[tid] += sdata[tid + s];\n", 103 | " }\n", 104 | " __syncthreads();\n", 105 | " }\n", 106 | "\n", 107 | " // write result for this block to global mem\n", 108 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 109 | "}\n", 110 | "\n", 111 | "__global__ void reduce3(int *g_idata, int *g_odata) {\n", 112 | "\n", 113 | " extern __shared__ int sdata[];\n", 114 | " // perform first level of reduction, reading from global memory, writing to shared memory\n", 115 | " unsigned int tid = threadIdx.x;\n", 116 | " unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;\n", 117 | " sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];\n", 118 | " __syncthreads();\n", 119 | "\n", 120 | " // do reduction in shared mem\n", 121 | " for (unsigned int s=blockDim.x/2; s>0; s>>=1) {\n", 122 | " if (tid < s) {\n", 123 | " sdata[tid] += sdata[tid + s];\n", 124 | " }\n", 125 | " __syncthreads();\n", 126 | " }\n", 127 | "\n", 128 | " // write result for this block to global mem\n", 129 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 130 | "}\n", 131 | "\n", 132 | "\n", 133 | "template \n", 134 | "__device__ void warpReduce(volatile int* sdata, int tid) {\n", 135 | " if (blockSize >= 64) sdata[tid] += sdata[tid + 32];\n", 136 | " if (blockSize >= 32) sdata[tid] += sdata[tid + 16];\n", 137 | " if (blockSize >= 16) sdata[tid] += sdata[tid + 8];\n", 138 | " if (blockSize >= 8) sdata[tid] += sdata[tid + 4];\n", 139 | " if (blockSize >= 4) sdata[tid] += sdata[tid + 2];\n", 140 | " if (blockSize >= 2) sdata[tid] += sdata[tid + 1];\n", 141 | "}\n", 142 | "\n", 143 | "__global__ void reduce4(int *g_idata, int *g_odata) {\n", 144 | "\n", 145 | " extern __shared__ int sdata[];\n", 146 | " // perform first level of reduction, reading from global memory, writing to shared memory\n", 147 | " unsigned int tid = threadIdx.x;\n", 148 | " unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;\n", 149 | " sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];\n", 150 | " __syncthreads();\n", 151 | "\n", 152 | " // do reduction in shared mem\n", 153 | " for (unsigned int s=blockDim.x/2; s>32; s>>=1) {\n", 154 | " if (tid < s) {\n", 155 | " sdata[tid] += sdata[tid + s];\n", 156 | " }\n", 157 | " __syncthreads();\n", 158 | " }\n", 159 | " if (tid < 32) warpReduce<512>(sdata, tid);\n", 160 | "\n", 161 | " // write result for this block to global mem\n", 162 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 163 | "}\n", 164 | "\n", 165 | "template \n", 166 | "__global__ void reduce5(int *g_idata, int *g_odata) {\n", 167 | "\n", 168 | " extern __shared__ int sdata[];\n", 169 | " // perform first level of reduction, reading from global memory, writing to shared memory\n", 170 | " unsigned int tid = threadIdx.x;\n", 171 | " unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x;\n", 172 | " sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];\n", 173 | " __syncthreads();\n", 174 | "\n", 175 | " // do reduction in shared mem\n", 176 | " if (blockSize >= 512) {\n", 177 | " if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();\n", 178 | " }\n", 179 | " if (blockSize >= 256) {\n", 180 | " if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();\n", 181 | " }\n", 182 | " if (blockSize >= 128) {\n", 183 | " if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();\n", 184 | " }\n", 185 | "\n", 186 | " if (tid < 32) warpReduce(sdata, tid);\n", 187 | "\n", 188 | " // write result for this block to global mem\n", 189 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 190 | "}\n", 191 | "\n", 192 | "template \n", 193 | "__global__ void reduce6(int *g_idata, int *g_odata, int n) {\n", 194 | "\n", 195 | " extern __shared__ int sdata[];\n", 196 | " // perform first level of reduction, reading from global memory, writing to shared memory\n", 197 | " unsigned int tid = threadIdx.x;\n", 198 | " unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x;\n", 199 | " unsigned int gridSize = blockSize*2*gridDim.x;\n", 200 | " sdata[tid] = 0;\n", 201 | " while (i < n) {\n", 202 | " sdata[tid] += g_idata[i] + g_idata[i+blockSize];\n", 203 | " i += gridSize;\n", 204 | " }\n", 205 | " __syncthreads();\n", 206 | "\n", 207 | " // do reduction in shared mem\n", 208 | " if (blockSize >= 512) {\n", 209 | " if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads();\n", 210 | " }\n", 211 | " if (blockSize >= 256) {\n", 212 | " if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads();\n", 213 | " }\n", 214 | " if (blockSize >= 128) {\n", 215 | " if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads();\n", 216 | " }\n", 217 | "\n", 218 | " if (tid < 32) warpReduce(sdata, tid);\n", 219 | "\n", 220 | " // write result for this block to global mem\n", 221 | " if (tid == 0) g_odata[blockIdx.x] = sdata[0];\n", 222 | "}\n", 223 | "\n", 224 | "\n", 225 | "\n", 226 | "int main(void)\n", 227 | "{\n", 228 | " long int size = 1 << 22;\n", 229 | " long int s;\n", 230 | " int sizeByte = size*sizeof(int);\n", 231 | " int* h_data = (int*) malloc(sizeByte);\n", 232 | "\n", 233 | " for(int i = 0; i < size; i++) {\n", 234 | " // h_data[i] = rand() & 0xFF;\n", 235 | " h_data[i] = i % 10;\n", 236 | " }\n", 237 | "\n", 238 | " long long int sum = 0;\n", 239 | " for(int i = 0; i < size; i++) sum += h_data[i];\n", 240 | " printf(\"CPU results = %lld \\n\", sum);\n", 241 | "\n", 242 | " int* d_idata = NULL;\n", 243 | " int* d_odata = NULL;\n", 244 | " cudaMalloc(&d_idata, sizeByte);\n", 245 | " cudaMalloc(&d_odata, sizeByte);\n", 246 | "\n", 247 | " cudaMemcpy(d_idata, h_data, sizeByte, cudaMemcpyHostToDevice);\n", 248 | "\n", 249 | " s = size >> 2;\n", 250 | " int blocks = (s+512-1)/512;\n", 251 | " reduce6<512><<>>(d_idata, d_odata, size);\n", 252 | " cudaDeviceSynchronize();\n", 253 | " printf(\"The size of array is %ld and it is processed on # of Blocks: %d \\n\", s, blocks/2);\n", 254 | "/*\n", 255 | " s = blocks/2;\n", 256 | " blocks = (s+512-1)/512;\n", 257 | " reduce5<512><<>>(d_odata, d_idata);\n", 258 | " cudaDeviceSynchronize();\n", 259 | " printf(\"The size of array is %ld and it is processed on # of Blocks: %d \\n\", s, blocks/2);\n", 260 | " s = blocks;\n", 261 | " int threadsPerBlock;\n", 262 | " if( s <= 512 ) { threadsPerBlock = s; blocks = 1; }\n", 263 | " reduce2<<>>(d_idata, d_odata);\n", 264 | " cudaDeviceSynchronize();\n", 265 | " */\n", 266 | " cudaMemcpy(h_data, d_odata, sizeof(int), cudaMemcpyDeviceToHost);\n", 267 | " printf(\"GPU result = %d\\n\", h_data[0]);\n", 268 | "\n", 269 | " cudaFree(d_idata);\n", 270 | " cudaFree(d_odata);\n", 271 | " free(h_data);\n", 272 | "}" 273 | ], 274 | "execution_count": 1, 275 | "outputs": [ 276 | { 277 | "output_type": "stream", 278 | "text": [ 279 | "Writing reduction.cu\n" 280 | ], 281 | "name": "stdout" 282 | } 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "metadata": { 288 | "id": "vgQIqpOQSQDy", 289 | "colab_type": "code", 290 | "colab": {} 291 | }, 292 | "source": [ 293 | "!nvcc -o reduction reduction.cu" 294 | ], 295 | "execution_count": 0, 296 | "outputs": [] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "metadata": { 301 | "id": "hLIkKkncSSlF", 302 | "colab_type": "code", 303 | "colab": { 304 | "base_uri": "https://localhost:8080/", 305 | "height": 71 306 | }, 307 | "outputId": "b91ce673-e747-40e2-f599-c608e746ff6c" 308 | }, 309 | "source": [ 310 | "!./reduction" 311 | ], 312 | "execution_count": 3, 313 | "outputs": [ 314 | { 315 | "output_type": "stream", 316 | "text": [ 317 | "CPU results = 18874356 \n", 318 | "The size of array is 1048576 and it is processed on # of Blocks: 1024 \n", 319 | "GPU result = 0\n" 320 | ], 321 | "name": "stdout" 322 | } 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "metadata": { 328 | "id": "e8CxESR7SUDK", 329 | "colab_type": "code", 330 | "colab": {} 331 | }, 332 | "source": [ 333 | "" 334 | ], 335 | "execution_count": 0, 336 | "outputs": [] 337 | } 338 | ] 339 | } -------------------------------------------------------------------------------- /02_cuda_lab/04_atomic.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "04_atomic.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "t5TzSbHmXJdZ", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Testing Atomic Operation !\n", 36 | "\n", 37 | "Please check the youtube video : [Atomic Memory Operations - Intro to Parallel Programming](https://www.youtube.com/watch?v=r-WtkvzKcVA)\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "ftRy1pmLWFR3", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 35 48 | }, 49 | "outputId": "331825c8-8ebc-4717-b11a-8d2c6b73f641" 50 | }, 51 | "source": [ 52 | "%%writefile atomic.cu\n", 53 | "\n", 54 | "#include \n", 55 | "#include \"gputimer.h\"\n", 56 | "#define NUM_THREADS 10000000\n", 57 | "#define ARRAY_SIZE 100\n", 58 | "#define BLOCK_WIDTH 1000\n", 59 | "\n", 60 | "void print_array(int *array, int size)\n", 61 | "{\n", 62 | " printf(\"{ \");\n", 63 | " for (int i = 0; i < size; i++) { printf(\"%d \", array[i]); }\n", 64 | " printf(\"}\\n\");\n", 65 | "}\n", 66 | "\n", 67 | "__global__ void increment_naive(int *g)\n", 68 | "{\n", 69 | "\t// which thread is this?\n", 70 | "\tint i = blockIdx.x * blockDim.x + threadIdx.x; \n", 71 | "\n", 72 | " // each thread to increment consecutive elements, wrapping at ARRAY_SIZE\n", 73 | "\ti = i % ARRAY_SIZE; \n", 74 | "\tg[i] = g[i] + 1;\n", 75 | "}\n", 76 | "\n", 77 | "__global__ void increment_atomic(int *g)\n", 78 | "{\n", 79 | "\t// which thread is this?\n", 80 | "\tint i = blockIdx.x * blockDim.x + threadIdx.x; \n", 81 | "\n", 82 | " // each thread to increment consecutive elements, wrapping at ARRAY_SIZE\n", 83 | "\ti = i % ARRAY_SIZE; \n", 84 | "\tatomicAdd(& g[i], 1);\n", 85 | "}\n", 86 | "\n", 87 | "int main(int argc,char **argv)\n", 88 | "{ \n", 89 | " GpuTimer timer;\n", 90 | " printf(\"%d total threads in %d blocks writing into %d array elements\\n\",\n", 91 | " NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE);\n", 92 | "\n", 93 | " // declare and allocate host memory\n", 94 | " int h_array[ARRAY_SIZE];\n", 95 | " const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int);\n", 96 | "\n", 97 | " // declare, allocate, and zero out GPU memory\n", 98 | " int * d_array;\n", 99 | "\n", 100 | " cudaMalloc((void **) &d_array, ARRAY_BYTES);\n", 101 | " cudaMemset((void *) d_array, 0, ARRAY_BYTES); \n", 102 | "\n", 103 | " // launch the kernel - comment out one of these\n", 104 | " timer.Start();\n", 105 | "\n", 106 | " // Instructions: This program is needed for the next quiz\n", 107 | " // uncomment increment_naive to measure speed and accuracy \n", 108 | " // of non-atomic increments or uncomment increment_atomic to\n", 109 | " // measure speed and accuracy of atomic icrements\n", 110 | " increment_naive<<>>(d_array);\n", 111 | " //increment_atomic<<>>(d_array);\n", 112 | " timer.Stop(); \n", 113 | "\n", 114 | " // copy back the array of sums from GPU and print\n", 115 | " cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost);\n", 116 | " print_array(h_array, ARRAY_SIZE);\n", 117 | " printf(\"Time elapsed = %g ms\\n\", timer.Elapsed());\n", 118 | "\n", 119 | " // free GPU memory allocation and exit\n", 120 | " cudaFree(d_array);\n", 121 | " return 0;\n", 122 | "}" 123 | ], 124 | "execution_count": 7, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "text": [ 129 | "Overwriting atomic.cu\n" 130 | ], 131 | "name": "stdout" 132 | } 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "metadata": { 138 | "id": "uQEYoHuUWmL9", 139 | "colab_type": "code", 140 | "colab": { 141 | "base_uri": "https://localhost:8080/", 142 | "height": 35 143 | }, 144 | "outputId": "31bb7334-a000-49ff-8483-e0c81a11ab54" 145 | }, 146 | "source": [ 147 | "%%writefile gputimer.h\n", 148 | "\n", 149 | "#ifndef __GPU_TIMER_H__\n", 150 | "#define __GPU_TIMER_H__\n", 151 | "\n", 152 | "struct GpuTimer\n", 153 | "{\n", 154 | " cudaEvent_t start;\n", 155 | " cudaEvent_t stop;\n", 156 | " \n", 157 | " GpuTimer()\n", 158 | " {\n", 159 | " cudaEventCreate(&start);\n", 160 | " cudaEventCreate(&stop);\n", 161 | " }\n", 162 | " \n", 163 | " ~GpuTimer()\n", 164 | " {\n", 165 | " cudaEventDestroy(start);\n", 166 | " cudaEventDestroy(stop);\n", 167 | " }\n", 168 | " \n", 169 | " void Start()\n", 170 | " {\n", 171 | " cudaEventRecord(start, 0);\n", 172 | " }\n", 173 | " \n", 174 | " void Stop()\n", 175 | " {\n", 176 | " cudaEventRecord(stop, 0);\n", 177 | " }\n", 178 | " \n", 179 | " float Elapsed()\n", 180 | " {\n", 181 | " float elapsed;\n", 182 | " cudaEventSynchronize(stop);\n", 183 | " cudaEventElapsedTime(&elapsed, start, stop);\n", 184 | " return elapsed;\n", 185 | " }\n", 186 | "};\n", 187 | "\n", 188 | "#endif /* __GPU_TIMER_H__ */" 189 | ], 190 | "execution_count": 4, 191 | "outputs": [ 192 | { 193 | "output_type": "stream", 194 | "text": [ 195 | "Writing gputimer.h\n" 196 | ], 197 | "name": "stdout" 198 | } 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "metadata": { 204 | "id": "0kKVgJpnWJFV", 205 | "colab_type": "code", 206 | "colab": {} 207 | }, 208 | "source": [ 209 | "!nvcc -o atomic atomic.cu" 210 | ], 211 | "execution_count": 0, 212 | "outputs": [] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": { 217 | "id": "KvuUZSHXXO0w", 218 | "colab_type": "text" 219 | }, 220 | "source": [ 221 | "## with atomic operation\n", 222 | "\n", 223 | " // increment_naive<<>>(d_array);\n", 224 | " \n", 225 | " increment_atomic<<>>(d_array);" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "metadata": { 231 | "id": "3B7O1VY3WPl-", 232 | "colab_type": "code", 233 | "colab": { 234 | "base_uri": "https://localhost:8080/", 235 | "height": 91 236 | }, 237 | "outputId": "7ca6dccc-f58f-4ba8-fd2a-84d363db81c9" 238 | }, 239 | "source": [ 240 | "!./atomic" 241 | ], 242 | "execution_count": 6, 243 | "outputs": [ 244 | { 245 | "output_type": "stream", 246 | "text": [ 247 | "10000000 total threads in 10000 blocks writing into 100 array elements\n", 248 | "{ 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 100000 }\n", 249 | "Time elapsed = 1.49888 ms\n" 250 | ], 251 | "name": "stdout" 252 | } 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "id": "mAA-RvGZXTbf", 259 | "colab_type": "text" 260 | }, 261 | "source": [ 262 | "## without atomic operation\n", 263 | "\n", 264 | " increment_naive<<>>(d_array);\n", 265 | " \n", 266 | " //increment_atomic<<>>(d_array);" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "metadata": { 272 | "id": "T5_LSzp7W54Z", 273 | "colab_type": "code", 274 | "colab": { 275 | "base_uri": "https://localhost:8080/", 276 | "height": 91 277 | }, 278 | "outputId": "bc968432-6866-4d65-9092-c28a980e6051" 279 | }, 280 | "source": [ 281 | "!./atomic" 282 | ], 283 | "execution_count": 9, 284 | "outputs": [ 285 | { 286 | "output_type": "stream", 287 | "text": [ 288 | "10000000 total threads in 10000 blocks writing into 100 array elements\n", 289 | "{ 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 251 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 251 251 251 251 }\n", 290 | "Time elapsed = 0.237568 ms\n" 291 | ], 292 | "name": "stdout" 293 | } 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "metadata": { 299 | "id": "R578xGaqW5sH", 300 | "colab_type": "code", 301 | "colab": {} 302 | }, 303 | "source": [ 304 | "" 305 | ], 306 | "execution_count": 0, 307 | "outputs": [] 308 | } 309 | ] 310 | } -------------------------------------------------------------------------------- /02_cuda_lab/README.md: -------------------------------------------------------------------------------- 1 | ## 02_cuda_lab 2 | 3 | -------------------------------------------------------------------------------- /02_cuda_lab/atomicAdd.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "gputimer.h" 3 | #define NUM_THREADS 10000000 4 | #define ARRAY_SIZE 100 5 | #define BLOCK_WIDTH 1000 6 | 7 | void print_array(int *array, int size) 8 | { 9 | printf("{ "); 10 | for (int i = 0; i < size; i++) { printf("%d ", array[i]); } 11 | printf("}\n"); 12 | } 13 | 14 | __global__ void increment_naive(int *g) 15 | { 16 | // which thread is this? 17 | int i = blockIdx.x * blockDim.x + threadIdx.x; 18 | 19 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 20 | i = i % ARRAY_SIZE; 21 | g[i] = g[i] + 1; 22 | } 23 | 24 | __global__ void increment_atomic(int *g) 25 | { 26 | // which thread is this? 27 | int i = blockIdx.x * blockDim.x + threadIdx.x; 28 | 29 | // each thread to increment consecutive elements, wrapping at ARRAY_SIZE 30 | i = i % ARRAY_SIZE; 31 | atomicAdd(& g[i], 1); 32 | } 33 | 34 | int main(int argc,char **argv) 35 | { 36 | GpuTimer timer; 37 | printf("%d total threads in %d blocks writing into %d array elements\n", 38 | NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); 39 | 40 | // declare and allocate host memory 41 | int h_array[ARRAY_SIZE]; 42 | const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); 43 | 44 | // declare, allocate, and zero out GPU memory 45 | int * d_array; 46 | 47 | cudaMalloc((void **) &d_array, ARRAY_BYTES); 48 | cudaMemset((void *) d_array, 0, ARRAY_BYTES); 49 | 50 | // launch the kernel - comment out one of these 51 | timer.Start(); 52 | 53 | // Instructions: This program is needed for the next quiz 54 | // uncomment increment_naive to measure speed and accuracy 55 | // of non-atomic increments or uncomment increment_atomic to 56 | // measure speed and accuracy of atomic icrements 57 | // increment_naive<<>>(d_array); 58 | increment_atomic<<>>(d_array); 59 | timer.Stop(); 60 | 61 | // copy back the array of sums from GPU and print 62 | cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost); 63 | print_array(h_array, ARRAY_SIZE); 64 | printf("Time elapsed = %g ms\n", timer.Elapsed()); 65 | 66 | // free GPU memory allocation and exit 67 | cudaFree(d_array); 68 | return 0; 69 | } 70 | -------------------------------------------------------------------------------- /02_cuda_lab/clock.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // This example shows how to use the clock function to measure the performance of 13 | // a kernel accurately. 14 | // 15 | // Blocks are executed in parallel and out of order. Since there's no synchronization 16 | // mechanism between blocks, we measure the clock once for each block. The clock 17 | // samples are written to device memory. 18 | 19 | // System includes 20 | #include 21 | #include 22 | #include 23 | 24 | // CUDA runtime 25 | #include 26 | 27 | // helper functions and utilities to work with CUDA 28 | #include 29 | #include 30 | 31 | // This kernel computes a standard parallel reduction and evaluates the 32 | // time it takes to do that for each block. The timing results are stored 33 | // in device memory. 34 | __global__ static void timedReduction(const float *input, float *output, clock_t *timer) 35 | { 36 | // __shared__ float shared[2 * blockDim.x]; 37 | extern __shared__ float shared[]; 38 | 39 | const int tid = threadIdx.x; 40 | const int bid = blockIdx.x; 41 | 42 | if (tid == 0) timer[bid] = clock(); 43 | 44 | // Copy input. 45 | shared[tid] = input[tid]; 46 | shared[tid + blockDim.x] = input[tid + blockDim.x]; 47 | 48 | // Perform reduction to find minimum. 49 | for (int d = blockDim.x; d > 0; d /= 2) 50 | { 51 | __syncthreads(); 52 | 53 | if (tid < d) 54 | { 55 | float f0 = shared[tid]; 56 | float f1 = shared[tid + d]; 57 | 58 | if (f1 < f0) 59 | { 60 | shared[tid] = f1; 61 | } 62 | } 63 | } 64 | 65 | // Write result. 66 | if (tid == 0) output[bid] = shared[0]; 67 | 68 | __syncthreads(); 69 | 70 | if (tid == 0) timer[bid+gridDim.x] = clock(); 71 | } 72 | 73 | 74 | // This example shows how to use the clock function to measure the performance of 75 | // a kernel accurately. 76 | // 77 | // Blocks are executed in parallel and out of order. Since there's no synchronization 78 | // mechanism between blocks, we measure the clock once for each block. The clock 79 | // samples are written to device memory. 80 | 81 | #define NUM_BLOCKS 64 82 | #define NUM_THREADS 256 83 | 84 | // It's interesting to change the number of blocks and the number of threads to 85 | // understand how to keep the hardware busy. 86 | // 87 | // Here are some numbers I get on my G80: 88 | // blocks - clocks 89 | // 1 - 3096 90 | // 8 - 3232 91 | // 16 - 3364 92 | // 32 - 4615 93 | // 64 - 9981 94 | // 95 | // With less than 16 blocks some of the multiprocessors of the device are idle. With 96 | // more than 16 you are using all the multiprocessors, but there's only one block per 97 | // multiprocessor and that doesn't allow you to hide the latency of the memory. With 98 | // more than 32 the speed scales linearly. 99 | 100 | // Start the main CUDA Sample here 101 | int main(int argc, char **argv) 102 | { 103 | printf("CUDA Clock sample\n"); 104 | 105 | // This will pick the best possible CUDA capable device 106 | int dev = findCudaDevice(argc, (const char **)argv); 107 | 108 | float *dinput = NULL; 109 | float *doutput = NULL; 110 | clock_t *dtimer = NULL; 111 | 112 | clock_t timer[NUM_BLOCKS * 2]; 113 | float input[NUM_THREADS * 2]; 114 | 115 | for (int i = 0; i < NUM_THREADS * 2; i++) 116 | { 117 | input[i] = (float)i; 118 | } 119 | 120 | checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); 121 | checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); 122 | checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); 123 | 124 | checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)); 125 | 126 | timedReduction<<>>(dinput, doutput, dtimer); 127 | 128 | checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)); 129 | 130 | checkCudaErrors(cudaFree(dinput)); 131 | checkCudaErrors(cudaFree(doutput)); 132 | checkCudaErrors(cudaFree(dtimer)); 133 | 134 | 135 | // Compute the difference between the last block end and the first block start. 136 | clock_t minStart = timer[0]; 137 | clock_t maxEnd = timer[NUM_BLOCKS]; 138 | 139 | for (int i = 1; i < NUM_BLOCKS; i++) 140 | { 141 | minStart = timer[i] < minStart ? timer[i] : minStart; 142 | maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd; 143 | } 144 | 145 | printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart)); 146 | 147 | 148 | // cudaDeviceReset causes the driver to clean up all state. While 149 | // not mandatory in normal operation, it is good practice. It is also 150 | // needed to ensure correct operation when the application is being 151 | // profiled. Calling cudaDeviceReset causes all profile data to be 152 | // flushed before the application exits 153 | cudaDeviceReset(); 154 | 155 | return EXIT_SUCCESS; 156 | } 157 | -------------------------------------------------------------------------------- /02_cuda_lab/gpu_timer.h: -------------------------------------------------------------------------------- 1 | #ifndef __GPU_TIMER_H__ 2 | #define __GPU_TIMER_H__ 3 | 4 | struct GpuTimer 5 | { 6 | cudaEvent_t start; 7 | cudaEvent_t stop; 8 | 9 | GpuTimer() 10 | { 11 | cudaEventCreate(&start); 12 | cudaEventCreate(&stop); 13 | } 14 | 15 | ~GpuTimer() 16 | { 17 | cudaEventDestroy(start); 18 | cudaEventDestroy(stop); 19 | } 20 | 21 | void Start() 22 | { 23 | cudaEventRecord(start, 0); 24 | } 25 | 26 | void Stop() 27 | { 28 | cudaEventRecord(stop, 0); 29 | } 30 | 31 | float Elapsed() 32 | { 33 | float elapsed; 34 | cudaEventSynchronize(stop); 35 | cudaEventElapsedTime(&elapsed, start, stop); 36 | return elapsed; 37 | } 38 | }; 39 | 40 | #endif /* __GPU_TIMER_H__ */ 41 | -------------------------------------------------------------------------------- /02_cuda_lab/reduction_all.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void reduce0(int *g_idata, int *g_odata) { 4 | 5 | extern __shared__ int sdata[]; 6 | // each thread loads one element from global to shared mem 7 | unsigned int tid = threadIdx.x; 8 | unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 9 | sdata[tid] = g_idata[i]; 10 | __syncthreads(); 11 | // do reduction in shared mem 12 | for(unsigned int s=1; s < blockDim.x; s *= 2) { 13 | if (tid % (2*s) == 0) { 14 | sdata[tid] += sdata[tid + s]; 15 | } 16 | __syncthreads(); 17 | } 18 | // write result for this block to global mem 19 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 20 | } 21 | 22 | __global__ void reduce1(int *g_idata, int *g_odata) { 23 | 24 | extern __shared__ int sdata[]; 25 | // each thread loads one element from global to shared mem 26 | unsigned int tid = threadIdx.x; 27 | unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 28 | sdata[tid] = g_idata[i]; 29 | __syncthreads(); 30 | // do reduction in shared mem 31 | for(unsigned int s=1; s < blockDim.x; s *= 2) { 32 | int index = 2 * s * tid; 33 | if (index < blockDim.x) { 34 | sdata[index] += sdata[index + s]; 35 | } 36 | __syncthreads(); 37 | } 38 | // write result for this block to global mem 39 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 40 | } 41 | __global__ void reduce2(int *g_idata, int *g_odata) { 42 | 43 | extern __shared__ int sdata[]; 44 | // each thread loads one element from global to shared mem 45 | unsigned int tid = threadIdx.x; 46 | unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; 47 | sdata[tid] = g_idata[i]; 48 | __syncthreads(); 49 | // do reduction in shared mem 50 | for (unsigned int s=blockDim.x/2; s>0; s>>=1) { 51 | if (tid < s) { 52 | sdata[tid] += sdata[tid + s]; 53 | } 54 | __syncthreads(); 55 | } 56 | 57 | // write result for this block to global mem 58 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 59 | } 60 | 61 | __global__ void reduce3(int *g_idata, int *g_odata) { 62 | 63 | extern __shared__ int sdata[]; 64 | // perform first level of reduction, reading from global memory, writing to shared memory 65 | unsigned int tid = threadIdx.x; 66 | unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; 67 | sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; 68 | __syncthreads(); 69 | 70 | // do reduction in shared mem 71 | for (unsigned int s=blockDim.x/2; s>0; s>>=1) { 72 | if (tid < s) { 73 | sdata[tid] += sdata[tid + s]; 74 | } 75 | __syncthreads(); 76 | } 77 | 78 | // write result for this block to global mem 79 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 80 | } 81 | 82 | 83 | template 84 | __device__ void warpReduce(volatile int* sdata, int tid) { 85 | if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; 86 | if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; 87 | if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; 88 | if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; 89 | if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; 90 | if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; 91 | } 92 | 93 | __global__ void reduce4(int *g_idata, int *g_odata) { 94 | 95 | extern __shared__ int sdata[]; 96 | // perform first level of reduction, reading from global memory, writing to shared memory 97 | unsigned int tid = threadIdx.x; 98 | unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; 99 | sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; 100 | __syncthreads(); 101 | 102 | // do reduction in shared mem 103 | for (unsigned int s=blockDim.x/2; s>32; s>>=1) { 104 | if (tid < s) { 105 | sdata[tid] += sdata[tid + s]; 106 | } 107 | __syncthreads(); 108 | } 109 | if (tid < 32) warpReduce<512>(sdata, tid); 110 | 111 | // write result for this block to global mem 112 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 113 | } 114 | 115 | template 116 | __global__ void reduce5(int *g_idata, int *g_odata) { 117 | 118 | extern __shared__ int sdata[]; 119 | // perform first level of reduction, reading from global memory, writing to shared memory 120 | unsigned int tid = threadIdx.x; 121 | unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; 122 | sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; 123 | __syncthreads(); 124 | 125 | // do reduction in shared mem 126 | if (blockSize >= 512) { 127 | if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); 128 | } 129 | if (blockSize >= 256) { 130 | if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); 131 | } 132 | if (blockSize >= 128) { 133 | if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); 134 | } 135 | 136 | if (tid < 32) warpReduce(sdata, tid); 137 | 138 | // write result for this block to global mem 139 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 140 | } 141 | 142 | template 143 | __global__ void reduce6(int *g_idata, int *g_odata, int n) { 144 | 145 | extern __shared__ int sdata[]; 146 | // perform first level of reduction, reading from global memory, writing to shared memory 147 | unsigned int tid = threadIdx.x; 148 | unsigned int i = blockIdx.x*(blockSize*2) + threadIdx.x; 149 | unsigned int gridSize = blockSize*2*gridDim.x; 150 | sdata[tid] = 0; 151 | while (i < n) { 152 | sdata[tid] += g_idata[i] + g_idata[i+blockSize]; 153 | i += gridSize; 154 | } 155 | __syncthreads(); 156 | 157 | // do reduction in shared mem 158 | if (blockSize >= 512) { 159 | if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); 160 | } 161 | if (blockSize >= 256) { 162 | if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); 163 | } 164 | if (blockSize >= 128) { 165 | if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); 166 | } 167 | 168 | if (tid < 32) warpReduce(sdata, tid); 169 | 170 | // write result for this block to global mem 171 | if (tid == 0) g_odata[blockIdx.x] = sdata[0]; 172 | } 173 | 174 | 175 | 176 | int main(void) 177 | { 178 | long int size = 1 << 22; 179 | long int s; 180 | int sizeByte = size*sizeof(int); 181 | int* h_data = (int*) malloc(sizeByte); 182 | 183 | for(int i = 0; i < size; i++) { 184 | // h_data[i] = rand() & 0xFF; 185 | h_data[i] = i % 10; 186 | } 187 | 188 | long long int sum = 0; 189 | for(int i = 0; i < size; i++) sum += h_data[i]; 190 | printf("CPU results = %lld \n", sum); 191 | 192 | int* d_idata = NULL; 193 | int* d_odata = NULL; 194 | cudaMalloc(&d_idata, sizeByte); 195 | cudaMalloc(&d_odata, sizeByte); 196 | 197 | cudaMemcpy(d_idata, h_data, sizeByte, cudaMemcpyHostToDevice); 198 | 199 | s = size >> 2; 200 | int blocks = (s+512-1)/512; 201 | reduce6<512><<>>(d_idata, d_odata, size); 202 | cudaDeviceSynchronize(); 203 | printf("The size of array is %ld and it is processed on # of Blocks: %d \n", s, blocks/2); 204 | /* 205 | s = blocks/2; 206 | blocks = (s+512-1)/512; 207 | reduce5<512><<>>(d_odata, d_idata); 208 | cudaDeviceSynchronize(); 209 | printf("The size of array is %ld and it is processed on # of Blocks: %d \n", s, blocks/2); 210 | 211 | 212 | s = blocks; 213 | int threadsPerBlock; 214 | if( s <= 512 ) { threadsPerBlock = s; blocks = 1; } 215 | reduce2<<>>(d_idata, d_odata); 216 | cudaDeviceSynchronize(); 217 | */ 218 | cudaMemcpy(h_data, d_odata, sizeof(int), cudaMemcpyDeviceToHost); 219 | printf("GPU result = %d\n", h_data[0]); 220 | 221 | cudaFree(d_idata); 222 | cudaFree(d_odata); 223 | free(h_data); 224 | } 225 | 226 | -------------------------------------------------------------------------------- /03_cuda_lab/01_matmul.cu: -------------------------------------------------------------------------------- 1 | // 03 Matrix Mulplication 2 | #include 3 | 4 | __global__ void MatrixMul(int *M, int *N, int *P, int width) 5 | { 6 | int accu = 0; 7 | 8 | // Block index 9 | int bx = blockIdx.x; 10 | int by = blockIdx.y; 11 | 12 | // Thread index 13 | int tx = threadIdx.x; 14 | int ty = threadIdx.y; 15 | 16 | int i = by * blockDim.y + ty; 17 | int j = bx * blockDim.x + tx; 18 | 19 | for(int k=0; k>>(d_A, d_B, d_C, size); 69 | cudaDeviceSynchronize(); 70 | printf("GPU Computing Finished !\n"); 71 | cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost); 72 | 73 | for(i = 0; i < size; i++) { 74 | for(j = 0; j < size; j++) 75 | printf("%d ", h_gC[i*size+j]); 76 | printf("\n"); 77 | } 78 | 79 | for(i = 0; i < size; i++) 80 | for(j = 0; j < size; j++) 81 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 82 | printf("Error !\n"); 83 | cudaFree(d_A); 84 | cudaFree(d_B); 85 | cudaFree(d_C); 86 | free(h_A); 87 | free(h_B); 88 | free(h_C); 89 | free(h_gC); 90 | exit(1); 91 | } 92 | 93 | printf("Success ! \n"); 94 | 95 | cudaFree(d_A); 96 | cudaFree(d_B); 97 | cudaFree(d_C); 98 | free(h_A); 99 | free(h_B); 100 | free(h_C); 101 | free(h_gC); 102 | 103 | exit(0); 104 | 105 | } 106 | -------------------------------------------------------------------------------- /03_cuda_lab/02_matmul_tile.cu: -------------------------------------------------------------------------------- 1 | # Tiled Version 1 2 | 3 | #include 4 | 5 | __global__ void MatrixMul(int *M, int *N, int *P, int width) 6 | { 7 | int bx = blockIdx.x; 8 | int by = blockIdx.y; 9 | 10 | int tx = threadIdx.x; 11 | int ty = threadIdx.y; 12 | 13 | //int i = by * blockDim.y + ty; 14 | //int j = bx * blockDim.x + tx; 15 | 16 | const int tile_size = 16; 17 | 18 | __shared__ int As[tile_size][tile_size]; 19 | __shared__ int Bs[tile_size][tile_size]; 20 | 21 | int aBegin = width * tile_size * by; 22 | int aEnd = aBegin + width - 1; 23 | int aStep = tile_size; 24 | 25 | int bBegin = tile_size * bx; 26 | int bStep = tile_size * width; 27 | 28 | int Csub = 0; 29 | int a, b; 30 | 31 | for (a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 32 | { 33 | As[ty][tx] = M[a + width * ty + tx]; 34 | Bs[tx][ty] = N[b + width * tx + ty]; // Not memory coelescing !!! 35 | __syncthreads(); 36 | 37 | for (int k = 0; k < tile_size; ++k) 38 | { 39 | Csub += As[ty][k] * Bs[k][tx]; // No Bank Conflict 40 | } 41 | __syncthreads(); 42 | } 43 | 44 | int c = width * tile_size * by + tile_size * bx; 45 | P[c + width * ty + tx] = Csub; 46 | } 47 | 48 | int main(void) 49 | { 50 | int i, j, k; 51 | int size=1024; 52 | int *h_A, *h_B, *h_C, *h_gC; 53 | int *d_A, *d_B, *d_C; 54 | 55 | int sizeByte = sizeof(int)*size*size; 56 | h_A = (int *) malloc(sizeByte); 57 | h_B = (int *) malloc(sizeByte); 58 | h_C = (int *) malloc(sizeByte); 59 | h_gC = (int *) malloc(sizeByte); 60 | 61 | for(i = 0; i < size*size; i++) h_A[i] = 1; 62 | for(i = 0; i < size*size; i++) h_B[i] = 2; 63 | 64 | printf("Host Computing Statrs !\n"); 65 | for(i = 0; i < size; i++) 66 | for(j = 0; j < size; j++) { 67 | h_C[i*size+j] = 0; 68 | for(k = 0; k < size; k++) 69 | h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j]; 70 | } 71 | printf("Host Computing Finished !\n"); 72 | /* 73 | for(i = 0; i < size; i++) { 74 | for(j = 0; j < size; j++) 75 | printf("%d ", h_C[i*size+j]); 76 | printf("\n"); 77 | } 78 | */ 79 | cudaMalloc(&d_A, sizeByte); 80 | cudaMalloc(&d_B, sizeByte); 81 | cudaMalloc(&d_C, sizeByte); 82 | 83 | cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice); 84 | cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice); 85 | 86 | 87 | printf("GPU Computing Statrs !\n"); 88 | dim3 blocks(size/16, size/16); 89 | dim3 threads(16, 16); 90 | MatrixMul<<>>(d_A, d_B, d_C, size); 91 | cudaDeviceSynchronize(); 92 | printf("GPU Computing Finished !\n"); 93 | cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost); 94 | /* 95 | for(i = 0; i < size; i++) { 96 | for(j = 0; j < size; j++) 97 | printf("%d ", h_gC[i*size+j]); 98 | printf("\n"); 99 | } 100 | */ 101 | for(i = 0; i < size; i++) 102 | for(j = 0; j < size; j++) 103 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 104 | for(i = 0; i < size; i++) 105 | for(j = 0; j < size; j++) 106 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 107 | printf("Error !\n"); 108 | cudaFree(d_A); 109 | cudaFree(d_B); 110 | cudaFree(d_C); 111 | free(h_A); 112 | free(h_B); 113 | free(h_C); 114 | free(h_gC); 115 | exit(1); 116 | } 117 | 118 | printf("Success ! \n"); 119 | 120 | cudaFree(d_A); 121 | cudaFree(d_B); 122 | cudaFree(d_C); 123 | free(h_A); 124 | free(h_B); 125 | free(h_C); 126 | free(h_gC); 127 | 128 | exit(0); 129 | 130 | } 131 | -------------------------------------------------------------------------------- /03_cuda_lab/03_matmul_tile2_mem_coel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void MatrixMul(int *M, int *N, int *P, int width) 4 | { 5 | int bx = blockIdx.x; 6 | int by = blockIdx.y; 7 | 8 | int tx = threadIdx.x; 9 | int ty = threadIdx.y; 10 | 11 | //int i = by * blockDim.y + ty; 12 | //int j = bx * blockDim.x + tx; 13 | 14 | const int tile_size = 16; 15 | 16 | __shared__ int As[tile_size][tile_size]; 17 | __shared__ int Bs[tile_size][tile_size]; 18 | 19 | int aBegin = width * tile_size * by; 20 | int aEnd = aBegin + width - 1; 21 | int aStep = tile_size; 22 | 23 | int bBegin = tile_size * bx; 24 | int bStep = tile_size * width; 25 | 26 | int Csub = 0; 27 | int a, b; 28 | 29 | for (a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 30 | { 31 | As[ty][tx] = M[a + width * ty + tx]; // Memory coelescing ! 32 | Bs[tx][ty] = N[b + width * ty + tx]; // Memory coelescing ! 33 | __syncthreads(); 34 | 35 | for (int k = 0; k < tile_size; ++k) 36 | { 37 | // For Memory Coalescing : Bs[k][tx] -> Bs[tx][k] 38 | Csub += As[ty][k] * Bs[tx][k]; // Bank Conflict on Bs[tx][k] 39 | // It causes Bank Conflict 40 | } 41 | __syncthreads(); 42 | } 43 | 44 | int c = width * tile_size * by + tile_size * bx; 45 | P[c + width * ty + tx] = Csub; 46 | } 47 | 48 | int main(void) 49 | { 50 | int i, j, k; 51 | int size=1024; 52 | int *h_A, *h_B, *h_C, *h_gC; 53 | int *d_A, *d_B, *d_C; 54 | 55 | int sizeByte = sizeof(int)*size*size; 56 | h_A = (int *) malloc(sizeByte); 57 | h_B = (int *) malloc(sizeByte); 58 | h_C = (int *) malloc(sizeByte); 59 | h_gC = (int *) malloc(sizeByte); 60 | 61 | for(i = 0; i < size*size; i++) h_A[i] = 1; 62 | for(i = 0; i < size*size; i++) h_B[i] = 2; 63 | 64 | printf("Host Computing Statrs !\n"); 65 | for(i = 0; i < size; i++) 66 | for(j = 0; j < size; j++) { 67 | h_C[i*size+j] = 0; 68 | for(k = 0; k < size; k++) 69 | h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j]; 70 | } 71 | printf("Host Computing Finished !\n"); 72 | /* 73 | for(i = 0; i < size; i++) { 74 | for(j = 0; j < size; j++) 75 | printf("%d ", h_C[i*size+j]); 76 | printf("\n"); 77 | } 78 | */ 79 | cudaMalloc(&d_A, sizeByte); 80 | cudaMalloc(&d_B, sizeByte); 81 | cudaMalloc(&d_C, sizeByte); 82 | 83 | cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice); 84 | cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice); 85 | 86 | 87 | printf("GPU Computing Statrs !\n"); 88 | dim3 blocks(size/16, size/16); 89 | dim3 threads(16, 16); 90 | MatrixMul<<>>(d_A, d_B, d_C, size); 91 | cudaDeviceSynchronize(); 92 | printf("GPU Computing Finished !\n"); 93 | cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost); 94 | /* 95 | for(i = 0; i < size; i++) { 96 | for(j = 0; j < size; j++) 97 | printf("%d ", h_gC[i*size+j]); 98 | printf("\n"); 99 | } 100 | */ 101 | for(i = 0; i < size; i++) 102 | for(j = 0; j < size; j++) 103 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 104 | printf("Error !\n"); 105 | for(i = 0; i < size; i++) 106 | for(j = 0; j < size; j++) 107 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 108 | printf("Error !\n"); 109 | cudaFree(d_A); 110 | cudaFree(d_B); 111 | cudaFree(d_C); 112 | free(h_A); 113 | free(h_B); 114 | free(h_C); 115 | free(h_gC); 116 | exit(1); 117 | } 118 | 119 | printf("Success ! \n"); 120 | 121 | cudaFree(d_A); 122 | cudaFree(d_B); 123 | cudaFree(d_C); 124 | free(h_A); 125 | free(h_B); 126 | free(h_C); 127 | free(h_gC); 128 | 129 | exit(0); 130 | 131 | } 132 | -------------------------------------------------------------------------------- /03_cuda_lab/04_matmul_tile3_noBankConflict.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void MatrixMul(int *M, int *N, int *P, int width) 4 | { 5 | int bx = blockIdx.x; 6 | int by = blockIdx.y; 7 | 8 | int tx = threadIdx.x; 9 | int ty = threadIdx.y; 10 | 11 | //int i = by * blockDim.y + ty; 12 | //int j = bx * blockDim.x + tx; 13 | 14 | const int tile_size = 16; // tile size 15 | 16 | __shared__ int As[tile_size][tile_size]; 17 | __shared__ int Bs[tile_size][tile_size]; 18 | 19 | int aBegin = width * tile_size * by; 20 | int aEnd = aBegin + width - 1; 21 | int aStep = tile_size; 22 | 23 | int bBegin = tile_size * bx; 24 | int bStep = tile_size * width; 25 | 26 | int Csub = 0; 27 | int a, b; 28 | 29 | for (a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 30 | { 31 | As[ty][tx] = M[a + width * ty + tx]; // <<----------- 32 | Bs[ty][tx] = N[b + width * ty + tx]; // <<------------ 33 | __syncthreads(); 34 | 35 | for (int k = 0; k < tile_size; ++k) 36 | { 37 | //Avoid Bank Conflict : Bs[tx][k] -> Bs[k][tx] 38 | Csub += As[ty][k] * Bs[k][tx]; // No Bank Conflict on Bs with an Interleaved Memory Banks 39 | // As[ty][k] is broadcasting to all threads 40 | } 41 | __syncthreads(); 42 | } 43 | 44 | int c = width * tile_size * by + tile_size * bx; 45 | P[c + width * ty + tx] = Csub; 46 | } 47 | 48 | int main(void) 49 | { 50 | int i, j, k; 51 | int size=1024; 52 | int *h_A, *h_B, *h_C, *h_gC; 53 | int *d_A, *d_B, *d_C; 54 | 55 | int sizeByte = sizeof(int)*size*size; 56 | h_A = (int *) malloc(sizeByte); 57 | h_B = (int *) malloc(sizeByte); 58 | h_C = (int *) malloc(sizeByte); 59 | h_gC = (int *) malloc(sizeByte); 60 | 61 | for(i = 0; i < size*size; i++) h_A[i] = 1; 62 | for(i = 0; i < size*size; i++) h_B[i] = 2; 63 | 64 | printf("Host Computing Statrs !\n"); 65 | for(i = 0; i < size; i++) 66 | for(j = 0; j < size; j++) { 67 | h_C[i*size+j] = 0; 68 | for(k = 0; k < size; k++) 69 | h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j]; 70 | } 71 | printf("Host Computing Finished !\n"); 72 | /* 73 | for(i = 0; i < size; i++) { 74 | for(j = 0; j < size; j++) 75 | printf("%d ", h_C[i*size+j]); 76 | printf("\n"); 77 | } 78 | */ 79 | cudaMalloc(&d_A, sizeByte); 80 | cudaMalloc(&d_B, sizeByte); 81 | cudaMalloc(&d_C, sizeByte); 82 | cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice); 83 | cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice); 84 | 85 | 86 | printf("GPU Computing Statrs !\n"); 87 | dim3 blocks(size/16, size/16); 88 | dim3 threads(16, 16); 89 | MatrixMul<<>>(d_A, d_B, d_C, size); 90 | cudaDeviceSynchronize(); 91 | printf("GPU Computing Finished !\n"); 92 | cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost); 93 | /* 94 | for(i = 0; i < size; i++) { 95 | for(j = 0; j < size; j++) 96 | printf("%d ", h_gC[i*size+j]); 97 | printf("\n"); 98 | } 99 | */ 100 | for(i = 0; i < size; i++) 101 | for(j = 0; j < size; j++) 102 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 103 | printf("Error !\n"); 104 | for(i = 0; i < size; i++) 105 | for(j = 0; j < size; j++) 106 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 107 | printf("Error !\n"); 108 | cudaFree(d_A); 109 | cudaFree(d_B); 110 | cudaFree(d_C); 111 | free(h_A); 112 | free(h_B); 113 | free(h_C); 114 | free(h_gC); 115 | exit(1); 116 | } 117 | 118 | printf("Success ! \n"); 119 | 120 | cudaFree(d_A); 121 | cudaFree(d_B); 122 | cudaFree(d_C); 123 | free(h_A); 124 | free(h_B); 125 | free(h_C); 126 | free(h_gC); 127 | 128 | exit(0); 129 | } 130 | -------------------------------------------------------------------------------- /03_cuda_lab/05_matmul_tile4_unroll.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void MatrixMul(int *M, int *N, int *P, int width) 4 | { 5 | int bx = blockIdx.x; 6 | int by = blockIdx.y; 7 | 8 | int tx = threadIdx.x; 9 | int ty = threadIdx.y; 10 | 11 | //int i = by * blockDim.y + ty; 12 | //int j = bx * blockDim.x + tx; 13 | 14 | const int tile_size = 16; // tile size 15 | 16 | __shared__ int As[tile_size][tile_size]; 17 | __shared__ int Bs[tile_size][tile_size]; 18 | 19 | int aBegin = width * tile_size * by; 20 | int aEnd = aBegin + width - 1; 21 | int aStep = tile_size; 22 | 23 | int bBegin = tile_size * bx; 24 | int bStep = tile_size * width; 25 | 26 | int Csub = 0; 27 | int a, b; 28 | 29 | for (a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) 30 | { 31 | As[ty][tx] = M[a + width * ty + tx]; 32 | Bs[ty][tx] = N[b + width * ty + tx]; 33 | __syncthreads(); 34 | 35 | //for (int k = 0; k < tile_size; ++k) 36 | //{ 37 | // Csub += As[ty][k] * Bs[k][tx]; 38 | //} 39 | // Loop Unrolling 40 | Csub += As[ty][0] * Bs[0][tx]; 41 | Csub += As[ty][1] * Bs[1][tx]; 42 | Csub += As[ty][2] * Bs[2][tx]; 43 | Csub += As[ty][3] * Bs[3][tx]; 44 | Csub += As[ty][4] * Bs[4][tx]; 45 | Csub += As[ty][5] * Bs[5][tx]; 46 | Csub += As[ty][6] * Bs[6][tx]; 47 | Csub += As[ty][7] * Bs[7][tx]; 48 | Csub += As[ty][8] * Bs[8][tx]; 49 | Csub += As[ty][9] * Bs[9][tx]; 50 | Csub += As[ty][10] * Bs[10][tx]; 51 | Csub += As[ty][11] * Bs[11][tx]; 52 | Csub += As[ty][12] * Bs[12][tx]; 53 | Csub += As[ty][13] * Bs[13][tx]; 54 | Csub += As[ty][14] * Bs[14][tx]; 55 | Csub += As[ty][15] * Bs[15][tx]; 56 | __syncthreads(); 57 | } 58 | 59 | int c = width * tile_size * by + tile_size * bx; 60 | P[c + width * ty + tx] = Csub; 61 | } 62 | 63 | int main(void) 64 | { 65 | int i, j, k; 66 | int size=1024; 67 | int *h_A, *h_B, *h_C, *h_gC; 68 | int *d_A, *d_B, *d_C; 69 | 70 | int sizeByte = sizeof(int)*size*size; 71 | h_A = (int *) malloc(sizeByte); 72 | h_B = (int *) malloc(sizeByte); 73 | h_C = (int *) malloc(sizeByte); 74 | h_gC = (int *) malloc(sizeByte); 75 | 76 | for(i = 0; i < size*size; i++) h_A[i] = 1; 77 | for(i = 0; i < size*size; i++) h_B[i] = 2; 78 | 79 | printf("Host Computing Statrs !\n"); 80 | for(i = 0; i < size; i++) 81 | for(j = 0; j < size; j++) { 82 | h_C[i*size+j] = 0; 83 | for(k = 0; k < size; k++) 84 | h_C[i*size+j] += h_A[i*size+k]*h_B[k*size+j]; 85 | } 86 | printf("Host Computing Finished !\n"); 87 | /* 88 | for(i = 0; i < size; i++) { 89 | for(j = 0; j < size; j++) 90 | printf("%d ", h_C[i*size+j]); 91 | printf("\n"); 92 | } 93 | */ 94 | cudaMalloc(&d_A, sizeByte); 95 | cudaMalloc(&d_B, sizeByte); 96 | cudaMalloc(&d_C, sizeByte); 97 | 98 | cudaMemcpy(d_A, h_A, sizeByte, cudaMemcpyHostToDevice); 99 | cudaMemcpy(d_B, h_B, sizeByte, cudaMemcpyHostToDevice); 100 | 101 | 102 | printf("GPU Computing Statrs !\n"); 103 | dim3 blocks(size/16, size/16); 104 | dim3 threads(16, 16); 105 | MatrixMul<<>>(d_A, d_B, d_C, size); 106 | cudaDeviceSynchronize(); 107 | printf("GPU Computing Finished !\n"); 108 | cudaMemcpy(h_gC, d_C, sizeByte, cudaMemcpyDeviceToHost); 109 | /* 110 | for(i = 0; i < size; i++) { 111 | for(j = 0; j < size; j++) 112 | printf("%d ", h_gC[i*size+j]); 113 | printf("\n"); 114 | } 115 | */ 116 | for(i = 0; i < size; i++) 117 | for(j = 0; j < size; j++) 118 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 119 | printf("Error !\n"); 120 | for(i = 0; i < size; i++) 121 | for(j = 0; j < size; j++) 122 | if( h_C[i*size+j] != h_gC[i*size+j] ) { 123 | printf("Error !\n"); 124 | cudaFree(d_A); 125 | cudaFree(d_B); 126 | cudaFree(d_C); 127 | free(h_A); 128 | free(h_B); 129 | free(h_C); 130 | free(h_gC); 131 | exit(1); 132 | } 133 | 134 | printf("Success ! \n"); 135 | 136 | cudaFree(d_A); 137 | cudaFree(d_B); 138 | cudaFree(d_C); 139 | free(h_A); 140 | free(h_B); 141 | free(h_C); 142 | free(h_gC); 143 | 144 | exit(0); 145 | 146 | } 147 | -------------------------------------------------------------------------------- /03_cuda_lab/07_async_streams.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #include 29 | 30 | // Convenience function for checking CUDA runtime API results 31 | // can be wrapped around any runtime API call. No-op in release builds. 32 | inline 33 | cudaError_t checkCuda(cudaError_t result) 34 | { 35 | #if defined(DEBUG) || defined(_DEBUG) 36 | if (result != cudaSuccess) { 37 | fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); 38 | assert(result == cudaSuccess); 39 | } 40 | #endif 41 | return result; 42 | } 43 | 44 | __global__ void kernel(float *a, int offset) 45 | { 46 | int i = offset + threadIdx.x + blockIdx.x*blockDim.x; 47 | float x = (float)i; 48 | float s = sinf(x); 49 | float c = cosf(x); 50 | a[i] = a[i] + sqrtf(s*s+c*c); 51 | } 52 | 53 | float maxError(float *a, int n) 54 | { 55 | float maxE = 0; 56 | for (int i = 0; i < n; i++) { 57 | float error = fabs(a[i]-1.0f); 58 | if (error > maxE) maxE = error; 59 | } 60 | return maxE; 61 | } 62 | 63 | int main(int argc, char **argv) 64 | { 65 | const int blockSize = 256, nStreams = 4; 66 | const int n = 4 * 1024 * blockSize * nStreams; 67 | const int streamSize = n / nStreams; 68 | const int streamBytes = streamSize * sizeof(float); 69 | const int bytes = n * sizeof(float); 70 | 71 | int devId = 0; 72 | if (argc > 1) devId = atoi(argv[1]); 73 | 74 | cudaDeviceProp prop; 75 | checkCuda( cudaGetDeviceProperties(&prop, devId)); 76 | printf("Device : %s\n", prop.name); 77 | checkCuda( cudaSetDevice(devId) ); 78 | 79 | // allocate pinned host memory and device memory 80 | float *a, *d_a; 81 | checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned 82 | checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device 83 | 84 | float ms; // elapsed time in milliseconds 85 | 86 | // create events and streams 87 | cudaEvent_t startEvent, stopEvent, dummyEvent; 88 | cudaStream_t stream[nStreams]; 89 | checkCuda( cudaEventCreate(&startEvent) ); 90 | checkCuda( cudaEventCreate(&stopEvent) ); 91 | checkCuda( cudaEventCreate(&dummyEvent) ); 92 | for (int i = 0; i < nStreams; ++i) 93 | checkCuda( cudaStreamCreate(&stream[i]) ); 94 | 95 | // baseline case - sequential transfer and execute 96 | memset(a, 0, bytes); 97 | checkCuda( cudaEventRecord(startEvent,0) ); 98 | checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) ); 99 | kernel<<>>(d_a, 0); 100 | checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) ); 101 | checkCuda( cudaEventRecord(stopEvent, 0) ); 102 | checkCuda( cudaEventSynchronize(stopEvent) ); 103 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 104 | printf("Time for sequential transfer and execute (ms): %f\n", ms); 105 | printf(" max error: %e\n", maxError(a, n)); 106 | 107 | // asynchronous version 1: loop over {copy, kernel, copy} 108 | memset(a, 0, bytes); 109 | checkCuda( cudaEventRecord(startEvent,0) ); 110 | for (int i = 0; i < nStreams; ++i) { 111 | int offset = i * streamSize; 112 | checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], 113 | streamBytes, cudaMemcpyHostToDevice, 114 | stream[i]) ); 115 | kernel<<>>(d_a, offset); 116 | checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], 117 | streamBytes, cudaMemcpyDeviceToHost, 118 | stream[i]) ); 119 | } 120 | checkCuda( cudaEventRecord(stopEvent, 0) ); 121 | checkCuda( cudaEventSynchronize(stopEvent) ); 122 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 123 | printf("Time for asynchronous V1 transfer and execute (ms): %f\n", ms); 124 | printf(" max error: %e\n", maxError(a, n)); 125 | 126 | // asynchronous version 2: 127 | // loop over copy, loop over kernel, loop over copy 128 | memset(a, 0, bytes); 129 | checkCuda( cudaEventRecord(startEvent,0) ); 130 | for (int i = 0; i < nStreams; ++i) 131 | { 132 | int offset = i * streamSize; 133 | checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], 134 | streamBytes, cudaMemcpyHostToDevice, 135 | stream[i]) ); 136 | } 137 | for (int i = 0; i < nStreams; ++i) 138 | { 139 | int offset = i * streamSize; 140 | kernel<<>>(d_a, offset); 141 | } 142 | for (int i = 0; i < nStreams; ++i) 143 | { 144 | int offset = i * streamSize; 145 | checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], 146 | streamBytes, cudaMemcpyDeviceToHost, 147 | stream[i]) ); 148 | } 149 | checkCuda( cudaEventRecord(stopEvent, 0) ); 150 | checkCuda( cudaEventSynchronize(stopEvent) ); 151 | checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) ); 152 | printf("Time for asynchronous V2 transfer and execute (ms): %f\n", ms); 153 | printf(" max error: %e\n", maxError(a, n)); 154 | 155 | // cleanup 156 | checkCuda( cudaEventDestroy(startEvent) ); 157 | checkCuda( cudaEventDestroy(stopEvent) ); 158 | checkCuda( cudaEventDestroy(dummyEvent) ); 159 | for (int i = 0; i < nStreams; ++i) 160 | checkCuda( cudaStreamDestroy(stream[i]) ); 161 | cudaFree(d_a); 162 | cudaFreeHost(a); 163 | 164 | return 0; 165 | } 166 | -------------------------------------------------------------------------------- /03_cuda_lab/08_stream_n_event.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "08_stream_n_event.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "kxcA86P9S6GD", 32 | "colab_type": "text" 33 | }, 34 | "source": [ 35 | "## Stream and Events\n", 36 | "\n", 37 | "동기화를 위한 CUDA API 함수" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "9aBuaeoeSQ2f", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 35 48 | }, 49 | "outputId": "a4914ccf-395d-4721-ea47-d4285d48ba89" 50 | }, 51 | "source": [ 52 | "%%writefile sync.cu\n", 53 | "\n", 54 | "/* Copyright (c) 1993-2015, NVIDIA CORPORATION. All rights reserved.\n", 55 | " *\n", 56 | " * Redistribution and use in source and binary forms, with or without\n", 57 | " * modification, are permitted provided that the following conditions\n", 58 | " * are met:\n", 59 | " * * Redistributions of source code must retain the above copyright\n", 60 | " * notice, this list of conditions and the following disclaimer.\n", 61 | " * * Redistributions in binary form must reproduce the above copyright\n", 62 | " * notice, this list of conditions and the following disclaimer in the\n", 63 | " * documentation and/or other materials provided with the distribution.\n", 64 | " * * Neither the name of NVIDIA CORPORATION nor the names of its\n", 65 | " * contributors may be used to endorse or promote products derived\n", 66 | " * from this software without specific prior written permission.\n", 67 | " *\n", 68 | " * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n", 69 | " * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", 70 | " * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n", 71 | " * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n", 72 | " * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n", 73 | " * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n", 74 | " * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n", 75 | " * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n", 76 | " * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n", 77 | " * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", 78 | " * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n", 79 | " */\n", 80 | "\n", 81 | "#include \n", 82 | "\n", 83 | "// Convenience function for checking CUDA runtime API results\n", 84 | "// can be wrapped around any runtime API call. No-op in release builds.\n", 85 | "inline\n", 86 | "cudaError_t checkCuda(cudaError_t result)\n", 87 | "{\n", 88 | "#if defined(DEBUG) || defined(_DEBUG)\n", 89 | " if (result != cudaSuccess) {\n", 90 | " fprintf(stderr, \"CUDA Runtime Error: %s\\n\", cudaGetErrorString(result));\n", 91 | " assert(result == cudaSuccess);\n", 92 | " }\n", 93 | "#endif\n", 94 | " return result;\n", 95 | "}\n", 96 | "\n", 97 | "__global__ void kernel(float *a, int offset)\n", 98 | "{\n", 99 | " int i = offset + threadIdx.x + blockIdx.x*blockDim.x;\n", 100 | " float x = (float)i;\n", 101 | " float s = sinf(x); \n", 102 | " float c = cosf(x);\n", 103 | " a[i] = a[i] + sqrtf(s*s+c*c);\n", 104 | "}\n", 105 | "\n", 106 | "float maxError(float *a, int n) \n", 107 | "{\n", 108 | " float maxE = 0;\n", 109 | " for (int i = 0; i < n; i++) {\n", 110 | " float error = fabs(a[i]-1.0f);\n", 111 | " if (error > maxE) maxE = error;\n", 112 | " }\n", 113 | " return maxE;\n", 114 | "}\n", 115 | "\n", 116 | "int main(int argc, char **argv)\n", 117 | "{\n", 118 | " const int blockSize = 256, nStreams = 4;\n", 119 | " const int n = 4 * 1024 * blockSize * nStreams;\n", 120 | " const int streamSize = n / nStreams;\n", 121 | " const int streamBytes = streamSize * sizeof(float);\n", 122 | " const int bytes = n * sizeof(float);\n", 123 | " \n", 124 | " int devId = 0;\n", 125 | " if (argc > 1) devId = atoi(argv[1]);\n", 126 | "\n", 127 | " cudaDeviceProp prop;\n", 128 | " checkCuda( cudaGetDeviceProperties(&prop, devId));\n", 129 | " printf(\"Device : %s\\n\", prop.name);\n", 130 | " checkCuda( cudaSetDevice(devId) );\n", 131 | " \n", 132 | " // allocate pinned host memory and device memory\n", 133 | " float *a, *d_a;\n", 134 | " checkCuda( cudaMallocHost((void**)&a, bytes) ); // host pinned\n", 135 | " checkCuda( cudaMalloc((void**)&d_a, bytes) ); // device\n", 136 | "\n", 137 | " float ms; // elapsed time in milliseconds\n", 138 | " \n", 139 | " // create events and streams\n", 140 | " cudaEvent_t startEvent, stopEvent, dummyEvent;\n", 141 | " cudaStream_t stream[nStreams];\n", 142 | " checkCuda( cudaEventCreate(&startEvent) );\n", 143 | " checkCuda( cudaEventCreate(&stopEvent) );\n", 144 | " checkCuda( cudaEventCreate(&dummyEvent) );\n", 145 | " for (int i = 0; i < nStreams; ++i)\n", 146 | " checkCuda( cudaStreamCreate(&stream[i]) );\n", 147 | "\n", 148 | " ///////////////////////////////////////////////////////////////\n", 149 | " /////////////////////////////////////////////////////////////// \n", 150 | " // baseline case - sequential transfer and execute\n", 151 | " memset(a, 0, bytes);\n", 152 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 153 | " checkCuda( cudaMemcpy(d_a, a, bytes, cudaMemcpyHostToDevice) );\n", 154 | " kernel<<>>(d_a, 0);\n", 155 | " checkCuda( cudaMemcpy(a, d_a, bytes, cudaMemcpyDeviceToHost) );\n", 156 | " checkCuda( cudaEventRecord(stopEvent, 0) );\n", 157 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 158 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 159 | " printf(\"Time for sequential transfer and execute (ms): %f\\n\", ms);\n", 160 | " printf(\" max error: %e\\n\", maxError(a, n));\n", 161 | "\n", 162 | " ///////////////////////////////////////////////////////////////\n", 163 | " ///////////////////////////////////////////////////////////////\n", 164 | " // asynchronous version 1: loop over {copy, kernel, copy}\n", 165 | " memset(a, 0, bytes);\n", 166 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 167 | " for (int i = 0; i < nStreams; ++i) {\n", 168 | " int offset = i * streamSize;\n", 169 | " checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], \n", 170 | " streamBytes, cudaMemcpyHostToDevice, \n", 171 | " stream[i]) );\n", 172 | " kernel<<>>(d_a, offset);\n", 173 | " checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], \n", 174 | " streamBytes, cudaMemcpyDeviceToHost,\n", 175 | " stream[i]) );\n", 176 | " }\n", 177 | " checkCuda( cudaEventRecord(stopEvent, 0) );\n", 178 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 179 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 180 | " printf(\"Time for asynchronous V1 transfer and execute (ms): %f\\n\", ms);\n", 181 | " printf(\" max error: %e\\n\", maxError(a, n));\n", 182 | "\n", 183 | " ///////////////////////////////////////////////////////////////\n", 184 | " /////////////////////////////////////////////////////////////// \n", 185 | " // asynchronous version 2: \n", 186 | " // loop over copy, loop over kernel, loop over copy\n", 187 | " \n", 188 | " memset(a, 0, bytes);\n", 189 | " checkCuda( cudaEventRecord(startEvent,0) );\n", 190 | " for (int i = 0; i < nStreams; ++i)\n", 191 | " {\n", 192 | " int offset = i * streamSize;\n", 193 | " checkCuda( cudaMemcpyAsync(&d_a[offset], &a[offset], \n", 194 | " streamBytes, cudaMemcpyHostToDevice,\n", 195 | " stream[i]) );\n", 196 | " }\n", 197 | " for (int i = 0; i < nStreams; ++i)\n", 198 | " {\n", 199 | " int offset = i * streamSize;\n", 200 | " kernel<<>>(d_a, offset);\n", 201 | " }\n", 202 | " for (int i = 0; i < nStreams; ++i)\n", 203 | " {\n", 204 | " int offset = i * streamSize;\n", 205 | " checkCuda( cudaMemcpyAsync(&a[offset], &d_a[offset], \n", 206 | " streamBytes, cudaMemcpyDeviceToHost,\n", 207 | " stream[i]) );\n", 208 | " }\n", 209 | " checkCuda( cudaEventRecord(stopEvent, 0) );\n", 210 | " checkCuda( cudaEventSynchronize(stopEvent) );\n", 211 | " checkCuda( cudaEventElapsedTime(&ms, startEvent, stopEvent) );\n", 212 | " printf(\"Time for asynchronous V2 transfer and execute (ms): %f\\n\", ms);\n", 213 | " printf(\" max error: %e\\n\", maxError(a, n));\n", 214 | "\n", 215 | " // cleanup\n", 216 | " checkCuda( cudaEventDestroy(startEvent) );\n", 217 | " checkCuda( cudaEventDestroy(stopEvent) );\n", 218 | " checkCuda( cudaEventDestroy(dummyEvent) );\n", 219 | " for (int i = 0; i < nStreams; ++i)\n", 220 | " checkCuda( cudaStreamDestroy(stream[i]) );\n", 221 | " cudaFree(d_a);\n", 222 | " cudaFreeHost(a);\n", 223 | "\n", 224 | " return 0;\n", 225 | "}" 226 | ], 227 | "execution_count": 2, 228 | "outputs": [ 229 | { 230 | "output_type": "stream", 231 | "text": [ 232 | "Writing sync.cu\n" 233 | ], 234 | "name": "stdout" 235 | } 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "metadata": { 241 | "id": "2HOWoTl3SUXq", 242 | "colab_type": "code", 243 | "colab": {} 244 | }, 245 | "source": [ 246 | "!nvcc -o sync sync.cu" 247 | ], 248 | "execution_count": 0, 249 | "outputs": [] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "kGOoLb82SbwL", 255 | "colab_type": "code", 256 | "colab": { 257 | "base_uri": "https://localhost:8080/", 258 | "height": 143 259 | }, 260 | "outputId": "bd964375-822e-44ea-dd23-a307a9a13a5f" 261 | }, 262 | "source": [ 263 | "!./sync" 264 | ], 265 | "execution_count": 4, 266 | "outputs": [ 267 | { 268 | "output_type": "stream", 269 | "text": [ 270 | "Device : Tesla T4\n", 271 | "Time for sequential transfer and execute (ms): 3.384064\n", 272 | " max error: 1.192093e-07\n", 273 | "Time for asynchronous V1 transfer and execute (ms): 2.023168\n", 274 | " max error: 1.192093e-07\n", 275 | "Time for asynchronous V2 transfer and execute (ms): 1.992160\n", 276 | " max error: 1.192093e-07\n" 277 | ], 278 | "name": "stdout" 279 | } 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "metadata": { 285 | "id": "yGM7cXi0SdL3", 286 | "colab_type": "code", 287 | "colab": { 288 | "base_uri": "https://localhost:8080/", 289 | "height": 323 290 | }, 291 | "outputId": "84f25bce-6654-4248-d739-eed95e3e0988" 292 | }, 293 | "source": [ 294 | "!nvidia-smi" 295 | ], 296 | "execution_count": 5, 297 | "outputs": [ 298 | { 299 | "output_type": "stream", 300 | "text": [ 301 | "Sun Jun 9 08:13:49 2019 \n", 302 | "+-----------------------------------------------------------------------------+\n", 303 | "| NVIDIA-SMI 418.67 Driver Version: 410.79 CUDA Version: 10.0 |\n", 304 | "|-------------------------------+----------------------+----------------------+\n", 305 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n", 306 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n", 307 | "|===============================+======================+======================|\n", 308 | "| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", 309 | "| N/A 59C P8 16W / 70W | 0MiB / 15079MiB | 0% Default |\n", 310 | "+-------------------------------+----------------------+----------------------+\n", 311 | " \n", 312 | "+-----------------------------------------------------------------------------+\n", 313 | "| Processes: GPU Memory |\n", 314 | "| GPU PID Type Process name Usage |\n", 315 | "|=============================================================================|\n", 316 | "| No running processes found |\n", 317 | "+-----------------------------------------------------------------------------+\n" 318 | ], 319 | "name": "stdout" 320 | } 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "metadata": { 326 | "id": "QZhxoGQjTPb6", 327 | "colab_type": "code", 328 | "colab": {} 329 | }, 330 | "source": [ 331 | "" 332 | ], 333 | "execution_count": 0, 334 | "outputs": [] 335 | } 336 | ] 337 | } -------------------------------------------------------------------------------- /03_cuda_lab/README.md: -------------------------------------------------------------------------------- 1 | ## 03_cuda_lab 2 | 3 | -------------------------------------------------------------------------------- /03_cuda_lab/clock.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. 3 | * 4 | * Please refer to the NVIDIA end user license agreement (EULA) associated 5 | * with this source code for terms and conditions that govern your use of 6 | * this software. Any use, reproduction, disclosure, or distribution of 7 | * this software and related documentation outside the terms of the EULA 8 | * is strictly prohibited. 9 | * 10 | */ 11 | 12 | // This example shows how to use the clock function to measure the performance of 13 | // a kernel accurately. 14 | // 15 | // Blocks are executed in parallel and out of order. Since there's no synchronization 16 | // mechanism between blocks, we measure the clock once for each block. The clock 17 | // samples are written to device memory. 18 | 19 | // System includes 20 | #include 21 | #include 22 | #include 23 | 24 | // CUDA runtime 25 | #include 26 | 27 | // helper functions and utilities to work with CUDA 28 | #include 29 | #include 30 | 31 | // This kernel computes a standard parallel reduction and evaluates the 32 | // time it takes to do that for each block. The timing results are stored 33 | // in device memory. 34 | __global__ static void timedReduction(const float *input, float *output, clock_t *timer) 35 | { 36 | // __shared__ float shared[2 * blockDim.x]; 37 | extern __shared__ float shared[]; 38 | 39 | const int tid = threadIdx.x; 40 | const int bid = blockIdx.x; 41 | 42 | if (tid == 0) timer[bid] = clock(); 43 | 44 | // Copy input. 45 | shared[tid] = input[tid]; 46 | shared[tid + blockDim.x] = input[tid + blockDim.x]; 47 | 48 | // Perform reduction to find minimum. 49 | for (int d = blockDim.x; d > 0; d /= 2) 50 | { 51 | __syncthreads(); 52 | 53 | if (tid < d) 54 | { 55 | float f0 = shared[tid]; 56 | float f1 = shared[tid + d]; 57 | 58 | if (f1 < f0) 59 | { 60 | shared[tid] = f1; 61 | } 62 | } 63 | } 64 | 65 | // Write result. 66 | if (tid == 0) output[bid] = shared[0]; 67 | 68 | __syncthreads(); 69 | 70 | if (tid == 0) timer[bid+gridDim.x] = clock(); 71 | } 72 | 73 | 74 | // This example shows how to use the clock function to measure the performance of 75 | // a kernel accurately. 76 | // 77 | // Blocks are executed in parallel and out of order. Since there's no synchronization 78 | // mechanism between blocks, we measure the clock once for each block. The clock 79 | // samples are written to device memory. 80 | 81 | #define NUM_BLOCKS 64 82 | #define NUM_THREADS 256 83 | 84 | // It's interesting to change the number of blocks and the number of threads to 85 | // understand how to keep the hardware busy. 86 | // 87 | // Here are some numbers I get on my G80: 88 | // blocks - clocks 89 | // 1 - 3096 90 | // 8 - 3232 91 | // 16 - 3364 92 | // 32 - 4615 93 | // 64 - 9981 94 | // 95 | // With less than 16 blocks some of the multiprocessors of the device are idle. With 96 | // more than 16 you are using all the multiprocessors, but there's only one block per 97 | // multiprocessor and that doesn't allow you to hide the latency of the memory. With 98 | // more than 32 the speed scales linearly. 99 | 100 | // Start the main CUDA Sample here 101 | int main(int argc, char **argv) 102 | { 103 | printf("CUDA Clock sample\n"); 104 | 105 | // This will pick the best possible CUDA capable device 106 | int dev = findCudaDevice(argc, (const char **)argv); 107 | 108 | float *dinput = NULL; 109 | float *doutput = NULL; 110 | clock_t *dtimer = NULL; 111 | 112 | clock_t timer[NUM_BLOCKS * 2]; 113 | float input[NUM_THREADS * 2]; 114 | 115 | for (int i = 0; i < NUM_THREADS * 2; i++) 116 | { 117 | input[i] = (float)i; 118 | } 119 | 120 | checkCudaErrors(cudaMalloc((void **)&dinput, sizeof(float) * NUM_THREADS * 2)); 121 | checkCudaErrors(cudaMalloc((void **)&doutput, sizeof(float) * NUM_BLOCKS)); 122 | checkCudaErrors(cudaMalloc((void **)&dtimer, sizeof(clock_t) * NUM_BLOCKS * 2)); 123 | 124 | checkCudaErrors(cudaMemcpy(dinput, input, sizeof(float) * NUM_THREADS * 2, cudaMemcpyHostToDevice)); 125 | 126 | timedReduction<<>>(dinput, doutput, dtimer); 127 | 128 | checkCudaErrors(cudaMemcpy(timer, dtimer, sizeof(clock_t) * NUM_BLOCKS * 2, cudaMemcpyDeviceToHost)); 129 | 130 | checkCudaErrors(cudaFree(dinput)); 131 | checkCudaErrors(cudaFree(doutput)); 132 | checkCudaErrors(cudaFree(dtimer)); 133 | 134 | 135 | // Compute the difference between the last block end and the first block start. 136 | clock_t minStart = timer[0]; 137 | clock_t maxEnd = timer[NUM_BLOCKS]; 138 | 139 | for (int i = 1; i < NUM_BLOCKS; i++) 140 | { 141 | minStart = timer[i] < minStart ? timer[i] : minStart; 142 | maxEnd = timer[NUM_BLOCKS+i] > maxEnd ? timer[NUM_BLOCKS+i] : maxEnd; 143 | } 144 | 145 | printf("Total clocks = %Lf\n", (long double)(maxEnd - minStart)); 146 | 147 | 148 | // cudaDeviceReset causes the driver to clean up all state. While 149 | // not mandatory in normal operation, it is good practice. It is also 150 | // needed to ensure correct operation when the application is being 151 | // profiled. Calling cudaDeviceReset causes all profile data to be 152 | // flushed before the application exits 153 | cudaDeviceReset(); 154 | 155 | return EXIT_SUCCESS; 156 | } 157 | -------------------------------------------------------------------------------- /PPTs/001_Intro. Parallel Computing.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/001_Intro. Parallel Computing.pptx -------------------------------------------------------------------------------- /PPTs/README.md: -------------------------------------------------------------------------------- 1 | ## Powerpoint course material 2 | ## 파워포인트 수업 자료 3 | -------------------------------------------------------------------------------- /PPTs/Robot_02_CUDA I - Basic Programming.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/Robot_02_CUDA I - Basic Programming.pdf -------------------------------------------------------------------------------- /PPTs/Robot_03_CUDA II - Optimization - Transpose.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/Robot_03_CUDA II - Optimization - Transpose.pdf -------------------------------------------------------------------------------- /PPTs/Robot_04_CUDA III - Optimization - Reductions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/Robot_04_CUDA III - Optimization - Reductions.pdf -------------------------------------------------------------------------------- /PPTs/Robot_05_CUDA IV - Optimization - Mat-Mat Multiplication.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/Robot_05_CUDA IV - Optimization - Mat-Mat Multiplication.pdf -------------------------------------------------------------------------------- /PPTs/Robot_06_CUDA V - Synchronization Stream.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jeonggunlee/Parallel_Programming/f5349f3626cc2a5b94e76648d4f864111b0bbba0/PPTs/Robot_06_CUDA V - Synchronization Stream.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 한림대학교 소프트웨어융합대학 빅데이터전공 교과목 / 2019 가을 학기 2 | # GPU를 이용한 병렬 프로그래밍 (CUDA 병렬 프로그래밍) 3 | ## High Performance Parallel Programming with Nvidia GPU and CUDA 4 | 5 | - 강사: 이정근, 한림대학교 소프트웨어융합대학 교수 (Lecturer: Jeong-Gun Lee, Hallym University) 6 | - Web: www.onchip.net 7 | 8 | ### 주요 자료 (Main Teaching Materials) 9 | - 강의 자료: 10 | - [PPT 디렉토리](https://github.com/jeonggunlee/Parallel_Programming/tree/master/PPTs)(All the lecture materials can be found in [PPT Directory](https://github.com/jeonggunlee/Parallel_Programming/tree/master/PPTs)) 11 | - 실습 자료 (Lab materials): 12 | - [01_a_lab](https://github.com/jeonggunlee/Parallel_Programming/tree/master/01_cuda_lab) 13 | - [02_a_lab](https://github.com/jeonggunlee/Parallel_Programming/tree/master/02_cuda_lab) 14 | - [03_a_lab](https://github.com/jeonggunlee/Parallel_Programming/tree/master/03_cuda_lab) 15 | - GPU 실습 코드들은 모두 ```구글 Colab``` 환경에서 진행될수 있도록 개발되었습니다. (Most of lab materials are developed with ```Google Colab``` environment. So you can do the lab practices with web browser in your computer without any GPU.) 16 | - [한림 소프트웨어중심대 동영상 시리즈: GPU/CUDA 병렬프로그래밍](https://www.youtube.com/playlist?list=PLKZ28p5qq0DGLcO6QZdMSG_jsprRtG15C) (Short lecture movie clips can be found in [Short Introduction to CUDA in Youtube](https://www.youtube.com/playlist?list=PLKZ28p5qq0DGLcO6QZdMSG_jsprRtG15C)) 17 | 18 | ## Schedule (스케쥴) 19 | - 병렬프로그래밍소개 ([PPT](https://github.com/jeonggunlee/Parallel_Programming/tree/master/PPTs)) 20 | - GPU CUDA 병렬프로그래밍 기초 21 | - GPU 아키텍쳐 22 | - GPU CUDA 병렬프로그래밍: 실습 23 | - colab.research.goole.com에서 GPU 활용하기 ([hello_CUDA.ipynb](./hello_CUDA.ipynb)) 24 | - [Google Drive와 연동하기](https://github.com/jeonggunlee/Parallel_Programming/blob/master/colab_gdrive.ipynb) 25 | - [Udacity GPU Programming 강좌 예제 파일](https://github.com/jeonggunlee/cs344/) 26 | - GPU CUDA 프로그래밍: 최적화 27 | - GPU CUDA 프로그래밍: 최적화 실습 28 | - Parallel Transpose 최적화 29 | - CUDA 최적화 실습 30 | - 병렬 CUDA 벡터곱/행렬곱 최적화 31 | - [옵션] Parallel Reduction 최적화 32 | - [옵션] Host-GPU 인터페이싱: Streams 33 | - [옵션] Host-GPU 인터페이싱: Streams 실습 34 | 35 | ## Capstone (스케쥴) 36 | - ```Nvidia Jetson Board```를 이용한 지능형 CCTV 개발 프로젝트 37 | - [Nvidia 사이트 정보](https://developer.nvidia.com/embedded/jetson-nano-developer-kit) 38 | - [Jetson Nano 객체 인식 데모](https://www.youtube.com/watch?v=k5pXXmTkPNM) 39 | - [YOLO 객체인식 Github](https://github.com/pjreddie/darknet) 40 | - [YOLO 객체인식 Main 홈페이지](https://pjreddie.com/darknet/) 41 | - [YOLO 객체인식 동영상](https://www.youtube.com/watch?v=MPU2HistivI) 42 | 43 | - 대학원생 멘토 지원 44 | 45 | * * * 46 | 47 | ## REFERENCES (참조Sites) 48 | - Good-to-See Source Example: https://github.com/jeonggunlee/cs344 49 | - CUDA Sample Directory: C:\ProgramData\NVIDIA Corporation\CUDA Samples 50 | - CUDA 최고의강좌! 강추! Udacity [Intro to Parallel Programming](https://www.youtube.com/watch?v=F620ommtjqk&list=PLAwxTw4SYaPnFKojVQrmyOGFCqHTxfdv2) 51 | - Udacity [High Performance Computer Architecture](https://www.youtube.com/watch?v=tawb_aeYQ2g&list=PLAwxTw4SYaPmqpjgrmf4-DGlaeV0om4iP&index=1) 52 | - Udacity [High Performance Computing](https://www.youtube.com/watch?v=grD5en6_IiQ&list=PLAwxTw4SYaPk8NaXIiFQXWK6VPnrtMRXC) 53 | - [CUDA LECTURE](https://www.youtube.com/watch?v=sxhvmTveO2A) - Oklahoma State University 54 | - 코딩 실습을 위한 [클라우드 설정(AWS)](https://github.com/jeonggunlee/CUDATeaching/blob/master/gpu4cloud.md) 55 | -------------------------------------------------------------------------------- /colab_gdrive.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Untitled1.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | " $\"Open$ " 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "id": "lwt6hjvawgy5", 31 | "colab_type": "code", 32 | "colab": {} 33 | }, 34 | "cell_type": "code", 35 | "source": [ 36 | "from google.colab import drive" 37 | ], 38 | "execution_count": 0, 39 | "outputs": [] 40 | }, 41 | { 42 | "metadata": { 43 | "id": "JfuMBZhrwkEz", 44 | "colab_type": "code", 45 | "colab": { 46 | "base_uri": "https://localhost:8080/", 47 | "height": 131 48 | }, 49 | "outputId": "e4efcbaa-2885-4f63-aefa-c335c4d6a022" 50 | }, 51 | "cell_type": "code", 52 | "source": [ 53 | "drive.mount('/content/gdrive')" 54 | ], 55 | "execution_count": 2, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "text": [ 60 | "Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code\n", 61 | "\n", 62 | "Enter your authorization code:\n", 63 | "··········\n", 64 | "Mounted at /content/gdrive\n" 65 | ], 66 | "name": "stdout" 67 | } 68 | ] 69 | }, 70 | { 71 | "metadata": { 72 | "id": "c4SIRCeAxuZ6", 73 | "colab_type": "code", 74 | "colab": { 75 | "base_uri": "https://localhost:8080/", 76 | "height": 36 77 | }, 78 | "outputId": "ac4742b6-d2ab-4fb3-8824-2b6be6d5dd75" 79 | }, 80 | "cell_type": "code", 81 | "source": [ 82 | "%cd /content" 83 | ], 84 | "execution_count": 3, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "text": [ 89 | "/content\n" 90 | ], 91 | "name": "stdout" 92 | } 93 | ] 94 | }, 95 | { 96 | "metadata": { 97 | "id": "0T7r6E_wx7sb", 98 | "colab_type": "code", 99 | "colab": { 100 | "base_uri": "https://localhost:8080/", 101 | "height": 36 102 | }, 103 | "outputId": "2442164e-d6ef-4969-ed28-21c4095495ec" 104 | }, 105 | "cell_type": "code", 106 | "source": [ 107 | "!ls" 108 | ], 109 | "execution_count": 4, 110 | "outputs": [ 111 | { 112 | "output_type": "stream", 113 | "text": [ 114 | "gdrive\tsample_data\n" 115 | ], 116 | "name": "stdout" 117 | } 118 | ] 119 | }, 120 | { 121 | "metadata": { 122 | "id": "zgJG8sc5x8b6", 123 | "colab_type": "code", 124 | "colab": { 125 | "base_uri": "https://localhost:8080/", 126 | "height": 36 127 | }, 128 | "outputId": "fc4b3cb7-1fbe-4569-f886-15800e63a7a8" 129 | }, 130 | "cell_type": "code", 131 | "source": [ 132 | "%cd gdrive" 133 | ], 134 | "execution_count": 5, 135 | "outputs": [ 136 | { 137 | "output_type": "stream", 138 | "text": [ 139 | "/content/gdrive\n" 140 | ], 141 | "name": "stdout" 142 | } 143 | ] 144 | }, 145 | { 146 | "metadata": { 147 | "id": "IOks_A0px-gL", 148 | "colab_type": "code", 149 | "colab": { 150 | "base_uri": "https://localhost:8080/", 151 | "height": 36 152 | }, 153 | "outputId": "5747474a-6e66-4ff0-89c3-2952107b36e8" 154 | }, 155 | "cell_type": "code", 156 | "source": [ 157 | "!ls" 158 | ], 159 | "execution_count": 6, 160 | "outputs": [ 161 | { 162 | "output_type": "stream", 163 | "text": [ 164 | "'My Drive'\n" 165 | ], 166 | "name": "stdout" 167 | } 168 | ] 169 | }, 170 | { 171 | "metadata": { 172 | "id": "vJmG1Mhpx_dj", 173 | "colab_type": "code", 174 | "colab": { 175 | "base_uri": "https://localhost:8080/", 176 | "height": 36 177 | }, 178 | "outputId": "f4684259-1487-4923-d35b-b685fb14059a" 179 | }, 180 | "cell_type": "code", 181 | "source": [ 182 | "%cd My Drive" 183 | ], 184 | "execution_count": 7, 185 | "outputs": [ 186 | { 187 | "output_type": "stream", 188 | "text": [ 189 | "/content/gdrive/My Drive\n" 190 | ], 191 | "name": "stdout" 192 | } 193 | ] 194 | } 195 | ] 196 | } --------------------------------------------------------------------------------